def logNihResultToCsvFile(self, site, keyword, article, articleDetails): name = site.get('name', '').lower() csvFileName = os.path.join(self.options['outputDirectory'], f'{name}_results.csv') if not os.path.exists(csvFileName): helpers.makeDirectory(os.path.dirname(csvFileName)) helpers.toFile( 'DateTime,Keyword,Title,Date_Submitted,URL,Abstract,Description,Details,ShortDetails,Resource,Type,Identifiers,Db,EntrezUID,Properties,all_authors,all_locations,first_author,firstauthor_location,lastauthor,last_author_location,citations', csvFileName) siteName = site.get('name', '') articleId = article.get('uid', '') dateSubmitted = article.get('sortpubdate', '') dateSubmitted = helpers.findBetween(dateSubmitted, '', ' ') dateSubmitted = dateSubmitted.replace('/', '-') properties = 'create date: ' + helpers.findBetween( article.get('sortpubdate', ''), '', ' ') + ' | first author: ' + article.get('sortfirstauthor', '') description = '' authors = [] for author in article.get('authors', ''): authors.append(author.get('name', '')) description = ', '.join(authors) + '.' publicationTypes = ', '.join(article.get('pubtype', [])) details = article.get('fulljournalname', '') + '. ' + article.get( 'elocationid', '') + '. ' + publicationTypes + '.' line = [ datetime.datetime.now().strftime('%m%d%y-%H%M%S'), keyword, article.get('title', ''), dateSubmitted, f'/pubmed/{articleId}', articleDetails.get('abstract', ''), description, details, article.get('fulljournalname', '') + '. ' + helpers.findBetween(article.get('sortpubdate', ''), '', '/'), 'PubMed', publicationTypes, f'PMID:{articleId}', 'pubmed', articleId, properties, articleDetails.get('allAuthors', ''), articleDetails.get('allLocations', ''), article.get('sortfirstauthor', ''), articleDetails.get('firstAuthorLocation', ''), article.get('lastauthor', ''), articleDetails.get('lastAuthorLocation', ''), articleDetails.get('citations', '') ] self.appendCsvFile(line, csvFileName)
def arxivSearch(self, site, keyword): results = [] maximumResults = self.options['maximumResultsPerKeyword'] if maximumResults == -1: maximumResults = None items = arxiv.query(query=keyword, id_list=[], max_results=maximumResults, start=0, sort_by="relevance", sort_order="descending", prune=True, iterative=False, max_chunk_results=1000) ids = [] for item in items: id = item.get('id', '') id = self.getLastAfterSplit(id, '/') # avoids duplicates if id in ids: continue ids.append(id) pdfUrl = item.get('pdf_url', '') if not pdfUrl: siteName = helpers.getDomainName(site.get('url', '')) message = f'No pdf file found on {siteName} for {id}' logging.error(message) pdfUrl = f'Error: {message}' title = item.get('title', '') title = title.replace('\n', ' ') title = self.squeezeWhitespace(title) dateSubmitted = item.get('published', '') dateSubmitted = helpers.findBetween(dateSubmitted, '', 'T') shortTitle = title if len(shortTitle) > 50: shortTitle = shortTitle[0:50] + '...' abstract = item.get('summary', '') allAuthors = '; '.join(item.get('authors', '')) allLocations = '' firstAuthor = self.getFirst(item.get('authors', '')) firstAuthorLocation = '' lastAuthor = self.getLast(item.get('authors', '')) lastAuthorLocation = '' citations = '' result = [ id, pdfUrl, title, dateSubmitted, abstract, allAuthors, allLocations, firstAuthor, firstAuthorLocation, lastAuthor, lastAuthorLocation, citations ] results.append(result) logging.info( f'Results: {len(results)}. Id: {id}. Title: {shortTitle}.') self.totalResults = len(results) self.showResultCount() # log the search now because the download might fail self.logToCsvFiles(site, keyword, -1, [], '', False, True, False) return results
def getInformationFromDetailsPage(self, siteData, url): page = self.downloader.get(url) title = self.downloader.getXpath(page, siteData['titleInDetailsPageXpath'], True) dateSubmitted = self.downloader.getXpath( page, siteData['dateSubmittedXpath'], True) # it starts with a non-breaking space dateSubmitted = helpers.findBetween(dateSubmitted, '\xa0', '.') abstract = self.downloader.getXpath(page, siteData['abstractXpath'], True) if dateSubmitted: dateSubmitted = self.changeDateFormat(dateSubmitted, '%B %d, %Y') import lxml.html as lh document = lh.fromstring(page) allAuthors = [] allLocations = [] firstAuthor = '' firstAuthorLocation = '' lastAuthor = '' lastAuthorLocation = '' authorXpath = "//*[contains(@id, 'hw-article-author-popups-')]/div[contains(@class, 'author-tooltip-')]" elements = document.xpath(authorXpath) for i, element in enumerate(elements): name = self.downloader.getXpathInElement( element, ".//div[@class = 'author-tooltip-name']", False) name = name.strip() if not name: continue allAuthors.append(name) if not firstAuthor: firstAuthor = name # only if the article has a last author elif i > 0 and i == len(elements) - 1 and not lastAuthor: lastAuthor = name affiliations = element.xpath(".//span[@class = 'nlm-aff']") for affiliation in affiliations: location = affiliation.text_content() location = location.strip() if not location: continue if i == 0 and not firstAuthorLocation: firstAuthorLocation = location # only if the article has a last author elif i > 0 and i == len( elements) - 1 and not lastAuthorLocation: lastAuthorLocation = location # avoid duplicates if not location in allLocations: allLocations.append(location) result = { 'title': title, 'dateSubmitted': dateSubmitted, 'abstract': abstract, 'allAuthors': '; '.join(allAuthors), 'allLocations': ' | '.join(allLocations), 'firstAuthor': firstAuthor, 'firstAuthorLocation': firstAuthorLocation, 'lastAuthor': lastAuthor, 'lastAuthorLocation': lastAuthorLocation, 'citations': '' } return result
def getNihPage(self, site, keyword, api, pageIndex, existingResults, resultCount): results = [] resultsPerPage = 1000 start = pageIndex * resultsPerPage response = '' if not self.options.get('useIdLists', ''): logging.info(f'Getting page {pageIndex + 1}') response = api.get( f'/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retstart={start}&retmax={resultsPerPage}&term={keyword}' ) if not response: logging.error('No response') return [] if not self.totalResults: self.totalResults = response['esearchresult']['count'] self.showResultCount() # log the search now because the download might fail self.logToCsvFiles(site, keyword, -1, [], '', False, True, False) else: if pageIndex == 0: response = {'esearchresult': {'idlist': [keyword]}} else: return [] i = resultCount for item in response['esearchresult']['idlist']: if self.shouldStopForThisKeyword(i, False): break # avoid duplicates if self.isInArticleList(existingResults, item): continue i += 1 try: summaryResponse = api.get( f'/entrez/eutils/esummary.fcgi?db=pubmed&id={item}&retmode=json' ) title = '' abstract = '' if 'result' in summaryResponse and item in summaryResponse[ 'result']: articleSummary = summaryResponse['result'][item] title = articleSummary.get('title', '') shortTitle = title if len(shortTitle) > 50: shortTitle = shortTitle[0:50] + '...' dateSubmitted = articleSummary.get('sortpubdate', '') dateSubmitted = helpers.findBetween(dateSubmitted, '', ' ') dateSubmitted = dateSubmitted.replace('/', '-') details = self.getNihDetails(api, item, articleSummary) abstract = details.get('abstract', '') logging.info( f'Results: {i}. Id: {item}. Title: {shortTitle}.') # write these results to a separate csv self.logNihResultToCsvFile(site, keyword, articleSummary, details) pdfUrl = '' except Exception as e: # if something goes wrong, we just go to next keyword logging.error(f'Skipping {item}. Something went wrong.') logging.debug(traceback.format_exc()) logging.error(e) continue result = [item, pdfUrl, title, dateSubmitted, abstract] fields = [ 'allAuthors', 'allLocations', 'firstAuthor', 'firstAuthorLocation', 'lastAuthor', 'lastAuthorLocation', 'citations' ] for field in fields: result.append(details.get(field, '')) results.append(result) return results
def initialize(self): suffix = helpers.getArgument('-w', False) if suffix: suffix = '-' + helpers.fileNameOnly(suffix, False) helpers.setUpLogging(suffix) logging.info('Starting\n') self.onItemIndex = 0 self.onKeywordIndex = 0 # to store the time we finished given sites/keyword combinations self.database = Database('database.sqlite') self.database.execute( 'create table if not exists history ( siteName text, keyword text, directory text, gmDate text, primary key(siteName, keyword, directory) )' ) self.downloader = Downloader() self.dateStarted = datetime.datetime.now().strftime('%m%d%y') outputDirectory = os.path.join(str(Path.home()), 'Desktop', f'WebSearch_{self.dateStarted}') # set default options self.options = { 'inputWebsitesFile': 'input_websites.txt', 'inputKeywordsFile': '', 'outputDirectory': outputDirectory, 'secondsBetweenItems': 0, 'maximumDaysToKeepItems': 90, 'maximumResultsPerKeyword': 25000, 'directoryToCheckForDuplicates': '', 'useIdLists': 0 } self.keywordsFiles = {} self.idListFiles = {} # read the options file helpers.setOptions('options.ini', self.options) helpers.setOptions('options.ini', self.keywordsFiles, 'search terms') helpers.setOptions('options.ini', self.idListFiles, 'id lists') # read command line parameters self.setOptionFromParameter('inputWebsitesFile', '-w') self.setOptionFromParameter('inputKeywordsFile', '-s') self.setOptionFromParameter('outputDirectory', '-d') if '-i' in sys.argv: self.options['maximumResultsPerKeyword'] = 1 logging.info('Downloading by ID list') self.options['useIdLists'] = 1 # read websites file file = helpers.getFile(self.options['inputWebsitesFile']) self.sites = [] for item in file.splitlines(): name = helpers.findBetween(item, '', ' ') url = helpers.findBetween(item, ' ', '') site = {'name': name, 'url': url} self.sites.append(site) self.removeOldEntries()