def lookUpItem(self, site, keyword): siteName = helpers.getDomainName(site.get('url', '')) self.totalResults = 0 articles = [] # use pubmed's api if siteName == 'nih.gov': articles = self.nihSearch(site, keyword) # use arxiv's api elif siteName == 'arxiv.org': articles = self.arxivSearch(site, keyword) # get the website and parse it else: siteData = {} keywordWithPlusSigns = urllib.parse.quote_plus(keyword) keywordWithPlusSigns = keywordWithPlusSigns.replace('%20', '+') if siteName == 'biorxiv.org': siteData = { 'url': f'https://www.biorxiv.org/search/{keywordWithPlusSigns}%20numresults%3A75%20sort%3Arelevance-rank', 'resultsXpath': "//a[@class = 'highwire-cite-linked-title']", 'totalResultsXpath': "//*[@id = 'search-summary-wrapper']", 'titleXpath': "./span[@class = 'highwire-cite-title']", 'dateSubmittedXpath': "//div[@class = 'pane-content' and contains(., 'Posted')]", 'urlPrefix': 'https://www.biorxiv.org', 'afterFirstPageSuffix': '?page={}', 'abstractXpath': "//*[@id = 'abstract-1']//*[@id = 'p-2']", 'titleInDetailsPageXpath': "//*[@id = 'page-title']" } elif siteName == 'medrxiv.org': siteData = { 'url': f'https://www.medrxiv.org/search/{keywordWithPlusSigns}%20numresults%3A75%20sort%3Arelevance-rank', 'resultsXpath': "//a[@class = 'highwire-cite-linked-title']", 'totalResultsXpath': "//*[@id = 'search-summary-wrapper']", 'titleXpath': "./span[@class = 'highwire-cite-title']", 'dateSubmittedXpath': "//div[@class = 'pane-content' and contains(., 'Posted')]", 'urlPrefix': 'https://www.medrxiv.org', 'afterFirstPageSuffix': '?page={}', 'abstractXpath': "//*[@id = 'abstract-1']//*[@id = 'p-2']", 'titleInDetailsPageXpath': "//*[@id = 'page-title']" } articles = self.genericSearch(site, keyword, siteData) i = 0 # download all the pdf url's we found for article in articles: self.outputResult(site, keyword, i + 1, article) i += 1
def outputResult(self, site, keyword, resultNumber, article): siteName = helpers.getDomainName(site.get('url', '')) articleId = article[0] pdfUrl = article[1] downloaded = 'Not downloaded' outputFileName = '' # log to the csv file anyway self.logToCsvFiles(site, keyword, resultNumber, article, outputFileName, downloaded, False, True) self.waitBetween()
def markDone(self, site, keyword): siteName = helpers.getDomainName(site.get('url', '')) keyword = keyword.replace("'", "''") item = { 'siteName': siteName, 'keyword': keyword, 'directory': self.options['outputDirectory'], 'gmDate': str(datetime.datetime.utcnow()) } logging.debug(f'Inserting into database') logging.debug(item) self.database.insert('history', item)
def isDone(self, site, keyword): result = False siteName = helpers.getDomainName(site.get('url', '')) keyword = keyword.replace("'", "''") directory = self.options['outputDirectory'] siteName = self.database.getFirst( 'history', 'siteName', f"siteName= '{siteName}' and keyword = '{keyword}' and directory = '{directory}'", '', '') if siteName: logging.info(f'Skipping. Already done this item.') result = True return result
def arxivSearch(self, site, keyword): results = [] maximumResults = self.options['maximumResultsPerKeyword'] if maximumResults == -1: maximumResults = None items = arxiv.query(query=keyword, id_list=[], max_results=maximumResults, start=0, sort_by="relevance", sort_order="descending", prune=True, iterative=False, max_chunk_results=1000) ids = [] for item in items: id = item.get('id', '') id = self.getLastAfterSplit(id, '/') # avoids duplicates if id in ids: continue ids.append(id) pdfUrl = item.get('pdf_url', '') if not pdfUrl: siteName = helpers.getDomainName(site.get('url', '')) message = f'No pdf file found on {siteName} for {id}' logging.error(message) pdfUrl = f'Error: {message}' title = item.get('title', '') title = title.replace('\n', ' ') title = self.squeezeWhitespace(title) dateSubmitted = item.get('published', '') dateSubmitted = helpers.findBetween(dateSubmitted, '', 'T') shortTitle = title if len(shortTitle) > 50: shortTitle = shortTitle[0:50] + '...' abstract = item.get('summary', '') allAuthors = '; '.join(item.get('authors', '')) allLocations = '' firstAuthor = self.getFirst(item.get('authors', '')) firstAuthorLocation = '' lastAuthor = self.getLast(item.get('authors', '')) lastAuthorLocation = '' citations = '' result = [ id, pdfUrl, title, dateSubmitted, abstract, allAuthors, allLocations, firstAuthor, firstAuthorLocation, lastAuthor, lastAuthorLocation, citations ] results.append(result) logging.info( f'Results: {len(results)}. Id: {id}. Title: {shortTitle}.') self.totalResults = len(results) self.showResultCount() # log the search now because the download might fail self.logToCsvFiles(site, keyword, -1, [], '', False, True, False) return results
def showStatus(self, item, keyword): siteName = helpers.getDomainName(item.get('url', '')) logging.info( f'Site {self.onItemIndex + 1} of {len(self.sites)}: {siteName}. Keyword {self.onKeywordIndex + 1} of {len(self.keywords)}: {keyword}.' )