Example #1
0
    def logNihResultToCsvFile(self, site, keyword, article, articleDetails):
        name = site.get('name', '').lower()

        csvFileName = os.path.join(self.options['outputDirectory'],
                                   f'{name}_results.csv')

        if not os.path.exists(csvFileName):
            helpers.makeDirectory(os.path.dirname(csvFileName))
            helpers.toFile(
                'DateTime,Keyword,Title,Date_Submitted,URL,Abstract,Description,Details,ShortDetails,Resource,Type,Identifiers,Db,EntrezUID,Properties,all_authors,all_locations,first_author,firstauthor_location,lastauthor,last_author_location,citations',
                csvFileName)

        siteName = site.get('name', '')

        articleId = article.get('uid', '')

        dateSubmitted = article.get('sortpubdate', '')
        dateSubmitted = helpers.findBetween(dateSubmitted, '', ' ')
        dateSubmitted = dateSubmitted.replace('/', '-')

        properties = 'create date: ' + helpers.findBetween(
            article.get('sortpubdate', ''), '',
            ' ') + ' | first author: ' + article.get('sortfirstauthor', '')

        description = ''

        authors = []
        for author in article.get('authors', ''):
            authors.append(author.get('name', ''))

        description = ', '.join(authors) + '.'

        publicationTypes = ', '.join(article.get('pubtype', []))

        details = article.get('fulljournalname', '') + '. ' + article.get(
            'elocationid', '') + '. ' + publicationTypes + '.'

        line = [
            datetime.datetime.now().strftime('%m%d%y-%H%M%S'), keyword,
            article.get('title', ''), dateSubmitted, f'/pubmed/{articleId}',
            articleDetails.get('abstract', ''), description, details,
            article.get('fulljournalname', '') + '. ' +
            helpers.findBetween(article.get('sortpubdate', ''), '', '/'),
            'PubMed', publicationTypes, f'PMID:{articleId}', 'pubmed',
            articleId, properties,
            articleDetails.get('allAuthors', ''),
            articleDetails.get('allLocations', ''),
            article.get('sortfirstauthor', ''),
            articleDetails.get('firstAuthorLocation', ''),
            article.get('lastauthor', ''),
            articleDetails.get('lastAuthorLocation', ''),
            articleDetails.get('citations', '')
        ]

        self.appendCsvFile(line, csvFileName)
Example #2
0
    def arxivSearch(self, site, keyword):
        results = []

        maximumResults = self.options['maximumResultsPerKeyword']

        if maximumResults == -1:
            maximumResults = None

        items = arxiv.query(query=keyword,
                            id_list=[],
                            max_results=maximumResults,
                            start=0,
                            sort_by="relevance",
                            sort_order="descending",
                            prune=True,
                            iterative=False,
                            max_chunk_results=1000)

        ids = []

        for item in items:
            id = item.get('id', '')
            id = self.getLastAfterSplit(id, '/')

            # avoids duplicates
            if id in ids:
                continue

            ids.append(id)

            pdfUrl = item.get('pdf_url', '')

            if not pdfUrl:
                siteName = helpers.getDomainName(site.get('url', ''))
                message = f'No pdf file found on {siteName} for {id}'
                logging.error(message)
                pdfUrl = f'Error: {message}'

            title = item.get('title', '')
            title = title.replace('\n', ' ')
            title = self.squeezeWhitespace(title)

            dateSubmitted = item.get('published', '')

            dateSubmitted = helpers.findBetween(dateSubmitted, '', 'T')

            shortTitle = title

            if len(shortTitle) > 50:
                shortTitle = shortTitle[0:50] + '...'

            abstract = item.get('summary', '')

            allAuthors = '; '.join(item.get('authors', ''))
            allLocations = ''
            firstAuthor = self.getFirst(item.get('authors', ''))
            firstAuthorLocation = ''
            lastAuthor = self.getLast(item.get('authors', ''))
            lastAuthorLocation = ''
            citations = ''

            result = [
                id, pdfUrl, title, dateSubmitted, abstract, allAuthors,
                allLocations, firstAuthor, firstAuthorLocation, lastAuthor,
                lastAuthorLocation, citations
            ]

            results.append(result)

            logging.info(
                f'Results: {len(results)}. Id: {id}. Title: {shortTitle}.')

        self.totalResults = len(results)

        self.showResultCount()

        # log the search now because the download might fail
        self.logToCsvFiles(site, keyword, -1, [], '', False, True, False)

        return results
Example #3
0
    def getInformationFromDetailsPage(self, siteData, url):
        page = self.downloader.get(url)

        title = self.downloader.getXpath(page,
                                         siteData['titleInDetailsPageXpath'],
                                         True)

        dateSubmitted = self.downloader.getXpath(
            page, siteData['dateSubmittedXpath'], True)
        # it starts with a non-breaking space
        dateSubmitted = helpers.findBetween(dateSubmitted, '\xa0', '.')

        abstract = self.downloader.getXpath(page, siteData['abstractXpath'],
                                            True)

        if dateSubmitted:
            dateSubmitted = self.changeDateFormat(dateSubmitted, '%B %d, %Y')

        import lxml.html as lh
        document = lh.fromstring(page)

        allAuthors = []
        allLocations = []
        firstAuthor = ''
        firstAuthorLocation = ''
        lastAuthor = ''
        lastAuthorLocation = ''

        authorXpath = "//*[contains(@id, 'hw-article-author-popups-')]/div[contains(@class, 'author-tooltip-')]"

        elements = document.xpath(authorXpath)

        for i, element in enumerate(elements):
            name = self.downloader.getXpathInElement(
                element, ".//div[@class = 'author-tooltip-name']", False)

            name = name.strip()

            if not name:
                continue

            allAuthors.append(name)

            if not firstAuthor:
                firstAuthor = name
            # only if the article has a last author
            elif i > 0 and i == len(elements) - 1 and not lastAuthor:
                lastAuthor = name

            affiliations = element.xpath(".//span[@class = 'nlm-aff']")

            for affiliation in affiliations:
                location = affiliation.text_content()

                location = location.strip()

                if not location:
                    continue

                if i == 0 and not firstAuthorLocation:
                    firstAuthorLocation = location
                # only if the article has a last author
                elif i > 0 and i == len(
                        elements) - 1 and not lastAuthorLocation:
                    lastAuthorLocation = location

                # avoid duplicates
                if not location in allLocations:
                    allLocations.append(location)

        result = {
            'title': title,
            'dateSubmitted': dateSubmitted,
            'abstract': abstract,
            'allAuthors': '; '.join(allAuthors),
            'allLocations': ' | '.join(allLocations),
            'firstAuthor': firstAuthor,
            'firstAuthorLocation': firstAuthorLocation,
            'lastAuthor': lastAuthor,
            'lastAuthorLocation': lastAuthorLocation,
            'citations': ''
        }

        return result
Example #4
0
    def getNihPage(self, site, keyword, api, pageIndex, existingResults,
                   resultCount):
        results = []

        resultsPerPage = 1000
        start = pageIndex * resultsPerPage
        response = ''

        if not self.options.get('useIdLists', ''):
            logging.info(f'Getting page {pageIndex + 1}')

            response = api.get(
                f'/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retstart={start}&retmax={resultsPerPage}&term={keyword}'
            )

            if not response:
                logging.error('No response')
                return []

            if not self.totalResults:
                self.totalResults = response['esearchresult']['count']

                self.showResultCount()

                # log the search now because the download might fail
                self.logToCsvFiles(site, keyword, -1, [], '', False, True,
                                   False)
        else:
            if pageIndex == 0:
                response = {'esearchresult': {'idlist': [keyword]}}
            else:
                return []

        i = resultCount

        for item in response['esearchresult']['idlist']:
            if self.shouldStopForThisKeyword(i, False):
                break

            # avoid duplicates
            if self.isInArticleList(existingResults, item):
                continue

            i += 1

            try:
                summaryResponse = api.get(
                    f'/entrez/eutils/esummary.fcgi?db=pubmed&id={item}&retmode=json'
                )

                title = ''
                abstract = ''

                if 'result' in summaryResponse and item in summaryResponse[
                        'result']:
                    articleSummary = summaryResponse['result'][item]

                    title = articleSummary.get('title', '')

                    shortTitle = title

                    if len(shortTitle) > 50:
                        shortTitle = shortTitle[0:50] + '...'

                    dateSubmitted = articleSummary.get('sortpubdate', '')
                    dateSubmitted = helpers.findBetween(dateSubmitted, '', ' ')
                    dateSubmitted = dateSubmitted.replace('/', '-')

                    details = self.getNihDetails(api, item, articleSummary)

                    abstract = details.get('abstract', '')

                    logging.info(
                        f'Results: {i}. Id: {item}. Title: {shortTitle}.')

                    # write these results to a separate csv
                    self.logNihResultToCsvFile(site, keyword, articleSummary,
                                               details)

                pdfUrl = ''
            except Exception as e:
                # if something goes wrong, we just go to next keyword
                logging.error(f'Skipping {item}. Something went wrong.')
                logging.debug(traceback.format_exc())
                logging.error(e)
                continue

            result = [item, pdfUrl, title, dateSubmitted, abstract]

            fields = [
                'allAuthors', 'allLocations', 'firstAuthor',
                'firstAuthorLocation', 'lastAuthor', 'lastAuthorLocation',
                'citations'
            ]

            for field in fields:
                result.append(details.get(field, ''))

            results.append(result)

        return results
Example #5
0
    def initialize(self):
        suffix = helpers.getArgument('-w', False)

        if suffix:
            suffix = '-' + helpers.fileNameOnly(suffix, False)

        helpers.setUpLogging(suffix)

        logging.info('Starting\n')

        self.onItemIndex = 0
        self.onKeywordIndex = 0

        # to store the time we finished given sites/keyword combinations
        self.database = Database('database.sqlite')
        self.database.execute(
            'create table if not exists history ( siteName text, keyword text, directory text, gmDate text, primary key(siteName, keyword, directory) )'
        )

        self.downloader = Downloader()
        self.dateStarted = datetime.datetime.now().strftime('%m%d%y')

        outputDirectory = os.path.join(str(Path.home()), 'Desktop',
                                       f'WebSearch_{self.dateStarted}')

        # set default options
        self.options = {
            'inputWebsitesFile': 'input_websites.txt',
            'inputKeywordsFile': '',
            'outputDirectory': outputDirectory,
            'secondsBetweenItems': 0,
            'maximumDaysToKeepItems': 90,
            'maximumResultsPerKeyword': 25000,
            'directoryToCheckForDuplicates': '',
            'useIdLists': 0
        }

        self.keywordsFiles = {}
        self.idListFiles = {}

        # read the options file
        helpers.setOptions('options.ini', self.options)
        helpers.setOptions('options.ini', self.keywordsFiles, 'search terms')
        helpers.setOptions('options.ini', self.idListFiles, 'id lists')

        # read command line parameters
        self.setOptionFromParameter('inputWebsitesFile', '-w')
        self.setOptionFromParameter('inputKeywordsFile', '-s')
        self.setOptionFromParameter('outputDirectory', '-d')

        if '-i' in sys.argv:
            self.options['maximumResultsPerKeyword'] = 1
            logging.info('Downloading by ID list')
            self.options['useIdLists'] = 1

        # read websites file
        file = helpers.getFile(self.options['inputWebsitesFile'])
        self.sites = []

        for item in file.splitlines():
            name = helpers.findBetween(item, '', ' ')
            url = helpers.findBetween(item, ' ', '')

            site = {'name': name, 'url': url}

            self.sites.append(site)

        self.removeOldEntries()