def parse_entry(entry):
    "parses a single search result from scopus"
    try:
        citedby_link = first(lfilter(lambda d: d["@ref"] == "scopus-citedby", entry['link']))
        ensure('prism:doi' in entry, "entry is missing 'doi'!", ParseError)
        ensure('citedby-count' in entry, "entry is missing 'citedby-count'!", ParseError)
        ensure(isint(entry['citedby-count']), "citedby count isn't an integer", ParseError)

        if isinstance(entry['prism:doi'], list):
            weird_key = "$"
            for struct in entry['prism:doi']:
                doi = struct[weird_key]
                if utils.doi2msid(doi, safe=True, allow_subresource=False):
                    entry['prism:doi'] = doi
                    break

        utils.doi2msid(entry['prism:doi'], allow_subresource=False) # throws AssertionError

        return {
            'doi': entry['prism:doi'],
            'num': int(entry['citedby-count']),
            'source': models.SCOPUS,
            'source_id': citedby_link['@href']
        }

    # errors handled here won't be caught by handler.capture_parse_error

    except AssertionError:
        LOG.warn("discarding scopus citation: failed to parse doi", extra={'response': entry})
        return {'bad': entry}

    except ParseError:
        LOG.warn("discarding scopus citation: failed to parse entry", extra={'response': entry})
        return {'bad': entry}
 def test_isint(self):
     int_list = [
         1,
         -1,
         '-1',
         '1',
         '1111111111',
         '99999999999999999999999999999999999',
         0xDEADBEEF,  # hex
     ]
     for int_val in int_list:
         self.assertTrue(utils.isint(int_val))
def search(api_key=settings.SCOPUS_KEY, doi_prefix=settings.DOI_PREFIX):
    """searches scopus, returning a generator that will iterate through each page
    of results until all pages have been consumed.
    results are cached and expire daily"""

    page = 0
    per_page = 25 # max per page

    data = fetch_page(api_key, doi_prefix, page=page, per_page=per_page).json()

    yield data['search-results']

    # I think this is 'total pages'
    # you can certainly query far far beyond 'totalResults / per_page'
    total_pages = int(data['search-results']['opensearch:totalResults'])

    # I think we're capped at 10k/day ? can't find their docs on this
    # eLife tends to hit 0 citations at about the 2.5k mark
    max_pages = 5000

    # figure out where to stop
    end_page = max_pages if total_pages > max_pages else total_pages

    try:
        for page in range(page + 1, end_page):
            try:
                data = fetch_page(api_key, doi_prefix, page=page, per_page=per_page).json()
                yield data['search-results']

                # find the first entry in the search results with a 'citedby-count'.
                # this is typically the first but we have results where it's missing
                fltrfn = lambda d: 'citedby-count' in d and isint(d['citedby-count'])
                entry = first(lfilter(fltrfn, data['search-results']['entry']))

                # exit early if we start hitting 0 results
                if entry and int(entry['citedby-count']) == 0:
                    raise GeneratorExit("no more articles with citations")

                # every ten pages print out our progress
                if page % 10 == 0:
                    LOG.info("page %s of %s, last citation count: %s" % (page, end_page, entry['citedby-count']))

            except requests.HTTPError as err:
                raise GeneratorExit(str(err))

    except GeneratorExit:
        return
Exemple #4
0
def enplumpen(artid):
    "takes an article id like e01234 and returns a DOI like 10.7554/eLife.01234"
    if isint(artid):
        return msid2doi(artid)
    ensure(artid[0] == 'e', 'cannot convert article id %s to doi' % artid)
    return artid.replace('e', '10.7554/eLife.')
 def test_isnotint(self):
     not_int_list = ['one', 'a', utils]
     for not_int in not_int_list:
         self.assertFalse(utils.isint(not_int), "failed on %s" % not_int)