Example #1
0
    def do(row):
        page_data = {
            'type': ptypeobj,
            'identifier': row['identifier'],
        }
        pageobj = first(create_or_update(models.Page, page_data, update=False))

        pagecount_data = {
            'page': pageobj,
            'views': row['views'],
            'date': row['date']
        }
        key_list = ['page', 'date']
        pagecountobj = first(create_or_update(models.PageCount, pagecount_data, key_list, update=True))
        return pagecountobj
Example #2
0
def parse_entry(entry):
    "parses a single search result from scopus"
    try:
        citedby_link = first(lfilter(lambda d: d["@ref"] == "scopus-citedby", entry['link']))
        ensure('prism:doi' in entry, "entry is missing 'doi'!", ParseError)
        ensure('citedby-count' in entry, "entry is missing 'citedby-count'!", ParseError)
        ensure(isint(entry['citedby-count']), "citedby count isn't an integer", ParseError)

        if isinstance(entry['prism:doi'], list):
            weird_key = "$"
            for struct in entry['prism:doi']:
                doi = struct[weird_key]
                if utils.doi2msid(doi, safe=True, allow_subresource=False):
                    entry['prism:doi'] = doi
                    break

        utils.doi2msid(entry['prism:doi'], allow_subresource=False) # throws AssertionError

        return {
            'doi': entry['prism:doi'],
            'num': int(entry['citedby-count']),
            'source': models.SCOPUS,
            'source_id': citedby_link['@href']
        }

    # errors handled here won't be caught by handler.capture_parse_error

    except AssertionError:
        LOG.warn("discarding scopus citation: failed to parse doi", extra={'response': entry})
        return {'bad': entry}

    except ParseError:
        LOG.warn("discarding scopus citation: failed to parse entry", extra={'response': entry})
        return {'bad': entry}
Example #3
0
def resolve_pmcid(artobj):
    pmcid = artobj.pmcid
    if pmcid:
        LOG.debug("no pmcid fetch necessary")
        return pmcid
    data = _fetch_pmids(artobj.doi)
    data[
        'doi'] = artobj.doi  # don't use doi from response, prefer the doi we already have
    artobj = first(
        utils.create_or_update(models.Article,
                               data, ['doi'],
                               create=False,
                               update=True))
    return artobj.pmcid
Example #4
0
def search(api_key=settings.SCOPUS_KEY, doi_prefix=settings.DOI_PREFIX):
    """searches scopus, returning a generator that will iterate through each page
    of results until all pages have been consumed.
    results are cached and expire daily"""

    page = 0
    per_page = 25 # max per page

    data = fetch_page(api_key, doi_prefix, page=page, per_page=per_page).json()

    yield data['search-results']

    # I think this is 'total pages'
    # you can certainly query far far beyond 'totalResults / per_page'
    total_pages = int(data['search-results']['opensearch:totalResults'])

    # I think we're capped at 10k/day ? can't find their docs on this
    # eLife tends to hit 0 citations at about the 2.5k mark
    max_pages = 5000

    # figure out where to stop
    end_page = max_pages if total_pages > max_pages else total_pages

    try:
        for page in range(page + 1, end_page):
            try:
                data = fetch_page(api_key, doi_prefix, page=page, per_page=per_page).json()
                yield data['search-results']

                # find the first entry in the search results with a 'citedby-count'.
                # this is typically the first but we have results where it's missing
                fltrfn = lambda d: 'citedby-count' in d and isint(d['citedby-count'])
                entry = first(lfilter(fltrfn, data['search-results']['entry']))

                # exit early if we start hitting 0 results
                if entry and int(entry['citedby-count']) == 0:
                    raise GeneratorExit("no more articles with citations")

                # every ten pages print out our progress
                if page % 10 == 0:
                    LOG.info("page %s of %s, last citation count: %s" % (page, end_page, entry['citedby-count']))

            except requests.HTTPError as err:
                raise GeneratorExit(str(err))

    except GeneratorExit:
        return