Ejemplo n.º 1
0
def generic_results_processor(ptype, frame, rows):
    if 'path-map-file' in frame:
        mapping = parse_map_file(frame)
        path_processor = partial(process_mapped_path, mapping)
    elif 'prefix' in frame:
        path_processor = partial(process_prefixed_path, frame['prefix'])
    elif 'path-map' in frame:
        path_processor = partial(process_mapped_path, frame['path-map'])

    ensure(path_processor, "generic results processing requires a 'prefix' or 'path-map' key.")

    def _process(row):
        try:
            path, datestr, count = row
            identifier = path_processor(path)
            if identifier is None:
                return # raise ValueError?
            return {
                'views': int(count),
                'date': _str2dt(datestr),
                'identifier': identifier,
            }
        except ValueError as err:
            LOG.info("skipping row, bad value: %s" % str(err))
        except BaseException as err:
            LOG.exception("unhandled exception processing row: %s", str(err), extra={"row": row})
    return list(filter(None, map(_process, rows)))
Ejemplo n.º 2
0
def process_prefixed_path(prefix, path):
    path = normalise_path(path)
    ensure(path.startswith(prefix), "path does not start with given prefix (%r): %s" % (prefix, path), ValueError)
    # we could just dispense with the prefix and discard the first segment ...
    prefix_len = len(prefix)
    path = path[prefix_len:].strip().strip('/') # /events/foobar => foobar
    identifier = path.split('/', 1)[0] # foobar/the-baz-in-bar-fooed-at-the-star => foobar
    return identifier
Ejemplo n.º 3
0
def writefile(xid, content, fname):
    path = join(settings.DUMP_PATH, xid)
    utils.ensure(utils.mkdirs(path), "failed to create path %s" % path)
    path = join(
        path,
        fname)  # ll: /tmp/elife-metrics/pmc-asdfasdfasdf-482309849230/log
    if isinstance(content, str):
        content = content.encode('utf8')
    open(path, 'wb').write(content)
    return path
Ejemplo n.º 4
0
 def _parse_line(line):
     "the file is a simple 'cat nginx-redirect-file | grep prefix > outfile'"
     line = line.strip()
     if not line:
         return
     path, redirect = line.split("' '")
     path, redirect = path.strip(" '"), redirect.strip(" ';")
     prefix = frame['redirect-prefix']
     ensure(redirect.startswith(prefix), "redirect doesn't start with redirect-prefix: %s" % line)
     # /inside-elife/foobar => foobar
     bits = redirect.strip('/').split('/', 1) # '/inside-elife/foobar' -> 'inside-elife/foobar' -> ['inside-elife, 'foobar']
     redirect = models.LANDING_PAGE if len(bits) == 1 else bits[1]
     return (path, redirect)
Ejemplo n.º 5
0
def generic_query_processor(ptype, frame):
    # NOTE: ptype is unused, it's just to match a query processor function's signature
    ptype_filter = None
    if frame.get('pattern'):
        ptype_filter = frame['pattern']
    elif frame.get('prefix') and frame.get('path-list'):
        ptype_filter = generic_ga_filter_w_paths(frame['prefix'], frame['path-list'])
    elif frame.get('prefix'):
        ptype_filter = generic_ga_filter(frame['prefix'])
    elif frame.get('path-map'):
        ptype_filter = generic_ga_filter_w_paths('', frame['path-map'].keys())
    ensure(ptype_filter, "bad frame data")
    return ptype_filter
Ejemplo n.º 6
0
def fetch(pmcid_list):
    ensure(
        len(pmcid_list) <= MAX_PER_PAGE,
        "no more than %s results can be processed per-request. requested: %s" %
        (MAX_PER_PAGE, len(pmcid_list)))
    headers = {'accept': 'application/json'}
    params = {
        'dbfrom': 'pubmed',
        'linkname': 'pmc_pmc_citedby',
        'id': lmap(norm_pmcid, pmcid_list),
        'tool': 'elife-metrics',
        'email': settings.CONTACT_EMAIL,
        'retmode': 'json'
    }
    return handler.requests_get(PM_URL, params=params, headers=headers)
Ejemplo n.º 7
0
def load_fn(dotted_path):
    try:
        dotted_path = dotted_path.strip().lower().replace('-', '_') # basic path normalisation
        package, funcname = dotted_path.rsplit('.', 1) # 'os.path.join' => 'os.path', 'join'
        package = importlib.import_module(package)
        ensure(hasattr(package, funcname),
               "could not find function %r in package %r for given path: %s" % (funcname, package, dotted_path))
        return getattr(package, funcname)
    except ImportError as err:
        # package doesn't exist
        LOG.debug(str(err))

    except AssertionError as err:
        # package exists but not function
        LOG.debug(str(err))
    return None
Ejemplo n.º 8
0
def query_ga(ptype, query, results_pp=MAX_GA_RESULTS, replace_cache_files=False):
    ensure(is_inrange(results_pp, 1, MAX_GA_RESULTS), "`results_pp` must be an integer between 1 and %s" % MAX_GA_RESULTS)
    sd, ed = query['start_date'], query['end_date']
    LOG.info("querying GA for %ss between %s and %s" % (ptype, sd, ed))
    dump_path = ga_core.output_path(ptype, sd, ed)
    # TODO: this settings.TESTING check is a code smell.
    if os.path.exists(dump_path) and not settings.TESTING:
        if not replace_cache_files:
            LOG.info("(cache hit)")
            return json.load(open(dump_path, 'r'))
        # cache file will be replaced with results
        pass

    query['max_results'] = results_pp
    query['start_index'] = 1
    response = ga_core.query_ga(query)
    if not settings.TESTING:
        ga_core.write_results(response, dump_path)
    return response
Ejemplo n.º 9
0
def update_article(row):
    data = {
        'doi': row['DOI'],
        'pmcid': row['PMCID'],
        'pmid': row['PMID'] or None,
    }
    ensure(data['doi'].startswith(settings.DOI_PREFIX),
           "refusing to create/update non-journal article: %s" % row)
    if not data['pmid']:
        LOG.warn("no pmid for %s" % data['doi'])

    # the doi values in the csv data look perfect and I've never had a problem with them
    # however
    # we only do it once per new production machine and it doesn't hurt to check
    utils.doi2msid(data['doi'], allow_subresource=False)

    return create_or_update(models.Article,
                            data, ['doi'],
                            create=True,
                            update=True,
                            update_check=True)
Ejemplo n.º 10
0
def parse_entry(entry):
    "parses a single search result from scopus"
    try:
        citedby_link = first(lfilter(lambda d: d["@ref"] == "scopus-citedby", entry['link']))
        ensure('prism:doi' in entry, "entry is missing 'doi'!", ParseError)
        ensure('citedby-count' in entry, "entry is missing 'citedby-count'!", ParseError)
        ensure(isint(entry['citedby-count']), "citedby count isn't an integer", ParseError)

        if isinstance(entry['prism:doi'], list):
            weird_key = "$"
            for struct in entry['prism:doi']:
                doi = struct[weird_key]
                if utils.doi2msid(doi, safe=True, allow_subresource=False):
                    entry['prism:doi'] = doi
                    break

        utils.doi2msid(entry['prism:doi'], allow_subresource=False) # throws AssertionError

        return {
            'doi': entry['prism:doi'],
            'num': int(entry['citedby-count']),
            'source': models.SCOPUS,
            'source_id': citedby_link['@href']
        }

    # errors handled here won't be caught by handler.capture_parse_error

    except AssertionError:
        LOG.warn("discarding scopus citation: failed to parse doi", extra={'response': entry})
        return {'bad': entry}

    except ParseError:
        LOG.warn("discarding scopus citation: failed to parse entry", extra={'response': entry})
        return {'bad': entry}
Ejemplo n.º 11
0
def build_ga_query(ptype, start_date=None, end_date=None, history_data=None):
    """As we go further back in history the query will change as known epochs
    overlap. These overlaps will truncate the current period to the epoch
    boundaries."""

    ensure(is_ptype(ptype), "bad page type")

    # if dates given, ensure they are date objects
    start_date and ensure(is_date(start_date), "bad start date")
    end_date and ensure(is_date(end_date), "bad end date")

    # if history data provided, ensure it validates
    if history_data:
        history_data = history.type_object.validate(history_data)

    # extract just the page type we're after
    ptype_history = history_data or history.ptype_history(ptype)
    frame_list = ptype_history['frames']

    # frames are ordered oldest to newest (asc)
    earliest_date = frame_list[0]['starts']
    latest_date = frame_list[-1]['ends']

    start_date = start_date or earliest_date
    end_date = end_date or latest_date
    ensure(start_date <= end_date, "start date %r cannot be greater than end date %r" % (start_date, end_date))

    # only those frames that overlap our start/end dates
    frame_list = interesting_frames(start_date, end_date, frame_list)

    # each timeframe requires it's own pattern generation, post processing and normalisation
    query_list = [(frame, build_ga_query__queries_for_frame(ptype, frame, start_date, end_date)) for frame in frame_list]

    return query_list
Ejemplo n.º 12
0
def _fetch_pmids(doi):
    # article doesn't have a pmcid for whatever reason
    # go fetch one using doi
    # https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/
    LOG.info("fetching pmcid for doi %s" % doi)
    params = {
        'ids': doi,
        'tool': 'elife-metrics',
        'email': settings.CONTACT_EMAIL,
        'format': 'json',
    }
    resp = requests.get(PMID_URL, params=params)
    resp.raise_for_status()

    data = resp.json()
    # ll:
    # {
    # "status": "ok",
    # "responseDate": "2017-01-31 13:35:10",
    # "request": "ids=10.7554%2FeLife.09560;format=json",
    # "records": [
    #   {
    #    "pmcid": "PMC4559886",
    #    "pmid": "26354291",
    #    "doi": "10.7554/eLife.09560",
    #    "versions": [
    #      {
    #       "pmcid": "PMC4559886.1",
    #       "current": "true"
    #      }
    #    ]
    #   }
    # ]
    # }
    ensure(data['status'] == 'ok', "response is not ok! %s" % data)
    return subdict(data['records'][0], ['pmid', 'pmcid'])
Ejemplo n.º 13
0
def page_views(pid, ptype, period=DAY):
    ensure(is_pid(pid), "bad page identifier", ValueError)
    ensure(is_ptype(ptype), "bad page type", ValueError)
    ensure(is_period(period), "bad period", ValueError)
    try:
        pobj = models.Page.objects.get(identifier=pid, type=ptype)
        dispatch = {
            DAY: daily_page_views,
            MONTH: monthly_page_views
        }
        return dispatch[period](pobj)
    except models.Page.DoesNotExist:
        return None
Ejemplo n.º 14
0
def parse_map_file(frame, contents=None):
    contents and ensure(isinstance(contents, str), "'contents' must be a string'")

    def _parse_line(line):
        "the file is a simple 'cat nginx-redirect-file | grep prefix > outfile'"
        line = line.strip()
        if not line:
            return
        path, redirect = line.split("' '")
        path, redirect = path.strip(" '"), redirect.strip(" ';")
        prefix = frame['redirect-prefix']
        ensure(redirect.startswith(prefix), "redirect doesn't start with redirect-prefix: %s" % line)
        # /inside-elife/foobar => foobar
        bits = redirect.strip('/').split('/', 1) # '/inside-elife/foobar' -> 'inside-elife/foobar' -> ['inside-elife, 'foobar']
        redirect = models.LANDING_PAGE if len(bits) == 1 else bits[1]
        return (path, redirect)
    if contents:
        contents = contents.splitlines()
    else:
        path = os.path.join(settings.GA_PTYPE_SCHEMA_PATH, frame['path-map-file'])
        contents = open(path, 'r').readlines()
    return OrderedDict(lfilter(None, lmap(_parse_line, contents)))
Ejemplo n.º 15
0
def ptype_history(ptype, history=None):
    history = history or load_from_file()
    ensure(ptype in history, "no historical data found: %s" % ptype,
           ValueError)
    return history[ptype]
Ejemplo n.º 16
0
def enplumpen(artid):
    "takes an article id like e01234 and returns a DOI like 10.7554/eLife.01234"
    if isint(artid):
        return msid2doi(artid)
    ensure(artid[0] == 'e', 'cannot convert article id %s to doi' % artid)
    return artid.replace('e', '10.7554/eLife.')