def test_scopus_data_dumps(self):
     """similar to `test_scopus_parse_entry` but geared for data dumps generated by handler.writefile.
     these dumps are slightly different from raw scopus results pages"""
     response_fixtures = utils.listfiles(
         join(self.fixture_dir, 'scopus-responses', 'dumps'))
     for response in response_fixtures:
         with patch('article_metrics.handler.capture_parse_error',
                    return_value=lambda fn: fn):
             try:
                 fixture = json.load(open(response, 'r'))['data']
                 utils.lmap(citations.parse_entry, fixture)
             except BaseException as err:
                 print('caught error in', response)
                 raise err
def query_processor_frame_1(ptype, frame):

    adhoc = lmap(lambda path: "ga:pagePath==%s" % path, [
        "/from-ancient-dna-to-decay-an-interview-with-jessica-metcalf",
        "/food-for-thought-an-interview-with-ana-domingos",
        "/helping-to-fight-tuberculosis-an-interview-with-david-dowdy"

        '/elife-news/chemistry-versus-cancer-an-interview-with-daniel-abankwa',
        '/elife-news/connecting-the-flight-controls-an-interview-with-tarjani-agrawal',
        '/elife-news/controlling-the-immune-response-an-interview-with-donna-macduff',
        '/elife-news/controlling-traffic-an-interview-with-ramanath-hegde',
        '/elife-news/decoding-behaviour-an-interview-with-fanny-cazettes',
        '/elife-news/developing-kidneys-an-interview-with-peter-hohenstein',
        '/elife-news/getting-under-the-skin-an-interview-with-elena-oancea',
        '/elife-news/helping-the-neighbours-an-interview-with-meredith-schuman',
        '/elife-news/imprinting-memories-an-interview-with-katja-kornysheva',
        '/elife-news/infection-statistics-and-public-health-an-interview-with-alicia-rosello',
        '/elife-news/looking-at-lipids-an-interview-with-jessica-hughes',
        '/elife-news/modelling-metabolism-an-interview-with-keren-yizhak',
        '/elife-news/of-plants-and-parasites-an-interview-with-yong-woo',
        '/elife-news/repeating-the-message-an-interview-with-yunsheng-cheng',
        '/elife-news/the-benefits-of-new-brain-cells-an-interview-with-antonia-marin-burgin',
        '/elife-news/the-regeneration-game-an-interview-with-brian-bradshaw',
        '/elife-news/understanding-the-evolution-of-defence-an-interview-with-maurijn-van-der-zee',
    ])

    interviews = [
        generic_ga_filter('/early-careers-interviews'),
        generic_ga_filter('/interviews')
    ]

    query = ",".join(interviews + adhoc)
    return query
Example #3
0
def event_counts(row_list):
    "parses the list of rows returned by google to extract the doi and it's count"

    def parse(row):
        label, count = row
        return label.split('::')[0], int(count)

    return dict(lmap(parse, row_list))
Example #4
0
def insert_metrics(list_of_rows):
    def _insert(row):
        pid, ptype, date, views = row
        ptype, _ = models.PageType.objects.get_or_create(name=ptype)
        page, _ = models.Page.objects.get_or_create(type=ptype, identifier=pid)
        pcount, _ = models.PageCount.objects.get_or_create(page=page, views=views, date=date)
        return (ptype, page, pcount)
    return lmap(_insert, list_of_rows)
Example #5
0
def frames_wrangler(frame_list):
    def fill_empties(frame):
        frame['starts'] = frame['starts'] or settings.INCEPTION.date()
        frame['ends'] = frame['ends'] or date.today()
        return frame

    frame_list = lmap(fill_empties, frame_list)
    frame_list = sorted(frame_list, key=lambda f: f['starts'])  # ASC

    # TODO: ensure no overlaps between frames

    return frame_list
 def test_scopus_parse_entry(self):
     "citations.parse_entry can handle all known fixtures"
     response_fixtures = [
         join(self.fixture_dir, 'scopus-responses/search-p1.json'),
         join(self.fixture_dir, 'scopus-responses/search-p2.json')
     ]
     response_fixtures = lmap(lambda path: json.load(open(path, 'r')),
                              response_fixtures)
     res = citations.all_entries(response_fixtures)
     per_page = 25
     expected_entries = per_page * len(response_fixtures)
     self.assertEqual(expected_entries, len(res))
Example #7
0
def fetch_parse(pmcid_list):
    "pages through all results for a list of PMC ids (can be just one) and parses the results."
    results = []

    for page, sub_pmcid_list in enumerate(
            utils.paginate(pmcid_list, MAX_PER_PAGE)):
        LOG.debug("page %s, %s per-page", page + 1, MAX_PER_PAGE)

        resp = fetch(sub_pmcid_list)
        result = resp.json()["linksets"]
        # result is a list of maps. add all maps returned to a single list ...
        results.extend(result)
    # ... to be parsed all at once.
    return lmap(parse_result, results)
Example #8
0
def fetch(pmcid_list):
    ensure(
        len(pmcid_list) <= MAX_PER_PAGE,
        "no more than %s results can be processed per-request. requested: %s" %
        (MAX_PER_PAGE, len(pmcid_list)))
    headers = {'accept': 'application/json'}
    params = {
        'dbfrom': 'pubmed',
        'linkname': 'pmc_pmc_citedby',
        'id': lmap(norm_pmcid, pmcid_list),
        'tool': 'elife-metrics',
        'email': settings.CONTACT_EMAIL,
        'retmode': 'json'
    }
    return handler.requests_get(PM_URL, params=params, headers=headers)
Example #9
0
def update_page_counts(ptype, page_counts):
    ptypeobj = first(create_or_update(models.PageType, {"name": ptype}, update=False))

    def do(row):
        page_data = {
            'type': ptypeobj,
            'identifier': row['identifier'],
        }
        pageobj = first(create_or_update(models.Page, page_data, update=False))

        pagecount_data = {
            'page': pageobj,
            'views': row['views'],
            'date': row['date']
        }
        key_list = ['page', 'date']
        pagecountobj = first(create_or_update(models.PageCount, pagecount_data, key_list, update=True))
        return pagecountobj
    return lmap(do, page_counts)
Example #10
0
def parse_map_file(frame, contents=None):
    contents and ensure(isinstance(contents, str), "'contents' must be a string'")

    def _parse_line(line):
        "the file is a simple 'cat nginx-redirect-file | grep prefix > outfile'"
        line = line.strip()
        if not line:
            return
        path, redirect = line.split("' '")
        path, redirect = path.strip(" '"), redirect.strip(" ';")
        prefix = frame['redirect-prefix']
        ensure(redirect.startswith(prefix), "redirect doesn't start with redirect-prefix: %s" % line)
        # /inside-elife/foobar => foobar
        bits = redirect.strip('/').split('/', 1) # '/inside-elife/foobar' -> 'inside-elife/foobar' -> ['inside-elife, 'foobar']
        redirect = models.LANDING_PAGE if len(bits) == 1 else bits[1]
        return (path, redirect)
    if contents:
        contents = contents.splitlines()
    else:
        path = os.path.join(settings.GA_PTYPE_SCHEMA_PATH, frame['path-map-file'])
        contents = open(path, 'r').readlines()
    return OrderedDict(lfilter(None, lmap(_parse_line, contents)))
Example #11
0
def bulk_query(query_list):
    "executes a list of queries"
    return lmap(core.query_ga_write_results, query_list)
Example #12
0
    def test_build_ga_query_multiple_frames(self):
        "a query for a date range that overlaps epochs generates the correct queries"
        midJan18 = date(year=2018, month=1, day=15)
        midDec17 = date(year=2017, month=12, day=15)
        one_day = timedelta(days=1)
        two_days = timedelta(days=2)
        to_day = date.today()

        history_data = {
            'frames': [{
                'id': 2,
                'ends': None,
                'starts': midJan18,
                'pattern': '/new/pants'
            }, {
                'id': 1,
                'ends': midJan18 - one_day,
                'starts': midDec17,
                'pattern': '/old/pants'
            }]
        }

        # starts/ends just outside frame boundaries
        starts = midDec17 - two_days
        ends = midJan18 + two_days

        ql = logic.build_ga_query(models.EVENT, starts, ends, history_data)

        frame_list = lmap(first,
                          ql)  # just the frames and not the queries for now

        # frames are not modified after being validated/coerced
        expected_frames = [{
            'id': '1',
            'starts': midDec17,
            'ends': midJan18 - one_day,
            'pattern': '/old/pants'
        }, {
            'id': '2',
            'starts': midJan18,
            'ends': to_day,
            'pattern': '/new/pants'
        }]
        self.assertEqual(frame_list, expected_frames)

        expected_query_dates = [
            # first query: starts and ends on frame boundaries, ignoring explicit start date
            {
                'start_date': midDec17,
                'end_date': midJan18 - one_day,
                'pattern': '/old/pants'
            },  # id=1

            # second query: starts on frame boundary and ends on explicit end date
            {
                'start_date': midJan18,
                'end_date': ends,
                'pattern': '/new/pants'
            },  # id=2
        ]
        for expected, query in zip(expected_query_dates, lmap(second, ql)):
            subquery = subdict(query, ['start_date', 'end_date', 'filters'])
            utils.renkeys(subquery, [('filters', 'pattern')])
            self.assertEqual(subquery, expected)
Example #13
0
def parse_results(search_result):
    "parses citation counts from a page of search results from scopus"
    return lmap(parse_entry, search_result['entry'])
Example #14
0
def load_csv(path):
    with open(path, 'r') as fh:
        reader = csv.DictReader(fh)
        return lmap(update_article, reader)
Example #15
0
def count_for_qs(qs):
    return process_results(fetch_parse(lmap(resolve_pmcid, qs)))