def test_scopus_data_dumps(self): """similar to `test_scopus_parse_entry` but geared for data dumps generated by handler.writefile. these dumps are slightly different from raw scopus results pages""" response_fixtures = utils.listfiles( join(self.fixture_dir, 'scopus-responses', 'dumps')) for response in response_fixtures: with patch('article_metrics.handler.capture_parse_error', return_value=lambda fn: fn): try: fixture = json.load(open(response, 'r'))['data'] utils.lmap(citations.parse_entry, fixture) except BaseException as err: print('caught error in', response) raise err
def query_processor_frame_1(ptype, frame): adhoc = lmap(lambda path: "ga:pagePath==%s" % path, [ "/from-ancient-dna-to-decay-an-interview-with-jessica-metcalf", "/food-for-thought-an-interview-with-ana-domingos", "/helping-to-fight-tuberculosis-an-interview-with-david-dowdy" '/elife-news/chemistry-versus-cancer-an-interview-with-daniel-abankwa', '/elife-news/connecting-the-flight-controls-an-interview-with-tarjani-agrawal', '/elife-news/controlling-the-immune-response-an-interview-with-donna-macduff', '/elife-news/controlling-traffic-an-interview-with-ramanath-hegde', '/elife-news/decoding-behaviour-an-interview-with-fanny-cazettes', '/elife-news/developing-kidneys-an-interview-with-peter-hohenstein', '/elife-news/getting-under-the-skin-an-interview-with-elena-oancea', '/elife-news/helping-the-neighbours-an-interview-with-meredith-schuman', '/elife-news/imprinting-memories-an-interview-with-katja-kornysheva', '/elife-news/infection-statistics-and-public-health-an-interview-with-alicia-rosello', '/elife-news/looking-at-lipids-an-interview-with-jessica-hughes', '/elife-news/modelling-metabolism-an-interview-with-keren-yizhak', '/elife-news/of-plants-and-parasites-an-interview-with-yong-woo', '/elife-news/repeating-the-message-an-interview-with-yunsheng-cheng', '/elife-news/the-benefits-of-new-brain-cells-an-interview-with-antonia-marin-burgin', '/elife-news/the-regeneration-game-an-interview-with-brian-bradshaw', '/elife-news/understanding-the-evolution-of-defence-an-interview-with-maurijn-van-der-zee', ]) interviews = [ generic_ga_filter('/early-careers-interviews'), generic_ga_filter('/interviews') ] query = ",".join(interviews + adhoc) return query
def event_counts(row_list): "parses the list of rows returned by google to extract the doi and it's count" def parse(row): label, count = row return label.split('::')[0], int(count) return dict(lmap(parse, row_list))
def insert_metrics(list_of_rows): def _insert(row): pid, ptype, date, views = row ptype, _ = models.PageType.objects.get_or_create(name=ptype) page, _ = models.Page.objects.get_or_create(type=ptype, identifier=pid) pcount, _ = models.PageCount.objects.get_or_create(page=page, views=views, date=date) return (ptype, page, pcount) return lmap(_insert, list_of_rows)
def frames_wrangler(frame_list): def fill_empties(frame): frame['starts'] = frame['starts'] or settings.INCEPTION.date() frame['ends'] = frame['ends'] or date.today() return frame frame_list = lmap(fill_empties, frame_list) frame_list = sorted(frame_list, key=lambda f: f['starts']) # ASC # TODO: ensure no overlaps between frames return frame_list
def test_scopus_parse_entry(self): "citations.parse_entry can handle all known fixtures" response_fixtures = [ join(self.fixture_dir, 'scopus-responses/search-p1.json'), join(self.fixture_dir, 'scopus-responses/search-p2.json') ] response_fixtures = lmap(lambda path: json.load(open(path, 'r')), response_fixtures) res = citations.all_entries(response_fixtures) per_page = 25 expected_entries = per_page * len(response_fixtures) self.assertEqual(expected_entries, len(res))
def fetch_parse(pmcid_list): "pages through all results for a list of PMC ids (can be just one) and parses the results." results = [] for page, sub_pmcid_list in enumerate( utils.paginate(pmcid_list, MAX_PER_PAGE)): LOG.debug("page %s, %s per-page", page + 1, MAX_PER_PAGE) resp = fetch(sub_pmcid_list) result = resp.json()["linksets"] # result is a list of maps. add all maps returned to a single list ... results.extend(result) # ... to be parsed all at once. return lmap(parse_result, results)
def fetch(pmcid_list): ensure( len(pmcid_list) <= MAX_PER_PAGE, "no more than %s results can be processed per-request. requested: %s" % (MAX_PER_PAGE, len(pmcid_list))) headers = {'accept': 'application/json'} params = { 'dbfrom': 'pubmed', 'linkname': 'pmc_pmc_citedby', 'id': lmap(norm_pmcid, pmcid_list), 'tool': 'elife-metrics', 'email': settings.CONTACT_EMAIL, 'retmode': 'json' } return handler.requests_get(PM_URL, params=params, headers=headers)
def update_page_counts(ptype, page_counts): ptypeobj = first(create_or_update(models.PageType, {"name": ptype}, update=False)) def do(row): page_data = { 'type': ptypeobj, 'identifier': row['identifier'], } pageobj = first(create_or_update(models.Page, page_data, update=False)) pagecount_data = { 'page': pageobj, 'views': row['views'], 'date': row['date'] } key_list = ['page', 'date'] pagecountobj = first(create_or_update(models.PageCount, pagecount_data, key_list, update=True)) return pagecountobj return lmap(do, page_counts)
def parse_map_file(frame, contents=None): contents and ensure(isinstance(contents, str), "'contents' must be a string'") def _parse_line(line): "the file is a simple 'cat nginx-redirect-file | grep prefix > outfile'" line = line.strip() if not line: return path, redirect = line.split("' '") path, redirect = path.strip(" '"), redirect.strip(" ';") prefix = frame['redirect-prefix'] ensure(redirect.startswith(prefix), "redirect doesn't start with redirect-prefix: %s" % line) # /inside-elife/foobar => foobar bits = redirect.strip('/').split('/', 1) # '/inside-elife/foobar' -> 'inside-elife/foobar' -> ['inside-elife, 'foobar'] redirect = models.LANDING_PAGE if len(bits) == 1 else bits[1] return (path, redirect) if contents: contents = contents.splitlines() else: path = os.path.join(settings.GA_PTYPE_SCHEMA_PATH, frame['path-map-file']) contents = open(path, 'r').readlines() return OrderedDict(lfilter(None, lmap(_parse_line, contents)))
def bulk_query(query_list): "executes a list of queries" return lmap(core.query_ga_write_results, query_list)
def test_build_ga_query_multiple_frames(self): "a query for a date range that overlaps epochs generates the correct queries" midJan18 = date(year=2018, month=1, day=15) midDec17 = date(year=2017, month=12, day=15) one_day = timedelta(days=1) two_days = timedelta(days=2) to_day = date.today() history_data = { 'frames': [{ 'id': 2, 'ends': None, 'starts': midJan18, 'pattern': '/new/pants' }, { 'id': 1, 'ends': midJan18 - one_day, 'starts': midDec17, 'pattern': '/old/pants' }] } # starts/ends just outside frame boundaries starts = midDec17 - two_days ends = midJan18 + two_days ql = logic.build_ga_query(models.EVENT, starts, ends, history_data) frame_list = lmap(first, ql) # just the frames and not the queries for now # frames are not modified after being validated/coerced expected_frames = [{ 'id': '1', 'starts': midDec17, 'ends': midJan18 - one_day, 'pattern': '/old/pants' }, { 'id': '2', 'starts': midJan18, 'ends': to_day, 'pattern': '/new/pants' }] self.assertEqual(frame_list, expected_frames) expected_query_dates = [ # first query: starts and ends on frame boundaries, ignoring explicit start date { 'start_date': midDec17, 'end_date': midJan18 - one_day, 'pattern': '/old/pants' }, # id=1 # second query: starts on frame boundary and ends on explicit end date { 'start_date': midJan18, 'end_date': ends, 'pattern': '/new/pants' }, # id=2 ] for expected, query in zip(expected_query_dates, lmap(second, ql)): subquery = subdict(query, ['start_date', 'end_date', 'filters']) utils.renkeys(subquery, [('filters', 'pattern')]) self.assertEqual(subquery, expected)
def parse_results(search_result): "parses citation counts from a page of search results from scopus" return lmap(parse_entry, search_result['entry'])
def load_csv(path): with open(path, 'r') as fh: reader = csv.DictReader(fh) return lmap(update_article, reader)
def count_for_qs(qs): return process_results(fetch_parse(lmap(resolve_pmcid, qs)))