def test_num_fields(self): d = { 'canonical': "1998PPGeo..22..553A", 'refereed': { 'refereed': False }, 'author': ["Arnfield, A. L."], 'reads': [1, 2, 3, 4], 'download': [0, 1, 2, 3], 'citations': ['1998PPGeo..22..553A'], 'id': 11, 'reference': ["1997BoLMe..85..475M"] } m = mock_open(read_data='') m.return_value.__iter__ = lambda self: iter(self.readline, '') with patch('builtins.open', m), Processor(compute_metrics=True) as processor: Cache.get('citation')['1998PPGeo..22..553A'].append( '1998PPGeo..22..553B') Cache.get('reference')['1998PPGeo..22..553A'].append( '1998PPGeo..22..553B') met = processor._compute_metrics(d) self.assertEqual(met['citation_num'], len(d['citations'])) self.assertEqual(met['reference_num'], len(d['reference'])) self.assertEqual(met['author_num'], len(d['author'])) self.assertEqual(met['refereed_citation_num'], 0)
def _compute_metrics(self, d): """compute metrics dict based on the passed dict with the full nonbib record read and the cache""" bibcode = d['canonical'] author_num = 1 if 'author' in d and d['author']: author_num = max(len(d['author']), 1) refereed = Cache.get('refereed') bibcode_to_references = Cache.get('reference') bibcode_to_cites = Cache.get('citation') citations = bibcode_to_cites[bibcode] citations_json_records = [] citation_normalized_references = 0.0 citation_num = 0 if citations: citation_num = len(citations) refereed_citations = [] reference_num = len(bibcode_to_references[bibcode]) total_normalized_citations = 0.0 if citation_num: for citation_bibcode in citations: citation_refereed = citation_bibcode in refereed len_citation_reference = len( bibcode_to_references[citation_bibcode]) citation_normalized_references = 1.0 / float( max(5, len_citation_reference)) total_normalized_citations += citation_normalized_references tmp_json = { "bibcode": citation_bibcode, "ref_norm": citation_normalized_references, "auth_norm": 1.0 / author_num, "pubyear": int(bibcode[:4]), "cityear": int(citation_bibcode[:4]) } citations_json_records.append(tmp_json) if (citation_refereed): refereed_citations.append(citation_bibcode) refereed_citation_num = len(refereed_citations) # annual citations today = datetime.today() resource_age = max(1.0, today.year - int(bibcode[:4]) + 1) an_citations = float(citation_num) / float(resource_age) an_refereed_citations = float(refereed_citation_num) / float( resource_age) # normalized info rn_citations = total_normalized_citations modtime = datetime.now() reads = d['reads'] downloads = d['download'] return_value = { 'bibcode': bibcode, 'an_citations': an_citations, 'an_refereed_citations': an_refereed_citations, 'author_num': author_num, 'citation_num': citation_num, 'citations': citations, 'downloads': downloads, 'modtime': modtime, 'reads': reads, 'refereed': bibcode in refereed, 'refereed_citations': refereed_citations, 'refereed_citation_num': refereed_citation_num, 'reference_num': reference_num, 'rn_citations': rn_citations, 'rn_citation_data': citations_json_records } return return_value
def test_with_citations(self): d = { 'canonical': "1997BoLMe..85..475M", 'refereed': { 'refereed': True }, 'author': [ "Meesters, A. G. C. A.", "Bink, N. J.", "Henneken, E. A. C.", "Vugts, H. F.", "Cannemeijer, F." ], 'download': [], 'reads': [], 'citations': [ "1998PPGeo..22..553A", "1999P&SS...47..951S", "2000BoLMe..97..385O", "2001MAP....78..115K", "2002BoLMe.103...49H", "2006QJRMS.132..779R", "2006QJRMS.132...61E", "2008Sci...320.1622D", "2016BoLMe.159..469G" ], 'reference': [ "1994BoLMe..71..393V", "1994GPC.....9...53M", "1994GPC.....9...53X" ] } m = mock_open(read_data='') m.return_value.__iter__ = lambda self: iter(self.readline, '') with patch('builtins.open', m), Processor(compute_metrics=True) as processor: for bib in d['citations']: Cache.get('citation')['1997BoLMe..85..475M'].append(bib) for bib in d['reference']: Cache.get('reference')['1997BoLMe..85..475M'].append(bib) refereed = [ '1997BoLMe..85..475M', "1999P&SS...47..951S", "2000BoLMe..97..385O", "2001MAP....78..115K", "2002BoLMe.103...49H", "2006QJRMS.132..779R", "2006QJRMS.132...61E", "2008Sci...320.1622D", "2016BoLMe.159..469G" ] for bib in refereed: Cache.get('refereed').add(bib) # 1999P&SS...47..951S PSSreferences = [ "1973JAtS...30...66B", "1973JAtS...30..749L", "1976JAtS...33..923B", "1977JGR....82.4121B", "1977JGR....82.4249K", "1977JGR....82.4559H", "1978Icar...33..417W", "1978JAtS...35.2346S", "1978JGR....83.1889D", "1979Icar...39..151H", "1979Icar...39..184H", "1979JGR....84.2889J", "1979JGR....84.2929P", "1979Natur.278..531H", "1981GeoRL...8..899R", "1981suma.book.....C", "1982JAtS...39.2701M", "1982JGR....87.9975M", "1982MWRv..110..994A", "1985AdSpR...5...93H", "1985PhDT.........2P", "1985TellA..37..156A", "1985wagp.book.....G", "1987MWRv..115..936Y", "1987MWRv..115.2214P", "1988aitb.book.....S", "1989BAMS...70..738B", "1990JAtS...47..612Y", "1990JGR....95.1359J", "1991ConAP..64..103S", "1992aitd.book.....H", "1992BoLMe..59..141G", "1992JGR....97.7781Z", "1993JAtS...50...77S", "1993JGR....98.3125B", "1994DPS....26.1806G", "1995Icar..113..277M", "1995JGR...100.5277H", "1995MWRv..123.1146H", "1996Icar..122...36C", "1996JGR...10114957S", "1996Sci...271..184S", "1997AdSpR..19.1241S", "1997AdSpR..19.1289M", "1997BoLMe..85..475M", "1997JGR...102.4463W", "1997Sci...278.1758S", "1998Sci...279.1686S" ] for bib in PSSreferences: Cache.get('reference')['1999P&SS...47..951S'].append(bib) met = processor._compute_metrics(d) self.assertEqual(len(met['citations']), len(d['citations']), 'citations check') self.assertEqual(met['refereed_citation_num'], len(refereed) - 1) self.assertEqual(met['refereed_citations'], refereed[1:]) rn_citation_data = { "cityear": 1998, "pubyear": 1997, "auth_norm": 0.20000000298023224, "bibcode": "1998PPGeo..22..553A", "ref_norm": 0.20000000298023224 } rn_citation_data1 = { "cityear": 1999, "pubyear": 1997, "auth_norm": 0.20000000298023224, "bibcode": "1999P&SS...47..951S", "ref_norm": 0.02083333395421505 } self.compare_citation_data(met['rn_citation_data'][0], rn_citation_data) self.compare_citation_data(met['rn_citation_data'][1], rn_citation_data1) y = int(d['canonical'][:4]) today = datetime.today() age = max(1.0, today.year - y + 1) self.assertAlmostEqual(met['an_refereed_citations'], len(met['refereed_citations']) / float(age), 5)
def main(): parser = argparse.ArgumentParser( description= 'Process nonbib input data files and send data to master pipeline') subparsers = parser.add_subparsers(help='commands', dest="action", required=True) diff_parser = subparsers.add_parser( 'COMPUTE_DIFF', help= 'Compute changed bibcodes by comparing current and previous data sets. Changed bibcodes are stored in the file ./logs/input/current/changedBibcodes.txt.' ) file_parser = subparsers.add_parser( 'PROCESS_FILE', help= 'Send nonbib and metrics protobufs to master for the list of bibcodes in the provided file' ) file_parser.add_argument('input_filename', action='store', type=str, help='Path to input file, required.') file_parser.add_argument( '--no-metrics', action='store_false', dest='compute_metrics', help= 'Only send nonbib protobufs to master, do not init cache or send metrics protobufs' ) bibcodes_parser = subparsers.add_parser( 'PROCESS_BIBCODES', help= 'Send data to master for the bibcodes provided on the command line.') bibcodes_parser.add_argument('--bibcodes', action='store', default=None, dest='bibcodes', nargs='+', required=True, type=str, help='Space delimited list of bibcodess.') bibcodes_parser.add_argument( '--no-metrics', dest='compute_metrics', action='store_false', help= 'Only send nonbib protobufs to master, do not init cache or send metrics protobufs.' ) args = parser.parse_args() if args.action == 'COMPUTE_DIFF': Diff.compute() else: # where with PROCESS_BIBCODES or PROCESS_FILE if args.compute_metrics: Cache.init() if args.action == 'PROCESS_BIBCODES': # parse and sort bibcodes = args.bibcodes.sort() with Processor(compute_metrics=args.compute_metrics) as processor: processor.process_bibcodes(bibcodes) print('processedbibcodes {}'.format(bibcodes)) elif args.action == 'PROCESS_FILE': Diff.execute('sort -o {} {}'.format(args.input_filename, args.input_filename)) # send bibcodes from file to processing in batches count = 0 bibcodes = [] with open(args.input_filename, 'r') as f, Processor( compute_metrics=args.compute_metrics) as processor: for line in f: if count % 10000 == 0: print('{}: processed bibcodes count = {}'.format( datetime.datetime.now(), count)) count = count + 1 line = line.strip() if line: bibcodes.append(line) if len(bibcodes) % 100 == 0: processor.process_bibcodes(bibcodes) bibcodes = [] if len(bibcodes) > 0: processor.process_bibcodes(bibcodes) print( '{}: completed processing bibcodes from {}, count = {}'.format( datetime.datetime.now(), args.input_filename, count))