コード例 #1
0
 def test_num_fields(self):
     d = {
         'canonical': "1998PPGeo..22..553A",
         'refereed': {
             'refereed': False
         },
         'author': ["Arnfield, A. L."],
         'reads': [1, 2, 3, 4],
         'download': [0, 1, 2, 3],
         'citations': ['1998PPGeo..22..553A'],
         'id': 11,
         'reference': ["1997BoLMe..85..475M"]
     }
     m = mock_open(read_data='')
     m.return_value.__iter__ = lambda self: iter(self.readline, '')
     with patch('builtins.open',
                m), Processor(compute_metrics=True) as processor:
         Cache.get('citation')['1998PPGeo..22..553A'].append(
             '1998PPGeo..22..553B')
         Cache.get('reference')['1998PPGeo..22..553A'].append(
             '1998PPGeo..22..553B')
         met = processor._compute_metrics(d)
         self.assertEqual(met['citation_num'], len(d['citations']))
         self.assertEqual(met['reference_num'], len(d['reference']))
         self.assertEqual(met['author_num'], len(d['author']))
         self.assertEqual(met['refereed_citation_num'], 0)
コード例 #2
0
    def _compute_metrics(self, d):
        """compute metrics dict based on the passed dict with the full nonbib record read and the cache"""

        bibcode = d['canonical']
        author_num = 1
        if 'author' in d and d['author']:
            author_num = max(len(d['author']), 1)

        refereed = Cache.get('refereed')
        bibcode_to_references = Cache.get('reference')
        bibcode_to_cites = Cache.get('citation')

        citations = bibcode_to_cites[bibcode]
        citations_json_records = []
        citation_normalized_references = 0.0
        citation_num = 0
        if citations:
            citation_num = len(citations)
        refereed_citations = []
        reference_num = len(bibcode_to_references[bibcode])
        total_normalized_citations = 0.0

        if citation_num:
            for citation_bibcode in citations:
                citation_refereed = citation_bibcode in refereed
                len_citation_reference = len(
                    bibcode_to_references[citation_bibcode])
                citation_normalized_references = 1.0 / float(
                    max(5, len_citation_reference))
                total_normalized_citations += citation_normalized_references
                tmp_json = {
                    "bibcode": citation_bibcode,
                    "ref_norm": citation_normalized_references,
                    "auth_norm": 1.0 / author_num,
                    "pubyear": int(bibcode[:4]),
                    "cityear": int(citation_bibcode[:4])
                }
                citations_json_records.append(tmp_json)
                if (citation_refereed):
                    refereed_citations.append(citation_bibcode)

        refereed_citation_num = len(refereed_citations)

        # annual citations
        today = datetime.today()
        resource_age = max(1.0, today.year - int(bibcode[:4]) + 1)
        an_citations = float(citation_num) / float(resource_age)
        an_refereed_citations = float(refereed_citation_num) / float(
            resource_age)

        # normalized info
        rn_citations = total_normalized_citations
        modtime = datetime.now()
        reads = d['reads']
        downloads = d['download']
        return_value = {
            'bibcode': bibcode,
            'an_citations': an_citations,
            'an_refereed_citations': an_refereed_citations,
            'author_num': author_num,
            'citation_num': citation_num,
            'citations': citations,
            'downloads': downloads,
            'modtime': modtime,
            'reads': reads,
            'refereed': bibcode in refereed,
            'refereed_citations': refereed_citations,
            'refereed_citation_num': refereed_citation_num,
            'reference_num': reference_num,
            'rn_citations': rn_citations,
            'rn_citation_data': citations_json_records
        }
        return return_value
コード例 #3
0
    def test_with_citations(self):
        d = {
            'canonical':
            "1997BoLMe..85..475M",
            'refereed': {
                'refereed': True
            },
            'author': [
                "Meesters, A. G. C. A.", "Bink, N. J.", "Henneken, E. A. C.",
                "Vugts, H. F.", "Cannemeijer, F."
            ],
            'download': [],
            'reads': [],
            'citations': [
                "1998PPGeo..22..553A", "1999P&SS...47..951S",
                "2000BoLMe..97..385O", "2001MAP....78..115K",
                "2002BoLMe.103...49H", "2006QJRMS.132..779R",
                "2006QJRMS.132...61E", "2008Sci...320.1622D",
                "2016BoLMe.159..469G"
            ],
            'reference': [
                "1994BoLMe..71..393V", "1994GPC.....9...53M",
                "1994GPC.....9...53X"
            ]
        }
        m = mock_open(read_data='')
        m.return_value.__iter__ = lambda self: iter(self.readline, '')
        with patch('builtins.open',
                   m), Processor(compute_metrics=True) as processor:
            for bib in d['citations']:
                Cache.get('citation')['1997BoLMe..85..475M'].append(bib)
            for bib in d['reference']:
                Cache.get('reference')['1997BoLMe..85..475M'].append(bib)
            refereed = [
                '1997BoLMe..85..475M', "1999P&SS...47..951S",
                "2000BoLMe..97..385O", "2001MAP....78..115K",
                "2002BoLMe.103...49H", "2006QJRMS.132..779R",
                "2006QJRMS.132...61E", "2008Sci...320.1622D",
                "2016BoLMe.159..469G"
            ]
            for bib in refereed:
                Cache.get('refereed').add(bib)
            # 1999P&SS...47..951S
            PSSreferences = [
                "1973JAtS...30...66B", "1973JAtS...30..749L",
                "1976JAtS...33..923B", "1977JGR....82.4121B",
                "1977JGR....82.4249K", "1977JGR....82.4559H",
                "1978Icar...33..417W", "1978JAtS...35.2346S",
                "1978JGR....83.1889D", "1979Icar...39..151H",
                "1979Icar...39..184H", "1979JGR....84.2889J",
                "1979JGR....84.2929P", "1979Natur.278..531H",
                "1981GeoRL...8..899R", "1981suma.book.....C",
                "1982JAtS...39.2701M", "1982JGR....87.9975M",
                "1982MWRv..110..994A", "1985AdSpR...5...93H",
                "1985PhDT.........2P", "1985TellA..37..156A",
                "1985wagp.book.....G", "1987MWRv..115..936Y",
                "1987MWRv..115.2214P", "1988aitb.book.....S",
                "1989BAMS...70..738B", "1990JAtS...47..612Y",
                "1990JGR....95.1359J", "1991ConAP..64..103S",
                "1992aitd.book.....H", "1992BoLMe..59..141G",
                "1992JGR....97.7781Z", "1993JAtS...50...77S",
                "1993JGR....98.3125B", "1994DPS....26.1806G",
                "1995Icar..113..277M", "1995JGR...100.5277H",
                "1995MWRv..123.1146H", "1996Icar..122...36C",
                "1996JGR...10114957S", "1996Sci...271..184S",
                "1997AdSpR..19.1241S", "1997AdSpR..19.1289M",
                "1997BoLMe..85..475M", "1997JGR...102.4463W",
                "1997Sci...278.1758S", "1998Sci...279.1686S"
            ]
            for bib in PSSreferences:
                Cache.get('reference')['1999P&SS...47..951S'].append(bib)

            met = processor._compute_metrics(d)
            self.assertEqual(len(met['citations']), len(d['citations']),
                             'citations check')
            self.assertEqual(met['refereed_citation_num'], len(refereed) - 1)
            self.assertEqual(met['refereed_citations'], refereed[1:])
            rn_citation_data = {
                "cityear": 1998,
                "pubyear": 1997,
                "auth_norm": 0.20000000298023224,
                "bibcode": "1998PPGeo..22..553A",
                "ref_norm": 0.20000000298023224
            }
            rn_citation_data1 = {
                "cityear": 1999,
                "pubyear": 1997,
                "auth_norm": 0.20000000298023224,
                "bibcode": "1999P&SS...47..951S",
                "ref_norm": 0.02083333395421505
            }
            self.compare_citation_data(met['rn_citation_data'][0],
                                       rn_citation_data)
            self.compare_citation_data(met['rn_citation_data'][1],
                                       rn_citation_data1)

            y = int(d['canonical'][:4])
            today = datetime.today()
            age = max(1.0, today.year - y + 1)
            self.assertAlmostEqual(met['an_refereed_citations'],
                                   len(met['refereed_citations']) / float(age),
                                   5)
コード例 #4
0
ファイル: run.py プロジェクト: spacemansteve/ADSDataPipeline
def main():
    parser = argparse.ArgumentParser(
        description=
        'Process nonbib input data files and send data to master pipeline')
    subparsers = parser.add_subparsers(help='commands',
                                       dest="action",
                                       required=True)
    diff_parser = subparsers.add_parser(
        'COMPUTE_DIFF',
        help=
        'Compute changed bibcodes by comparing current and previous data sets.  Changed bibcodes are stored in the file ./logs/input/current/changedBibcodes.txt.'
    )
    file_parser = subparsers.add_parser(
        'PROCESS_FILE',
        help=
        'Send nonbib and metrics protobufs to master for the list of bibcodes in the provided file'
    )
    file_parser.add_argument('input_filename',
                             action='store',
                             type=str,
                             help='Path to input file, required.')
    file_parser.add_argument(
        '--no-metrics',
        action='store_false',
        dest='compute_metrics',
        help=
        'Only send nonbib protobufs to master, do not init cache or send metrics protobufs'
    )
    bibcodes_parser = subparsers.add_parser(
        'PROCESS_BIBCODES',
        help=
        'Send data to master for the bibcodes provided on the command line.')
    bibcodes_parser.add_argument('--bibcodes',
                                 action='store',
                                 default=None,
                                 dest='bibcodes',
                                 nargs='+',
                                 required=True,
                                 type=str,
                                 help='Space delimited list of bibcodess.')
    bibcodes_parser.add_argument(
        '--no-metrics',
        dest='compute_metrics',
        action='store_false',
        help=
        'Only send nonbib protobufs to master, do not init cache or send metrics protobufs.'
    )

    args = parser.parse_args()

    if args.action == 'COMPUTE_DIFF':
        Diff.compute()
    else:
        # where with PROCESS_BIBCODES or PROCESS_FILE
        if args.compute_metrics:
            Cache.init()
        if args.action == 'PROCESS_BIBCODES':
            # parse and sort
            bibcodes = args.bibcodes.sort()
            with Processor(compute_metrics=args.compute_metrics) as processor:
                processor.process_bibcodes(bibcodes)
            print('processedbibcodes {}'.format(bibcodes))

        elif args.action == 'PROCESS_FILE':
            Diff.execute('sort -o {} {}'.format(args.input_filename,
                                                args.input_filename))
            # send bibcodes from file to processing in batches
            count = 0
            bibcodes = []
            with open(args.input_filename, 'r') as f, Processor(
                    compute_metrics=args.compute_metrics) as processor:
                for line in f:
                    if count % 10000 == 0:
                        print('{}: processed bibcodes count = {}'.format(
                            datetime.datetime.now(), count))
                    count = count + 1
                    line = line.strip()
                    if line:
                        bibcodes.append(line)
                        if len(bibcodes) % 100 == 0:
                            processor.process_bibcodes(bibcodes)
                            bibcodes = []
                if len(bibcodes) > 0:
                    processor.process_bibcodes(bibcodes)
            print(
                '{}: completed processing bibcodes from {}, count = {}'.format(
                    datetime.datetime.now(), args.input_filename, count))