def test_return_all_matches(self):
        aff_pickle_filename = stubdata_dir + '/aff.pickle'
        (adict, cdict) = utils.load_affil_dict(aff_pickle_filename)
        clause_pickle_filename = stubdata_dir + '/clause.pickle'
        clause_dict = utils.load_clause_dict(clause_pickle_filename)

        matcher = app.ADSAffilCelery('fnord')
        matcher.adict = adict
        matcher.cdict = cdict
        matcher.clausedict = clause_dict

        rec = 'Department of Physics, University of Hanging Out and Listening to Jazz Records, Merced, CA'

        matcher.crit = 0.0
        expected_record = {
            'A05210': 0.25,
            'A11557': 0.25,
            'A00921': 0.25,
            'A02585': 0.25,
            'A03439': 0.75
        }
        output_record = matcher.find_matches(rec)
        self.assertEqual(output_record, expected_record)

        matcher.crit = 0.75
        expected_record = {'A03439': 0.75}
        output_record = matcher.find_matches(rec)
        self.assertEqual(output_record, expected_record)

        matcher.crit = 0.99
        expected_record = {}
        output_record = matcher.find_matches(rec)
        self.assertEqual(output_record, expected_record)
 def test_create_base_instance(self):
     matcher = app.ADSAffilCelery('augment-pipeline')
     self.assertEqual(matcher.adict, {})
     self.assertEqual(matcher.cdict, {})
     self.assertEqual(matcher.clausedict, {})
     self.assertEqual(matcher.crit, 0.75)
     self.assertEqual(matcher.separator, ',')
     self.assertFalse(matcher.exact)
    def test_find_matches_with_exact(self):
        aff_pickle_filename = stubdata_dir + '/aff.pickle'
        (adict, cdict) = utils.load_affil_dict(aff_pickle_filename)
        clause_pickle_filename = stubdata_dir + '/clause.pickle'
        clause_dict = utils.load_clause_dict(clause_pickle_filename)

        matcher = app.ADSAffilCelery('augment-pipeline')
        matcher.adict = adict
        matcher.cdict = cdict
        matcher.clausedict = clause_dict

        rec = 'Department of Physics and Institute for the Early Universe, Ewha Womans University, Seodaaemun-gu, Seoul, South Korea'
        output_record = matcher.find_matches(rec)
        expected_record = {'A11557': 2.0}
        self.assertEqual(output_record, expected_record)
    def test_return_exact_matches(self):
        aff_pickle_filename = stubdata_dir + '/aff.pickle'
        (adict, cdict) = utils.load_affil_dict(aff_pickle_filename)
        clause_pickle_filename = stubdata_dir + '/clause.pickle'
        clause_dict = utils.load_clause_dict(clause_pickle_filename)

        matcher = app.ADSAffilCelery('augment-pipeline')
        matcher.adict = adict
        matcher.cdict = cdict
        matcher.clausedict = clause_dict

        rec = 'Department of Physics, University of Hanging Out and Listening to Jazz Records, Merced, CA'
        output_record = matcher.augment_affiliations(rec)
        expected_record = ('-', '-', None, '-')
        self.assertEqual(output_record, expected_record)
Exemple #5
0
import os
from kombu import Queue
from adsmsg import AugmentAffiliationRequestRecord, AugmentAffiliationResponseRecord
from ADSAffil import app as app_module
from ADSAffil import utils

proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
app = app_module.ADSAffilCelery('augment-pipeline',
                                proj_home=proj_home,
                                config=globals().get('config', {}),
                                local_config=globals().get('local_config', {}))
logger = app.logger

app.conf.CELERY_QUEUES = (Queue('augment-affiliation',
                                app.exchange,
                                routing_key='augment-affiliation'),
                          Queue('output-record',
                                app.exchange,
                                routing_key='output-record'),
                          Queue('update-record',
                                app.exchange,
                                routing_key='update-record'),
                          Queue('api-matcher',
                                app.exchange,
                                rounting_key='match-affil'))

try:
    (app.adict,
     app.cdict) = utils.load_affil_dict(app.conf.AFFIL_PICKLE_FILENAME)
    app.clausedict = utils.load_clause_dict(app.conf.CLAUSE_PICKLE_FILENAME)
except Exception as err:
    def test_create_working_instance(self):
        aff_pickle_filename = stubdata_dir + '/aff.pickle'
        (adict, cdict) = utils.load_affil_dict(aff_pickle_filename)
        clause_pickle_filename = stubdata_dir + '/clause.pickle'
        clause_dict = utils.load_clause_dict(clause_pickle_filename)

        matcher = app.ADSAffilCelery('foo')
        matcher.adict = adict
        matcher.cdict = cdict
        matcher.clausedict = clause_dict

        # first test: use augment_affiliations to exactly match a string:
        rec = 'Department of Physics and Institute for the Early Universe, Ewha Womans University, Seodaaemun-gu, Seoul, South Korea'
        output_record = matcher.augment_affiliations(rec)
        expected_record = (
            'Ewha Wmns U/Inst Early Uni',
            'Ewha Womans University, Institute for the Early Universe',
            ['0/Ewha Wmns U', '1/Ewha Wmns U/Inst Early Uni'], 'A11557')
        self.assertEqual(output_record, expected_record)

        # second test: use augment_affiliations to augment a record
        test_json = stubdata_dir + '/record.json'
        with open(test_json, 'r') as fj:
            jdata = json.load(fj)
        rec = jdata['response']['docs'][0]
        output_record = matcher.augment_affiliations(rec)
        expected_record = {
            'bibcode':
            '2002ApJ...576..963T',
            'aff': [
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101',
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101',
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101'
            ],
            'aff_abbrev':
            ['Yale U/Dep Ast', 'Yale U/Dep Ast', 'Yale U/Dep Ast'],
            'aff_id': ['A00928', 'A00928', 'A00928'],
            'aff_canonical': [
                'Yale University, Department of Astronomy',
                'Yale University, Department of Astronomy',
                'Yale University, Department of Astronomy'
            ],
            'aff_facet_hier': ['0/Yale U', '1/Yale U/Dep Ast'],
            'aff_raw': [
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101',
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101',
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101'
            ],
            'institution':
            ['Yale U/Dep Ast', 'Yale U/Dep Ast', 'Yale U/Dep Ast']
        }
        self.assertEqual(output_record, expected_record)

        # third test: try augmenting a record with some unmatched affs:
        test_json = stubdata_dir + '/record2.json'
        with open(test_json, 'r') as fj:
            jdata = json.load(fj)
        rec = jdata['response']['docs'][0]
        output_record = matcher.augment_affiliations(rec)
        expected_record = {
            'bibcode':
            '2021FOO...576..963T',
            'aff': [
                'University of Unbridled Nonsense',
                'University of Irrational Exuberance',
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101'
            ],
            'aff_abbrev': ['-', '-', 'Yale U/Dep Ast'],
            'aff_id': ['-', '-', 'A00928'],
            'aff_canonical':
            ['-', '-', 'Yale University, Department of Astronomy'],
            'aff_facet_hier': ['0/Yale U', '1/Yale U/Dep Ast'],
            'aff_raw': [
                'University of Unbridled Nonsense',
                'University of Irrational Exuberance',
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101'
            ],
            'institution': ['-', '-', 'Yale U/Dep Ast']
        }
        self.assertEqual(output_record, expected_record)

        # fourth test: try augmenting a record with all unmatched affs:
        test_json = stubdata_dir + '/record3.json'
        with open(test_json, 'r') as fj:
            jdata = json.load(fj)
        rec = jdata['response']['docs'][0]
        output_record = matcher.augment_affiliations(rec)
        expected_record = {
            'bibcode':
            '2021FUBAR.576..963T',
            'aff': [
                'University of Unbridled Nonsense',
                'University of Irrational Exuberance',
                'University of Mass Hysteria'
            ],
            'aff_abbrev': ['-', '-', '-'],
            'aff_id': ['-', '-', '-'],
            'aff_canonical': ['-', '-', '-'],
            'aff_facet_hier': [],
            'aff_raw': [
                'University of Unbridled Nonsense',
                'University of Irrational Exuberance',
                'University of Mass Hysteria'
            ],
            'institution': ['-', '-', '-']
        }
        self.assertEqual(output_record, expected_record)

        # fifth test: try augmenting a record with empty affs:
        test_json = stubdata_dir + '/record4.json'
        with open(test_json, 'r') as fj:
            jdata = json.load(fj)
        rec = jdata['response']['docs'][0]
        output_record = matcher.augment_affiliations(rec)
        expected_record = {
            'bibcode': '2021BAZ...576..963T',
            'aff': [],
            'aff_abbrev': [],
            'aff_id': [],
            'aff_canonical': [],
            'aff_facet_hier': [],
            'aff_raw': [],
            'institution': []
        }
        self.assertEqual(output_record, expected_record)

        # sixth test: try augmenting a record with blank affs:
        test_json = stubdata_dir + '/record5.json'
        with open(test_json, 'r') as fj:
            jdata = json.load(fj)
        rec = jdata['response']['docs'][0]
        output_record = matcher.augment_affiliations(rec)
        expected_record = {
            'bibcode': '2021ASDFQ.576..963T',
            'aff': ['-'],
            'aff_abbrev': ['-'],
            'aff_id': ['-'],
            'aff_canonical': ['-'],
            'aff_facet_hier': [],
            'aff_raw': ['-'],
            'institution': ['-']
        }
        self.assertEqual(output_record, expected_record)

        # seventh test: try augmenting a record with some unmatched affs AND a multipart aff (one known ; one unknown):
        test_json = stubdata_dir + '/record6.json'
        with open(test_json, 'r') as fj:
            jdata = json.load(fj)
        rec = jdata['response']['docs'][0]
        output_record = matcher.augment_affiliations(rec)
        expected_record = {
            'bibcode':
            '2021FOO...576..963T',
            'aff': [
                'University of Unbridled Nonsense',
                'University of Irrational Exuberance',
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101; University of Unbridled Nonsense'
            ],
            'aff_abbrev': ['-', '-', 'Yale U/Dep Ast; -'],
            'aff_id': ['-', '-', 'A00928; -'],
            'aff_canonical':
            ['-', '-', 'Yale University, Department of Astronomy; -'],
            'aff_facet_hier': ['0/Yale U', '1/Yale U/Dep Ast'],
            'aff_raw': [
                'University of Unbridled Nonsense',
                'University of Irrational Exuberance',
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101; University of Unbridled Nonsense'
            ],
            'institution': ['-', '-', 'Yale U/Dep Ast; -']
        }
        self.assertEqual(output_record, expected_record)

        # eighth test: try augmenting a record a multipart affil where both are known strings:
        test_json = stubdata_dir + '/record7.json'
        with open(test_json, 'r') as fj:
            jdata = json.load(fj)
        rec = jdata['response']['docs'][0]
        output_record = matcher.augment_affiliations(rec)
        expected_record = {
            'bibcode':
            '2021FOO...576..963T',
            'aff': [
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101; Institute of the Early Universe, Ewha Womans University, Seoul, 120-750 Korea'
            ],
            'aff_abbrev': ['Yale U/Dep Ast; Ewha Wmns U/Inst Early Uni'],
            'aff_id': ['A00928; A11557'],
            'aff_canonical': [
                'Yale University, Department of Astronomy; Ewha Womans University, Institute for the Early Universe'
            ],
            'aff_facet_hier': [
                '0/Yale U', '1/Yale U/Dep Ast', '0/Ewha Wmns U',
                '1/Ewha Wmns U/Inst Early Uni'
            ],
            'aff_raw': [
                'Astronomy Department, Yale University, P.O. Box 208101, New Haven, CT 06520-8101; Institute of the Early Universe, Ewha Womans University, Seoul, 120-750 Korea'
            ],
            'institution': ['Yale U/Dep Ast; Ewha Wmns U/Inst Early Uni']
        }
        self.assertEqual(output_record, expected_record)

        # ninth test: try augmenting a record having an affiliation whose raw text has an HTML entity with semicolon:
        test_json = stubdata_dir + '/record8.json'
        with open(test_json, 'r') as fj:
            jdata = json.load(fj)
        rec = jdata['response']['docs'][0]
        output_record = matcher.augment_affiliations(rec)
        expected_record = {
            'bibcode':
            '2021FOO...576..963T',
            'aff': [
                'Texas A & M University, College Station; Physics Department, Fisk University, Nashville, Tennessee 37203; University of ABCDEFG'
            ],
            'aff_abbrev':
            ['TX A&M Coll Station/TX A&M Coll Station; Fisk U/Fisk U; -'],
            'aff_id': ['A02250; A00921; -'],
            'aff_canonical': [
                'Texas A&M University, College Station; Fisk University, Tennessee; -'
            ],
            'aff_facet_hier': [
                '0/TX A&M Coll Station',
                '1/TX A&M Coll Station/TX A&M Coll Station', '0/Fisk U',
                '1/Fisk U/Fisk U'
            ],
            'aff_raw': [
                'Texas A & M University, College Station; Physics Department, Fisk University, Nashville, Tennessee 37203; University of ABCDEFG'
            ],
            'institution':
            ['TX A&M Coll Station/TX A&M Coll Station; Fisk U/Fisk U; -']
        }
        self.assertEqual(output_record, expected_record)
Exemple #7
0
from kombu import Queue
import os

from ADSAffil import app as app_module
from ADSAffil import utils
from adsmsg import AugmentAffiliationRequestRecord, AugmentAffiliationResponseRecord

import config

app = app_module.ADSAffilCelery('augment-pipeline',
                                proj_home=os.path.realpath(
                                    os.path.join(os.path.dirname(__file__),
                                                 '../')))

app.conf.CELERY_QUEUES = (Queue('augment-affiliation',
                                app.exchange,
                                routing_key='augment-affiliation'),
                          Queue('output-record',
                                app.exchange,
                                routing_key='output-record'),
                          Queue('update-record',
                                app.exchange,
                                routing_key='update-record'))
logger = app.logger

app.load_dicts(config.PICKLE_FILE)


@app.task(queue='update-record')
def task_update_record(msg):
    logger.warn('in update record with {}'.format(msg))