Beispiel #1
0
    def test_object_to_dict(self):
        parents = [{
            "_id": 10,
            "_another_id": 2,
            "some_field": 20
        }, {
            "_id": 20,
            "_another_id": 2,
            "some_field": 20
        }]
        _parents = insert_data("MYSQLDBCONF", "mysqldb", "production_tests",
                               Base, DummyModel, parents)
        assert len(parents) == len(_parents)

        children = [{
            "_id": 10,
            "parent_id": 10
        }, {
            "_id": 10,
            "parent_id": 20
        }, {
            "_id": 20,
            "parent_id": 20
        }, {
            "_id": 30,
            "parent_id": 20
        }]
        _children = insert_data("MYSQLDBCONF", "mysqldb", "production_tests",
                                Base, DummyChild, children)
        assert len(children) == len(_children)

        # Re-retrieve parents from the database
        found_children = set()
        engine = get_mysql_engine("MYSQLDBCONF", "mysqldb")
        with db_session(engine) as session:
            for p in session.query(DummyModel).all():
                row = object_to_dict(p)
                assert type(row) is dict
                assert len(row['children']) > 0
                _found_children = set(
                    (c['_id'], c['parent_id']) for c in row['children'])
                found_children = found_children.union(_found_children)
                _row = object_to_dict(p, shallow=True)
                assert 'children' not in _row
                del row['children']
                assert row == _row
            assert len(found_children) == len(children) == len(_children)
Beispiel #2
0
def metadata(orm, session, appln_ids, field_selector=None):
    if field_selector is None:
        field_selector = orm.appln_id
    _filter = field_selector.in_(appln_ids)
    return [
        object_to_dict(_obj)
        for _obj in session.query(orm).filter(_filter).all()
    ]
Beispiel #3
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    logging.info('Retrieving engine connection')
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)

    # es setup
    logging.info('Connecting to ES')
    strans_kwargs = {
        'filename': 'eurito/cordis-eu.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['id']
    }
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           listify_terms=True,
                           do_sort=False,
                           ngram_fields=['textBody_description_project'])

    # collect file
    logging.info('Retrieving project ids')
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    project_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(project_ids)} project IDs " "retrieved from s3")

    #
    logging.info('Processing rows')
    with db_session(engine) as session:
        for count, obj in enumerate((session.query(Project).filter(
                Project.rcn.in_(project_ids)).all())):
            row = object_to_dict(obj)
            row = reformat_row(row)
            es.index(index=es_index,
                     doc_type=es_type,
                     id=row.pop('rcn'),
                     body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")
Beispiel #4
0
def flatten(orm_instance):
    """Convert a SqlAlchemy ORM (i.e. a 'row' of data) to flat JSON.

    Args:
        orm_instance (sqlalchemy.Base): Instance of a SqlAlchemy ORM, i.e.
                                        a 'row' of data.
    Returns:
        row (dict): A flat row of data, inferred from `orm_instance`
    """
    row = object_to_dict(orm_instance, shallow=True)
    return {k: _flatten(v) for k, v in row.items()}
Beispiel #5
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    logging.info('Retrieving engine connection')
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    logging.info('Building FOS lookup')
    fos_lookup = build_fos_lookup(engine, max_lvl=6)

    nf = NutsFinder()

    # es setup
    logging.info('Connecting to ES')
    strans_kwargs = {
        'filename': 'eurito/arxiv-eu.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['id']
    }
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           listify_terms=True,
                           do_sort=False,
                           ngram_fields=['textBody_abstract_article'])

    # collect file
    logging.info('Retrieving article ids')
    nrows = 20 if test else None
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    art_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(art_ids)} article IDs " "retrieved from s3")

    # Get all grid countries
    # and country: continent lookup
    logging.info('Doing country lookup')
    country_lookup = get_country_region_lookup()
    eu_countries = get_eu_countries()
    with db_session(engine) as session:
        grid_regions = {
            obj.id: country_lookup[obj.country_code]
            for obj in session.query(Inst).all()
            if obj.country_code is not None
        }
        grid_countries = {
            obj.id: obj.country_code
            for obj in session.query(Inst).all()
            if obj.country_code is not None
        }
        grid_institutes = {
            obj.id: obj.name
            for obj in session.query(Inst).all()
        }
        grid_latlon = {
            obj.id: (obj.latitude, obj.longitude)
            for obj in session.query(Inst).all()
        }

    #
    logging.info('Processing rows')
    with db_session(engine) as session:
        for count, obj in enumerate(
            (session.query(Art).filter(Art.id.in_(art_ids)).all())):
            row = object_to_dict(obj)
            # Extract year from date
            if row['created'] is not None:
                row['year'] = row['created'].year

            # Normalise citation count for searchkit
            if row['citation_count'] is None:
                row['citation_count'] = 0

            # Extract field of study
            row['fields_of_study'] = make_fos_tree(row['fields_of_study'],
                                                   fos_lookup)
            row['_fields_of_study'] = [
                f for fields in row['fields_of_study']['nodes'] for f in fields
                if f != []
            ]

            # Format hierarchical fields as expected by searchkit
            row['categories'] = [
                cat['description'] for cat in row.pop('categories')
            ]
            institutes = row.pop('institutes')
            good_institutes = [
                i['institute_id'] for i in institutes
                if i['matching_score'] > 0.9
            ]

            # Add NUTS regions
            for inst_id in good_institutes:
                if inst_id not in grid_latlon:
                    continue
                lat, lon = grid_latlon[inst_id]
                if lat is None or lon is None:
                    continue
                nuts = nf.find(lat=lat, lon=lon)
                for i in range(0, 4):
                    name = f'nuts_{i}'
                    if name not in row:
                        row[name] = set()
                    for nut in nuts:
                        if nut['LEVL_CODE'] != i:
                            continue
                        row[name].add(nut['NUTS_ID'])
            for i in range(0, 4):
                name = f'nuts_{i}'
                if name in row:
                    row[name] = list(row[name])

            # Add other geographies
            countries = set(grid_countries[inst_id]
                            for inst_id in good_institutes
                            if inst_id in grid_countries)
            regions = set(grid_regions[inst_id] for inst_id in good_institutes
                          if inst_id in grid_countries)
            row['countries'] = list(countries)  #[c for c, r in countries]
            row['regions'] = [r for c, r in regions]
            row['is_eu'] = any(c in eu_countries for c in countries)

            # Pull out international institute info
            has_mn = any(
                is_multinational(inst, grid_countries.values())
                for inst in good_institutes)
            row['has_multinational'] = has_mn

            # Generate author & institute properties
            mag_authors = row.pop('mag_authors')
            if mag_authors is None:
                row['authors'] = None
                row['institutes'] = None
            else:
                if all('author_order' in a for a in mag_authors):
                    mag_authors = sorted(mag_authors,
                                         key=lambda a: a['author_order'])
                row['authors'] = [
                    author['author_name'].title() for author in mag_authors
                ]
                gids = [
                    author['affiliation_grid_id'] for author in mag_authors
                    if 'affiliation_grid_id' in author
                ]
                row['institutes'] = [
                    grid_institutes[g].title() for g in gids
                    if g in grid_institutes and g in good_institutes
                ]
            if row['institutes'] in (None, []):
                row['institutes'] = [
                    grid_institutes[g].title() for g in good_institutes
                ]

            uid = row.pop('id')
            _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")

    logging.warning("Batch job complete.")
Beispiel #6
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    logging.info('Retrieving engine connection')
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    _engine = get_mysql_engine("BATCHPAR_config", "readonly",
                               "patstat_2019_05_13")

    # es setup
    logging.info('Connecting to ES')
    strans_kwargs = {
        'filename': 'eurito/patstat-eu.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['id']
    }
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           auto_translate=True,
                           auto_translate_kwargs={'min_len': 20},
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           do_sort=True,
                           ngram_fields=['textBody_abstract_patent'])

    # collect file
    logging.info('Retrieving patent family ids')
    nrows = 20 if test else None
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    docdb_fam_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(docdb_fam_ids)} patent family IDs "
                 "retrieved from s3")

    eu_countries = get_eu_countries()

    logging.info('Processing rows')
    _filter = ApplnFamily.docdb_family_id.in_(docdb_fam_ids)
    with db_session(engine) as session:
        for obj in session.query(ApplnFamily).filter(_filter).all():
            row = object_to_dict(obj)
            appln_ids = row.pop('appln_id')
            with db_session(_engine) as _session:
                _titles = metadata(Tls202ApplnTitle, _session, appln_ids)
                _abstrs = metadata(Tls203ApplnAbstr, _session, appln_ids)
                ipcs = metadata(Tls209ApplnIpc, _session, appln_ids)
                nace2s = metadata(Tls229ApplnNace2, _session, appln_ids)
                techs = metadata(Tls230ApplnTechnField, _session, appln_ids)
                # Get persons
                _pers_applns = metadata(Tls207PersAppln, _session, appln_ids)
                pers_ids = set(pa['person_id'] for pa in _pers_applns)
                persons = metadata(Tls906Person,
                                   _session,
                                   pers_ids,
                                   field_selector=Tls906Person.person_id)

            title = select_text(_titles, 'appln_title_lg', 'appln_title')
            abstr = select_text(_abstrs, 'appln_abstract_lg', 'appln_abstract')

            # Get names from lookups
            ipcs = list(set(i['ipc_class_symbol'].split()[0] for i in ipcs))
            nace2s = list(set(n['nace2_code'] for n in nace2s))
            techs = list(set(t['techn_field_nr'] for t in techs))
            ctrys = list(set(p['person_ctry_code'] for p in persons))
            nuts = list(set(p['nuts'] for p in persons))
            is_eu = any(c in eu_countries for c in ctrys)

            # Index the data
            row = dict(title=title,
                       abstract=abstr,
                       ipc=ipcs,
                       nace2=nace2s,
                       tech=techs,
                       ctry=ctrys,
                       nuts=nuts,
                       is_eu=is_eu,
                       **row)
            uid = row.pop('docdb_family_id')
            _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row)

    logging.warning("Batch job complete.")
Beispiel #7
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    fos_lookup = build_fos_lookup(engine)
    
    # Setup ngrammer
    os.environ['MYSQLDBCONF'] = os.environ['BATCHPAR_config']
    ngrammer = Ngrammer(database="production")

    # es setup
    strans_kwargs={'filename':'arxiv.json',
                   'from_key':'tier_0',
                   'to_key':'tier_1',
                   'ignore':['id']}
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           listify_terms=True,
                           do_sort=False)

    # collect file
    nrows = 20 if test else None

    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    art_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(art_ids)} article IDs "
                 "retrieved from s3")
    
    # Get all grid countries
    # and country: continent lookup
    country_lookup = get_country_region_lookup()                
    with db_session(engine) as session:
        grid_countries = {obj.id: country_lookup[obj.country_code]
                          for obj in session.query(Institute).all()
                          if obj.country_code is not None}
        grid_institutes = {obj.id: obj.name
                           for obj in session.query(Institute).all()}
    #
    current_year = dt.now().year
    with db_session(engine) as session:
        for count, obj in enumerate((session.query(Article)
                                     .filter(Article.id.in_(art_ids))
                                     .all())):
            row = object_to_dict(obj)
            # Extract year from date
            year = 1990
            if row['created'] is not None:
                row['year'] = row['created'].year
                year = row['created'].year

            # Normalise citation count for searchkit
            if row['citation_count'] is None:
                row['citation_count'] = 0
            row['normalised_citation'] = row['citation_count']/np.log(current_year-year+2)

            # If abstract doesn't meet requirements, zero novelty
            # all other novelty will be assigned in a later task
            text = row['abstract'] + ' ' + row['title']
            if (len(text) < 400
                or any(x in row['abstract'].lower()
                       for x in ('withdrawn', 'arxiv administrators'))
                or any(x in row['title'].lower()
                       for x in ('reply to', 'reply on', 
                                 'comment to', 'comment on',
                                 'remarks to', 'remarks on'))):
                row['novelty_of_article'] = 0

            processed_tokens = ngrammer.process_document(row['abstract'])
            row['tokens'] = [t.replace('_', ' ') 
                             for tokens in processed_tokens
                             for t in tokens]

            # Extract field of study Level 0 --> Level 1 paths
            fos = []
            fos_objs = row.pop('fields_of_study')
            fos_ids = set(fos['id'] for fos in fos_objs)
            for f in fos_objs:
                if f['level'] > 0:
                    continue
                fos += [reversed(fos_lookup[(f['id'], cid)])
                        for cid in split_ids(f['child_ids'])
                        if cid in fos_ids]

            # Format hierarchical fields as expected by searchkit
            cats = [(cat['description'], cat['id'].split('.')[0])
                    for cat in row.pop('categories')]
            institutes = row.pop('institutes')
            good_institutes = [i['institute_id'] for i in institutes
                               if i['matching_score'] > 0.9]
            countries = set(grid_countries[inst_id]
                            for inst_id in good_institutes
                            if inst_id in grid_countries)
            row['categories'], _, _ = hierarchy_field(cats)
            row['fos'], _, _ = hierarchy_field(fos)
            row['countries'], _, _ = hierarchy_field(countries)

            # Pull out international institute info
            has_mn = any(is_multinational(inst,
                                          grid_countries.values())
                         for inst in good_institutes)
            row['has_multinational'] = has_mn

            # Generate author & institute properties
            mag_authors = row.pop('mag_authors')
            if mag_authors is None:
                row['authors'] = None
                row['institutes'] = None
                #row['novelty_of_article'] = 0
            else:
                if all('author_order' in a for a in mag_authors):
                    mag_authors = sorted(mag_authors,
                                         key=lambda a: a['author_order'])

                row['authors'] = [author['author_name'].title()
                                  for author in mag_authors]
                if len(row['authors']) > 10:
                    row['authors'] = [f"{row['authors'][0]}, et al"]

                gids = [author['affiliation_grid_id']
                        for author in mag_authors
                        if 'affiliation_grid_id' in author]
                row['institutes'] = [grid_institutes[g].title()
                                     for g in gids
                                     if g in grid_institutes
                                     and g in good_institutes]
                #row['novelty_of_article'] = novelty0 + np.log(novelty1+1)            

            uid = row.pop('id')
            _row = es.index(index=es_index, doc_type=es_type,
                            id=uid, body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to elasticsearch")

    logging.warning("Batch job complete.")