def test_object_to_dict(self): parents = [{ "_id": 10, "_another_id": 2, "some_field": 20 }, { "_id": 20, "_another_id": 2, "some_field": 20 }] _parents = insert_data("MYSQLDBCONF", "mysqldb", "production_tests", Base, DummyModel, parents) assert len(parents) == len(_parents) children = [{ "_id": 10, "parent_id": 10 }, { "_id": 10, "parent_id": 20 }, { "_id": 20, "parent_id": 20 }, { "_id": 30, "parent_id": 20 }] _children = insert_data("MYSQLDBCONF", "mysqldb", "production_tests", Base, DummyChild, children) assert len(children) == len(_children) # Re-retrieve parents from the database found_children = set() engine = get_mysql_engine("MYSQLDBCONF", "mysqldb") with db_session(engine) as session: for p in session.query(DummyModel).all(): row = object_to_dict(p) assert type(row) is dict assert len(row['children']) > 0 _found_children = set( (c['_id'], c['parent_id']) for c in row['children']) found_children = found_children.union(_found_children) _row = object_to_dict(p, shallow=True) assert 'children' not in _row del row['children'] assert row == _row assert len(found_children) == len(children) == len(_children)
def metadata(orm, session, appln_ids, field_selector=None): if field_selector is None: field_selector = orm.appln_id _filter = field_selector.in_(appln_ids) return [ object_to_dict(_obj) for _obj in session.query(orm).filter(_filter).all() ]
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) bucket = os.environ['BATCHPAR_bucket'] batch_file = os.environ['BATCHPAR_batch_file'] db_name = os.environ["BATCHPAR_db_name"] es_host = os.environ['BATCHPAR_outinfo'] es_port = int(os.environ['BATCHPAR_out_port']) es_index = os.environ['BATCHPAR_out_index'] es_type = os.environ['BATCHPAR_out_type'] entity_type = os.environ["BATCHPAR_entity_type"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] # database setup logging.info('Retrieving engine connection') engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) # es setup logging.info('Connecting to ES') strans_kwargs = { 'filename': 'eurito/cordis-eu.json', 'from_key': 'tier_0', 'to_key': 'tier_1', 'ignore': ['id'] } es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, strans_kwargs=strans_kwargs, null_empty_str=True, coordinates_as_floats=True, listify_terms=True, do_sort=False, ngram_fields=['textBody_description_project']) # collect file logging.info('Retrieving project ids') s3 = boto3.resource('s3') obj = s3.Object(bucket, batch_file) project_ids = json.loads(obj.get()['Body']._raw_stream.read()) logging.info(f"{len(project_ids)} project IDs " "retrieved from s3") # logging.info('Processing rows') with db_session(engine) as session: for count, obj in enumerate((session.query(Project).filter( Project.rcn.in_(project_ids)).all())): row = object_to_dict(obj) row = reformat_row(row) es.index(index=es_index, doc_type=es_type, id=row.pop('rcn'), body=row) if not count % 1000: logging.info(f"{count} rows loaded to " "elasticsearch")
def flatten(orm_instance): """Convert a SqlAlchemy ORM (i.e. a 'row' of data) to flat JSON. Args: orm_instance (sqlalchemy.Base): Instance of a SqlAlchemy ORM, i.e. a 'row' of data. Returns: row (dict): A flat row of data, inferred from `orm_instance` """ row = object_to_dict(orm_instance, shallow=True) return {k: _flatten(v) for k, v in row.items()}
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) bucket = os.environ['BATCHPAR_bucket'] batch_file = os.environ['BATCHPAR_batch_file'] db_name = os.environ["BATCHPAR_db_name"] es_host = os.environ['BATCHPAR_outinfo'] es_port = int(os.environ['BATCHPAR_out_port']) es_index = os.environ['BATCHPAR_out_index'] es_type = os.environ['BATCHPAR_out_type'] entity_type = os.environ["BATCHPAR_entity_type"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] # database setup logging.info('Retrieving engine connection') engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) logging.info('Building FOS lookup') fos_lookup = build_fos_lookup(engine, max_lvl=6) nf = NutsFinder() # es setup logging.info('Connecting to ES') strans_kwargs = { 'filename': 'eurito/arxiv-eu.json', 'from_key': 'tier_0', 'to_key': 'tier_1', 'ignore': ['id'] } es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, strans_kwargs=strans_kwargs, null_empty_str=True, coordinates_as_floats=True, listify_terms=True, do_sort=False, ngram_fields=['textBody_abstract_article']) # collect file logging.info('Retrieving article ids') nrows = 20 if test else None s3 = boto3.resource('s3') obj = s3.Object(bucket, batch_file) art_ids = json.loads(obj.get()['Body']._raw_stream.read()) logging.info(f"{len(art_ids)} article IDs " "retrieved from s3") # Get all grid countries # and country: continent lookup logging.info('Doing country lookup') country_lookup = get_country_region_lookup() eu_countries = get_eu_countries() with db_session(engine) as session: grid_regions = { obj.id: country_lookup[obj.country_code] for obj in session.query(Inst).all() if obj.country_code is not None } grid_countries = { obj.id: obj.country_code for obj in session.query(Inst).all() if obj.country_code is not None } grid_institutes = { obj.id: obj.name for obj in session.query(Inst).all() } grid_latlon = { obj.id: (obj.latitude, obj.longitude) for obj in session.query(Inst).all() } # logging.info('Processing rows') with db_session(engine) as session: for count, obj in enumerate( (session.query(Art).filter(Art.id.in_(art_ids)).all())): row = object_to_dict(obj) # Extract year from date if row['created'] is not None: row['year'] = row['created'].year # Normalise citation count for searchkit if row['citation_count'] is None: row['citation_count'] = 0 # Extract field of study row['fields_of_study'] = make_fos_tree(row['fields_of_study'], fos_lookup) row['_fields_of_study'] = [ f for fields in row['fields_of_study']['nodes'] for f in fields if f != [] ] # Format hierarchical fields as expected by searchkit row['categories'] = [ cat['description'] for cat in row.pop('categories') ] institutes = row.pop('institutes') good_institutes = [ i['institute_id'] for i in institutes if i['matching_score'] > 0.9 ] # Add NUTS regions for inst_id in good_institutes: if inst_id not in grid_latlon: continue lat, lon = grid_latlon[inst_id] if lat is None or lon is None: continue nuts = nf.find(lat=lat, lon=lon) for i in range(0, 4): name = f'nuts_{i}' if name not in row: row[name] = set() for nut in nuts: if nut['LEVL_CODE'] != i: continue row[name].add(nut['NUTS_ID']) for i in range(0, 4): name = f'nuts_{i}' if name in row: row[name] = list(row[name]) # Add other geographies countries = set(grid_countries[inst_id] for inst_id in good_institutes if inst_id in grid_countries) regions = set(grid_regions[inst_id] for inst_id in good_institutes if inst_id in grid_countries) row['countries'] = list(countries) #[c for c, r in countries] row['regions'] = [r for c, r in regions] row['is_eu'] = any(c in eu_countries for c in countries) # Pull out international institute info has_mn = any( is_multinational(inst, grid_countries.values()) for inst in good_institutes) row['has_multinational'] = has_mn # Generate author & institute properties mag_authors = row.pop('mag_authors') if mag_authors is None: row['authors'] = None row['institutes'] = None else: if all('author_order' in a for a in mag_authors): mag_authors = sorted(mag_authors, key=lambda a: a['author_order']) row['authors'] = [ author['author_name'].title() for author in mag_authors ] gids = [ author['affiliation_grid_id'] for author in mag_authors if 'affiliation_grid_id' in author ] row['institutes'] = [ grid_institutes[g].title() for g in gids if g in grid_institutes and g in good_institutes ] if row['institutes'] in (None, []): row['institutes'] = [ grid_institutes[g].title() for g in good_institutes ] uid = row.pop('id') _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row) if not count % 1000: logging.info(f"{count} rows loaded to " "elasticsearch") logging.warning("Batch job complete.")
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) bucket = os.environ['BATCHPAR_bucket'] batch_file = os.environ['BATCHPAR_batch_file'] db_name = os.environ["BATCHPAR_db_name"] es_host = os.environ['BATCHPAR_outinfo'] es_port = int(os.environ['BATCHPAR_out_port']) es_index = os.environ['BATCHPAR_out_index'] es_type = os.environ['BATCHPAR_out_type'] entity_type = os.environ["BATCHPAR_entity_type"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] # database setup logging.info('Retrieving engine connection') engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) _engine = get_mysql_engine("BATCHPAR_config", "readonly", "patstat_2019_05_13") # es setup logging.info('Connecting to ES') strans_kwargs = { 'filename': 'eurito/patstat-eu.json', 'from_key': 'tier_0', 'to_key': 'tier_1', 'ignore': ['id'] } es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, strans_kwargs=strans_kwargs, auto_translate=True, auto_translate_kwargs={'min_len': 20}, null_empty_str=True, coordinates_as_floats=True, do_sort=True, ngram_fields=['textBody_abstract_patent']) # collect file logging.info('Retrieving patent family ids') nrows = 20 if test else None s3 = boto3.resource('s3') obj = s3.Object(bucket, batch_file) docdb_fam_ids = json.loads(obj.get()['Body']._raw_stream.read()) logging.info(f"{len(docdb_fam_ids)} patent family IDs " "retrieved from s3") eu_countries = get_eu_countries() logging.info('Processing rows') _filter = ApplnFamily.docdb_family_id.in_(docdb_fam_ids) with db_session(engine) as session: for obj in session.query(ApplnFamily).filter(_filter).all(): row = object_to_dict(obj) appln_ids = row.pop('appln_id') with db_session(_engine) as _session: _titles = metadata(Tls202ApplnTitle, _session, appln_ids) _abstrs = metadata(Tls203ApplnAbstr, _session, appln_ids) ipcs = metadata(Tls209ApplnIpc, _session, appln_ids) nace2s = metadata(Tls229ApplnNace2, _session, appln_ids) techs = metadata(Tls230ApplnTechnField, _session, appln_ids) # Get persons _pers_applns = metadata(Tls207PersAppln, _session, appln_ids) pers_ids = set(pa['person_id'] for pa in _pers_applns) persons = metadata(Tls906Person, _session, pers_ids, field_selector=Tls906Person.person_id) title = select_text(_titles, 'appln_title_lg', 'appln_title') abstr = select_text(_abstrs, 'appln_abstract_lg', 'appln_abstract') # Get names from lookups ipcs = list(set(i['ipc_class_symbol'].split()[0] for i in ipcs)) nace2s = list(set(n['nace2_code'] for n in nace2s)) techs = list(set(t['techn_field_nr'] for t in techs)) ctrys = list(set(p['person_ctry_code'] for p in persons)) nuts = list(set(p['nuts'] for p in persons)) is_eu = any(c in eu_countries for c in ctrys) # Index the data row = dict(title=title, abstract=abstr, ipc=ipcs, nace2=nace2s, tech=techs, ctry=ctrys, nuts=nuts, is_eu=is_eu, **row) uid = row.pop('docdb_family_id') _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row) logging.warning("Batch job complete.")
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) bucket = os.environ['BATCHPAR_bucket'] batch_file = os.environ['BATCHPAR_batch_file'] db_name = os.environ["BATCHPAR_db_name"] es_host = os.environ['BATCHPAR_outinfo'] es_port = int(os.environ['BATCHPAR_out_port']) es_index = os.environ['BATCHPAR_out_index'] es_type = os.environ['BATCHPAR_out_type'] entity_type = os.environ["BATCHPAR_entity_type"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] # database setup engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) fos_lookup = build_fos_lookup(engine) # Setup ngrammer os.environ['MYSQLDBCONF'] = os.environ['BATCHPAR_config'] ngrammer = Ngrammer(database="production") # es setup strans_kwargs={'filename':'arxiv.json', 'from_key':'tier_0', 'to_key':'tier_1', 'ignore':['id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, strans_kwargs=strans_kwargs, null_empty_str=True, coordinates_as_floats=True, listify_terms=True, do_sort=False) # collect file nrows = 20 if test else None s3 = boto3.resource('s3') obj = s3.Object(bucket, batch_file) art_ids = json.loads(obj.get()['Body']._raw_stream.read()) logging.info(f"{len(art_ids)} article IDs " "retrieved from s3") # Get all grid countries # and country: continent lookup country_lookup = get_country_region_lookup() with db_session(engine) as session: grid_countries = {obj.id: country_lookup[obj.country_code] for obj in session.query(Institute).all() if obj.country_code is not None} grid_institutes = {obj.id: obj.name for obj in session.query(Institute).all()} # current_year = dt.now().year with db_session(engine) as session: for count, obj in enumerate((session.query(Article) .filter(Article.id.in_(art_ids)) .all())): row = object_to_dict(obj) # Extract year from date year = 1990 if row['created'] is not None: row['year'] = row['created'].year year = row['created'].year # Normalise citation count for searchkit if row['citation_count'] is None: row['citation_count'] = 0 row['normalised_citation'] = row['citation_count']/np.log(current_year-year+2) # If abstract doesn't meet requirements, zero novelty # all other novelty will be assigned in a later task text = row['abstract'] + ' ' + row['title'] if (len(text) < 400 or any(x in row['abstract'].lower() for x in ('withdrawn', 'arxiv administrators')) or any(x in row['title'].lower() for x in ('reply to', 'reply on', 'comment to', 'comment on', 'remarks to', 'remarks on'))): row['novelty_of_article'] = 0 processed_tokens = ngrammer.process_document(row['abstract']) row['tokens'] = [t.replace('_', ' ') for tokens in processed_tokens for t in tokens] # Extract field of study Level 0 --> Level 1 paths fos = [] fos_objs = row.pop('fields_of_study') fos_ids = set(fos['id'] for fos in fos_objs) for f in fos_objs: if f['level'] > 0: continue fos += [reversed(fos_lookup[(f['id'], cid)]) for cid in split_ids(f['child_ids']) if cid in fos_ids] # Format hierarchical fields as expected by searchkit cats = [(cat['description'], cat['id'].split('.')[0]) for cat in row.pop('categories')] institutes = row.pop('institutes') good_institutes = [i['institute_id'] for i in institutes if i['matching_score'] > 0.9] countries = set(grid_countries[inst_id] for inst_id in good_institutes if inst_id in grid_countries) row['categories'], _, _ = hierarchy_field(cats) row['fos'], _, _ = hierarchy_field(fos) row['countries'], _, _ = hierarchy_field(countries) # Pull out international institute info has_mn = any(is_multinational(inst, grid_countries.values()) for inst in good_institutes) row['has_multinational'] = has_mn # Generate author & institute properties mag_authors = row.pop('mag_authors') if mag_authors is None: row['authors'] = None row['institutes'] = None #row['novelty_of_article'] = 0 else: if all('author_order' in a for a in mag_authors): mag_authors = sorted(mag_authors, key=lambda a: a['author_order']) row['authors'] = [author['author_name'].title() for author in mag_authors] if len(row['authors']) > 10: row['authors'] = [f"{row['authors'][0]}, et al"] gids = [author['affiliation_grid_id'] for author in mag_authors if 'affiliation_grid_id' in author] row['institutes'] = [grid_institutes[g].title() for g in gids if g in grid_institutes and g in good_institutes] #row['novelty_of_article'] = novelty0 + np.log(novelty1+1) uid = row.pop('id') _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row) if not count % 1000: logging.info(f"{count} rows loaded to elasticsearch") logging.warning("Batch job complete.")