def run(self): db = 'production' if not self.test else 'dev' keys = self.get_abstract_file_keys(bucket, key_prefix) engine = get_mysql_engine(self.db_config_env, 'mysqldb', db) with db_session(engine) as session: if self.test: existing_projects = set() projects = session.query(Projects.application_id).distinct() for p in projects: existing_projects.update(int(p.application_id)) projects_done = set() projects_mesh = session.query(ProjectMeshTerms.project_id).distinct() for p in projects_mesh: projects_done.update(int(p.project_id)) mesh_term_ids = {int(m.id) for m in session.query(MeshTerms.id).all()} logging.info('Inserting associations') for key_count, key in enumerate(keys): if self.test and (key_count > 2): continue # collect mesh results from s3 file and groups by project id # each project id has set of mesh terms and corresponding term ids df_mesh = retrieve_mesh_terms(bucket, key) project_terms = self.format_mesh_terms(df_mesh) # go through documents for project_count, (project_id, terms) in enumerate(project_terms.items()): rows = [] if self.test and (project_count > 2): continue if (project_id in projects_done) or (project_id not in existing_projects): continue for term, term_id in zip(terms['terms'], terms['ids']): term_id = int(term_id) # add term to mesh term table if not present if term_id not in mesh_term_ids: objs = insert_data( self.db_config_env, 'mysqldb', db, Base, MeshTerms, [{'id': term_id, 'term': term}], low_memory=True) mesh_term_ids.update({term_id}) # prepare row to be added to project-mesh_term link table rows.append({'project_id': project_id, 'mesh_term_id': term_id}) # inesrt rows to link table insert_data(self.db_config_env, 'mysqldb', db, Base, ProjectMeshTerms, rows, low_memory=True) self.output().touch() # populate project-mesh_term link table
def run(self): # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) # collect mesh terms from S3 bucket = 'innovation-mapping-general' key = 'crunchbase_descriptions/crunchbase_descriptions_mesh.txt' mesh_terms = retrieve_mesh_terms(bucket, key) mesh_terms = format_mesh_terms( mesh_terms) # [{'id': ['term1', 'term2']}, ...] logging.info(f"File contains {len(mesh_terms)} orgs with mesh terms") logging.info("Extracting previously processed orgs") with db_session(self.engine) as session: all_orgs = session.query(Organization.id, Organization.mesh_terms).all() processed_orgs = { org_id for (org_id, mesh_terms) in all_orgs if mesh_terms is not None } all_orgs = {org_id for (org_id, _) in all_orgs} logging.info(f"{len(all_orgs)} total orgs in database") logging.info(f"{len(processed_orgs)} previously processed orgs") # reformat for batch insert, removing not found and previously processed terms meshed_orgs = [{ 'id': org_id, 'mesh_terms': '|'.join(terms) } for org_id, terms in mesh_terms.items() if org_id in all_orgs and org_id not in processed_orgs] logging.info(f"{len(meshed_orgs)} organisations to update in database") for count, batch in enumerate( split_batches(meshed_orgs, self.insert_batch_size), 1): with db_session(self.engine) as session: session.bulk_update_mappings(Organization, batch) logging.info( f"{count} batch{'es' if count > 1 else ''} written to db") if self.test and count > 1: logging.info("Breaking after 2 batches while in test mode") break # mark as done logging.warning("Task complete") self.output().touch()
def run(): # Fetch the input parameters s3_bucket = os.environ["BATCHPAR_bucket"] batch_file = os.environ["BATCHPAR_batch_file"] members_perc = int(os.environ["BATCHPAR_members_perc"]) db_name = os.environ["BATCHPAR_db_name"] es_host = os.environ['BATCHPAR_outinfo'] es_port = int(os.environ['BATCHPAR_out_port']) es_index = os.environ['BATCHPAR_out_index'] es_type = os.environ['BATCHPAR_out_type'] entity_type = os.environ["BATCHPAR_entity_type"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] routine_id = os.environ["BATCHPAR_routine_id"] # Get continent lookup url = ("https://nesta-open-data.s3.eu-west-2" ".amazonaws.com/rwjf-viz/continent_codes_names.json") continent_lookup = {row["Code"]: row["Name"] for row in requests.get(url).json()} continent_lookup[None] = None # Extract the core topics logging.debug('Getting topics') s3 = boto3.resource('s3') topics_key = f'meetup-topics-{routine_id}.json' topics_obj = s3.Object(s3_bucket, topics_key) core_topics = set(json.loads(topics_obj.get()['Body']._raw_stream.read())) # Extract the group ids for this task ids_obj = s3.Object(s3_bucket, batch_file) group_ids = set(json.loads(ids_obj.get()['Body']._raw_stream.read())) # Extract the mesh terms for this task mesh_obj = s3.Object('innovation-mapping-general', 'meetup_mesh/meetup_mesh_processed.txt') df_mesh = retrieve_mesh_terms('innovation-mapping-general', 'meetup_mesh/meetup_mesh_processed.txt') mesh_terms = format_mesh_terms(df_mesh) # Setup ES+ field_null_mapping = load_json_from_pathstub(("tier_1/" "field_null_mappings/"), "health_scanner.json") strans_kwargs={'filename':'meetup.json', 'from_key':'tier_0', 'to_key':'tier_1', 'ignore':[]} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, strans_kwargs=strans_kwargs, field_null_mapping=field_null_mapping, null_empty_str=True, coordinates_as_floats=True, country_detection=True, auto_translate=True) # Generate the lookup for geographies engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) geo_lookup = {} with db_session(engine) as session: query_result = session.query(Geographic).all() for geography in query_result: geo_lookup[geography.id] = {k: v for k, v in geography.__dict__.items() if k in geography.__table__.columns} # Pipe the groups members_limit = get_members_by_percentile(engine, perc=members_perc) with db_session(engine) as session: query_result = (session .query(Group) .filter(Group.members >= members_limit) .filter(Group.id.in_(group_ids)) .all()) for count, group in enumerate(query_result, 1): row = {k: v for k, v in group.__dict__.items() if k in group.__table__.columns} # Filter groups without the required topics topics = [topic['name'] for topic in group.topics if topic['name'] in core_topics] if len(topics) == 0: continue # Assign mesh terms mesh_id = f'{row["id"]}'.zfill(8) row['mesh_terms'] = None if mesh_id in mesh_terms: row['mesh_terms'] = mesh_terms[mesh_id] # Get the geographic data for this row country_name = country_iso_code_to_name(row['country'], iso2=True) geo_key = generate_composite_key(row['city'], country_name) geo = geo_lookup[geo_key] # Clean up the input data row['topics'] = topics row['urlname'] = f"https://www.meetup.com/{row['urlname']}" row['coordinate'] = dict(lat=geo['latitude'], lon=geo['longitude']) row['created'] = dt.strftime(dt.fromtimestamp(row['created']/1000), format="%Y-%m-%d") if row['description'] is not None: row['description'] = BeautifulSoup(row['description'], 'lxml').text row['continent'] = continent_lookup[geo['continent']] row['country_name'] = geo['country'] row['continent_id'] = geo['continent'] row['country'] = geo['country_alpha_2'] row['iso3'] = geo['country_alpha_3'] row['isoNumeric'] = geo['country_numeric'] # Insert to ES _row = es.index(index=es_index, doc_type=es_type, id=row['id'], body=row) if not count % 1000: logging.info(f"{count} rows loaded to elasticsearch") logging.info("Batch job complete.")
def run(): bucket = os.environ["BATCHPAR_s3_bucket"] abstract_file = os.environ["BATCHPAR_s3_key"] dupe_file = os.environ["BATCHPAR_dupe_file"] es_config = literal_eval(os.environ["BATCHPAR_outinfo"]) db = os.environ["BATCHPAR_db"] entity_type = os.environ["BATCHPAR_entity_type"] # mysql setup engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db) Session = sessionmaker(bind=engine) session = Session() # retrieve a batch of meshed terms mesh_terms = retrieve_mesh_terms(bucket, abstract_file) mesh_terms = format_mesh_terms(mesh_terms) logging.info(f'batch {abstract_file} contains ' f'{len(mesh_terms)} meshed abstracts') # retrieve duplicate map dupes = retrieve_duplicate_map(bucket, dupe_file) dupes = format_duplicate_map(dupes) # Set up elastic search connection field_null_mapping = load_json_from_pathstub( "tier_1/" "field_null_mappings/", "health_scanner.json") es = ElasticsearchPlus(hosts=es_config['host'], port=es_config['port'], aws_auth_region=es_config['region'], use_ssl=True, entity_type=entity_type, strans_kwargs=None, field_null_mapping=field_null_mapping, null_empty_str=True, coordinates_as_floats=True, country_detection=True, listify_terms=True) all_es_ids = get_es_ids(es, es_config) docs = [] for doc_id, terms in mesh_terms.items(): if doc_id not in all_es_ids: continue try: _filter = Abstracts.application_id == doc_id abstract = (session.query(Abstracts).filter(_filter).one()) except NoResultFound: logging.warning(f'Not found {doc_id} in database') raise NoResultFound(doc_id) clean_abstract_text = clean_abstract(abstract.abstract_text) docs.append({ 'doc_id': doc_id, 'terms_mesh_abstract': terms, 'textBody_abstract_project': clean_abstract_text }) duped_docs = dupes.get(doc_id, []) if len(duped_docs) > 0: logging.info(f'Found {len(duped_docs)} duplicates') for duped_doc in duped_docs: docs.append({ 'doc_id': duped_doc, 'terms_mesh_abstract': terms, 'textBody_abstract_project': clean_abstract_text, 'booleanFlag_duplicate_abstract': True }) # output to elasticsearch logging.warning(f'Writing {len(docs)} documents to elasticsearch') for doc in docs: uid = doc.pop("doc_id") # Extract existing info existing = es.get(es_config['index'], doc_type=es_config['type'], id=uid)['_source'] # Merge existing info into new doc doc = {**existing, **doc} es.index(index=es_config['index'], doc_type=es_config['type'], id=uid, body=doc)