def combine(self, job_params): '''Combine the outputs from the batch jobs''' # Retrieve the batched data country_data = defaultdict(dict) n_rows = 0 for i, params in enumerate(job_params): print(i, " of ", len(job_params)) _body = s3.S3Target(params["outinfo"]).open("rb") _country_data = json.loads(_body.read().decode('utf-8')) for country, data in _country_data.items(): for var_name, data_row in data.items(): n_rows += 1 country_data[country][var_name] = data_row print(f"Got {n_rows} rows of data") # Merge with metadata, then flatten and clean country_metadata = get_worldbank_resource("countries") flat_country_data = flatten_country_data(country_data, country_metadata) cleaned_data = clean_variable_names(flat_country_data) # Commit the data engine = get_mysql_engine("MYSQLDB", "mysqldb", self.db_config['database']) Base.metadata.create_all(engine) Session = sessionmaker(engine) session = Session() for row in cleaned_data: country = WorldbankCountry(**row) session.add(country) session.commit() session.close() self.output().touch()
def requires(self): '''Collects the database configurations and executes the central task.''' logging.getLogger().setLevel(logging.INFO) _routine_id = f"{self.date}-{self.iso2}-{self.category}-{self.production}" engine = get_mysql_engine("MYSQLDB", "mysqldb", "production" if self.production else "dev") Base.metadata.create_all(engine) yield GroupDetailsTask( iso2=self.iso2, category=self.category, _routine_id=_routine_id, batchable=BATCHABLE.format("group_details"), env_files=[ find_filepath_from_pathstub("/nesta/nesta"), find_filepath_from_pathstub("/config/mysqldb.config") ], job_def="py36_amzn1_image", job_name="GroupDetails-%s" % _routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, max_live_jobs=100, test=(not self.production))
class TestWiktionaryNgram(unittest.TestCase): '''Check that the WiktionaryNgram ORM works as expected''' engine = get_mysql_engine("MYSQLDBCONF", "mysqldb") Session = sessionmaker(engine) def setUp(self): '''Create the temporary table''' Base.metadata.create_all(self.engine) def tearDown(self): '''Drop the temporary table''' Base.metadata.drop_all(self.engine) def test_good_relation(self): session = self.Session() ngram = WiktionaryNgram(ngram="something") new_ngram = WiktionaryNgram(ngram="something") # Add the group and member session.add(ngram) session.commit() # Shouldn't be able to add duplicate data del ngram session.add(new_ngram) self.assertRaises(IntegrityError, session.commit) session.rollback() session.close()
def run(): table_name = os.environ["BATCHPAR_table_name"] url = os.environ["BATCHPAR_url"] db_name = os.environ["BATCHPAR_db_name"] s3_path = os.environ["BATCHPAR_outinfo"] # Setup the database connectors engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) try_until_allowed(Base.metadata.create_all, engine) _class = get_class_by_tablename(Base, table_name) Session = try_until_allowed(sessionmaker, engine) session = try_until_allowed(Session) # Commit the data all_pks = set() objs = [] pkey_cols = _class.__table__.primary_key.columns for row in iterrows(url): if len(row) == 0: continue if session.query(exists(_class, **row)).scalar(): continue pk = tuple([row[pkey.name] for pkey in pkey_cols]) if pk in all_pks: continue all_pks.add(pk) objs.append(_class(**row)) session.bulk_save_objects(objs) session.commit() session.close() # Mark the task as done s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="")
def run(): batch_file = os.environ['BATCHPAR_batch_file'] db = os.environ['BATCHPAR_db_name'] bucket = os.environ['BATCHPAR_bucket'] # database setup engine = get_mysql_engine('BATCHPAR_config', 'mysqldb', db) # collect data target = f"s3://{bucket}/{batch_file}" df = pd.read_json(target, orient='records') logging.info(f"{len(df)} locations to geocode") # append country iso codes and continent df = country_iso_code_dataframe(df) logging.info("Country ISO codes appended") # geocode, appending latitude and longitude columns, using the q= query method df = geocode_batch_dataframe(df, query_method='query_only') logging.info("Geocoding complete") # remove city and country columns and append done column df = df.drop(['city', 'country'], axis=1) df['done'] = True # convert to list of dict and output to database rows = df.to_dict(orient='records') logging.info(f"Writing {len(rows)} rows to database") with db_session(engine) as session: session.bulk_update_mappings(Geographic, rows) logging.warning("Batch task complete")
def load_arxiv_categories(db_config, db, bucket, cat_file): """Loads a file of categories and descriptions into mysql from a csv file on s3. Args: db_config (str): environmental variable pointing to mysql config file db (str): config header to use from the mysql config file bucket (str): s3 bucket where the csv is held cat_file (str): path to the file on s3 """ target = f's3://{bucket}/{cat_file}' categories = pd.read_csv(target) # Setup the database connectors engine = get_mysql_engine(db_config, "mysqldb", db) try_until_allowed(Base.metadata.create_all, engine) Session = try_until_allowed(sessionmaker, engine) session = try_until_allowed(Session) logging.info( f'found {session.query(Category).count()} existing categories') for idx, data in categories.iterrows(): if not _category_exists(session, data['id']): _add_category(session, cat_id=data['id'], description=data['description']) session.close()
def run(self): """Apply health labels using model.""" # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) try_until_allowed(Base.metadata.create_all, self.engine) # collect and unpickle models from s3 logging.info("Collecting models from S3") s3 = boto3.resource('s3') vectoriser_obj = s3.Object(self.bucket, self.vectoriser_key) vectoriser = pickle.loads( vectoriser_obj.get()['Body']._raw_stream.read()) classifier_obj = s3.Object(self.bucket, self.classifier_key) classifier = pickle.loads( classifier_obj.get()['Body']._raw_stream.read()) # retrieve organisations and categories nrows = 1000 if self.test else None logging.info("Collecting organisations from database") with db_session(self.engine) as session: orgs = (session.query(Organization.id).filter( Organization.is_health.is_(None)).limit(nrows).all()) for batch_count, batch in enumerate( split_batches(orgs, self.insert_batch_size), 1): batch_orgs_with_cats = [] for (org_id, ) in batch: with db_session(self.engine) as session: categories = (session.query( OrganizationCategory.category_name).filter( OrganizationCategory.organization_id == org_id).all()) # categories should be a list of str, comma separated: ['cat,cat,cat', 'cat,cat'] categories = ','.join(cat_name for (cat_name, ) in categories) batch_orgs_with_cats.append({ 'id': org_id, 'categories': categories }) logging.debug( f"{len(batch_orgs_with_cats)} organisations retrieved from database" ) logging.debug("Predicting health flags") batch_orgs_with_flag = predict_health_flag(batch_orgs_with_cats, vectoriser, classifier) logging.debug( f"{len(batch_orgs_with_flag)} organisations to update") with db_session(self.engine) as session: session.bulk_update_mappings(Organization, batch_orgs_with_flag) logging.info( f"{batch_count} batches health labeled and written to db") # mark as done logging.warning("Task complete") self.output().touch()
def extract_data(limit=None, db='patstat_2019_05_13'): '''Get all EU patents, grouped and aggregated by their doc families''' engine = get_mysql_engine('MYSQLDB', 'mysqldb', db) session = generate_temp_tables(engine, limit=limit) dfs = temp_tables_to_dfs(engine, limit=limit) session.close() del session return concat_dfs(dfs)
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) bucket = os.environ['BATCHPAR_bucket'] batch_file = os.environ['BATCHPAR_batch_file'] db_name = os.environ["BATCHPAR_db_name"] es_host = os.environ['BATCHPAR_outinfo'] es_port = int(os.environ['BATCHPAR_out_port']) es_index = os.environ['BATCHPAR_out_index'] es_type = os.environ['BATCHPAR_out_type'] entity_type = os.environ["BATCHPAR_entity_type"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] # database setup logging.info('Retrieving engine connection') engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) # es setup logging.info('Connecting to ES') strans_kwargs = { 'filename': 'eurito/cordis-eu.json', 'from_key': 'tier_0', 'to_key': 'tier_1', 'ignore': ['id'] } es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, strans_kwargs=strans_kwargs, null_empty_str=True, coordinates_as_floats=True, listify_terms=True, do_sort=False, ngram_fields=['textBody_description_project']) # collect file logging.info('Retrieving project ids') s3 = boto3.resource('s3') obj = s3.Object(bucket, batch_file) project_ids = json.loads(obj.get()['Body']._raw_stream.read()) logging.info(f"{len(project_ids)} project IDs " "retrieved from s3") # logging.info('Processing rows') with db_session(engine) as session: for count, obj in enumerate((session.query(Project).filter( Project.rcn.in_(project_ids)).all())): row = object_to_dict(obj) row = reformat_row(row) es.index(index=es_index, doc_type=es_type, id=row.pop('rcn'), body=row) if not count % 1000: logging.info(f"{count} rows loaded to " "elasticsearch")
def run(self): # database setup database = 'dev' if self.test else 'production' logging.info(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) # s3 setup s3 = boto3.resource('s3') intermediate_file = s3.Object(BUCKET, f"mag_estimate_{database}.json") eu = get_eu_countries() with db_session(self.engine) as session: eu_grid_ids = { i.id for i in (session.query(Institute.id).filter( Institute.country.in_(eu)).all()) } logging.info(f"{len(eu_grid_ids):,} EU institutes in GRID") # collect previous and exclude try: previous = json.loads( intermediate_file.get()['Body']._raw_stream.read()) done_institutes = set(previous['institutes']) logging.info( f"{len(done_institutes)} previously processed institutes retrieved" ) eu_grid_ids = eu_grid_ids - done_institutes logging.info(f"{len(eu_grid_ids)} to process") paper_ids = set(previous['paper_ids']) logging.info( f"{len(paper_ids)} previously processed papers retrieved") except ClientError: logging.info("Unable to load previous file, starting from scratch") done_institutes = set() paper_ids = set() limit = 100 if self.test else None save_every = 50 if self.test else 1000000 total = count_papers(eu_grid_ids, done_institutes, paper_ids, intermediate_file, save_every=save_every, limit=limit) # mark as done logging.info("Task complete") logging.info(f"Total EU papers found: {total:,}") self.output().touch()
def __init__(self, config_filepath=None, database="dev"): if config_filepath is not None: os.environ["MYSQLDBCONF"] = config_filepath engine = get_mysql_engine("MYSQLDBCONF", "mysqldb", database=database) Session = sessionmaker(engine) Base.metadata.create_all(engine) session = Session() # Split out n-grams by size (speeds up the extraction later) self.ngrams = defaultdict(set) for row in session.query(WiktionaryNgram).all(): size = row.ngram.count("_") + 1 self.ngrams[size].add(row.ngram)
def __enter__(self): '''Set up the database connection, session and query stub.''' engine = get_mysql_engine("MYSQLDBCONF", "mysqldb", database=self.database) engine.execution_options(stream_results=True) Session = sessionmaker(engine) Base.metadata.create_all(engine) self.session = Session() self.query_stub = self.session.query(Abstracts).order_by( Abstracts.application_id) return self
def run(self): db = 'production' if not self.test else 'dev' keys = self.get_abstract_file_keys(bucket, key_prefix) engine = get_mysql_engine(self.db_config_env, 'mysqldb', db) with db_session(engine) as session: if self.test: existing_projects = set() projects = session.query(Projects.application_id).distinct() for p in projects: existing_projects.update(int(p.application_id)) projects_done = set() projects_mesh = session.query(ProjectMeshTerms.project_id).distinct() for p in projects_mesh: projects_done.update(int(p.project_id)) mesh_term_ids = {int(m.id) for m in session.query(MeshTerms.id).all()} logging.info('Inserting associations') for key_count, key in enumerate(keys): if self.test and (key_count > 2): continue # collect mesh results from s3 file and groups by project id # each project id has set of mesh terms and corresponding term ids df_mesh = retrieve_mesh_terms(bucket, key) project_terms = self.format_mesh_terms(df_mesh) # go through documents for project_count, (project_id, terms) in enumerate(project_terms.items()): rows = [] if self.test and (project_count > 2): continue if (project_id in projects_done) or (project_id not in existing_projects): continue for term, term_id in zip(terms['terms'], terms['ids']): term_id = int(term_id) # add term to mesh term table if not present if term_id not in mesh_term_ids: objs = insert_data( self.db_config_env, 'mysqldb', db, Base, MeshTerms, [{'id': term_id, 'term': term}], low_memory=True) mesh_term_ids.update({term_id}) # prepare row to be added to project-mesh_term link table rows.append({'project_id': project_id, 'mesh_term_id': term_id}) # inesrt rows to link table insert_data(self.db_config_env, 'mysqldb', db, Base, ProjectMeshTerms, rows, low_memory=True) self.output().touch() # populate project-mesh_term link table
def run(self): """Collect and process organizations, categories and long descriptions.""" # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) try_until_allowed(Base.metadata.create_all, self.engine) limit = 2000 if self.test else None batch_size = 30 if self.test else 1000 with db_session(self.engine) as session: all_orgs = session.query( Organisation.id, Organisation.addresses).limit(limit).all() existing_org_location_ids = session.query( OrganisationLocation.id).all() logging.info(f"{len(all_orgs)} organisations retrieved from database") logging.info( f"{len(existing_org_location_ids)} organisations have previously been processed" ) # convert to a list of dictionaries with the nested addresses unpacked orgs = get_orgs_to_process(all_orgs, existing_org_location_ids) logging.info(f"{len(orgs)} new organisations to geocode") total_batches = ceil(len(orgs) / batch_size) logging.info(f"{total_batches} batches") completed_batches = 0 for batch in split_batches(orgs, batch_size=batch_size): # geocode first to add missing country for UK batch = map(geocode_uk_with_postcode, batch) batch = map(add_country_details, batch) # remove data not in OrganisationLocation columns org_location_cols = OrganisationLocation.__table__.columns.keys() batch = [{k: v for k, v in org.items() if k in org_location_cols} for org in batch] insert_data(self.db_config_env, 'mysqldb', database, Base, OrganisationLocation, batch) completed_batches += 1 logging.info( f"Completed {completed_batches} of {total_batches} batches") if self.test and completed_batches > 1: logging.warning("Breaking after 2 batches in test mode") break # mark as done logging.warning("Finished task") self.output().touch()
def run(self): limit = 100 if self.test else None flush_freq = 33 if self.test else 5000 # Get connection settings engine = get_mysql_engine('MYSQLDB', 'nesta', 'dev' if self.test else 'production') conf = get_config('neo4j.config', 'neo4j') gkwargs = dict(host=conf['host'], secure=True, auth=(conf['user'], conf['password'])) # Drop all neo4j data in advance # (WARNING: this is a hack in lieu of proper db staging/versioning) with graph_session(**gkwargs) as tx: logging.info('Dropping all previous data') tx.graph.delete_all() for constraint in tx.run('CALL db.constraints'): logging.info(f'Dropping constraint {constraint[0]}') tx.run(f'DROP {constraint[0]}') # Iterate over all tables in the ORM for tablename, table in Base.metadata.tables.items(): entity_name = _extract_name(tablename) logging.info(f'\tProcessing {entity_name}') orm, parent_orm, rel_name = prepare_base_entities(table) # Insert data to neo4j in one session per table, # to enable constraint and relationship lookups # after insertion irow = 0 uninterrupted = False while not uninterrupted: uninterrupted = True with graph_session(**gkwargs) as tx: # Iterate over rows in the database for db, orm_instance in db_session_query(query=orm, engine=engine, limit=limit, offset=irow): irow += 1 if irow == limit: break # Convert the ORM row to a neo4j object, and insert orm_to_neo4j(session=db, transaction=tx, orm_instance=orm_instance, parent_orm=parent_orm, rel_name=rel_name) if (irow % flush_freq) == 0: logging.info(f'\t\tFlushing at row {irow}') uninterrupted = False break # Confirm the task is finished self.output().touch()
def run(): batch_file = os.environ['BATCHPAR_batch_file'] bucket = os.environ['BATCHPAR_bucket'] db_name = os.environ['BATCHPAR_db_name'] db_env = "BATCHPAR_config" db_section = "mysqldb" # Setup the database connectors engine = get_mysql_engine(db_env, db_section, db_name) try_until_allowed(Base.metadata.create_all, engine) # Retrieve RCNs to iterate over s3 = boto3.resource('s3') obj = s3.Object(bucket, batch_file) all_rcn = json.loads(obj.get()['Body']._raw_stream.read()) logging.info(f"{len(all_rcn)} project RCNs retrieved from s3") # Retrieve all topics data = defaultdict(list) for i, rcn in enumerate(all_rcn): logging.info(i) project, orgs, reports, pubs = fetch_data(rcn) if project is None: continue _topics = project.pop('topics') _calls = project.pop('proposal_call') # NB: Order below matters due to FK constraints! data['projects'].append(project) data['reports'] += prepare_data(reports, rcn) data['publications'] += prepare_data(pubs, rcn) data['organisations'] += extract_core_orgs(orgs, rcn) data['project_organisations'] += prepare_data(orgs, rcn) for topics, project_topics in split_links(_topics, rcn): data['topics'].append(topics) data['project_topics'].append(project_topics) for calls, project_calls in split_links(_calls, rcn): data['proposal_calls'].append(calls) data['project_proposal_calls'].append(project_calls) # Pipe the data to the db for table_prefix, rows in data.items(): table_name = f'cordis_{table_prefix}' logging.info(table_name) _class = get_class_by_tablename(Base, table_name) insert_data(db_env, db_section, db_name, Base, _class, rows, low_memory=True)
def run(self): '''Extract the topics of interest''' database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) members_limit = get_members_by_percentile(engine, perc=self.members_perc) topics = get_core_topics(engine, core_categories=self.core_categories, members_limit=members_limit, perc=self.topic_perc) # Write the intermediate output with self.output().open('wb') as outstream: outstream.write(json.dumps(list(topics)).encode('utf8'))
def prepare(self): # mysql setup db = 'production' if not self.test else 'dev' engine = get_mysql_engine(MYSQLDB_ENV, "mysqldb", db) Session = sessionmaker(bind=engine) session = Session() project_query = session.query(Projects) # elasticsearch setup es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset='nih', aliases='health_scanner') batches = self.batch_limits(project_query, BATCH_SIZE) job_params = [] for start, end in batches: params = { 'start_index': start, 'end_index': end, 'config': "mysqldb.config", 'db': db, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'done': es.exists(index=es_config['index'], doc_type=es_config['type'], id=end), 'aws_auth_region': es_config['region'], 'entity_type': 'paper' } job_params.append(params) return job_params
class TestPatstat(unittest.TestCase): '''Check that the Patstat EU ORM works as expected''' engine = get_mysql_engine("MYSQLDBCONF", "mysqldb") Session = sessionmaker(engine) def setUp(self): '''Create the temporary table''' Base.metadata.create_all(self.engine) def tearDown(self): '''Drop the temporary table''' Base.metadata.drop_all(self.engine) def test_build(self): pass
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) db_name = os.environ["BATCHPAR_db_name"] batch_size = int(os.environ["BATCHPAR_batch_size"]) # example parameter s3_path = os.environ["BATCHPAR_outinfo"] start_string = os.environ["BATCHPAR_start_string"], # example parameter offset = int(os.environ["BATCHPAR_offset"]) # reduce records in test mode if test: limit = 50 logging.info(f"Limiting to {limit} rows in test mode") else: limit = batch_size logging.info(f"Processing {offset} - {offset + limit}") # database setup logging.info(f"Using {db_name} database") engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) try_until_allowed(Base.metadata.create_all, engine) with db_session(engine) as session: # consider moving this query and the one from the prepare step into a package batch_records = (session.query(MyTable.id, MyTable.name).filter( MyTable.founded_on > '2007-01-01').offset(offset).limit(limit)) # process and insert data processed_batch = [] for row in batch_records: processed_row = some_func(start_string=start_string, row=row) processed_batch.append(processed_row) logging.info(f"Inserting {len(processed_batch)} rows") insert_data("BATCHPAR_config", 'mysqldb', db_name, Base, MyOtherTable, processed_batch, low_memory=True) logging.info(f"Marking task as done to {s3_path}") s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") logging.info("Batch job complete.")
def run(self): # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) # collect mesh terms from S3 bucket = 'innovation-mapping-general' key = 'crunchbase_descriptions/crunchbase_descriptions_mesh.txt' mesh_terms = retrieve_mesh_terms(bucket, key) mesh_terms = format_mesh_terms( mesh_terms) # [{'id': ['term1', 'term2']}, ...] logging.info(f"File contains {len(mesh_terms)} orgs with mesh terms") logging.info("Extracting previously processed orgs") with db_session(self.engine) as session: all_orgs = session.query(Organization.id, Organization.mesh_terms).all() processed_orgs = { org_id for (org_id, mesh_terms) in all_orgs if mesh_terms is not None } all_orgs = {org_id for (org_id, _) in all_orgs} logging.info(f"{len(all_orgs)} total orgs in database") logging.info(f"{len(processed_orgs)} previously processed orgs") # reformat for batch insert, removing not found and previously processed terms meshed_orgs = [{ 'id': org_id, 'mesh_terms': '|'.join(terms) } for org_id, terms in mesh_terms.items() if org_id in all_orgs and org_id not in processed_orgs] logging.info(f"{len(meshed_orgs)} organisations to update in database") for count, batch in enumerate( split_batches(meshed_orgs, self.insert_batch_size), 1): with db_session(self.engine) as session: session.bulk_update_mappings(Organization, batch) logging.info( f"{count} batch{'es' if count > 1 else ''} written to db") if self.test and count > 1: logging.info("Breaking after 2 batches while in test mode") break # mark as done logging.warning("Task complete") self.output().touch()
def run(self): # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) # collect file logging.info(f"Collecting org_parents from crunchbase tar") org_parents = get_files_from_tar(['org_parents'])[0] logging.info(f"{len(org_parents)} parent ids in crunchbase export") # collect previously processed orgs logging.info("Extracting previously processed organisations") with db_session(self.engine) as session: processed_orgs = session.query(Organization.id, Organization.parent_id).all() all_orgs = {org for (org, _) in processed_orgs} logging.info(f"{len(all_orgs)} total orgs in database") processed_orgs = { org for (org, parent_id) in processed_orgs if parent_id is not None } logging.info(f"{len(processed_orgs)} previously processed orgs") # reformat into a list of dicts, removing orgs that already have a parent_id # or are missing from the database org_parents = org_parents[['uuid', 'parent_uuid']] org_parents.columns = ['id', 'parent_id'] org_parents = org_parents[org_parents['id'].isin(all_orgs)] org_parents = org_parents[~org_parents['id'].isin(processed_orgs)] org_parents = org_parents.to_dict(orient='records') logging.info(f"{len(org_parents)} organisations to update in MYSQL") # insert parent_ids into db in batches for count, batch in enumerate( split_batches(org_parents, self.insert_batch_size), 1): with db_session(self.engine) as session: session.bulk_update_mappings(Organization, batch) logging.info( f"{count} batch{'es' if count > 1 else ''} written to db") if self.test and count > 1: logging.info("Breaking after 2 batches while in test mode") break # mark as done logging.warning("Task complete") self.output().touch()
def prepare(self): """Prepare the batch job parameters""" # database setup database = 'dev' if self.test else 'production' logging.info(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) if self.test and database == 'dev': logging.warning('Dropping tables') Base.metadata.drop_all(self.engine) try_until_allowed(Base.metadata.create_all, self.engine) with db_session(self.engine) as session: if self.test: logging.info("Adding test data") test_data = [] for i in range(1000): test_data.append(MyTable(id=i, founded_on='2009-01-01')) session.add_all(test_data) session.commit() logging.info('Retrieving list of records to process') total_records = (session .query(MyTable.id) .filter(MyTable.founded_on > '2007-01-01') .count()) job_params = [] # dictionaries of environmental variables for each batch # potential method of generating batches: for count, offset in enumerate(range(0, total_records, self.batch_size)): key = f"{self.date}_batch_{offset}_{database}" done = key in DONE_KEYS params = {"config": "mysqldb.config", "db_name": database, "test": self.test, "outinfo": f"s3://{self.intermediate_bucket}/{key}", "done": done, "batch_size": self.batch_size, # example parameter "start_string": self.start_string, # example parameter "offset": offset} job_params.append(params) logging.info(params) if self.test and count == TEST_BATCHES: logging.info(f"Only {TEST_BATCHES} batches created in test mode") break return job_params
def test_object_to_dict(self): parents = [{ "_id": 10, "_another_id": 2, "some_field": 20 }, { "_id": 20, "_another_id": 2, "some_field": 20 }] _parents = insert_data("MYSQLDBCONF", "mysqldb", "production_tests", Base, DummyModel, parents) assert len(parents) == len(_parents) children = [{ "_id": 10, "parent_id": 10 }, { "_id": 10, "parent_id": 20 }, { "_id": 20, "parent_id": 20 }, { "_id": 30, "parent_id": 20 }] _children = insert_data("MYSQLDBCONF", "mysqldb", "production_tests", Base, DummyChild, children) assert len(children) == len(_children) # Re-retrieve parents from the database found_children = set() engine = get_mysql_engine("MYSQLDBCONF", "mysqldb") with db_session(engine) as session: for p in session.query(DummyModel).all(): row = object_to_dict(p) assert type(row) is dict assert len(row['children']) > 0 _found_children = set( (c['_id'], c['parent_id']) for c in row['children']) found_children = found_children.union(_found_children) _row = object_to_dict(p, shallow=True) assert 'children' not in _row del row['children'] assert row == _row assert len(found_children) == len(children) == len(_children)
class TestMeetup(unittest.TestCase): '''Currently just a placeholder test to check that the schema compiles''' engine = get_mysql_engine("MYSQLDBCONF", "mysqldb") Session = sessionmaker(engine) def setUp(self): '''Create the temporary table''' Base.metadata.create_all(self.engine) def tearDown(self): '''Drop the temporary table''' Base.metadata.drop_all(self.engine) def test_constraints(self): '''Placeholder for if any contraints are added''' session = self.Session() session.close()
class TestRun(TestCase): engine = get_mysql_engine("MYSQLDBCONF", "mysqldb") Session = sessionmaker(engine) def setUp(self): '''Create the temporary table''' Base.metadata.create_all(self.engine) def tearDown(self): '''Drop the temporary table''' Base.metadata.drop_all(self.engine) @mock.patch.dict(os.environ, environ) @mock.patch('nesta.core.batchables.meetup.members_groups.run.boto3') def test_members_groups(self, boto3): n = members_groups.run.run() self.assertGreater(n, 0)
def prepare(self): """Copies any new city/county combinations from the input table into the geographic_data table. All rows which have previously not been processed will be split into batches. Returns: (:obj:`list` of :obj:`dict`) job parameters for each of the batch tasks """ # set up database connectors self.database = 'dev' if self.test else 'production' self.engine = get_mysql_engine(self.db_config_env, "mysqldb", self.database) try_until_allowed(Base.metadata.create_all, self.engine) # s3 setup self.s3 = boto3.resource('s3') # identify new locations in the input table and copy them to the geographic table if self.location_key_col is not None: self._insert_new_locations() else: self._insert_new_locations_no_id() # create batches from all locations which have not previously been coded job_params = [] uncoded_locations = self._get_uncoded() if uncoded_locations: for batch_file in self._create_batches(uncoded_locations): params = { "batch_file": batch_file, "config": 'mysqldb.config', "db_name": self.database, "bucket": self.intermediate_bucket, "done": False, "outinfo": '', "test": self.test } job_params.append(params) logging.info(params) logging.info(f"{len(job_params)} batches to run") else: logging.warning(f"no new locations to geocode") return job_params
def test_db_session_query(self): parents = [{ "_id": i, "_another_id": i, "some_field": 20 } for i in range(0, 1000)] _parents = insert_data("MYSQLDBCONF", "mysqldb", "production_tests", Base, DummyModel, parents) # Re-retrieve parents from the database engine = get_mysql_engine("MYSQLDBCONF", "mysqldb") # Test for limit = 3 limit = 3 old_db = mock.MagicMock() old_db.is_active = False n_rows = 0 for db, row in db_session_query(query=DummyModel, engine=engine, chunksize=10, limit=limit): assert type(row) is DummyModel if old_db != db: assert len(old_db.transaction._connections) == 0 assert len(db.transaction._connections) > 0 old_db = db n_rows += 1 assert n_rows == limit # Test for limit = None old_db = mock.MagicMock() old_db.is_active = False n_rows = 0 for db, row in db_session_query(query=DummyModel, engine=engine, chunksize=100, limit=None): assert type(row) is DummyModel if old_db != db: assert len(old_db.transaction._connections) == 0 assert len(db.transaction._connections) > 0 old_db = db n_rows += 1 assert n_rows == len(parents) == 1000
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) db_name = os.environ["BATCHPAR_db_name"] table = os.environ["BATCHPAR_table"] batch_size = int(os.environ["BATCHPAR_batch_size"]) s3_path = os.environ["BATCHPAR_outinfo"] logging.warning(f"Processing {table} file") # database setup engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) try_until_allowed(Base.metadata.create_all, engine) table_name = f"crunchbase_{table}" table_class = get_class_by_tablename(Base, table_name) # collect file nrows = 1000 if test else None df = get_files_from_tar([table], nrows=nrows)[0] logging.warning(f"{len(df)} rows in file") # get primary key fields and set of all those already existing in the db pk_cols = list(table_class.__table__.primary_key.columns) pk_names = [pk.name for pk in pk_cols] with db_session(engine) as session: existing_rows = set(session.query(*pk_cols).all()) # process and insert data processed_rows = process_non_orgs(df, existing_rows, pk_names) for batch in split_batches(processed_rows, batch_size): insert_data("BATCHPAR_config", 'mysqldb', db_name, Base, table_class, processed_rows, low_memory=True) logging.warning(f"Marking task as done to {s3_path}") s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") logging.warning("Batch job complete.")
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) db_name = os.environ["BATCHPAR_db_name"] bucket = os.environ['BATCHPAR_bucket'] batch_file = os.environ['BATCHPAR_batch_file'] outinfo = os.environ["BATCHPAR_outinfo"] output_bucket = 'clio-text2vec' # reduce records in test mode if test: limit = 50 logging.info(f"Limiting to {limit} rows in test mode") else: limit = None # database setup logging.info(f"Using {db_name} database") # Get IDs from S3 s3 = boto3.resource('s3') obj = s3.Object(bucket, batch_file) ids = json.loads(obj.get()['Body']._raw_stream.read()) logging.info(f"{len(ids)} article IDs retrieved from s3") # Connect to SQL engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) with db_session(engine) as session: batch_records = ( session.query(Projects.abstractText).filter(Projects.id.in_(ids)) # .limit(limit) .all()) # Process and insert data vectors = docs2vectors([batch.abstractText for batch in batch_records]) processed_batch = { id_: vector.tolist() for id_, vector in zip(ids, vectors) } logging.info(f"Inserting {len(processed_batch)} rows") # Store batched vectors in S3 s3 = boto3.resource('s3') obj = s3.Object(output_bucket, f'{outinfo}.json') obj.put(Body=json.dumps(processed_batch))