def test_setup_es_bad_es_mode(mock_get_es_mapping, mock_Elasticsearch, mock_assert_correct_config, mock_get_config): with pytest.raises(ValueError): setup_es(es_mode="dave", test_mode=False, drop_and_recreate=False, dataset=None, aliases=None)
def test_setup_es_no_create_if_exists(mock_get_es_mapping, mock_Elasticsearch, mock_assert_correct_config, mock_get_config): mock_Elasticsearch.return_value.indices.exists.return_value = True setup_es(es_mode="dev", test_mode=True, drop_and_recreate=False, dataset=None, aliases=None) assert mock_Elasticsearch.return_value.indices.delete.call_count == 0 assert mock_Elasticsearch.return_value.indices.create.call_count == 0
def prepare(self): if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset='nih', aliases='health_scanner', increment_version=True) # Count articles from the old index _old_config = es_config.copy() _old_config['index'] = es_config['old_index'] logging.info(f"Collected article IDs...") _ids = get_es_ids(es, _old_config, size=10000) logging.info(f"Collected {len(_ids)} IDs") done_ids = get_es_ids(es, es_config, size=10000) # Generate the job params job_params = [] batches = split_batches(_ids, self.process_batch_size) for count, batch in enumerate(batches, 1): # Magical '0.3' is the lower end of the deduplication # fraction found by inspection done = sum(_id in done_ids for _id in batch) / len(batch) > 0.3 # write batch of ids to s3 batch_file = '' if not done: batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": 'mysqldb.config', "bucket": self.intermediate_bucket, "done": done, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'in_index': es_config['old_index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': 'paper', 'test': self.test, 'routine_id': self.routine_id } job_params.append(params) if self.test and count > 1: logging.warning("Breaking after 2 batches " "while in test mode.") logging.warning(job_params) break logging.info("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
def prepare(self): '''Chunk up elasticsearch data, and submit batch jobs over those chunks.''' if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") # Setup elasticsearch and extract all ids es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, drop_and_recreate=False, dataset=self.dataset, increment_version=False) ids = get_es_ids(es, es_config, size=10000) # All ids in this index ids = ids - self._done_ids # Don't repeat done ids # Override the default index if specified es_config['index'] = (self.index if self.index is not None else es_config['index']) # Generate the job params job_params = [] batches = split_batches(ids, self.process_batch_size) for count, batch in enumerate(batches, 1): done = False # Already taken care of with _done_ids # write batch of ids to s3 batch_file = '' if not done: batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": self.sql_config_filename, "bucket": self.intermediate_bucket, "done": done, "count": len(ids), 'outinfo': es_config['host'], 'out_port': es_config['port'], 'index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'test': self.test, 'routine_id': self.routine_id, 'entity_type': self.entity_type, **self.kwargs } job_params.append(params) # Test mode if self.test and count > 1: logging.warning("Breaking after 2 batches " "while in test mode.") logging.warning(job_params) break # Done logging.info("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
def done_ids(self): es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, drop_and_recreate=False, dataset=self.dataset, increment_version=False) field = "terms_tokens_article" ids = get_es_ids(es, es_config, size=10000, query={"query": {"exists": {"field" : field}}}) return ids
def prepare(self): # mysql setup db = 'production' if not self.test else 'dev' engine = get_mysql_engine(MYSQLDB_ENV, "mysqldb", db) Session = sessionmaker(bind=engine) session = Session() project_query = session.query(Projects) # elasticsearch setup es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset='nih', aliases='health_scanner') batches = self.batch_limits(project_query, BATCH_SIZE) job_params = [] for start, end in batches: params = { 'start_index': start, 'end_index': end, 'config': "mysqldb.config", 'db': db, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'done': es.exists(index=es_config['index'], doc_type=es_config['type'], id=end), 'aws_auth_region': es_config['region'], 'entity_type': 'paper' } job_params.append(params) return job_params
def prepare(self): # mysql setup db = 'production' if not self.test else 'dev' # elasticsearch setup es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, drop_and_recreate=False, dataset='nih', aliases='health_scanner') # s3 setup and file key collection bucket = 'innovation-mapping-general' key_prefix = 'nih_abstracts_processed/22-07-2019/nih_' keys = self.get_abstract_file_keys(bucket, key_prefix) logging.info(f"Found keys: {keys}") # In test mode, manually filter keys for those which # contain our data if self.test: keys = subset_keys(es, es_config, keys) job_params = [] for key in keys: done = ((not self.test) and self.done_check(es, index=es_config['index'], doc_type=es_config['type'], key=key)) params = { 's3_key': key, 's3_bucket': bucket, 'dupe_file': ("nih_abstracts/24-05-19/" "duplicate_mapping.json"), 'config': "mysqldb.config", 'db': db, 'outinfo': es_config, 'done': done, 'entity_type': 'paper' } logging.info(params) job_params.append(params) return job_params
def requires(self): '''Collects the database configurations and executes the central task.''' _routine_id = "{}-{}".format(self.date, self.production) grid_task_kwargs = { '_routine_id': _routine_id, 'db_config_path': self.db_config_path, 'db_config_env': 'MYSQLDB', 'mag_config_path': 'mag.config', 'test': not self.production, 'insert_batch_size': self.insert_batch_size, 'articles_from_date': self.articles_from_date, 'date': self.date, } cherry_picked = (f'automl/{self.date}/COREX_TOPIC_MODEL' '.n_hidden_27-0.VECTORIZER.binary_True' f'.min_df_0-001.NGRAM.TEST_False.json') if not self.production: cherry_picked = (f'automl/{self.date}/COREX_TOPIC_MODEL' '.n_hidden_36-0.VECTORIZER.binary_True' '.min_df_0-001.NGRAM.TEST_True.json') kwargs = { 'score_field': 'metric_novelty_article', 'fields': ['textBody_abstract_article'] } test = not self.production routine_id = f"ArxivLolveltyTask-{self.date}-{test}" # Elasticsearch setup dataset = 'arxiv' _, es_config = setup_es('prod' if self.production else 'dev', not self.production, self.drop_and_recreate, dataset=dataset) yield ArxivElasticsearchTask(date=self.date, routine_id=routine_id, grid_task_kwargs=grid_task_kwargs, test=not self.production, index=es_config['index'], dataset='arxiv', entity_type='article', kwargs=kwargs, batchable=f3p("batchables/novelty" "/lolvelty"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/" "elasticsearch.config") ], job_def="py36_amzn1_image", job_name=routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=1024, max_live_jobs=30) yield AnalysisTask(date=self.date, grid_task_kwargs=grid_task_kwargs, _routine_id=_routine_id, db_config_path=self.db_config_path, db_config_env='MYSQLDB', mag_config_path='mag.config', test=not self.production, insert_batch_size=self.insert_batch_size, articles_from_date=self.articles_from_date, cherry_picked=cherry_picked)
logging.warning("Batch job complete.") if __name__ == "__main__": log_stream_handler = logging.StreamHandler() logging.basicConfig(handlers=[ log_stream_handler, ], level=logging.INFO, format="%(asctime)s:%(levelname)s:%(message)s") if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es es, es_config = setup_es('dev', True, True, dataset='crunchbase', aliases='health_scanner') environ = { "AWSBATCHTEST": "", 'BATCHPAR_batch_file': 'crunchbase_to_es-15597291977144725.json', 'BATCHPAR_config': ('/home/ec2-user/nesta/nesta/' 'core/config/mysqldb.config'), 'BATCHPAR_db_name': 'production', 'BATCHPAR_bucket': 'nesta-production-intermediate', 'BATCHPAR_done':
grid_institutes[g].title() for g in good_institutes ] uid = row.pop('id') _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row) if not count % 1000: logging.info(f"{count} rows loaded to " "elasticsearch") logging.warning("Batch job complete.") if __name__ == "__main__": set_log_level() if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es es, es_config = setup_es('dev', True, True, dataset='arxiv-eu') environ = { 'config': ('/home/ec2-user/nesta-eu/nesta/' 'core/config/mysqldb.config'), 'batch_file': ('arxiv-eu_EURITO-ElasticsearchTask-' '2019-10-12-True-157124660046601.json'), 'db_name': 'dev', 'bucket': 'nesta-production-intermediate', 'done': "False", 'outinfo': ('https://search-eurito-dev-' 'vq22tw6otqjpdh47u75bh2g7ba.' 'eu-west-2.es.amazonaws.com'), 'out_port':
def prepare(self): if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") # MySQL setup self.database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, 'mysqldb', self.database) # Elasticsearch setup es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset='crunchbase', aliases='health_scanner') # Get set of existing ids from elasticsearch via scroll scanner = scan(es, query={"_source": False}, index=es_config['index'], doc_type=es_config['type']) existing_ids = {s['_id'] for s in scanner} logging.info(f"Collected {len(existing_ids)} existing in " "Elasticsearch") # Get set of all organisations from mysql all_orgs = list(all_org_ids(engine)) logging.info(f"{len(all_orgs)} organisations in MySQL") # Remove previously processed orgs_to_process = list(org for org in all_orgs if org not in existing_ids) logging.info(f"{len(orgs_to_process)} to be processed") job_params = [] for count, batch in enumerate( split_batches(orgs_to_process, self.process_batch_size), 1): logging.info(f"Processing batch {count} with size {len(batch)}") # write batch of ids to s3 batch_file = put_s3_batch(batch, self.intermediate_bucket, 'crunchbase_to_es') params = { "batch_file": batch_file, "config": 'mysqldb.config', "db_name": self.database, "bucket": self.intermediate_bucket, "done": False, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': 'company', "test": self.test } logging.info(params) job_params.append(params) if self.test and count > 1: logging.warning("Breaking after 2 batches while in " "test mode.") break logging.warning("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
row = object_to_dict(obj) row = reformat_row(row) es.index(index=es_index, doc_type=es_type, id=row.pop('rcn'), body=row) if not count % 1000: logging.info(f"{count} rows loaded to " "elasticsearch") if __name__ == "__main__": set_log_level() if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es from nesta.core.luigihacks.misctools import find_filepath_from_pathstub es, es_config = setup_es('dev', True, True, dataset='cordis-eu') environ = { 'config': find_filepath_from_pathstub('mysqldb.config'), 'batch_file': ('cordis-eu_EURITO-ElasticsearchTask-' '2020-04-10-True-15865345336407135.json'), 'db_name': 'dev', 'bucket': 'nesta-production-intermediate', 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'],
def prepare(self): if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") # MySQL setup database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, self.db_section, database) # Elasticsearch setup es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset=self.dataset, aliases=self.aliases) # Get set of existing ids from elasticsearch via scroll existing_ids = get_es_ids(es, es_config) logging.info(f"Collected {len(existing_ids)} existing in " "Elasticsearch") # Get set of all organisations from mysql with db_session(engine) as session: result = session.query(self.id_field).all() all_ids = {r[0] for r in result} logging.info(f"{len(all_ids)} organisations in MySQL") # Remove previously processed ids_to_process = (org for org in all_ids if org not in existing_ids) job_params = [] for count, batch in enumerate( split_batches(ids_to_process, self.process_batch_size), 1): # write batch of ids to s3 batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": 'mysqldb.config', "db_name": database, "bucket": self.intermediate_bucket, "done": False, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': self.entity_type, 'test': self.test, 'routine_id': self.routine_id } params.update(self.kwargs) logging.info(params) job_params.append(params) if self.test and count > 1: logging.warning("Breaking after 2 batches while in " "test mode.") logging.warning(job_params) break logging.warning("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
es.index(index=es_index, doc_type=es_type, id=uid, body=row) # Also upload the data to S3 silo.put(data, dataset) if __name__ == "__main__": log_stream_handler = logging.StreamHandler() logging.basicConfig(handlers=[log_stream_handler, ], level=logging.INFO, format="%(asctime)s:%(levelname)s:%(message)s") if 'BATCHPAR_outinfo' not in os.environ: es, es_config = setup_es(es_mode="dev", test_mode=True, drop_and_recreate=True, dataset='example', aliases='example') #environ = {"AWSBATCHTEST": "", ## << This means don't write to ES environ = {"BATCHPAR_aws_auth_region": es_config["region"], "BATCHPAR_outinfo": es_config["host"], "BATCHPAR_dataset" : "example", "BATCHPAR_done":"False", "BATCHPAR_age_increment": "-3", "BATCHPAR_start_index":"0", "BATCHPAR_end_index":"3", "BATCHPAR_out_type": es_config["type"], "BATCHPAR_out_port": es_config["port"], "BATCHPAR_out_index":es_config["index"], "BATCHPAR_entity_type":"muppet"} for k, v in environ.items():