def prepare(self): if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset='nih', aliases='health_scanner', increment_version=True) # Count articles from the old index _old_config = es_config.copy() _old_config['index'] = es_config['old_index'] logging.info(f"Collected article IDs...") _ids = get_es_ids(es, _old_config, size=10000) logging.info(f"Collected {len(_ids)} IDs") done_ids = get_es_ids(es, es_config, size=10000) # Generate the job params job_params = [] batches = split_batches(_ids, self.process_batch_size) for count, batch in enumerate(batches, 1): # Magical '0.3' is the lower end of the deduplication # fraction found by inspection done = sum(_id in done_ids for _id in batch) / len(batch) > 0.3 # write batch of ids to s3 batch_file = '' if not done: batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": 'mysqldb.config', "bucket": self.intermediate_bucket, "done": done, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'in_index': es_config['old_index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': 'paper', 'test': self.test, 'routine_id': self.routine_id } job_params.append(params) if self.test and count > 1: logging.warning("Breaking after 2 batches " "while in test mode.") logging.warning(job_params) break logging.info("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
def prepare(self): '''Chunk up elasticsearch data, and submit batch jobs over those chunks.''' if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") # Setup elasticsearch and extract all ids es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, drop_and_recreate=False, dataset=self.dataset, increment_version=False) ids = get_es_ids(es, es_config, size=10000) # All ids in this index ids = ids - self._done_ids # Don't repeat done ids # Override the default index if specified es_config['index'] = (self.index if self.index is not None else es_config['index']) # Generate the job params job_params = [] batches = split_batches(ids, self.process_batch_size) for count, batch in enumerate(batches, 1): done = False # Already taken care of with _done_ids # write batch of ids to s3 batch_file = '' if not done: batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": self.sql_config_filename, "bucket": self.intermediate_bucket, "done": done, "count": len(ids), 'outinfo': es_config['host'], 'out_port': es_config['port'], 'index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'test': self.test, 'routine_id': self.routine_id, 'entity_type': self.entity_type, **self.kwargs } job_params.append(params) # Test mode if self.test and count > 1: logging.warning("Breaking after 2 batches " "while in test mode.") logging.warning(job_params) break # Done logging.info("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
def prepare(self): if self.test: self.process_batch_size = 100 # MySQL setup database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) # Subtract off all done ids Base.metadata.create_all(engine) with db_session(engine) as session: result = session.query(Project.rcn).all() done_rcn = {r[0] for r in result} # Get all possible ids (or "RCN" in Cordis-speak) nrows = 1000 if self.test else None all_rcn = set( get_framework_ids('fp7', nrows=nrows) + get_framework_ids('h2020', nrows=nrows)) all_rcn = all_rcn - done_rcn # Generate the job params batches = split_batches(all_rcn, self.process_batch_size) params = [{ "batch_file": put_s3_batch(batch, self.intermediate_bucket, self.routine_id), "config": 'mysqldb.config', "db_name": database, "bucket": self.intermediate_bucket, "outinfo": 'dummy', "done": False, 'test': self.test } for batch in batches] return params
def prepare(self): '''Prepare the batch job parameters''' db = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, 'mysqldb', db) with db_session(engine) as session: results = (session.query( Projects.id, func.length(Projects.abstractText)).filter( Projects.abstractText is not None).distinct( Projects.abstractText).all()) # Keep documents with a length larger than the 10th percentile. perc = np.percentile([r[1] for r in results], 10) all_ids = [r.id for r in results if r[1] >= perc] job_params = [] for count, batch in enumerate(split_batches(all_ids, self.process_batch_size), start=1): # write batch of ids to s3 key = f'text2vec-{self.routine_id}-{self.date}-{count}' batch_file = put_s3_batch(batch, self.intermediate_bucket, key) done = key in DONE_KEYS params = { "config": "mysqldb.config", "bucket": self.intermediate_bucket, "batch_file": batch_file, "db_name": db, "done": done, 'outinfo': f"s3://{self.intermediate_bucket}/{key}", # mark as done 'test': self.test, } job_params.append(params) logging.info(params) return job_params
def prepare(self): if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") # MySQL setup self.database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, 'mysqldb', self.database) # Elasticsearch setup es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset='crunchbase', aliases='health_scanner') # Get set of existing ids from elasticsearch via scroll scanner = scan(es, query={"_source": False}, index=es_config['index'], doc_type=es_config['type']) existing_ids = {s['_id'] for s in scanner} logging.info(f"Collected {len(existing_ids)} existing in " "Elasticsearch") # Get set of all organisations from mysql all_orgs = list(all_org_ids(engine)) logging.info(f"{len(all_orgs)} organisations in MySQL") # Remove previously processed orgs_to_process = list(org for org in all_orgs if org not in existing_ids) logging.info(f"{len(orgs_to_process)} to be processed") job_params = [] for count, batch in enumerate( split_batches(orgs_to_process, self.process_batch_size), 1): logging.info(f"Processing batch {count} with size {len(batch)}") # write batch of ids to s3 batch_file = put_s3_batch(batch, self.intermediate_bucket, 'crunchbase_to_es') params = { "batch_file": batch_file, "config": 'mysqldb.config', "db_name": self.database, "bucket": self.intermediate_bucket, "done": False, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': 'company', "test": self.test } logging.info(params) job_params.append(params) if self.test and count > 1: logging.warning("Breaking after 2 batches while in " "test mode.") break logging.warning("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
def prepare(self): if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") # MySQL setup database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, self.db_section, database) # Elasticsearch setup es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset=self.dataset, aliases=self.aliases) # Get set of existing ids from elasticsearch via scroll existing_ids = get_es_ids(es, es_config) logging.info(f"Collected {len(existing_ids)} existing in " "Elasticsearch") # Get set of all organisations from mysql with db_session(engine) as session: result = session.query(self.id_field).all() all_ids = {r[0] for r in result} logging.info(f"{len(all_ids)} organisations in MySQL") # Remove previously processed ids_to_process = (org for org in all_ids if org not in existing_ids) job_params = [] for count, batch in enumerate( split_batches(ids_to_process, self.process_batch_size), 1): # write batch of ids to s3 batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": 'mysqldb.config', "db_name": database, "bucket": self.intermediate_bucket, "done": False, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': self.entity_type, 'test': self.test, 'routine_id': self.routine_id } params.update(self.kwargs) logging.info(params) job_params.append(params) if self.test and count > 1: logging.warning("Breaking after 2 batches while in " "test mode.") logging.warning(job_params) break logging.warning("Batch preparation completed, " f"with {len(job_params)} batches") return job_params