def requires(self): test = not self.production set_log_level(True) routine_id = f'EURITO-ElasticsearchTask-{self.date}-{test}' default_kwargs = dict(date=self.date, process_batch_size=self.process_batch_size, drop_and_recreate=self.drop_and_recreate, job_def='py36_amzn1_image', job_name=routine_id, job_queue='HighPriority', region_name='eu-west-2', poll_time=10, max_live_jobs=300, db_config_env='MYSQLDB', test=test, memory=2048, intermediate_bucket=S3_BUCKET) #params = (('arxiv', 'article', Article.id), # ('crunchbase', 'company', Organization.id), # ('patstat', 'patent', ApplnFamily.docdb_family_id)) #params = (('crunchbase', 'company', Organization.id),) params = (('arxiv', 'article', Article.id), ) #params = (('patstat', 'patent', ApplnFamily.docdb_family_id),) for dataset, entity_type, id_field in params: print(dataset, entity_type, id_field) yield Sql2EsTask(id_field=id_field, entity_type=entity_type, **kwarg_maker(dataset, routine_id), **default_kwargs)
def requires(self): """Yield AutoML""" logging.getLogger().setLevel(logging.INFO) test = not self.production luigi_logging.set_log_level(test, self.verbose) return AutoMLTask(s3_path_prefix=S3INTER.format(dataset=self.dataset, date=self.date), task_chain_filepath=CHAIN_PARAMETER_PATH, input_task=S3Task, input_task_kwargs={'s3_path': self.s3_path_in}, final_task='corex_topic_model', test=test)
class RootTask(luigi.WrapperTask): production = luigi.BoolParameter(default=False) date = luigi.DateParameter(default=dt.now()) set_log_level(True) def requires(self): batchable = f3p("batchables/cordis/cordis_api") env_files = [f3p("nesta"), f3p("config/mysqldb.config")] routine_id = f'Cordis-{self.date}-{self.production}' return CordisCollectTask(routine_id=routine_id, test=not self.production, batchable=batchable, env_files=env_files, job_def="py36_amzn1_image", job_name=f"Collect-{routine_id}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=2048, max_live_jobs=20)
def requires(self): test = not self.production set_log_level(test) return CordisNeo4jTask(test=test)
for calls, project_calls in split_links(_calls, rcn): data['proposal_calls'].append(calls) data['project_proposal_calls'].append(project_calls) # Pipe the data to the db for table_prefix, rows in data.items(): table_name = f'cordis_{table_prefix}' logging.info(table_name) _class = get_class_by_tablename(Base, table_name) insert_data(db_env, db_section, db_name, Base, _class, rows, low_memory=True) if __name__ == "__main__": set_log_level(True) if 'BATCHPAR_config' not in os.environ: from nesta.core.luigihacks.misctools import find_filepath_from_pathstub os.environ['BATCHPAR_batch_file'] = ( 'Cordis-2020-04-12-True-1586709686976328.json') os.environ['BATCHPAR_db_name'] = 'production' os.environ["BATCHPAR_config"] = find_filepath_from_pathstub( 'mysqldb.config'), os.environ["BATCHPAR_bucket"] = ('nesta-production' '-intermediate') run()
] if row['institutes'] in (None, []): row['institutes'] = [ grid_institutes[g].title() for g in good_institutes ] uid = row.pop('id') _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row) if not count % 1000: logging.info(f"{count} rows loaded to " "elasticsearch") logging.warning("Batch job complete.") if __name__ == "__main__": set_log_level() if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es es, es_config = setup_es('dev', True, True, dataset='arxiv-eu') environ = { 'config': ('/home/ec2-user/nesta-eu/nesta/' 'core/config/mysqldb.config'), 'batch_file': ('arxiv-eu_EURITO-ElasticsearchTask-' '2019-10-12-True-157124660046601.json'), 'db_name': 'dev', 'bucket': 'nesta-production-intermediate', 'done': "False", 'outinfo': ('https://search-eurito-dev-'
def requires(self): if self.production: self.test = False set_log_level(True) pass
def requires(self): set_log_level(True) yield PreprocessPatstatTask(date=self.date, test=not self.production)