コード例 #1
0
    def requires(self):
        test = not self.production
        set_log_level(True)
        routine_id = f'EURITO-ElasticsearchTask-{self.date}-{test}'
        default_kwargs = dict(date=self.date,
                              process_batch_size=self.process_batch_size,
                              drop_and_recreate=self.drop_and_recreate,
                              job_def='py36_amzn1_image',
                              job_name=routine_id,
                              job_queue='HighPriority',
                              region_name='eu-west-2',
                              poll_time=10,
                              max_live_jobs=300,
                              db_config_env='MYSQLDB',
                              test=test,
                              memory=2048,
                              intermediate_bucket=S3_BUCKET)

        #params = (('arxiv', 'article', Article.id),
        #          ('crunchbase', 'company', Organization.id),
        #          ('patstat', 'patent', ApplnFamily.docdb_family_id))
        #params = (('crunchbase', 'company', Organization.id),)
        params = (('arxiv', 'article', Article.id), )
        #params = (('patstat', 'patent', ApplnFamily.docdb_family_id),)
        for dataset, entity_type, id_field in params:
            print(dataset, entity_type, id_field)
            yield Sql2EsTask(id_field=id_field,
                             entity_type=entity_type,
                             **kwarg_maker(dataset, routine_id),
                             **default_kwargs)
コード例 #2
0
 def requires(self):
     """Yield AutoML"""
     logging.getLogger().setLevel(logging.INFO)
     test = not self.production
     luigi_logging.set_log_level(test, self.verbose)
     return AutoMLTask(s3_path_prefix=S3INTER.format(dataset=self.dataset,
                                                     date=self.date),
                       task_chain_filepath=CHAIN_PARAMETER_PATH,
                       input_task=S3Task,
                       input_task_kwargs={'s3_path': self.s3_path_in},
                       final_task='corex_topic_model',
                       test=test)
コード例 #3
0
class RootTask(luigi.WrapperTask):
    production = luigi.BoolParameter(default=False)
    date = luigi.DateParameter(default=dt.now())
    set_log_level(True)

    def requires(self):
        batchable = f3p("batchables/cordis/cordis_api")
        env_files = [f3p("nesta"), f3p("config/mysqldb.config")]
        routine_id = f'Cordis-{self.date}-{self.production}'
        return CordisCollectTask(routine_id=routine_id,
                                 test=not self.production,
                                 batchable=batchable,
                                 env_files=env_files,
                                 job_def="py36_amzn1_image",
                                 job_name=f"Collect-{routine_id}",
                                 job_queue="HighPriority",
                                 region_name="eu-west-2",
                                 poll_time=10,
                                 memory=2048,
                                 max_live_jobs=20)
コード例 #4
0
 def requires(self):
     test = not self.production
     set_log_level(test)
     return CordisNeo4jTask(test=test)
コード例 #5
0
ファイル: run.py プロジェクト: yitzikc/nesta
        for calls, project_calls in split_links(_calls, rcn):
            data['proposal_calls'].append(calls)
            data['project_proposal_calls'].append(project_calls)

    # Pipe the data to the db
    for table_prefix, rows in data.items():
        table_name = f'cordis_{table_prefix}'
        logging.info(table_name)
        _class = get_class_by_tablename(Base, table_name)
        insert_data(db_env,
                    db_section,
                    db_name,
                    Base,
                    _class,
                    rows,
                    low_memory=True)


if __name__ == "__main__":

    set_log_level(True)
    if 'BATCHPAR_config' not in os.environ:
        from nesta.core.luigihacks.misctools import find_filepath_from_pathstub
        os.environ['BATCHPAR_batch_file'] = (
            'Cordis-2020-04-12-True-1586709686976328.json')
        os.environ['BATCHPAR_db_name'] = 'production'
        os.environ["BATCHPAR_config"] = find_filepath_from_pathstub(
            'mysqldb.config'),
        os.environ["BATCHPAR_bucket"] = ('nesta-production' '-intermediate')
    run()
コード例 #6
0
ファイル: run.py プロジェクト: hmessafi/nesta
                ]
            if row['institutes'] in (None, []):
                row['institutes'] = [
                    grid_institutes[g].title() for g in good_institutes
                ]

            uid = row.pop('id')
            _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")

    logging.warning("Batch job complete.")


if __name__ == "__main__":
    set_log_level()
    if 'BATCHPAR_outinfo' not in os.environ:
        from nesta.core.orms.orm_utils import setup_es
        es, es_config = setup_es('dev', True, True, dataset='arxiv-eu')
        environ = {
            'config': ('/home/ec2-user/nesta-eu/nesta/'
                       'core/config/mysqldb.config'),
            'batch_file': ('arxiv-eu_EURITO-ElasticsearchTask-'
                           '2019-10-12-True-157124660046601.json'),
            'db_name':
            'dev',
            'bucket':
            'nesta-production-intermediate',
            'done':
            "False",
            'outinfo': ('https://search-eurito-dev-'
コード例 #7
0
 def requires(self):
     if self.production:
         self.test = False
     set_log_level(True)
     pass
コード例 #8
0
ファイル: preprocess_patstat.py プロジェクト: hmessafi/nesta
 def requires(self):
     set_log_level(True)
     yield PreprocessPatstatTask(date=self.date,
                                 test=not self.production)