def main(): # In order to see errors during extension loading, you can uncomment the next line. # logging.basicConfig(level=logging.DEBUG) # Load tasks configured using entry_points # TODO: launch tasks by their entry_point name stevedore.ExtensionManager('edx.analytics.tasks') configuration = luigi.configuration.get_config() if os.path.exists(OVERRIDE_CONFIGURATION_FILE): log.debug('Using override.cfg') with open(OVERRIDE_CONFIGURATION_FILE, 'r') as override_file: log.debug(override_file.read()) configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE) else: log.debug('override.cfg does not exist') # Tell luigi what dependencies to pass to the Hadoop nodes # - boto is used for all direct interactions with s3. # - cjson is used for all parsing event logs. # - filechunkio is used for multipart uploads of large files to s3. # - opaque_keys is used to interpret serialized course_ids # - dependencies of opaque_keys: bson, stevedore luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore) # TODO: setup logging for tasks or configured logging mechanism # Launch Luigi using the default builder luigi.run()
def testUsage(self): class MetaTask(luigi.Task): task_namespace = "mynamespace" a = luigi.TaskParameter() def run(self): self.__class__.saved_value = self.a class OtherTask(luigi.Task): task_namespace = "other_namespace" self.assertEqual(MetaTask(a=MetaTask).a, MetaTask) self.assertEqual(MetaTask(a=OtherTask).a, OtherTask) # So I first thought this "should" work, but actually it should not, # because it should not need to parse values known at run-time self.assertNotEqual(MetaTask(a="mynamespace.MetaTask").a, MetaTask) # But is should be able to parse command line arguments self.assertRaises(luigi.task_register.TaskClassNotFoundException, lambda: (luigi.run(['--local-scheduler', '--no-lock'] + 'mynamespace.MetaTask --a blah'.split()))) self.assertRaises(luigi.task_register.TaskClassNotFoundException, lambda: (luigi.run(['--local-scheduler', '--no-lock'] + 'mynamespace.MetaTask --a Taskk'.split()))) self.assertTrue(luigi.run(['--local-scheduler', '--no-lock'] + 'mynamespace.MetaTask --a mynamespace.MetaTask'.split())) self.assertEqual(MetaTask.saved_value, MetaTask) self.assertTrue(luigi.run(['--local-scheduler', '--no-lock'] + 'mynamespace.MetaTask --a other_namespace.OtherTask'.split())) self.assertEqual(MetaTask.saved_value, OtherTask)
def run(self): def abrirExcel(direccionArchivo): wb = load_workbook(direccionArchivo) return wb.active def extraerLibros(direccionArchivo, descriptor): libros = list() ws = descriptor.rows for fila in range(1,len(ws)): if not ws[fila][0].value: break tmp = dict() for columna in range(len(ws[0])): tmp[ws[0][columna].value] = ws[fila][columna].value libros.append(tmp) logging.info("se han extraido "+str(fila)+" registros de "+direccionArchivo) print "se han extraido "+str(fila)+" registros de "+direccionArchivo return libros print "Abriendo", self.input_dir descriptor = abrirExcel(self.input_dir) libros = extraerLibros(self.input_dir, descriptor) f = self.output().open('w') pickle.dump(libros, f) f.close() def output(self): return luigi.LocalTarget('libros.pickle') if __name__ == '__main__': luigi.run()
def run(cls, curr_task, date_value, check_output_recursive=True): ss = SensorSchedule(curr_task, date_value, check_output_recursive) while True: # 1. all output arrives if ss.is_external_all_complete(): break else: cls.current_sleep_seconds += cls.default_wait_seconds # 2. time arrives if cls.current_sleep_seconds > cls.max_wait_seconds: exit() time.sleep(cls.default_wait_seconds) # A. Original Way # Can't turn into a task instance luigi.run(main_task_cls=curr_task) # support other task parameter, or yesterday (diff from current date_value) # B. New Way # Actually we do it manually. # Whatever how long task instances, interface.run will only run once, and exits current process. # So we need to pass all current task instances to `interface.run` . # from luigi.interface import ArgParseInterface # interface = ArgParseInterface() # interface.run(ss.ordered_task_instances_list) # , worker_scheduler_factory, override_defaults=override_defaults) can be ignored. """
def connection_rescalsim_eval(): connection_count = [0, 1, 2, 5, 10] for con in connection_count: params = ['RESCALEvaluation'] + base_config() + RESCAL2_DEFAULT_PARAMS + \ ['--maxconnections', str(con)] + ['--outputfolder', output_folder_config() + '/results/connections'] + \ ['--tensorfolder', output_folder_config() + '/tensor'] + ['--connectionslice2'] luigi.run(params)
def test_picard_alignmentmetrics(self): luigi.run(_luigi_args(['--target', sortbam.replace(".bam", ".align_metrics"),'--options', 'REFERENCE_SEQUENCE={}'.format(bwaseqref), '--config-file', localconf]), main_task_cls=ratatosk.lib.tools.picard.AlignmentMetrics) self.assertTrue(os.path.exists(sortbam.replace(".bam", ".align_metrics"))) with open(sortbam.replace(".bam", ".align_metrics")) as fh: metrics = fh.readlines() self.assertTrue(metrics[7].startswith("FIRST_OF_PAIR\t1001")) self.assertTrue(metrics[9].startswith("PAIR\t2002"))
def test_fastqc(self): luigi.run(_luigi_args(['--target', os.path.basename(fastq1), '--config-file', localconf, '--parent-task', 'ratatosk.lib.files.fastq.FastqFileLink']), main_task_cls=ratatosk.lib.tools.fastqc.FastQCJobTask) fqc = os.path.join(os.curdir, os.path.basename(fastq1).replace(".fastq.gz", "_fastqc"), os.path.basename(fastq1).replace(".fastq.gz", "_fastqc"), "summary.txt") self.assertTrue(os.path.exists(fqc)) with open(fqc) as fh: summary = fh.readlines() self.assertEqual('PASS\tBasic Statistics\tP001_101_index3_TGACCA_L001_R1_001.fastq.gz\n', summary[0])
def run_semafor(urls, readability_token, aws_key, aws_secret, s3_path, luigi_scheduler_host=''): import luigi from semafor.minion import pipeline # 0. Set settings settings.S3_PATH = s3_path settings.AWS_KEY = aws_key settings.AWS_SECRET = aws_secret settings.READABILITY_TOKEN = readability_token # 1. Create input file for pipeline fin = os.path.join(settings.DATA_DIR, settings.URLS_FILE) fin = open(fin, 'w') fin.write('\n'.join(urls)) fin.close() # 2. Run pipeline args = ['--local-scheduler'] if luigi_scheduler_host: args = ['--scheduler-host', luigi_scheduler_host] luigi.run(main_task_cls=pipeline.CopyToS3, cmdline_args=args) return 'OK'
def test_cmdline_logger(self, setup_mock, warn): with mock.patch("luigi.interface.core") as env_params: env_params.return_value.logging_conf_file = None luigi.run(['SomeTask', '--n', '7', '--local-scheduler', '--no-lock']) self.assertEqual([mock.call(None)], setup_mock.call_args_list) with mock.patch("luigi.configuration.get_config") as getconf: def get_boolean_side_effect(section, option, default=None): if section == 'worker_history': return False else: return True def get_side_effect(section, option, default=luigi.configuration.LuigiConfigParser.NO_DEFAULT): if section == 'worker_metadata' and option == 'worker_id': return default else: raise ConfigParser.NoOptionError(section='foo', option='bar') getconf.return_value.get.side_effect = get_side_effect getconf.return_value.getboolean.side_effect = get_boolean_side_effect luigi.interface.setup_interface_logging.call_args_list = [] luigi.run(['SomeTask', '--n', '42', '--local-scheduler', '--no-lock']) self.assertEqual([], setup_mock.call_args_list)
def main(): # In order to see errors during extension loading, you can uncomment the next line. logging.basicConfig(level=logging.DEBUG) # Load tasks configured using entry_points # TODO: launch tasks by their entry_point name stevedore.ExtensionManager('edx.analytics.tasks') configuration = luigi.configuration.get_config() if os.path.exists(OVERRIDE_CONFIGURATION_FILE): log.debug('Using %s', OVERRIDE_CONFIGURATION_FILE) configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE) else: log.debug('Configuration file %s does not exist', OVERRIDE_CONFIGURATION_FILE) # Tell luigi what dependencies to pass to the Hadoop nodes # - boto is used for all direct interactions with s3. # - cjson is used for all parsing event logs. # - filechunkio is used for multipart uploads of large files to s3. # - opaque_keys is used to interpret serialized course_ids # - dependencies of opaque_keys: bson, stevedore luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, ciso8601) # TODO: setup logging for tasks or configured logging mechanism # Launch Luigi using the default builder with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')): luigi.run()
def test_cutadapt(self): luigi.run(_luigi_args(['--target', os.path.basename(fastq1.replace(".fastq.gz", ".trimmed.fastq.gz")), '--config-file', localconf, '--parent-task', 'ratatosk.lib.files.fastq.FastqFileLink']), main_task_cls=ratatosk.lib.utils.cutadapt.CutadaptJobTask) luigi.run(_luigi_args(['--target', os.path.basename(fastq2.replace(".fastq.gz", ".trimmed.fastq.gz")), '--config-file', localconf, '--parent-task', 'ratatosk.lib.files.fastq.FastqFileLink']), main_task_cls=ratatosk.lib.utils.cutadapt.CutadaptJobTask) mfile = "P001_101_index3_TGACCA_L001_R1_001.trimmed.fastq.cutadapt_metrics" self.assertTrue(os.path.exists(mfile)) with open(mfile) as fp: metrics = fp.readlines() self.assertIn("Adapter 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC', length 34, was trimmed 31 times.\n", metrics)
def setUpClass(self): """Setup bwa wrappers by linking fastq files from ngs_test_data""" for f in [fastq1, fastq2]: if not os.path.exists(f): os.symlink(os.path.join(indir, os.path.basename(f)), f) luigi.run(_luigi_args(['--target', sortbam, '--config-file', localconf]), main_task_cls=ratatosk.lib.tools.samtools.SortBam) self.bam = sortbam luigi.run(_luigi_args(['--target', self.bam.replace(".bam", ".vcf")]), main_task_cls=ratatosk.lib.tools.gatk.UnifiedGenotyper)
def test_global_param_dep_cmdline_optparse(self): luigi.run( ["--local-scheduler", "--task", "HasGlobalParamDep", "--x", "xyz", "--global-param", "124"], use_optparse=True, ) h = HasGlobalParam(x="xyz") self.assertEquals(h.global_param, 124) self.assertEquals(h.global_bool_param, False)
def test_fastqln(self): if os.path.exists(fastq1): os.unlink(fastq1) luigi.run(_luigi_args(['--target', fastq1, '--config-file', localconf, '--use-long-names']), main_task_cls=ratatosk.lib.files.fastq.FastqFileLink) self.assertTrue(os.path.exists(fastq1)) with gzip.open(fastq1) as fp: h2 = fp.readlines()[0:3] self.assertTrue(h2[0].startswith("@SRR")) self.assertEqual(h2[2].rstrip(), "+")
def test_cmdline_optparse_existing(self): import optparse parser = optparse.OptionParser() parser.add_option('--blaha') luigi.run(['--local-scheduler', '--no-lock', '--task', 'Fib', '--n', '100'], use_optparse=True, existing_optparse=parser) self.assertEqual(MockFile.fs.get_data('/tmp/fib_10'), '55\n') self.assertEqual(MockFile.fs.get_data('/tmp/fib_100'), '354224848179261915075\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--additional-config', help='additional configuration file to be loaded after default/override', default=None, action='append') arguments, _extra_args = parser.parse_known_args() # We get a cleaned command-line arguments list, free of the arguments *we* care about, since Luigi will throw # errors when it sees arguments that it or the workflow didn't specify. We pass these in when invoking Luigi. cmdline_args = get_cleaned_command_line_args() # In order to see errors during extension loading, you can uncomment the next line. logging.basicConfig(level=logging.DEBUG) # Load tasks configured using entry_points # TODO: launch tasks by their entry_point name stevedore.ExtensionManager('edx.analytics.tasks') # Load the override configuration if it's specified/exists. configuration = luigi.configuration.get_config() if os.path.exists(OVERRIDE_CONFIGURATION_FILE): log.debug('Loading override configuration \'%s\'...', OVERRIDE_CONFIGURATION_FILE) configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE) else: log.debug('Configuration file \'%s\' does not exist!', OVERRIDE_CONFIGURATION_FILE) # Load any additional configuration files passed in. if arguments.additional_config is not None: for additional_config in arguments.additional_config: if os.path.exists(additional_config): log.debug('Loading additional configuration file \'%s\'...', additional_config) configuration.add_config_path(additional_config) else: log.debug('Configuration file \'%s\' does not exist!', additional_config) # Tell luigi what dependencies to pass to the Hadoop nodes # - edx.analytics.tasks is used to load the pipeline code, since we cannot trust all will be loaded automatically. # - boto is used for all direct interactions with s3. # - cjson is used for all parsing event logs. # - filechunkio is used for multipart uploads of large files to s3. # - opaque_keys is used to interpret serialized course_ids # - opaque_keys extensions: ccx_keys # - dependencies of opaque_keys: bson, stevedore luigi.hadoop.attach(edx.analytics.tasks) luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, ciso8601, requests) if configuration.getboolean('ccx', 'enabled', default=False): import ccx_keys luigi.hadoop.attach(ccx_keys) # TODO: setup logging for tasks or configured logging mechanism # Launch Luigi using the default builder with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')): luigi.run(cmdline_args)
def testBoolOverride(self): # See #743 self.assertEqual(hdfs.config.hdfs().snakebite_autoconfig, True) class DummyTestTask(luigi.Task): pass luigi.run(['--local-scheduler', '--no-lock', 'DummyTestTask']) self.assertEqual(hdfs.config.hdfs().snakebite_autoconfig, True)
def test_cmdline_logger(self, setup_mock, warn): luigi.run(['Task', '--local-scheduler']) self.assertEqual([mock.call(None)], setup_mock.call_args_list) with mock.patch("luigi.configuration.get_config") as getconf: getconf.return_value.get.return_value = None getconf.return_value.get_boolean.return_value = True luigi.interface.setup_interface_logging.call_args_list = [] luigi.run(['Task', '--local-scheduler']) self.assertEqual([], setup_mock.call_args_list)
def test_just_run_main_task_cls(self): class MyTestTask(luigi.Task): pass class MyOtherTestTask(luigi.Task): my_param = luigi.Parameter() with patch.object(sys, 'argv', ['my_module.py', '--no-lock', '--local-scheduler']): luigi.run(main_task_cls=MyTestTask) with patch.object(sys, 'argv', ['my_module.py', '--no-lock', '--my-param', 'my_value', '--local-scheduler']): luigi.run(main_task_cls=MyOtherTestTask)
def test_cmdline_logger(self, setup_mock, warn): with mock.patch("luigi.interface.core") as env_params: env_params.return_value.logging_conf_file = None luigi.run(['SomeTask', '--n', '7', '--local-scheduler', '--no-lock']) self.assertEqual([mock.call(None)], setup_mock.call_args_list) with mock.patch("luigi.configuration.get_config") as getconf: getconf.return_value.get.side_effect = ConfigParser.NoOptionError(section='foo', option='bar') getconf.return_value.getint.return_value = 0 luigi.interface.setup_interface_logging.call_args_list = [] luigi.run(['SomeTask', '--n', '42', '--local-scheduler', '--no-lock']) self.assertEqual([], setup_mock.call_args_list)
def test_cmdline_logger(self, setup_mock, warn): with mock.patch("luigi.interface.EnvironmentParamsContainer.env_params") as env_params: env_params.return_value.logging_conf_file = None luigi.run(['Task', '--local-scheduler']) self.assertEqual([mock.call(None)], setup_mock.call_args_list) with mock.patch("luigi.configuration.get_config") as getconf: getconf.return_value.get.side_effect = ConfigParser.NoOptionError(section='foo', option='bar') getconf.return_value.get_boolean.return_value = True luigi.interface.setup_interface_logging.call_args_list = [] luigi.run(['Task', '--local-scheduler']) self.assertEqual([], setup_mock.call_args_list)
def test_config_update(self): """Test updating config with and without disable_parent_task_update""" # Main gatk task luigi.run(['--config-file', ratatosk_file, '--target', 'mock.fastq.gz', '--dry-run'], main_task_cls=ratatosk.lib.files.fastq.FastqFileLink) gatkjt = ratatosk.lib.tools.gatk.GATKJobTask() self.assertEqual(gatkjt.parent_task, ("ratatosk.lib.tools.gatk.InputBamFile", )) cnf.add_config_path("mock.yaml") kwargs = gatkjt._update_config(cnf, {}) self.assertEqual(kwargs['parent_task'], 'another.class') kwargs = gatkjt._update_config(cnf, {}, disable_parent_task_update=True) self.assertIsNone(kwargs.get('parent_task')) cnf.del_config_path("mock.yaml") cnf.clear()
def test_misc_1(self): class Dogs(luigi.Config): n_dogs = luigi.IntParameter() class CatsWithoutSection(luigi.ConfigWithoutSection): n_cats = luigi.IntParameter() luigi.run(['--local-scheduler', '--no-lock', '--n-cats', '123', '--Dogs-n-dogs', '456', 'WithDefault']) self.assertEqual(Dogs().n_dogs, 456) self.assertEqual(CatsWithoutSection().n_cats, 123) luigi.run(['--local-scheduler', '--no-lock', 'WithDefault', '--n-cats', '321', '--Dogs-n-dogs', '654']) self.assertEqual(Dogs().n_dogs, 654) self.assertEqual(CatsWithoutSection().n_cats, 321)
def intersection_eval(): params = ['IntersectionEvaluation'] + base_config() + \ ['--outputfolder', output_folder_config() + '/results/intersection'] + \ ['--tensorfolder', output_folder_config() + '/tensor'] luigi.run(params + ['--rank', '500', '--rescalthreshold', '0.01', '--cosinethreshold', '0.5']) luigi.run(params + ['--rank', '500', '--rescalthreshold', '0.005', '--cosinethreshold', '0.5']) luigi.run(params + ['--rank', '500', '--rescalthreshold', '0.01', '--cosinethreshold', '0.6']) luigi.run(params + ['--rank', '500', '--rescalthreshold', '0.005', '--cosinethreshold', '0.6'])
def test_bwaaln_after_trim_resyncmates(self): with open("mock.yaml", "w") as fp: fp.write(yaml.safe_dump({ 'cutadapt':{'InputFastqFile':{'parent_task': 'ratatosk.lib.files.fastq.FastqFileLink'}}, 'misc':{'ResyncMates':{'parent_task': 'ratatosk.lib.utils.cutadapt.CutadaptJobTask'}}, 'fastq':{'link':{'indir': indir}}, 'bwa' :{ 'bwaref': bwaref, 'aln':{'parent_task':'ratatosk.lib.utils.misc.ResyncMatesJobTask'}}}, default_flow_style=False)) luigi.run(_luigi_args(['--target', sai1.replace(".sai", ".trimmed.sync.sai"), '--config-file', "mock.yaml"]), main_task_cls=ratatosk.lib.align.bwa.BwaAln) luigi.run(_luigi_args(['--target', sai2.replace(".sai", ".trimmed.sync.sai"), '--config-file', "mock.yaml"]), main_task_cls=ratatosk.lib.align.bwa.BwaAln) os.unlink("mock.yaml") self.assertTrue(os.path.exists(sai1.replace(".sai", ".trimmed.sync.sai")))
def test_1_create_cluster(self): success = luigi.run(['--local-scheduler', '--no-lock', 'CreateDataprocClusterTask', '--gcloud-project-id=' + PROJECT_ID, '--dataproc-cluster-name=' + CLUSTER_NAME]) self.assertTrue(success)
def test_config_update_main(self): """Test updating main subsection""" # UnifiedGenotyper # # Incidentally, this verifies that subsection key value 'no.such.class' # overrides section key 'another.class' luigi.run(['--config-file', ratatosk_file, '--target', 'mock.bam', '--dry-run'], main_task_cls=ratatosk.lib.files.fastq.FastqFileLink) ug = ratatosk.lib.tools.gatk.UnifiedGenotyper() self.assertEqual(ug.parent_task, "ratatosk.lib.tools.gatk.ClipReads") cnf.del_config_path(ratatosk_file) cnf.add_config_path("mock.yaml") kwargs = ug._update_config(cnf, {}) self.assertEqual(kwargs.get('parent_task'), 'no.such.class') kwargs = ug._update_config(cnf, {}, disable_parent_task_update=True) self.assertIsNone(kwargs.get('parent_task')) cnf.del_config_path("mock.yaml")
def test_9_delete_cluster_image_version(self): success = luigi.run(['--local-scheduler', '--no-lock', 'DeleteDataprocClusterTask', '--gcloud-project-id=' + PROJECT_ID, '--dataproc-cluster-name=' + CLUSTER_NAME + '-' + IMAGE_VERSION]) self.assertTrue(success)
def rank_eval(): rank_threshold = [(50,[0.001, 0.002, 0.003]), (100,[0.005, 0.006, 0.007]), (250,[0.01, 0.012, 0.015]), (500,[0.012, 0.015, 0.02]), (750,[0.015, 0.02, 0.025]), (1000,[0.02, 0.025, 0.03]), (2000,[0.02, 0.025, 0.03])] for tuple in rank_threshold: rank = tuple[0] for threshold in tuple[1]: params = ['RESCALEvaluation'] + base_config() + \ ['--outputfolder', output_folder_config() + '/results/rank'] + \ ['--rank', str(rank), '--threshold', str(threshold)] + \ ['--tensorfolder', output_folder_config() + '/tensor'] luigi.run(params)
def test_3_submit_minimal_job(self): # The job itself will fail because the job files don't exist # We don't care, because then we would be testing spark # We care the job was submitted correctly, so that's what we test luigi.run(['--local-scheduler', '--no-lock', 'DataprocSparkTask', '--gcloud-project-id=' + PROJECT_ID, '--dataproc-cluster-name=' + CLUSTER_NAME, '--main-class=my.MinimalMainClass']) response = dataproc.get_dataproc_client().projects().regions().jobs() \ .list(projectId=PROJECT_ID, region=REGION, clusterName=CLUSTER_NAME).execute() lastJob = response['jobs'][0]['sparkJob'] self.assertEqual(lastJob['mainClass'], "my.MinimalMainClass")
mt = SeqrGenotypesSchema(mt).annotate_all(overwrite=True).select_annotated_mt() mt.describe() mt.write(self.output().path, stage_locally=True, overwrite=True) class SeqrMTToESOptimizedTask(HailElasticSearchTask): def __init__(self, *args, **kwargs): # TODO: instead of hardcoded index, generate from project_guid, etc. super().__init__(*args, **kwargs) def requires(self): return [SeqrVCFToVariantMTTask(), SeqrVCFToGenotypesMTTask()] def run(self): variants_mt = hl.read_matrix_table(self.input()[0].path) genotypes_mt = hl.read_matrix_table(self.input()[1].path) row_ht = genotypes_mt.rows().join(variants_mt.rows()) row_ht = SeqrVariantsAndGenotypesSchema.elasticsearch_row(row_ht) self.export_table_to_elasticsearch(row_ht, self._mt_num_shards(genotypes_mt)) self.cleanup() if __name__ == '__main__': # If run does not succeed, exit with 1 status code. luigi.run() or sys.exit(1)
def test_bwaaln(self): luigi.run(_luigi_args([ '--target', read1.replace(".fastq.gz", ".sai"), '--config-file', localconf ]), main_task_cls=ratatosk.lib.align.bwa.Aln)
image_download = image_predict_clouds.addBands(cloud_score_percentile.select(["cluster"], ["cloudscore"])) \ .addBands(ground_truth.select(["b1"], ["fixedmask"]))\ .addBands(pred_percentile).clip(region_of_interest).toFloat() properties = ["system:time_start", 'system:index'] return image_download, properties class DownloadAll(luigi.WrapperTask): basepath = luigi.Parameter(default="reproducibility_results") method = luigi.ChoiceParameter(choices=["percentile","persistence","linear","kernel"], default="percentile",var_type=str) def requires(self): locations = get_location_splits() tareas = [] for index_name,v in locations.items(): for split_name,pol in v.items(): tarea = DownloadImageResults(image_index=index_name, basepath=self.basepath, method=self.method, split=split_name) tareas.append(tarea) return tareas if __name__ == "__main__": luigi.run(local_scheduler=True)
def test_sort(self): luigi.build([Task2(target="output.sort.txt")]) luigi.run(['--pipe', '--target', "output.sort.txt"], main_task_cls=Task2)
def test_global_param_cmdline(self): luigi.run(['--local-scheduler', '--no-lock', 'HasGlobalParam', '--x', 'xyz', '--global-param', '124']) h = HasGlobalParam(x='xyz') self.assertEqual(h.global_param, 124) self.assertEqual(h.global_bool_param, False)
def test_global_param_shared(self): luigi.run(['--local-scheduler', '--no-lock', 'SharedGlobalParamA', '--shared-global-param', 'abc']) b = SharedGlobalParamB() self.assertEqual(b.shared_global_param, 'abc')
def test_bool_false(self): luigi.run(['--local-scheduler', '--no-lock', 'Baz']) self.assertEqual(Baz._val, False)
def test_x_arg_y_arg_override(self): luigi.run([ '--local-scheduler', '--no-lock', 'Banana', '--x', 'foo', '--y', 'bar', '--style', 'x-arg-y-arg', '--BananaDep-y', 'xyz' ]) self.expect_keys(['banana-foo-bar', 'banana-dep-foo-bar'])
def test_x_arg(self): luigi.run(['--local-scheduler', '--no-lock', 'Banana', '--x', 'foo', '--y', 'bar', '--style', 'x-arg']) self.expect_keys(['banana-foo-bar', 'banana-dep-foo-def'])
def test_global_param_dep_cmdline_optparse(self): luigi.run(['--local-scheduler', '--no-lock', '--task', 'HasGlobalParamDep', '--x', 'xyz', '--global-param', '124'], use_optparse=True) h = HasGlobalParam(x='xyz') self.assertEqual(h.global_param, 124) self.assertEqual(h.global_bool_param, False)
def run_and_check(self, args): run_exit_status = luigi.run(['--local-scheduler', '--no-lock'] + args) self.assertTrue(run_exit_status) return run_exit_status
yield relations, json.dumps(tuple) ''' ...................... fill in your code above ........................''' def parse_query_tree(relations: str, tuple: dict, raquery: radb.ast.Node) -> tuple: """ Parse the combined operations before the current Map-Reduce job :param raquery: query to execute. :param relations: relations on which to execute the query :param tuple: the tuple on which to execute the query :return: pairs of relations and """ assert isinstance(tuple, dict), f'expecting dict got {type(tuple)}' # If the previous operation is a map reduce job or the first job then return the line as it is. if isinstance(raquery, radb.ast.Project) or isinstance(raquery, radb.ast.RelRef) or isinstance(raquery, radb.ast.Join): return relations, tuple # If it is select or rename run the select method and return the result without writing them to a file. if isinstance(raquery, radb.ast.Select): relations, tuple = parse_query_tree(relations=relations, tuple=tuple, raquery=raquery.inputs[0]) return select(relations=relations, tuple=tuple, raquery=raquery) if isinstance(raquery, radb.ast.Rename): relations, tuple = parse_query_tree(relations=relations, tuple=tuple, raquery=raquery.inputs[0]) return rename(relations=relations, tuple=tuple, raquery=raquery) if __name__ == '__main__': luigi.run()
self.assertEqual('blah', p.value) @with_config({"foo": {"bar": "baz"}}) def testGlobal(self): p = luigi.Parameter(config_path=dict(section="foo", name="bar"), is_global=True, default='blah') self.assertEqual('baz', p.value) p.set_global('meh') self.assertEqual('meh', p.value) def testGlobalAndMissing(self): p = luigi.Parameter(config_path=dict(section="foo", name="bar"), is_global=True, default='blah') self.assertEqual('blah', p.value) p.set_global('meh') self.assertEqual('meh', p.value) class OverrideEnvStuff(unittest.TestCase): def setUp(self): env_params_cls = luigi.interface.EnvironmentParamsContainer env_params_cls.scheduler_port.reset_global() @with_config({"core": {"default-scheduler-port": '6543'}}) def testOverrideSchedulerPort(self): env_params = luigi.interface.EnvironmentParamsContainer.env_params() self.assertEqual(env_params.scheduler_port, 6543) if __name__ == '__main__': luigi.run(use_optparse=True)
def test_multibool(self): luigi.run(['--local-scheduler', '--no-lock', 'Bar', '--multibool', 'true', '--multibool', 'false']) self.assertEqual(Bar._val, (True, False))
def test_bool_true(self): luigi.run(['--local-scheduler', '--no-lock', 'Baz', '--bool']) self.assertEqual(Baz._val, True)
with self.output().open("w") as o: for input in inputs: con = sqlite3.connect(input.fn) cur = con.cursor() try: n_particles, t_min, t_max = cur.execute( "SELECT nParticles, tMin, tMax FROM particleSourceMessenger;" ).fetchone() n_eff_cells = np.sum( cur.execute( "SELECT weight FROM `g4sipmDigis-0` WHERE time >= %s AND time < %s;" % (t_min, t_max)).fetchall()) print >> o, n_particles, n_eff_cells except Exception as e: print "Failure in", input.fn print e class All(luigi.WrapperTask): def requires(self): models = [ "../sample/resources/hamamatsu-s13360-6025pe.properties", "../sample/resources/hamamatsu-s13360-6050pe.properties", "../sample/resources/sensl-microfj-60035-tsv.properties" ] return [DynamicRangeSimulation(model=model) for model in models] if __name__ == "__main__": luigi.run(main_task_cls=All)
def test_y_arg_override_banana(self): luigi.run([ '--local-scheduler', '--no-lock', 'Banana', '--y', 'bar', '--style', 'y-kwarg', '--BananaDep-x', 'xyz', '--Banana-x', 'baz' ]) self.expect_keys(['banana-baz-bar', 'banana-dep-xyz-bar'])
def test_default_param_cmdline(self): luigi.run(['--local-scheduler', '--no-lock', 'WithDefault']) self.assertEqual(WithDefault().x, 'xyz')
def test_x_arg_override_stupid(self): luigi.run([ '--local-scheduler', '--no-lock', 'Banana', '--x', 'foo', '--y', 'bar', '--style', 'x-arg', '--BananaDep-x', 'blabla' ]) self.expect_keys(['banana-foo-bar', 'banana-dep-foo-def'])
save_directory_path=os.path.join(self.local_temporary_directory, 'factorization_machine'), scope_name='FactorizationMachineExample'), output_file_path='criteo/validation.zip') return dict(model=validation_task, test_data=test_data_task) def output(self): return self.make_target('criteo/example_results.txt') def run(self): tf.reset_default_graph() model = self.load('model') # type: redshells.model.FactorizationMachine test_data = self.load_data_frame('test_data') y = test_data['label'].copy() x = test_data.drop('label', axis=1) predict = model.predict(x) auc = redshells.model.utils.calculate_auc(y, predict) self.dump(f'auc={auc}') if __name__ == '__main__': # Please download criteo data from https://www.kaggle.com/c/criteo-display-ad-challenge and put train.txt on ./resouces/criteo/train.txt. luigi.configuration.add_config_path('./config/example.ini') luigi.run([ 'examples.FactorizationMachineExample', '--text-data-file-path', './resources/criteo/train.txt', '--data-size-rate', '0.1', '--local-scheduler', ])
output=temp) luigi.File(temp).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='ldj.gz')) class CrossrefIntermediateSchema(Task): """ Convert crossref into an intermediate schema. We use a tool called span, but it is really only converting one JSON format into another. """ def requires(self): return CrossrefItems() def run(self): output = shellout( "span-import -w 2 -i crossref <(unpigz -c {input}) | pigz -c > {output}", input=self.input().path) luigi.File(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='ldj.gz')) if __name__ == '__main__': luigi.run( ['CrossrefIntermediateSchema', '--workers', '1', '--local-scheduler'])
def test_global_param_dep_cmdline_bool(self): luigi.run(['--local-scheduler', '--no-lock', 'HasGlobalParamDep', '--x', 'xyz', '--global-bool-param']) h = HasGlobalParam(x='xyz') self.assertEqual(h.global_param, 123) self.assertEqual(h.global_bool_param, True)
task_namespace = 'iris_tasks' # output_path output_path = luigi.Parameter() def requires(self): yield SplitTrainTest() yield Predict() def run(self): from sklearn import metrics data = pd.read_pickle(self.input()[0].path) predict = pd.read_pickle(self.input()[1].path) ac_score = metrics.accuracy_score(data["test_y"], predict) evaluate = {"accuracy": ac_score} print(evaluate) with open(self.output_path, "wb") as f: pickle.dump(evaluate, f) def output(self): file_name = self.output_path.split("/")[-1] output_dir = self.output_path.split("/")[:-1] if not os.path.exists(os.path.join(*output_dir)): os.makedirs(os.path.join(*output_dir)) output = os.path.join(*output_dir + [file_name]) return luigi.LocalTarget(output) if __name__ == "__main__": luigi.run(["iris_tasks.Evaluate", "--workers", "1", "--local-scheduler"])
def test_forgot_param_in_dep(self): # A programmatic missing parameter will cause an error email to be sent luigi.run(['--local-scheduler', '--no-lock', 'ForgotParamDep']) self.assertNotEquals(self.last_email, None)
:param record: :return: """ # Look at HTTP Responses: if (record.type == WarcRecord.RESPONSE and record.content_type.startswith(b'application/http')): # Parse the HTTP Headers, faking a socket wrapper: f = hanzo.warcpayload.FileHTTPResponse(record.content_file) f.begin() hostname = urlparse.urlparse(record.url).hostname yield hostname, f.status def reducer(self, key, values): """ :param key: :param values: :return: """ for value in values: yield key, sum(values) if __name__ == '__main__': luigi.run([ 'GenerateWarcStats', '--input-file', 'daily-warcs-test.txt', '--local-scheduler' ])
from tasks.save_boe_diary_entry import SaveBoeDiaryEntry, SaveOption class Pipeline(luigi.WrapperTask): date = luigi.DateParameter() def complete(self): return False def requires(self): return ProcessBoeDiary(date=self.date) def run(self): with self.input().open('r') as f: diary_entries = json.loads(f.read()) yield (SaveBoeDiaryEntry(entry=entry, save_options=(SaveOption.DATABASE.value, SaveOption.ELASTICSEARCH.value)) for entry in diary_entries) if __name__ == '__main__': luigi.run(['Pipeline', '--local-scheduler', '--workers', '3', '--GlobalParams-base-dir', './temp', '--DBParams-host', 'localhost', '--DBParams-user', 'root', '--DBParams-password', 'pass', '--DBParams-database', 'boe', '--date', sys.argv[1] ])
def output(self): print("Task1: output") return luigi.LocalTarget("intermediate/task1_%s.txt" % time.asctime()) class Task2(luigi.Task): task_namespace = 'tasks' def requires(self): print("Task2: requires") return Task1() def run(self): print("Task2: run") with self.input().open("r") as intermediate, self.output().open( "w") as target: task1_text = intermediate.read() target.write(task1_text) target.write("This file was generated by Task2 at %s." % time.asctime()) def output(self): print("Task2: output") return luigi.LocalTarget("output/task2_%s.txt" % time.asctime()) if __name__ == '__main__': # luigi.run(['tasks.Task2', '--workers', '1', '--local-scheduler']) luigi.run(['tasks.Task2', '--workers', '1'])
def test_bampe(self): luigi.run(['--target', "data/sample1.bam", '--config-file', localconf], main_task_cls=Bampe)
class ExternalFile(luigi.ExternalTask): """ Represents an external file for the Luigi pipeline """ ext_file = luigi.Parameter() def output(self): return luigi.LocalTarget(self.ext_file) class MatrixAnalysis(luigi.WrapperTask): """ WrapperTask to trigger analysis of Domain matrices """ def requires(self): apps = defaultdict(set) target_folders = [cfg.fake_apks_folder] for f in target_folders: for root, dirs, files in os.walk(f): for basename in files: if basename.endswith('.apk'): pkg = "_".join(basename.split("_")[:-2]) vercode = basename.split("_")[-2] date = basename.split("_")[-1].split('.')[0] apps[pkg].add((pkg, vercode, date)) for pkg, apks in apps.items(): yield StringoidAnalysis(pkg=pkg) yield DnsAnalysis(pkg=pkg) if __name__ == '__main__': luigi.run(main_task_cls=MatrixAnalysis)
def test_multibool_empty(self): luigi.run(['--local-scheduler', '--no-lock', 'Bar']) self.assertEqual(Bar._val, tuple())