def main():
    # In order to see errors during extension loading, you can uncomment the next line.
    # logging.basicConfig(level=logging.DEBUG)

    # Load tasks configured using entry_points
    # TODO: launch tasks by their entry_point name
    stevedore.ExtensionManager('edx.analytics.tasks')

    configuration = luigi.configuration.get_config()
    if os.path.exists(OVERRIDE_CONFIGURATION_FILE):
        log.debug('Using override.cfg')
        with open(OVERRIDE_CONFIGURATION_FILE, 'r') as override_file:
            log.debug(override_file.read())
        configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE)
    else:
        log.debug('override.cfg does not exist')

    # Tell luigi what dependencies to pass to the Hadoop nodes
    # - boto is used for all direct interactions with s3.
    # - cjson is used for all parsing event logs.
    # - filechunkio is used for multipart uploads of large files to s3.
    # - opaque_keys is used to interpret serialized course_ids
    #   - dependencies of opaque_keys:  bson, stevedore
    luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore)

    # TODO: setup logging for tasks or configured logging mechanism

    # Launch Luigi using the default builder
    luigi.run()
Exemple #2
0
    def testUsage(self):

        class MetaTask(luigi.Task):
            task_namespace = "mynamespace"
            a = luigi.TaskParameter()

            def run(self):
                self.__class__.saved_value = self.a

        class OtherTask(luigi.Task):
            task_namespace = "other_namespace"

        self.assertEqual(MetaTask(a=MetaTask).a, MetaTask)
        self.assertEqual(MetaTask(a=OtherTask).a, OtherTask)

        # So I first thought this "should" work, but actually it should not,
        # because it should not need to parse values known at run-time
        self.assertNotEqual(MetaTask(a="mynamespace.MetaTask").a, MetaTask)

        # But is should be able to parse command line arguments
        self.assertRaises(luigi.task_register.TaskClassNotFoundException,
                          lambda: (luigi.run(['--local-scheduler', '--no-lock'] +
                                   'mynamespace.MetaTask --a blah'.split())))
        self.assertRaises(luigi.task_register.TaskClassNotFoundException,
                          lambda: (luigi.run(['--local-scheduler', '--no-lock'] +
                                   'mynamespace.MetaTask --a Taskk'.split())))
        self.assertTrue(luigi.run(['--local-scheduler', '--no-lock'] + 'mynamespace.MetaTask --a mynamespace.MetaTask'.split()))
        self.assertEqual(MetaTask.saved_value, MetaTask)
        self.assertTrue(luigi.run(['--local-scheduler', '--no-lock'] + 'mynamespace.MetaTask --a other_namespace.OtherTask'.split()))
        self.assertEqual(MetaTask.saved_value, OtherTask)
	def run(self):

	def abrirExcel(direccionArchivo):
		wb = load_workbook(direccionArchivo)
		return wb.active

	def extraerLibros(direccionArchivo, descriptor):
		libros = list()
		ws = descriptor.rows
		for fila in range(1,len(ws)):
			if not ws[fila][0].value:
				break
			tmp = dict()
			for columna in range(len(ws[0])):
				tmp[ws[0][columna].value] = ws[fila][columna].value
			libros.append(tmp)
		logging.info("se han extraido "+str(fila)+" registros de "+direccionArchivo)
		print "se han extraido "+str(fila)+" registros de "+direccionArchivo
		return libros

	print "Abriendo", self.input_dir

	descriptor = abrirExcel(self.input_dir)
	libros = extraerLibros(self.input_dir, descriptor)

	f = self.output().open('w')
	pickle.dump(libros, f)
	f.close()


	def output(self):
    	return luigi.LocalTarget('libros.pickle')

if __name__ == '__main__':
    luigi.run()     	        
Exemple #4
0
    def run(cls, curr_task, date_value, check_output_recursive=True):
        ss = SensorSchedule(curr_task, date_value, check_output_recursive)

        while True:
            # 1. all output arrives
            if ss.is_external_all_complete():
                break
            else:
                cls.current_sleep_seconds += cls.default_wait_seconds
                # 2. time arrives
                if cls.current_sleep_seconds > cls.max_wait_seconds:
                    exit()
                time.sleep(cls.default_wait_seconds)

        # A. Original Way
        # Can't turn into a task instance
        luigi.run(main_task_cls=curr_task)  # support other task parameter, or yesterday (diff from current date_value)

        # B. New Way
        # Actually we do it manually.
        #   Whatever how long task instances, interface.run will only run once, and exits current process.
        #   So we need to pass all current task instances to `interface.run` .
        # from luigi.interface import ArgParseInterface
        # interface = ArgParseInterface()
        # interface.run(ss.ordered_task_instances_list)  # , worker_scheduler_factory, override_defaults=override_defaults) can be ignored.
        """
def connection_rescalsim_eval():
    connection_count = [0, 1, 2, 5, 10]
    for con in connection_count:
        params = ['RESCALEvaluation'] + base_config() + RESCAL2_DEFAULT_PARAMS + \
                 ['--maxconnections', str(con)] + ['--outputfolder', output_folder_config() + '/results/connections'] + \
                 ['--tensorfolder', output_folder_config() + '/tensor'] + ['--connectionslice2']
        luigi.run(params)
 def test_picard_alignmentmetrics(self):
     luigi.run(_luigi_args(['--target', sortbam.replace(".bam", ".align_metrics"),'--options', 'REFERENCE_SEQUENCE={}'.format(bwaseqref), '--config-file', localconf]), main_task_cls=ratatosk.lib.tools.picard.AlignmentMetrics)
     self.assertTrue(os.path.exists(sortbam.replace(".bam", ".align_metrics")))
     with open(sortbam.replace(".bam", ".align_metrics")) as fh:
         metrics = fh.readlines()
         self.assertTrue(metrics[7].startswith("FIRST_OF_PAIR\t1001"))
         self.assertTrue(metrics[9].startswith("PAIR\t2002"))
 def test_fastqc(self):
     luigi.run(_luigi_args(['--target', os.path.basename(fastq1), '--config-file', localconf, '--parent-task', 'ratatosk.lib.files.fastq.FastqFileLink']), main_task_cls=ratatosk.lib.tools.fastqc.FastQCJobTask)
     fqc = os.path.join(os.curdir, os.path.basename(fastq1).replace(".fastq.gz", "_fastqc"), os.path.basename(fastq1).replace(".fastq.gz", "_fastqc"), "summary.txt")
     self.assertTrue(os.path.exists(fqc))
     with open(fqc) as fh:
         summary = fh.readlines()
         self.assertEqual('PASS\tBasic Statistics\tP001_101_index3_TGACCA_L001_R1_001.fastq.gz\n', summary[0])
Exemple #8
0
def run_semafor(urls, readability_token, aws_key, aws_secret, s3_path,
                luigi_scheduler_host=''):
    import luigi
    from semafor.minion import pipeline

    # 0. Set settings
    settings.S3_PATH = s3_path
    settings.AWS_KEY = aws_key
    settings.AWS_SECRET = aws_secret
    settings.READABILITY_TOKEN = readability_token

    # 1. Create input file for pipeline
    fin = os.path.join(settings.DATA_DIR, settings.URLS_FILE)
    fin = open(fin, 'w')
    fin.write('\n'.join(urls))
    fin.close()

    # 2. Run pipeline
    args = ['--local-scheduler']
    if luigi_scheduler_host:
        args = ['--scheduler-host', luigi_scheduler_host]

    luigi.run(main_task_cls=pipeline.CopyToS3, cmdline_args=args)

    return 'OK'
Exemple #9
0
    def test_cmdline_logger(self, setup_mock, warn):
        with mock.patch("luigi.interface.core") as env_params:
            env_params.return_value.logging_conf_file = None
            luigi.run(['SomeTask', '--n', '7', '--local-scheduler', '--no-lock'])
            self.assertEqual([mock.call(None)], setup_mock.call_args_list)

        with mock.patch("luigi.configuration.get_config") as getconf:
            def get_boolean_side_effect(section, option, default=None):
                if section == 'worker_history':
                    return False
                else:
                    return True

            def get_side_effect(section, option, default=luigi.configuration.LuigiConfigParser.NO_DEFAULT):
                if section == 'worker_metadata' and option == 'worker_id':
                    return default
                else:
                    raise ConfigParser.NoOptionError(section='foo', option='bar')

            getconf.return_value.get.side_effect = get_side_effect
            getconf.return_value.getboolean.side_effect = get_boolean_side_effect

            luigi.interface.setup_interface_logging.call_args_list = []
            luigi.run(['SomeTask', '--n', '42', '--local-scheduler', '--no-lock'])
            self.assertEqual([], setup_mock.call_args_list)
def main():
    # In order to see errors during extension loading, you can uncomment the next line.
    logging.basicConfig(level=logging.DEBUG)

    # Load tasks configured using entry_points
    # TODO: launch tasks by their entry_point name
    stevedore.ExtensionManager('edx.analytics.tasks')

    configuration = luigi.configuration.get_config()
    if os.path.exists(OVERRIDE_CONFIGURATION_FILE):
        log.debug('Using %s', OVERRIDE_CONFIGURATION_FILE)
        configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE)
    else:
        log.debug('Configuration file %s does not exist', OVERRIDE_CONFIGURATION_FILE)

    # Tell luigi what dependencies to pass to the Hadoop nodes
    # - boto is used for all direct interactions with s3.
    # - cjson is used for all parsing event logs.
    # - filechunkio is used for multipart uploads of large files to s3.
    # - opaque_keys is used to interpret serialized course_ids
    #   - dependencies of opaque_keys:  bson, stevedore
    luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, ciso8601)

    # TODO: setup logging for tasks or configured logging mechanism

    # Launch Luigi using the default builder

    with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')):
        luigi.run()
 def test_cutadapt(self):
     luigi.run(_luigi_args(['--target', os.path.basename(fastq1.replace(".fastq.gz", ".trimmed.fastq.gz")), '--config-file', localconf, '--parent-task', 'ratatosk.lib.files.fastq.FastqFileLink']), main_task_cls=ratatosk.lib.utils.cutadapt.CutadaptJobTask)
     luigi.run(_luigi_args(['--target', os.path.basename(fastq2.replace(".fastq.gz", ".trimmed.fastq.gz")), '--config-file', localconf, '--parent-task', 'ratatosk.lib.files.fastq.FastqFileLink']), main_task_cls=ratatosk.lib.utils.cutadapt.CutadaptJobTask)
     mfile = "P001_101_index3_TGACCA_L001_R1_001.trimmed.fastq.cutadapt_metrics"
     self.assertTrue(os.path.exists(mfile))
     with open(mfile) as fp:
         metrics = fp.readlines()
         self.assertIn("Adapter 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC', length 34, was trimmed 31 times.\n", metrics)
 def setUpClass(self):
     """Setup bwa wrappers by linking fastq files from ngs_test_data"""
     for f in [fastq1, fastq2]:
         if not os.path.exists(f):
             os.symlink(os.path.join(indir, os.path.basename(f)), f)
     luigi.run(_luigi_args(['--target', sortbam, '--config-file', localconf]), main_task_cls=ratatosk.lib.tools.samtools.SortBam)
     self.bam = sortbam
     luigi.run(_luigi_args(['--target', self.bam.replace(".bam", ".vcf")]), main_task_cls=ratatosk.lib.tools.gatk.UnifiedGenotyper)
Exemple #13
0
 def test_global_param_dep_cmdline_optparse(self):
     luigi.run(
         ["--local-scheduler", "--task", "HasGlobalParamDep", "--x", "xyz", "--global-param", "124"],
         use_optparse=True,
     )
     h = HasGlobalParam(x="xyz")
     self.assertEquals(h.global_param, 124)
     self.assertEquals(h.global_bool_param, False)
 def test_fastqln(self):
     if os.path.exists(fastq1):
         os.unlink(fastq1)
     luigi.run(_luigi_args(['--target', fastq1, '--config-file', localconf, '--use-long-names']), main_task_cls=ratatosk.lib.files.fastq.FastqFileLink)
     self.assertTrue(os.path.exists(fastq1))
     with gzip.open(fastq1) as fp:
         h2 = fp.readlines()[0:3]
         self.assertTrue(h2[0].startswith("@SRR"))
         self.assertEqual(h2[2].rstrip(), "+")
Exemple #15
0
    def test_cmdline_optparse_existing(self):
        import optparse
        parser = optparse.OptionParser()
        parser.add_option('--blaha')

        luigi.run(['--local-scheduler', '--no-lock', '--task', 'Fib', '--n', '100'], use_optparse=True, existing_optparse=parser)

        self.assertEqual(MockFile.fs.get_data('/tmp/fib_10'), '55\n')
        self.assertEqual(MockFile.fs.get_data('/tmp/fib_100'), '354224848179261915075\n')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--additional-config', help='additional configuration file to be loaded after default/override',
        default=None, action='append')
    arguments, _extra_args = parser.parse_known_args()

    # We get a cleaned command-line arguments list, free of the arguments *we* care about, since Luigi will throw
    # errors when it sees arguments that it or the workflow didn't specify.  We pass these in when invoking Luigi.
    cmdline_args = get_cleaned_command_line_args()

    # In order to see errors during extension loading, you can uncomment the next line.
    logging.basicConfig(level=logging.DEBUG)

    # Load tasks configured using entry_points
    # TODO: launch tasks by their entry_point name
    stevedore.ExtensionManager('edx.analytics.tasks')

    # Load the override configuration if it's specified/exists.
    configuration = luigi.configuration.get_config()
    if os.path.exists(OVERRIDE_CONFIGURATION_FILE):
        log.debug('Loading override configuration \'%s\'...', OVERRIDE_CONFIGURATION_FILE)
        configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE)
    else:
        log.debug('Configuration file \'%s\' does not exist!', OVERRIDE_CONFIGURATION_FILE)

    # Load any additional configuration files passed in.
    if arguments.additional_config is not None:
        for additional_config in arguments.additional_config:
            if os.path.exists(additional_config):
                log.debug('Loading additional configuration file \'%s\'...', additional_config)
                configuration.add_config_path(additional_config)
            else:
                log.debug('Configuration file \'%s\' does not exist!', additional_config)


    # Tell luigi what dependencies to pass to the Hadoop nodes
    # - edx.analytics.tasks is used to load the pipeline code, since we cannot trust all will be loaded automatically.
    # - boto is used for all direct interactions with s3.
    # - cjson is used for all parsing event logs.
    # - filechunkio is used for multipart uploads of large files to s3.
    # - opaque_keys is used to interpret serialized course_ids
    #   - opaque_keys extensions:  ccx_keys
    #   - dependencies of opaque_keys:  bson, stevedore
    luigi.hadoop.attach(edx.analytics.tasks)
    luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, ciso8601, requests)

    if configuration.getboolean('ccx', 'enabled', default=False):
        import ccx_keys
        luigi.hadoop.attach(ccx_keys)

    # TODO: setup logging for tasks or configured logging mechanism

    # Launch Luigi using the default builder

    with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')):
        luigi.run(cmdline_args)
Exemple #17
0
    def testBoolOverride(self):
        # See #743
        self.assertEqual(hdfs.config.hdfs().snakebite_autoconfig, True)

        class DummyTestTask(luigi.Task):
            pass

        luigi.run(['--local-scheduler', '--no-lock', 'DummyTestTask'])

        self.assertEqual(hdfs.config.hdfs().snakebite_autoconfig, True)
Exemple #18
0
    def test_cmdline_logger(self, setup_mock, warn):
        luigi.run(['Task', '--local-scheduler'])
        self.assertEqual([mock.call(None)], setup_mock.call_args_list)

        with mock.patch("luigi.configuration.get_config") as getconf:
            getconf.return_value.get.return_value = None
            getconf.return_value.get_boolean.return_value = True

            luigi.interface.setup_interface_logging.call_args_list = []
            luigi.run(['Task', '--local-scheduler'])
            self.assertEqual([], setup_mock.call_args_list)
Exemple #19
0
    def test_just_run_main_task_cls(self):
        class MyTestTask(luigi.Task):
            pass

        class MyOtherTestTask(luigi.Task):
            my_param = luigi.Parameter()

        with patch.object(sys, 'argv', ['my_module.py', '--no-lock', '--local-scheduler']):
            luigi.run(main_task_cls=MyTestTask)

        with patch.object(sys, 'argv', ['my_module.py', '--no-lock', '--my-param', 'my_value', '--local-scheduler']):
            luigi.run(main_task_cls=MyOtherTestTask)
Exemple #20
0
    def test_cmdline_logger(self, setup_mock, warn):
        with mock.patch("luigi.interface.core") as env_params:
            env_params.return_value.logging_conf_file = None
            luigi.run(['SomeTask', '--n', '7', '--local-scheduler', '--no-lock'])
            self.assertEqual([mock.call(None)], setup_mock.call_args_list)

        with mock.patch("luigi.configuration.get_config") as getconf:
            getconf.return_value.get.side_effect = ConfigParser.NoOptionError(section='foo', option='bar')
            getconf.return_value.getint.return_value = 0

            luigi.interface.setup_interface_logging.call_args_list = []
            luigi.run(['SomeTask', '--n', '42', '--local-scheduler', '--no-lock'])
            self.assertEqual([], setup_mock.call_args_list)
Exemple #21
0
    def test_cmdline_logger(self, setup_mock, warn):
        with mock.patch("luigi.interface.EnvironmentParamsContainer.env_params") as env_params:
            env_params.return_value.logging_conf_file = None
            luigi.run(['Task', '--local-scheduler'])
            self.assertEqual([mock.call(None)], setup_mock.call_args_list)

        with mock.patch("luigi.configuration.get_config") as getconf:
            getconf.return_value.get.side_effect = ConfigParser.NoOptionError(section='foo', option='bar')
            getconf.return_value.get_boolean.return_value = True

            luigi.interface.setup_interface_logging.call_args_list = []
            luigi.run(['Task', '--local-scheduler'])
            self.assertEqual([], setup_mock.call_args_list)
Exemple #22
0
 def test_config_update(self):
     """Test updating config with and without disable_parent_task_update"""
     # Main gatk task
     luigi.run(['--config-file', ratatosk_file, '--target', 'mock.fastq.gz', '--dry-run'], main_task_cls=ratatosk.lib.files.fastq.FastqFileLink)
     gatkjt = ratatosk.lib.tools.gatk.GATKJobTask()
     self.assertEqual(gatkjt.parent_task, ("ratatosk.lib.tools.gatk.InputBamFile", ))
     cnf.add_config_path("mock.yaml")
     kwargs = gatkjt._update_config(cnf, {})
     self.assertEqual(kwargs['parent_task'], 'another.class')
     kwargs = gatkjt._update_config(cnf, {}, disable_parent_task_update=True)
     self.assertIsNone(kwargs.get('parent_task'))
     cnf.del_config_path("mock.yaml")
     cnf.clear()
Exemple #23
0
    def test_misc_1(self):
        class Dogs(luigi.Config):
            n_dogs = luigi.IntParameter()

        class CatsWithoutSection(luigi.ConfigWithoutSection):
            n_cats = luigi.IntParameter()

        luigi.run(['--local-scheduler', '--no-lock', '--n-cats', '123', '--Dogs-n-dogs', '456', 'WithDefault'])
        self.assertEqual(Dogs().n_dogs, 456)
        self.assertEqual(CatsWithoutSection().n_cats, 123)

        luigi.run(['--local-scheduler', '--no-lock', 'WithDefault', '--n-cats', '321', '--Dogs-n-dogs', '654'])
        self.assertEqual(Dogs().n_dogs, 654)
        self.assertEqual(CatsWithoutSection().n_cats, 321)
def intersection_eval():
    params = ['IntersectionEvaluation'] + base_config() + \
                 ['--outputfolder', output_folder_config() + '/results/intersection'] + \
                 ['--tensorfolder', output_folder_config() + '/tensor']
    luigi.run(params + ['--rank',  '500', '--rescalthreshold', '0.01', '--cosinethreshold', '0.5'])
    luigi.run(params + ['--rank',  '500', '--rescalthreshold', '0.005', '--cosinethreshold', '0.5'])
    luigi.run(params + ['--rank',  '500', '--rescalthreshold', '0.01', '--cosinethreshold', '0.6'])
    luigi.run(params + ['--rank',  '500', '--rescalthreshold', '0.005', '--cosinethreshold', '0.6'])
 def test_bwaaln_after_trim_resyncmates(self):
     with open("mock.yaml", "w") as fp:
         fp.write(yaml.safe_dump({
                     'cutadapt':{'InputFastqFile':{'parent_task': 'ratatosk.lib.files.fastq.FastqFileLink'}},
                     'misc':{'ResyncMates':{'parent_task': 'ratatosk.lib.utils.cutadapt.CutadaptJobTask'}},
                     'fastq':{'link':{'indir': indir}},
                     'bwa' :{
                         'bwaref': bwaref,
                         'aln':{'parent_task':'ratatosk.lib.utils.misc.ResyncMatesJobTask'}}}, default_flow_style=False))
     luigi.run(_luigi_args(['--target', sai1.replace(".sai", ".trimmed.sync.sai"),
                            '--config-file', "mock.yaml"]), main_task_cls=ratatosk.lib.align.bwa.BwaAln)
     luigi.run(_luigi_args(['--target', sai2.replace(".sai", ".trimmed.sync.sai"),
                            '--config-file', "mock.yaml"]), main_task_cls=ratatosk.lib.align.bwa.BwaAln)
     os.unlink("mock.yaml")
     self.assertTrue(os.path.exists(sai1.replace(".sai", ".trimmed.sync.sai")))
Exemple #26
0
 def test_1_create_cluster(self):
     success = luigi.run(['--local-scheduler',
                          '--no-lock',
                          'CreateDataprocClusterTask',
                          '--gcloud-project-id=' + PROJECT_ID,
                          '--dataproc-cluster-name=' + CLUSTER_NAME])
     self.assertTrue(success)
Exemple #27
0
 def test_config_update_main(self):
     """Test updating main subsection"""
     # UnifiedGenotyper
     #
     # Incidentally, this verifies that subsection key value 'no.such.class'
     # overrides section key 'another.class'
     luigi.run(['--config-file', ratatosk_file, '--target', 'mock.bam', '--dry-run'], main_task_cls=ratatosk.lib.files.fastq.FastqFileLink)
     ug = ratatosk.lib.tools.gatk.UnifiedGenotyper()
     self.assertEqual(ug.parent_task, "ratatosk.lib.tools.gatk.ClipReads")
     cnf.del_config_path(ratatosk_file)
     cnf.add_config_path("mock.yaml")
     kwargs = ug._update_config(cnf, {})
     self.assertEqual(kwargs.get('parent_task'), 'no.such.class')
     kwargs = ug._update_config(cnf, {}, disable_parent_task_update=True)
     self.assertIsNone(kwargs.get('parent_task'))
     cnf.del_config_path("mock.yaml")
Exemple #28
0
 def test_9_delete_cluster_image_version(self):
     success = luigi.run(['--local-scheduler',
                          '--no-lock',
                          'DeleteDataprocClusterTask',
                          '--gcloud-project-id=' + PROJECT_ID,
                          '--dataproc-cluster-name=' + CLUSTER_NAME + '-' + IMAGE_VERSION])
     self.assertTrue(success)
def rank_eval():
    rank_threshold = [(50,[0.001, 0.002, 0.003]),
                      (100,[0.005, 0.006, 0.007]),
                      (250,[0.01, 0.012, 0.015]),
                      (500,[0.012, 0.015, 0.02]),
                      (750,[0.015, 0.02, 0.025]),
                      (1000,[0.02, 0.025, 0.03]),
                      (2000,[0.02, 0.025, 0.03])]
    for tuple in rank_threshold:
        rank = tuple[0]
        for threshold in tuple[1]:
            params = ['RESCALEvaluation'] + base_config() + \
                     ['--outputfolder', output_folder_config() + '/results/rank'] + \
                     ['--rank', str(rank), '--threshold', str(threshold)]  + \
                     ['--tensorfolder', output_folder_config() + '/tensor']
            luigi.run(params)
Exemple #30
0
    def test_3_submit_minimal_job(self):
        # The job itself will fail because the job files don't exist
        # We don't care, because then we would be testing spark
        # We care the job was submitted correctly, so that's what we test

        luigi.run(['--local-scheduler',
                   '--no-lock',
                   'DataprocSparkTask',
                   '--gcloud-project-id=' + PROJECT_ID,
                   '--dataproc-cluster-name=' + CLUSTER_NAME,
                   '--main-class=my.MinimalMainClass'])

        response = dataproc.get_dataproc_client().projects().regions().jobs() \
            .list(projectId=PROJECT_ID, region=REGION, clusterName=CLUSTER_NAME).execute()
        lastJob = response['jobs'][0]['sparkJob']

        self.assertEqual(lastJob['mainClass'], "my.MinimalMainClass")
        mt = SeqrGenotypesSchema(mt).annotate_all(overwrite=True).select_annotated_mt()

        mt.describe()
        mt.write(self.output().path, stage_locally=True, overwrite=True)


class SeqrMTToESOptimizedTask(HailElasticSearchTask):

    def __init__(self, *args, **kwargs):
        # TODO: instead of hardcoded index, generate from project_guid, etc.
        super().__init__(*args, **kwargs)

    def requires(self):
        return [SeqrVCFToVariantMTTask(), SeqrVCFToGenotypesMTTask()]

    def run(self):
        variants_mt = hl.read_matrix_table(self.input()[0].path)
        genotypes_mt = hl.read_matrix_table(self.input()[1].path)
        row_ht = genotypes_mt.rows().join(variants_mt.rows())

        row_ht = SeqrVariantsAndGenotypesSchema.elasticsearch_row(row_ht)
        self.export_table_to_elasticsearch(row_ht, self._mt_num_shards(genotypes_mt))

        self.cleanup()


if __name__ == '__main__':
    # If run does not succeed, exit with 1 status code.
    luigi.run() or sys.exit(1)
Exemple #32
0
 def test_bwaaln(self):
     luigi.run(_luigi_args([
         '--target',
         read1.replace(".fastq.gz", ".sai"), '--config-file', localconf
     ]),
               main_task_cls=ratatosk.lib.align.bwa.Aln)
Exemple #33
0
        image_download = image_predict_clouds.addBands(cloud_score_percentile.select(["cluster"], ["cloudscore"])) \
            .addBands(ground_truth.select(["b1"], ["fixedmask"]))\
            .addBands(pred_percentile).clip(region_of_interest).toFloat()

        properties = ["system:time_start", 'system:index']

        return image_download, properties


class DownloadAll(luigi.WrapperTask):
    basepath = luigi.Parameter(default="reproducibility_results")
    method = luigi.ChoiceParameter(choices=["percentile","persistence","linear","kernel"],
                                   default="percentile",var_type=str)

    def requires(self):
        locations = get_location_splits()
        tareas = []
        for index_name,v in locations.items():
            for split_name,pol in v.items():
                tarea = DownloadImageResults(image_index=index_name,
                                             basepath=self.basepath,
                                             method=self.method,
                                             split=split_name)
                tareas.append(tarea)
        return tareas

if __name__ == "__main__":
    luigi.run(local_scheduler=True)

Exemple #34
0
 def test_sort(self):
     luigi.build([Task2(target="output.sort.txt")])
     luigi.run(['--pipe', '--target', "output.sort.txt"],
               main_task_cls=Task2)
Exemple #35
0
 def test_global_param_cmdline(self):
     luigi.run(['--local-scheduler', '--no-lock', 'HasGlobalParam', '--x', 'xyz', '--global-param', '124'])
     h = HasGlobalParam(x='xyz')
     self.assertEqual(h.global_param, 124)
     self.assertEqual(h.global_bool_param, False)
Exemple #36
0
 def test_global_param_shared(self):
     luigi.run(['--local-scheduler', '--no-lock', 'SharedGlobalParamA', '--shared-global-param', 'abc'])
     b = SharedGlobalParamB()
     self.assertEqual(b.shared_global_param, 'abc')
Exemple #37
0
 def test_bool_false(self):
     luigi.run(['--local-scheduler', '--no-lock', 'Baz'])
     self.assertEqual(Baz._val, False)
Exemple #38
0
 def test_x_arg_y_arg_override(self):
     luigi.run([
         '--local-scheduler', '--no-lock', 'Banana', '--x', 'foo', '--y',
         'bar', '--style', 'x-arg-y-arg', '--BananaDep-y', 'xyz'
     ])
     self.expect_keys(['banana-foo-bar', 'banana-dep-foo-bar'])
Exemple #39
0
 def test_x_arg(self):
     luigi.run(['--local-scheduler', '--no-lock', 'Banana', '--x', 'foo', '--y', 'bar', '--style', 'x-arg'])
     self.expect_keys(['banana-foo-bar', 'banana-dep-foo-def'])
Exemple #40
0
 def test_global_param_dep_cmdline_optparse(self):
     luigi.run(['--local-scheduler', '--no-lock', '--task', 'HasGlobalParamDep', '--x', 'xyz', '--global-param', '124'], use_optparse=True)
     h = HasGlobalParam(x='xyz')
     self.assertEqual(h.global_param, 124)
     self.assertEqual(h.global_bool_param, False)
Exemple #41
0
 def run_and_check(self, args):
     run_exit_status = luigi.run(['--local-scheduler', '--no-lock'] + args)
     self.assertTrue(run_exit_status)
     return run_exit_status
Exemple #42
0
            yield relations, json.dumps(tuple)
        ''' ...................... fill in your code above ........................'''


def parse_query_tree(relations: str, tuple: dict, raquery: radb.ast.Node) -> tuple:
    """
    Parse the combined operations before the current Map-Reduce job
    :param raquery: query to execute.
    :param relations: relations on which to execute the query
    :param tuple: the tuple on which to execute the query
    :return: pairs of relations and
    """
    assert isinstance(tuple, dict), f'expecting dict got {type(tuple)}'
    # If the previous operation is a map reduce job or the first job then return the line as it is.
    if isinstance(raquery, radb.ast.Project) or isinstance(raquery, radb.ast.RelRef) or isinstance(raquery,
                                                                                                   radb.ast.Join):
        return relations, tuple

    # If it is select or rename run the select method and return the result without writing them to a file.
    if isinstance(raquery, radb.ast.Select):
        relations, tuple = parse_query_tree(relations=relations, tuple=tuple, raquery=raquery.inputs[0])
        return select(relations=relations, tuple=tuple, raquery=raquery)

    if isinstance(raquery, radb.ast.Rename):
        relations, tuple = parse_query_tree(relations=relations, tuple=tuple, raquery=raquery.inputs[0])
        return rename(relations=relations, tuple=tuple, raquery=raquery)


if __name__ == '__main__':
    luigi.run()
Exemple #43
0
        self.assertEqual('blah', p.value)

    @with_config({"foo": {"bar": "baz"}})
    def testGlobal(self):
        p = luigi.Parameter(config_path=dict(section="foo", name="bar"), is_global=True, default='blah')
        self.assertEqual('baz', p.value)
        p.set_global('meh')
        self.assertEqual('meh', p.value)

    def testGlobalAndMissing(self):
        p = luigi.Parameter(config_path=dict(section="foo", name="bar"), is_global=True, default='blah')
        self.assertEqual('blah', p.value)
        p.set_global('meh')
        self.assertEqual('meh', p.value)


class OverrideEnvStuff(unittest.TestCase):

    def setUp(self):
        env_params_cls = luigi.interface.EnvironmentParamsContainer
        env_params_cls.scheduler_port.reset_global()

    @with_config({"core": {"default-scheduler-port": '6543'}})
    def testOverrideSchedulerPort(self):
        env_params = luigi.interface.EnvironmentParamsContainer.env_params()
        self.assertEqual(env_params.scheduler_port, 6543)


if __name__ == '__main__':
    luigi.run(use_optparse=True)
Exemple #44
0
 def test_multibool(self):
     luigi.run(['--local-scheduler', '--no-lock', 'Bar', '--multibool', 'true', '--multibool', 'false'])
     self.assertEqual(Bar._val, (True, False))
Exemple #45
0
 def test_bool_true(self):
     luigi.run(['--local-scheduler', '--no-lock', 'Baz', '--bool'])
     self.assertEqual(Baz._val, True)
        with self.output().open("w") as o:
            for input in inputs:
                con = sqlite3.connect(input.fn)
                cur = con.cursor()
                try:
                    n_particles, t_min, t_max = cur.execute(
                        "SELECT nParticles, tMin, tMax FROM particleSourceMessenger;"
                    ).fetchone()
                    n_eff_cells = np.sum(
                        cur.execute(
                            "SELECT weight FROM `g4sipmDigis-0` WHERE time >= %s AND time < %s;"
                            % (t_min, t_max)).fetchall())
                    print >> o, n_particles, n_eff_cells
                except Exception as e:
                    print "Failure in", input.fn
                    print e


class All(luigi.WrapperTask):
    def requires(self):
        models = [
            "../sample/resources/hamamatsu-s13360-6025pe.properties",
            "../sample/resources/hamamatsu-s13360-6050pe.properties",
            "../sample/resources/sensl-microfj-60035-tsv.properties"
        ]
        return [DynamicRangeSimulation(model=model) for model in models]


if __name__ == "__main__":
    luigi.run(main_task_cls=All)
Exemple #47
0
 def test_y_arg_override_banana(self):
     luigi.run([
         '--local-scheduler', '--no-lock', 'Banana', '--y', 'bar',
         '--style', 'y-kwarg', '--BananaDep-x', 'xyz', '--Banana-x', 'baz'
     ])
     self.expect_keys(['banana-baz-bar', 'banana-dep-xyz-bar'])
Exemple #48
0
 def test_default_param_cmdline(self):
     luigi.run(['--local-scheduler', '--no-lock', 'WithDefault'])
     self.assertEqual(WithDefault().x, 'xyz')
Exemple #49
0
 def test_x_arg_override_stupid(self):
     luigi.run([
         '--local-scheduler', '--no-lock', 'Banana', '--x', 'foo', '--y',
         'bar', '--style', 'x-arg', '--BananaDep-x', 'blabla'
     ])
     self.expect_keys(['banana-foo-bar', 'banana-dep-foo-def'])
Exemple #50
0
                save_directory_path=os.path.join(self.local_temporary_directory, 'factorization_machine'),
                scope_name='FactorizationMachineExample'),
            output_file_path='criteo/validation.zip')
        return dict(model=validation_task, test_data=test_data_task)

    def output(self):
        return self.make_target('criteo/example_results.txt')

    def run(self):
        tf.reset_default_graph()
        model = self.load('model')  # type: redshells.model.FactorizationMachine
        test_data = self.load_data_frame('test_data')
        y = test_data['label'].copy()
        x = test_data.drop('label', axis=1)
        predict = model.predict(x)
        auc = redshells.model.utils.calculate_auc(y, predict)
        self.dump(f'auc={auc}')


if __name__ == '__main__':
    # Please download criteo data from https://www.kaggle.com/c/criteo-display-ad-challenge and put train.txt on ./resouces/criteo/train.txt.
    luigi.configuration.add_config_path('./config/example.ini')
    luigi.run([
        'examples.FactorizationMachineExample',
        '--text-data-file-path',
        './resources/criteo/train.txt',
        '--data-size-rate',
        '0.1',
        '--local-scheduler',
    ])
Exemple #51
0
                    output=temp)
        luigi.File(temp).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='ldj.gz'))


class CrossrefIntermediateSchema(Task):
    """
    Convert crossref into an intermediate schema.

    We use a tool called span, but it is really
    only converting one JSON format into another.
    """
    def requires(self):
        return CrossrefItems()

    def run(self):
        output = shellout(
            "span-import -w 2 -i crossref <(unpigz -c {input}) | pigz -c > {output}",
            input=self.input().path)
        luigi.File(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='ldj.gz'))


if __name__ == '__main__':
    luigi.run(
        ['CrossrefIntermediateSchema', '--workers', '1', '--local-scheduler'])
Exemple #52
0
 def test_global_param_dep_cmdline_bool(self):
     luigi.run(['--local-scheduler', '--no-lock', 'HasGlobalParamDep', '--x', 'xyz', '--global-bool-param'])
     h = HasGlobalParam(x='xyz')
     self.assertEqual(h.global_param, 123)
     self.assertEqual(h.global_bool_param, True)
    task_namespace = 'iris_tasks'

    # output_path
    output_path = luigi.Parameter()

    def requires(self):
        yield SplitTrainTest()
        yield Predict()

    def run(self):
        from sklearn import metrics
        data = pd.read_pickle(self.input()[0].path)
        predict = pd.read_pickle(self.input()[1].path)
        ac_score = metrics.accuracy_score(data["test_y"], predict)
        evaluate = {"accuracy": ac_score}
        print(evaluate)
        with open(self.output_path, "wb") as f:
            pickle.dump(evaluate, f)

    def output(self):
        file_name = self.output_path.split("/")[-1]
        output_dir = self.output_path.split("/")[:-1]
        if not os.path.exists(os.path.join(*output_dir)):
            os.makedirs(os.path.join(*output_dir))
        output = os.path.join(*output_dir + [file_name])
        return luigi.LocalTarget(output)


if __name__ == "__main__":
    luigi.run(["iris_tasks.Evaluate", "--workers", "1", "--local-scheduler"])
Exemple #54
0
 def test_forgot_param_in_dep(self):
     # A programmatic missing parameter will cause an error email to be sent
     luigi.run(['--local-scheduler', '--no-lock', 'ForgotParamDep'])
     self.assertNotEquals(self.last_email, None)
        :param record:
        :return:
        """

        # Look at HTTP Responses:
        if (record.type == WarcRecord.RESPONSE
                and record.content_type.startswith(b'application/http')):
            # Parse the HTTP Headers, faking a socket wrapper:
            f = hanzo.warcpayload.FileHTTPResponse(record.content_file)
            f.begin()

            hostname = urlparse.urlparse(record.url).hostname
            yield hostname, f.status

    def reducer(self, key, values):
        """

        :param key:
        :param values:
        :return:
        """
        for value in values:
            yield key, sum(values)


if __name__ == '__main__':
    luigi.run([
        'GenerateWarcStats', '--input-file', 'daily-warcs-test.txt',
        '--local-scheduler'
    ])
Exemple #56
0
from tasks.save_boe_diary_entry import SaveBoeDiaryEntry, SaveOption

class Pipeline(luigi.WrapperTask):
    date = luigi.DateParameter()

    def complete(self):
        return False

    def requires(self):
        return ProcessBoeDiary(date=self.date)

    def run(self):
        with self.input().open('r') as f:
            diary_entries = json.loads(f.read())
        
        yield (SaveBoeDiaryEntry(entry=entry, save_options=(SaveOption.DATABASE.value, SaveOption.ELASTICSEARCH.value))
               for entry 
               in diary_entries)

if __name__ == '__main__':
    luigi.run(['Pipeline',
        '--local-scheduler',
        '--workers', '3',
        '--GlobalParams-base-dir', './temp',
        '--DBParams-host', 'localhost',
        '--DBParams-user', 'root',
        '--DBParams-password', 'pass',
        '--DBParams-database', 'boe',
        '--date', sys.argv[1]
        ])
Exemple #57
0
    def output(self):
        print("Task1: output")
        return luigi.LocalTarget("intermediate/task1_%s.txt" % time.asctime())


class Task2(luigi.Task):
    task_namespace = 'tasks'

    def requires(self):
        print("Task2: requires")
        return Task1()

    def run(self):
        print("Task2: run")
        with self.input().open("r") as intermediate, self.output().open(
                "w") as target:
            task1_text = intermediate.read()

            target.write(task1_text)
            target.write("This file was generated by Task2 at %s." %
                         time.asctime())

    def output(self):
        print("Task2: output")
        return luigi.LocalTarget("output/task2_%s.txt" % time.asctime())


if __name__ == '__main__':
    # luigi.run(['tasks.Task2', '--workers', '1', '--local-scheduler'])
    luigi.run(['tasks.Task2', '--workers', '1'])
Exemple #58
0
 def test_bampe(self):
     luigi.run(['--target', "data/sample1.bam", '--config-file', localconf],
               main_task_cls=Bampe)
class ExternalFile(luigi.ExternalTask):
    """ Represents an external file for the Luigi pipeline """
    ext_file = luigi.Parameter()

    def output(self):
        return luigi.LocalTarget(self.ext_file)


class MatrixAnalysis(luigi.WrapperTask):
    """ WrapperTask to trigger analysis of Domain matrices """
    def requires(self):

        apps = defaultdict(set)
        target_folders = [cfg.fake_apks_folder]
        for f in target_folders:
            for root, dirs, files in os.walk(f):
                for basename in files:
                    if basename.endswith('.apk'):
                        pkg = "_".join(basename.split("_")[:-2])
                        vercode = basename.split("_")[-2]
                        date = basename.split("_")[-1].split('.')[0]
                        apps[pkg].add((pkg, vercode, date))
        for pkg, apks in apps.items():
            yield StringoidAnalysis(pkg=pkg)
            yield DnsAnalysis(pkg=pkg)


if __name__ == '__main__':
    luigi.run(main_task_cls=MatrixAnalysis)
Exemple #60
0
 def test_multibool_empty(self):
     luigi.run(['--local-scheduler', '--no-lock', 'Bar'])
     self.assertEqual(Bar._val, tuple())