def addModeledPrt(): for tname in [_.rstrip() for _ in file("../dat/casf_names.txt")]: luigi.build([AddCenter2Modeled(tname, version="0.7"), AddCenter2Modeled(tname, version="0.5")], local_scheduler=True)
def test_row_overload(self): """Overload the rows method and we should be able to insert data into database""" class SQLARowOverloadTest(sqla.CopyToTable): columns = [(["item", sqlalchemy.String(64)], {}), (["property", sqlalchemy.String(64)], {})] connection_string = CONNECTION_STRING table = "item_property" chunk_size = 1 def rows(self): tasks = [ ("item0", "property0"), ("item1", "property1"), ("item2", "property2"), ("item3", "property3"), ("item4", "property4"), ("item5", "property5"), ("item6", "property6"), ("item7", "property7"), ("item8", "property8"), ("item9", "property9"), ] for row in tasks: yield row task = SQLARowOverloadTest() luigi.build([task], local_scheduler=True, workers=self.NUM_WORKERS) self._check_entries(self.engine)
def main(tname, subset): luigi.build([Curate.PairWisePsScore(tname=tname, subset=subset), Curate.PairWiseTanimoto(tname=tname, subset=subset)], local_scheduler=True) pass
def test_unified_snapshot(self): task = UnifiedSnapshotMock() luigi.build([task], local_scheduler=True) print(task.output().path) with task.output().open() as handle: lines = handle.readlines() self.assertEquals(186, len(lines))
def setUp(self): super(TestVisualiser, self).setUp() x = 'I scream for ice cream' task = UberTask(base_task=FailingMergeSort, x=x, copies=4) luigi.build([task], workers=1, scheduler_port=self.get_http_port()) self.done = threading.Event() def _do_ioloop(): # Enter ioloop for maximum TEST_TIMEOUT. Check every 2s whether the test has finished. print('Entering event loop in separate thread') for i in range(TEST_TIMEOUT): try: self.wait(timeout=1) except AssertionError: pass if self.done.is_set(): break print('Exiting event loop thread') self.iothread = threading.Thread(target=_do_ioloop) self.iothread.start()
def test(): luigi.build( [ Extract("1b9vA"), PrepareLigStateEnsemble("1b9vA"), ], local_scheduler=True, )
def closest(self): task = VIAFLatestDate() luigi.build([task], local_scheduler=True) with task.output().open() as handle: date, _ = handle.iter_tsv(cols=('date', 'url')).next() dateobj = datetime.date(*map(int, date.split('-'))) return dateobj
def main(): # get the arguments from the command line parser = build_arg_parser() cmdline_args = parser.parse_args() # get a named tuple of indexes v1...vN indexes = Load.label_indices( cmdline_args.backup_count + 1, cmdline_args.index) # get the end class task = Load(indexes=indexes, mapping_file=cmdline_args.mapping_file, settings_file=cmdline_args.settings_file, docs_file=cmdline_args.docs_file, table=cmdline_args.table, sql_filter=cmdline_args.sql_filter, marker_table=cmdline_args.marker_table, es_timeout=cmdline_args.es_timeout) if cmdline_args.clear: clear(task) else: if cmdline_args.restart: clear(task) luigi.build([task], local_scheduler=True) # luigi suppresses exceptions if capture_task_exceptions: raise Exception( 'A luigi.Task failed, see traceback in the Luigi Execution Summary')
def requires(self): task = SWBOpenDataDates(date=self.date) luigi.build([task], local_scheduler=True) with task.output().open() as handle: for row in handle.iter_tsv(cols=('date',)): dateobj = datetime.date(*map(int, row.date.split('-'))) yield SWBOpenDataListified(date=dateobj)
def run(self): output = shellout("cut -f2 {input} | LANG=C sort | LANG=C uniq > {output}", input=self.input().path) with open(output) as handle: dates = map(string.strip, handle.readlines()) with self.output().open('w') as output: for date in dates: dateobj = datetime.date(*map(int, date.split('-'))) marc = SWBOpenDataMarc(date=dateobj) sdb = SWBOpenDataSeekMapDB(date=dateobj) luigi.build([marc, sdb], local_scheduler=True) with open(marc.output().path) as handle: with sqlite3db(sdb.output().path) as cursor: idset = df[df.date == date].id.values.tolist() limit, offset = self.limit, 0 while True: cursor.execute(""" SELECT offset, length FROM seekmap WHERE id IN (%s)""" % ( ','.join(("'%s'" % id for id in idset[offset:offset + limit])))) rows = cursor.fetchall() if not rows: break else: copyregions(handle, output, rows) offset += limit
def test(): task1 = VinaResultAccuracy("3ofl_JHM_B_1.pdb") task2 = QueryVinaResultOnIdenticalTemplate("3ofl_JHM_B_1.pdb") task3 = QueryVinaResultOnIdenticalTemplate("1yst_U10_L_1.pdb") task4 = QueryVinaResultOnIdenticalTemplate("3mbm_717_C_1.pdb") task5 = QueryVinaResultOnIdenticalTemplate("3jdw_ORN_A_1.pdb") luigi.build([task1, task2, task3, task4, task5], local_scheduler=True)
def test_update_table_task(self, mock_config): mock_config.get_config.return_value.get.return_value = AWS_ACCESS_KEY t = TestUpdateDynamoDBTableTask() # mock s3 location for writing output token s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket("mybucket") # create table table_name = "dynamo_table1" schema = [HashKey("my_hash", data_type=STRING)] indexes = [ AllIndex( "IndexName", parts=[HashKey("my_hash", data_type=STRING), RangeKey("range_index", data_type=NUMBER)] ) ] throughput = {"read": 2, "write": 4} client = DynamoDBClient(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY) client.create_table(table_name, schema, throughput, indexes=indexes) luigi.build([t], local_scheduler=True) table = client.get_table("dynamo_table1") self.assertEquals(8, table.throughput["read"]) self.assertEquals(16, table.throughput["write"])
def test(): luigi.build( [AddNativeCenter("3owjA00"), AddCenter2Modeled("3owjA00", version="0.7"), AddCenter2Modeled("3owjA00", version="0.5")], local_scheduler=True)
def test_column_row_separator(self): """ Test alternate column row separator works :return: """ class ModBaseTask(luigi.Task): def output(self): return MockFile("ModBaseTask", mirror_on_stderr=True) def run(self): out = self.output().open("w") tasks = ["item%d,property%d\n" % (i, i) for i in range(10)] for task in tasks: out.write(task) out.close() class ModSQLATask(sqla.CopyToTable): columns = [(["item", sqlalchemy.String(64)], {}), (["property", sqlalchemy.String(64)], {})] connection_string = CONNECTION_STRING table = "item_property" column_separator = "," chunk_size = 1 def requires(self): return ModBaseTask() task1, task2 = ModBaseTask(), ModSQLATask() luigi.build([task1, task2], local_scheduler=True, workers=self.NUM_WORKERS) self._check_entries(self.engine)
def mariobros( targets=('DEFAULT',), mariofile='mario.ini', print_ns=False, dry_run=False, workers=1, port=8082, **kwargs ): """Main mariobros entry point. Parse the configuration file and launch the build of targets. :param sequence targets: List of targets. :param unicode mariofile: MarioFile name. :param bool print_ns: Flag to print namespace. :param bool dry_run: Dry run flag. :param int workers: Number of workers. :param dict kwargs: Passed to the luigi.build function. """ assert all(isinstance(target, str) for target in targets) if dry_run and workers > 1: workers = 1 mario.LOGGER.warning('Dry run is incompatible with multiprocessing. Setting --workers=1') section_namespaces = mariofile_.parse_mariofile(mariofile) default_namespace, rendered_namespaces = mario.render_config(section_namespaces) if print_ns: namespaces = mario.print_namespaces(default_namespace, section_namespaces) print(namespaces) else: target_tasks = mario.mario( rendered_namespaces, default_namespace, targets=targets, dry_run=dry_run ) luigi.build(target_tasks, workers=workers, scheduler_port=port, **kwargs)
def test_multiple_tasks(self): """ Test a case where there are multiple tasks :return: """ class SmallSQLATask(sqla.CopyToTable): item = luigi.Parameter() property = luigi.Parameter() columns = [(["item", sqlalchemy.String(64)], {}), (["property", sqlalchemy.String(64)], {})] connection_string = CONNECTION_STRING table = "item_property" chunk_size = 1 def rows(self): yield (self.item, self.property) class ManyBaseTask(luigi.Task): def requires(self): for t in TASK_LIST: item, property = t.strip().split("\t") yield SmallSQLATask(item=item, property=property) task2 = ManyBaseTask() luigi.build([task2], local_scheduler=True, workers=self.NUM_WORKERS) self._check_entries(self.engine)
def test_customized_worker(self): a = DummyTask(3) self.assertFalse(a.complete()) self.assertFalse(self.worker.complete()) luigi.build([a], scheduler_instance=self.scheduler, worker_instance=self.worker) self.assertTrue(a.complete()) self.assertTrue(self.worker.complete())
def test_map_only(self): luigi.build([MapOnlyJob()], local_scheduler=True) c = [] for line in File("luigitest-3").open("r"): c.append(line.strip()) self.assertEquals(c[0], "kj") self.assertEquals(c[4], "ljoi")
def test_map_only(self): luigi.build([MapOnlyJob()], local_scheduler=True) c = [] for line in File('luigitest-3').open('r'): c.append(line.strip()) self.assertEquals(c[0], 'kj') self.assertEquals(c[4], 'ljoi')
def run(self): self.setup_arguments() self.process_arguments(self.parser.parse_args()) self.log_arguments() luigi.build(self.create_tasks(), local_scheduler=self.local_scheduler, workers=self.workers)
def test_extract(self, mock_config): """ Tests that the mysql extraction runs correctly. """ output_file = os.path.join(tempfile.gettempdir(), 'extract-%s.txt' % uuid.uuid4().hex) conf = ConfigParser.ConfigParser() conf.add_section('mysql') conf.set('mysql', 'dbname', 'mydb') conf.set('mysql', 'host', 'myhost') conf.set('mysql', 'port', '3306') conf.set('mysql', 'user', 'myuser') conf.set('mysql', 'password', 'my_password') mock_config.get_config.return_value = conf with mock.patch("mortar.luigi.dbms.subprocess") as subprocess: subprocess_return = mock.Mock() subprocess_return.stdout = StringIO.StringIO() subprocess_return.stderr = StringIO.StringIO() subprocess_return.communicate.return_value = (None, None) subprocess_return.returncode = 0 subprocess.Popen.return_value = subprocess_return t = dbms.ExtractFromMySQL(table='foo', output_path=output_file) self.assertFalse(t.output()[0].exists()) luigi.build([t], local_scheduler=True) self.assertTrue(t.output()[0].exists()) os.remove(output_file)
def main(): op = OptionParser() op.add_option('--interval', '-i', default=1000) op.add_option('--directory', '-d') op.add_option('--config', '-c') op.add_option('--start', '-s') op.add_option('--end', '-e') op.add_option('--workflow', '-w') op.add_option('--filetype', '-f', default='json') options, arguments = op.parse_args() if not options.config: op.error('No configuration YAML') if not options.directory: op.error('No input file directory') if not options.workflow: op.error('No workflow specified') files = glob.glob( os.path.join( options.directory, '*.{0}'.format(options.filetype) ) ) if not files: op.error( 'Empty input file directory (no {0})'.format(options.filetype) ) try: interval = int(options.interval) except: op.error('Non-integer interval value') try: start_index = int(options.start) except: start_index = 0 try: end_index = int(options.end) except: end_index = len(files) # this only works for the workflows imported above (and they # need to be imported, obv). things like the eda, which aren't # working on a single file dependency tree, need something else try: workflow_class = getattr(sys.modules[__name__], options.workflow) except AttributeError: op.error('Unable to load workflow for {0}'.format(options.workflow)) for i in xrange(start_index, end_index, interval): w = workflow_class( doc_dir=options.directory, yaml_file=options.config, start_index=i, end_index=(i + interval) if (i + interval) < end_index else end_index ) luigi.build([w], local_scheduler=True)
def test_rows(self): task, task0 = SQLATask(), BaseTask() luigi.build([task, task0], local_scheduler=True, workers=self.NUM_WORKERS) for i, row in enumerate(task.rows()): given = TASK_LIST[i].strip("\n").split("\t") self.assertEqual(row, given)
def test_customized_worker(self): a = DummyTask(3) self.assertFalse(a.complete()) self.assertFalse(self.worker_scheduler_factory.worker.complete()) luigi.build([a], worker_scheduler_factory=self.worker_scheduler_factory) self.assertTrue(a.complete()) self.assertTrue(self.worker_scheduler_factory.worker.complete())
def test_reflect(self): """ If the table is setup already, then one can set reflect to True, and completely skip the columns part. It is not even required at that point. :return: """ class AnotherSQLATask(sqla.CopyToTable): connection_string = CONNECTION_STRING table = "item_property" reflect = True chunk_size = 1 def requires(self): return SQLATask() def copy(self, conn, ins_rows, table_bound): ins = ( table_bound.update() .where(table_bound.c.property == sqlalchemy.bindparam("_property")) .values({table_bound.c.item: sqlalchemy.bindparam("_item")}) ) conn.execute(ins, ins_rows) def rows(self): for line in TASK_LIST: yield line.strip("\n").split("\t") task0, task1, task2 = AnotherSQLATask(), SQLATask(), BaseTask() luigi.build([task0, task1, task2], local_scheduler=True, workers=self.NUM_WORKERS) self._check_entries(self.engine)
def test_use_json_as_data_interchange_format_job(test_case): job = UseJsonAsDataInteterchangeFormatJob(use_hdfs=test_case.use_hdfs) luigi.build([job], local_scheduler=True) c = [] for line in job.output().open("r"): c.append(line) test_case.assertEqual(c, ['{"data type": "json"}\n'])
def main(sdf): luigi.build([ PairwiseCms(sdf, maximum_radius=8.0), PairwiseCmsEvenlyDistributed(sdf, maximum_radius=8.0), PairwiseCmsEvenlyGryration(sdf) ], local_scheduler=True)
def test_run_job_with_dump(self, mock_check_output): mock_check_output.side_effect = [ 'Your job 12345 ("test_job") has been submitted', '' ] task = TestJobTask(i=1, n_cpu=1, shared_tmp_dir='/tmp') luigi.build([task], local_scheduler=True) self.assertEqual(mock_check_output.call_count, 2)
def test_output_token(self): """ Output should be the given token path and the class name """ t = TestShellScriptTask(self.token_path) self.t = t luigi.build([t], local_scheduler=True) self.assertEquals('%s/%s' % (self.token_path, t.__class__.__name__), t.output()[0].path)
def test_map_only(test_case): job = MapOnlyJob(use_hdfs=test_case.use_hdfs) luigi.build([job], local_scheduler=True) c = [] for line in job.output().open("r"): c.append(line.strip()) test_case.assertEqual(c[0], "kj") test_case.assertEqual(c[4], "ljoi")
def test_inheritance(self): t = PowerSum(lo=42, hi=45, p=2) luigi.build([t], local_scheduler=True) self.assertEqual(t.s, 42**2 + 43**2 + 44**2)
yield bss2_handle if __name__ == "__main__": # Set up the command line parameters PARSER = argparse.ArgumentParser( description="BS Seeker 2 Peak Caller Pipeline Wrapper") PARSER.add_argument("--genome_fa", help="") PARSER.add_argument("--genome_idx", help="") PARSER.add_argument("--aligner", help="") PARSER.add_argument("--aligner_path", help="") PARSER.add_argument("--bss_path", help="") PARSER.add_argument("--shared_tmp_dir", help="") PARSER.add_argument("--python_path", default=sys.executable, help="") # Get the matching parameters from the command line ARGS = PARSER.parse_args() SHARED_TMP_DIR = ARGS.shared_tmp_dir luigi.build([ BSseeker2Indexer(genome_fa=ARGS.genome_fa, genome_idx=ARGS.genome_idx, aligner=ARGS.aligner, aligner_path=ARGS.aligner_path, bss_path=ARGS.bss_path, user_python_path=ARGS.python_path) ], local_scheduler=True, workers=5)
import state_to_state_transitions2 as sst import pandas as pd import numpy as np import luigi class wrapper(luigi.WrapperTask): def requires(self): files = ['Session.csv', 'lead.csv', 'opportunity.csv', 'complete.csv'] task_list = [] for i in range(1, len(files)): path = '/Users/emmanuels/Documents/AttributionData/Data/' one = path + str(files[i - 1]) two = path + str(files[i]) task_list.append( sst.state_to_state(first_file=one, second_file=two)) return task_list def run(self): print('Wrapper ran') pd.DataFrame().to_csv( '/Users/emmanuels/Documents/AttributionData/Data/wrangler1.csv') def output(self): return luigi.LocalTarget( '/Users/emmanuels/Documents/AttributionData/Data/wrangler1.csv') if __name__ == '__main__': luigi.build([wrapper()], workers=8, local_scheduler=True)
def test_priority_w_dep(self): x, y, z = PrioTask(25), PrioTask(15), PrioTask(5) a, b, c = PrioTask(24), PrioTask(14), PrioTask(4) luigi.build([a, b, c, x, y, z], local_scheduler=True) self.assertTrue(z.t < y.t < x.t < c.t < b.t < a.t)
def run_tasks_for_dry_run(tasks_to_run): for type in [ "failure", "success", "timeout", "process_failure", "processing_time", "broken_task", ]: os.makedirs(Path(constants.RESULTS_DIRECTORY) / type) run_result = luigi.build( tasks_to_run, local_scheduler=True, detailed_summary=True, workers=10, log_level='INFO', ) for filename in glob('results/failure/*.json'): result = json.loads(open(filename, 'r').read()) click.echo( colorclass.Color("{red}" + result.get('task_type') + " failed{/red}")) click.echo( f"{yaml.safe_dump({'parameters':result.get('task_params')})}") click.echo("\n".join(result.get('exception_stack_trace'))) click.echo('') exit_status_codes = { LuigiStatusCode.SUCCESS: 0, LuigiStatusCode.SUCCESS_WITH_RETRY: 0, LuigiStatusCode.FAILED: 1, LuigiStatusCode.FAILED_AND_SCHEDULING_FAILED: 2, LuigiStatusCode.SCHEDULING_FAILED: 3, LuigiStatusCode.NOT_RUN: 4, LuigiStatusCode.MISSING_EXT: 5, } click.echo("Dry run results") table_data = [ [ 'Result', 'Launch', 'Account', 'Region', 'Current Version', 'New Version', 'Notes' ], ] table = terminaltables.AsciiTable(table_data) for filename in glob('output/TerminateProductDryRunTask/*.json'): result = json.loads(open(filename, 'r').read()) table_data.append([ result.get('effect'), result.get('params').get('launch_name'), result.get('params').get('account_id'), result.get('params').get('region'), result.get('current_version'), result.get('new_version'), result.get('notes'), ]) for filename in glob('output/ProvisionProductDryRunTask/*.json'): result = json.loads(open(filename, 'r').read()) table_data.append([ result.get('effect'), result.get('params').get('launch_name'), result.get('params').get('account_id'), result.get('params').get('region'), result.get('current_version'), result.get('new_version'), result.get('notes'), ]) click.echo(table.table) sys.exit(exit_status_codes.get(run_result.status))
def run(cls): cls.logger.info('Running Systems Checks...') return luigi.build([SystemCheckTask()], local_scheduler=True)
def main(config): luigi.build([groupEmbTags(config=config)])
combinations = { 'experiment': 'FF', 'setups': [setup], 'iterations': [50000, 100000, 150000], 'samples': [validation], 'thresholds': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'merge_functions': ['mean_aff', 'max_aff'], 'init_with_maxs': [False], 'custom_fragments': True, 'histogram_quantiless': [False], 'discrete_queues': [True], 'keep_segmentation': True, 'dilate_mask': 0, 'mask_fragments': True } range_keys = [ 'setups', 'iterations', 'samples', 'merge_functions', 'init_with_maxs', 'histogram_quantiless', 'discrete_queues' ] jobs.append(EvaluateCombinations(combinations, range_keys)) set_base_dir(os.path.abspath('..')) luigi.build( jobs, workers=50, scheduler_host='slowpoke1.int.janelia.org', logging_conf_file='/groups/saalfeld/home/funkej/.luigi/logging.conf')
def test_Ok(self): task = TestTask() luigi.build([task], local_scheduler=True)
from luigi import build import bqe_search from bqe_search import combined_search build( [ combined_search(gene_list=["p53", "CCND1"], combination_terms=["apoptosis", "cell cycle"]) ], local_scheduler=True, )
def run(tasks, forced=None, forced_all=False, forced_all_upstream=False, confirm=True, workers=1, abort=True, **kwargs): """ Run tasks locally. See luigi.build for additional details Args: tasks (obj, list): task or list of tasks forced (list): list of forced tasks forced_all (bool): force all tasks forced_all_upstream (bool): force all tasks including upstream confirm (list): confirm invalidating tasks workers (int): number of workers abort (bool): on errors raise exception kwargs: keywords to pass to luigi.build """ if not isinstance(tasks, (list, )): tasks = [tasks] if forced_all: forced = tasks if forced_all_upstream: for t in tasks: invalidate_upstream(t, confirm=confirm) if forced is not None: if not isinstance(forced, (list, )): forced = [forced] invalidate = [] for tf in forced: for tup in tasks: invalidate.append(d6tflow.taskflow_downstream(tf, tup)) invalidate = set().union(*invalidate) invalidate = {t for t in invalidate if t.complete()} if len(invalidate) > 0: if confirm: print('Forced tasks', invalidate) c = input('Confirm invalidating forced tasks (y/n)') else: c = 'y' if c == 'y': [t.invalidate(confirm=False) for t in invalidate] else: return None opts = { **{ 'workers': workers, 'local_scheduler': True, 'log_level': d6tflow.settings.log_level }, **kwargs } result = luigi.build(tasks, **opts) if abort and not result: raise RuntimeError('Exception found running flow, check trace') return result
'Args': [ "spark-submit", "s3a://tar7/agg.py", ] } }]) #falta un paso para que espere a que terminen los steps #client.terminate_job_flows(JobFlowIds=[clusters['Clusters'][0]['Id']]) while (time.time() - start_time < 360): flag = 1 clusters = client.list_clusters() if (clusters['Clusters'][0]['Status']['State'] in ['TERMINATING']): #client.terminate_job_flows(JobFlowIds=[clusters['Clusters'][0]['Id']]) print('cluster termino') else: sys.exit('tiempo mayor') os.system("aws s3 cp --recursive s3://tar7/promedio.parquet ./") # AQUI ADENTRO VA EL CODIGO ------------------------------------------------- luigi.build([StartPipeline()])
def predict(device_mapping, target): max_jobs = len(device_mapping) tmp_folder = './tmp_inference' input_path = '/g/kreshuk/data/FIB25/data.n5' output_path = input_path roi_begin = roi_end = None in_key = 'volumes/raw/s0' out_key = {'volumes/affinities/s0': (0, 3)} mask_path = input_path mask_key = 'volumes/masks/minfilter/s5' config_folder = './config_inference' if not os.path.exists(config_folder): os.mkdir(config_folder) input_blocks = (192, ) * 3 # remove (16, 16, 16) pixels from each side in the output output_blocks = (160, ) * 3 halo = [(ib - ob) // 2 for ib, ob in zip(input_blocks, output_blocks)] print("Found halo", halo) shebang = "#! /g/kreshuk/pape/Work/software/conda/miniconda3/envs/torch/bin/python" global_config = InferenceLocal.default_global_config() global_config.update({ 'shebang': shebang, 'block_shape': output_blocks, 'roi_begin': roi_begin, 'roi_end': roi_end }) with open(os.path.join(config_folder, 'global.config'), 'w') as f: json.dump(global_config, f) config = InferenceLocal.default_task_config() config.update({ 'chunks': [ob // 2 for ob in output_blocks], 'mem_limit': 32, 'time_limit': 720, 'threads_per_job': 4, 'set_visible_device': False }) with open(os.path.join(config_folder, 'inference.config'), 'w') as f: json.dump(config, f) ckpt = '/g/kreshuk/matskevych/boundary_map_prediction/project_folder_old/Weights' task = InferenceLocal if target == 'local' else InferenceSlurm t = task(tmp_folder=tmp_folder, max_jobs=max_jobs, config_dir=config_folder, input_path=input_path, input_key=in_key, output_path=output_path, output_key=out_key, mask_path=mask_path, mask_key=mask_key, checkpoint_path=ckpt, framework='inferno', halo=halo) ret = luigi.build([t], local_scheduler=True) assert ret, "Failure"
import luigi import os from OrthoEvol.Tools.sge import SGEPipelineTask # TIP Works on linux logger = logging.getLogger('luigi-interface') SGEPipelineTask.shared_tmp_dir = os.getcwd() SGEPipelineTask.parallel_env = None class TestPipelineTask(SGEPipelineTask): """Example pipeline task.""" i = luigi.Parameter() def work(self): # Use work instead of run to DEBUG logger.info('Running test job...') with open(self.output().path, 'w') as f: f.write('This is a test job.') f.close() def output(self): return luigi.LocalTarget( path=os.path.join(os.getcwd(), 'testjob_' + str(self.i) + '.txt')) if __name__ == '__main__': tasks = [TestPipelineTask(i=str(i), select=i + 1) for i in range(3)] luigi.build(tasks, local_scheduler=True, workers=3)
def test_priority(self): p, q, r = PrioTask(1), PrioTask(2), PrioTask(3) luigi.build([p, q, r], local_scheduler=True) self.assertTrue(r.t < q.t < p.t)
def test_invoke(self): luigi.build([Popularity(datetime.date(2009, 1, 5))], local_scheduler=True) self.assertEqual( MockTarget.fs.get_data('/tmp/popularity/2009-01-05.txt'), b'4\n')
from sqlalchemy import String import luigi from luigi.contrib import sqla class SQLATask(sqla.CopyToTable): # columns defines the table schema, with each element corresponding # to a column in the format (args, kwargs) which will be sent to # the sqlalchemy.Column(*args, **kwargs) reflect = True connection_string = "sqlite://" # in memory SQLite database table = "Author" # name of the table to store data columns = [(["First_Name", String(64)], {}), (["Last_Name", String(64)], {})] def rows(self): for row in [("First_Name", "Babacar"), ("Last_Name", "Kane")]: yield row if __name__ == '__main__': task = SQLATask() luigi.build([task], local_scheduler=True)
import luigi import os class FileTarget(luigi.Target): def __init__(self, filename): self.filename = filename def exists(self): return os.path.isfile(self.filename) class HelloTask(luigi.Task): def run(self): with open("hello.txt", "w") as f: f.write("Hello Mario!") def output(self): return FileTarget("hello.txt") if __name__ == "__main__": luigi.build([HelloTask()])
def downscale_seg(sample, max_jobs=8, target='local'): input_path = '/g/kreshuk/data/cremi/realigned/sample%s_small.n5' % sample input_key = 'segmentation/multicut/s0' config_dir = './config_ds_seg' tmp_folder = './tmp_ds_seg_%s' % sample try: os.mkdir(config_dir) except OSError: pass config = DownscalingWorkflow.get_config() global_config = config['global'] shebang = '#! /g/kreshuk/pape/Work/software/conda/miniconda3/envs/cluster_env/bin/python' global_config.update({'shebang': shebang}) with open(os.path.join(config_dir, 'global.config'), 'w') as f: json.dump(global_config, f) ds_config = config['downscaling'] # FIXME majority vote downscaling is broken # ds_config.update({'library': 'skimage', 'threads_per_job': 8}) ds_config.update({ 'library': 'vigra', 'library_kwargs': { 'order': 0 }, 'threads_per_job': 8 }) with open(os.path.join(config_dir, 'downscaling.config'), 'w') as f: json.dump(ds_config, f) scale_factors = [[1, 2, 2], [1, 2, 2], [1, 2, 2], 2] halos = [[0, 10, 10], [0, 10, 10], [0, 10, 10], [10, 10, 10]] task = DownscalingWorkflow(tmp_folder=tmp_folder, max_jobs=1, config_dir=config_dir, target='local', input_path=input_path, input_key='segmentation/multicut/s0', output_key_prefix='segmentation/multicut', scale_factors=scale_factors, halos=halos) success = luigi.build([task], local_scheduler=True) # if success and target == 'local': with z5py.File(input_path) as f: # ds = f['raw/s2'] ds.n_threads = 8 raw = ds[:] rshape = raw.shape # ds = f['segmentation/multicut/s2'] ds.n_threads = 8 seg = ds[:] mshape = seg.shape assert mshape == rshape, "%s %s" % (str(mshape), str(rshape)) view([raw, seg])
def test_run(self, os_mock): t = MortarSqoopTaskTest(path=S3_PATH) luigi.build([t], local_scheduler=True) self.assertEquals(EXPECTED_ARGV, t.argv) self.assertEquals(os.environ['AWS_ACCESS_KEY'], AWS_ACCESS_KEY) self.assertEquals(os.environ['AWS_SECRET_KEY'], AWS_SECRET_KEY)
class HelloTask(luigi.Task): to_whom = luigi.Parameter() def run(self): print("thinking about what to say...") time.sleep(5) with open("hello_%s.txt"%self.to_whom, "w") as f: f.write("Hello %s!"%self.to_whom) def output(self): return FileTarget("hello_%s.txt"%self.to_whom) class ReplyTask(luigi.Task): def run(self): print("thinking about what to say...") time.sleep(5) with open("reply.txt", "w") as f: f.write("Hello Luigi!") def output(self): return FileTarget("reply.txt") def requires(self): return HelloTask(to_whom="Luigi") if __name__ == "__main__": luigi.build([ReplyTask()])
def test_run_options_with_jdbc_jar(self, os_mock): t = MortarSqoopTaskTest(path=S3_PATH, jdbc_driver='some/path') luigi.build([t], local_scheduler=True) option_string = EXPECTED_ARGV + ['-j', 'some/path'] self.assertEqual(option_string, t.argv)
def test_recursion(self): t = LinearSum(lo=42, hi=45) luigi.build([t], local_scheduler=True) self.assertEqual(t.s, 42 + 43 + 44)
def test_run_options_with_direct(self, os_mock): t = MortarSqoopTaskTest(path=S3_PATH, direct=True) luigi.build([t], local_scheduler=True) option_string = EXPECTED_ARGV + ['--direct'] self.assertEqual(option_string, t.argv)
def test_run_job(self, mock_open, mock_communicate): if on_lsf_master(): outfile = os.path.join(DEFAULT_HOME, 'testfile_1') tasks = [TestJobTask(i=str(i), n_cpu_flag=1) for i in range(3)] luigi.build(tasks, local_scheduler=True, workers=3) self.assertTrue(os.path.exists(outfile))
def run_full_pipeline(project_abs_path, config_dir, config_file_name, local_scheduler_=True): # Run pipeline this way luigi.build([BuildSolution(project_abs_path, config_dir, config_file_name)], local_scheduler=local_scheduler_)
def test_build_internal(self): luigi.build([Fib(100)], local_scheduler=True) self.assertEqual(MockTarget.fs.get_data('/tmp/fib_10'), b'55\n') self.assertEqual(MockTarget.fs.get_data('/tmp/fib_100'), b'354224848179261915075\n')
def run_stacking_only(project_abs_path, config_dir, config_file_name, local_scheduler_=True): # Run pipeline this way luigi.build([MakeStackingPredictions(project_abs_path, config_dir, config_file_name)], local_scheduler=local_scheduler_)
def handle(self, *args, **options): build([LineDimLoad(subset=options["full"])], local_scheduler=True) df_LineDim = pd.read_parquet("./data/linedim/part.0.parquet", engine="fastparquet") build([AccountDimLoad(subset=options["full"])], local_scheduler=True) df_AccountDim = pd.read_parquet("./data/accountdim/part.0.parquet", engine="fastparquet") build([LimitFactLoad(subset=options["full"])], local_scheduler=True) df_LimitFact = pd.read_parquet("./data/limitfact/part.0.parquet", engine="fastparquet") build([ByMdn()], local_scheduler=True) local_root = "file://data/amplitude/by_mdn/" target = ParquetTarget(local_root, flag=False, glob="*.parquet") df_ActivityFact = target.read_dask(columns=["mdn", "event_time"], parse_dates=["event_time" ]).compute() LineDim.objects.all().delete() with transaction.atomic(): df_LineDim_objs = [ LineDim( MTN=line["MTN"], Device_Grouping=line["DEVICE_GROUPING"], Sales_Channel=line["SALES_CHANNEL"], SVC_ACT_DT=line["SVC_ACT_DT"], ) for idx, line in df_LineDim.iterrows() ] LineDim.objects.bulk_create(df_LineDim_objs) AccountDim.objects.all().delete() with transaction.atomic(): df_AcctDim_objs = [ AccountDim( Cust_Acct=acct["CUST_ACCT"], Segment_Name=acct["SEGMENT_NAME"], SVC_Plan=acct["SVC_PLAN"], ) for idx, acct in df_AccountDim.iterrows() ] AccountDim.objects.bulk_create(df_AcctDim_objs) LimitFact.objects.all().delete() with transaction.atomic(): df_LimitFact_objs = [ LimitFact( MTN=LineDim.objects.get_or_create(MTN=limit["MTN"])[0], Cust_Acct=AccountDim.objects.get_or_create( Cust_Acct=limit["CUST_ACCT"])[0], LIMIT_DT=limit["LIMITING_DT"], LIMIT_TYPE=limit["LIMIT_TYPE"], ) for idx, limit in df_LimitFact.iterrows() ] LimitFact.objects.bulk_create(df_LimitFact_objs) ActivityFact.objects.all().delete() with transaction.atomic(): df_ActivityFact_objs = [ ActivityFact( MTN=LineDim.objects.get_or_create(MTN=activity["mdn"])[0], EVENT_DT=activity["event_time"], ) for idx, activity in df_ActivityFact.iterrows() ] ActivityFact.objects.bulk_create(df_ActivityFact_objs)
def mws_segmentation(offsets, path, input_key, fg_mask_key, output_key, tmp_folder, target, max_jobs, stitch_mode): task = MwsWorkflow qos = 'normal' config_folder = os.path.join(tmp_folder, 'configs') os.makedirs(config_folder, exist_ok=True) configs = task.get_config() # we use a smaller block shape to speed up MWS block_shape = [64, 256, 256] conf = configs['global'] shebang = get_default_shebang() block_shape = block_shape conf.update({'shebang': shebang, 'block_shape': block_shape}) with open(os.path.join(config_folder, 'global.config'), 'w') as f: json.dump(conf, f) # write config for mws block task strides = [4, 4, 4] conf = configs['mws_blocks'] conf.update({ 'randomize_strides': True, 'strides': strides, 'mem_limit': 12, 'time_limit': 900 }) with open(os.path.join(config_folder, 'mws_blocks.config'), 'w') as f: json.dump(conf, f) # determine config for the given stitching mode if stitch_mode == '': stitch_mc = False elif stitch_mode == 'biased': stitch_mc = True beta1, beta2 = 0.5, 0.75 elif stitch_mode == 'unbiased': stitch_mc = True beta1 = beta2 = 0.5 if stitch_mc: # write config for stitching multicut conf = configs['stitching_multicut'] conf.update({'beta1': beta1, 'beta2': beta2, 'qos': qos}) with open(os.path.join(config_folder, 'stitching_multicut.config'), 'w') as f: json.dump(conf, f) conf = configs['write'] conf.update({'mem_limit': 8, 'time_limit': 120, 'qos': qos}) with open(os.path.join(config_folder, 'write.config'), 'w') as f: json.dump(conf, f) # write config for edge feature task conf = configs['block_edge_features'] conf.update({'offsets': offsets, 'mem_limit': 4, 'qos': qos}) with open(os.path.join(config_folder, 'block_edge_features.config'), 'w') as f: json.dump(conf, f) conf_names = [ 'merge_edge_features', 'merge_sub_graphs', 'map_edge_ids', 'simple_stitch_assignments' ] for name in conf_names: conf = configs[name] conf.update({ 'mem_limit': 128, 'time_limit': 240, 'threads_per_job': 16, 'qos': qos }) with open(os.path.join(config_folder, '%s.config' % name), 'w') as f: json.dump(conf, f) conf = configs['stitching_multicut'] # set time limit for the multicut task to 18 hours (in minutes) tlim_task = 18 * 60 # set time limit for the solver to 16 hours (in seconds) tlim_solver = 16 * 60 * 60 conf.update({ 'mem_limit': 256, 'time_limit': tlim_task, 'threads_per_job': 16, 'qos': qos, 'agglomerator': 'greedy-additive', 'time_limit_solver': tlim_solver }) with open(os.path.join(config_folder, 'stitching_multicut.config'), 'w') as f: json.dump(conf, f) t = task(tmp_folder=tmp_folder, config_dir=config_folder, max_jobs=max_jobs, target=target, input_path=path, input_key=input_key, output_path=path, output_key=output_key, mask_path=path, mask_key=fg_mask_key, stitch_via_mc=stitch_mc, offsets=offsets) ret = luigi.build([t], local_scheduler=True) if not ret: raise RuntimeError("Mws segmentation failed")