class MergeTables(luigi.Task): output_prefix = luigi.Parameter() output_path = luigi.Parameter() max_jobs = luigi.IntParameter() dependency = luigi.TaskParameter() def requires(self): return self.dependency def run(self): # load all job sub results tables = [] for job_id in range(self.max_jobs): path = self.output_prefix + '_job%i.csv' % job_id # NOTE: not all jobs might have been scheduled, so # we neeed to check if the result actually exists if not os.path.exists(path): continue sub_table = pd.read_csv(path, sep='\t') tables.append(sub_table) table = pd.concat(tables) table.sort_values('label_id', inplace=True) table.to_csv(self.output_path, index=False, sep='\t') def output(self): return luigi.LocalTarget(self.output_path)
class FindMergesBase(luigi.Task): task_name = 'find_merges' src_file = os.path.abspath(__file__) allow_retry = False path = luigi.Parameter() key = luigi.Parameter() out_path = luigi.Parameter() clear_ids = luigi.ListParameter() min_overlap = luigi.IntParameter() dependency = luigi.TaskParameter() def requires(self): return self.dependency def run_impl(self): # get the global config and init configs shebang = self.global_config_values()[0] self.init(shebang) # load the task config config = self.get_task_config() config.update({'path': self.path, 'key': self.key, 'clear_ids': self.clear_ids, 'out_path': self.out_path, 'min_overlap': self.min_overlap}) # prime and run the jobs n_jobs = 1 self.prepare_jobs(n_jobs, None, config) self.submit_jobs(n_jobs) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(n_jobs)
class ApplyThreshold(luigi.Task): feature_path = luigi.Parameter() feature_key = luigi.Parameter() out_path = luigi.Parameter() threshold = luigi.FloatParameter() threshold_mode = luigi.Parameter(default='less') dependency = luigi.TaskParameter() threshold_modes = ('less', 'greater', 'equal') def requires(self): return self.dependency def run(self): f = z5py.File(self.feature_path) ds = f[self.feature_key] feats = ds[:] assert self.threshold_mode in self.threshold_modes if self.threshold_mode == 'less': filter_ids = feats < self.threshold elif self.threshold_mode == 'greater': filter_ids = feats > self.threshold elif self.threshold_mode == 'equal': filter_ids = feats == self.threshold filter_ids = np.where(filter_ids)[0].tolist() with open(self.out_path, 'w') as f: json.dump(filter_ids, f) def output(self): return luigi.LocalTarget(self.out_path)
class ExtractSingleCopyRegions(NMETask): """Get a BED of single-copy regions from a hal file. Delegates to a toil pipeline to parallelize the process.""" chunk_size = luigi.IntParameter(default=500) prev_task = luigi.TaskParameter() def requires(self): return self.prev_task def output(self): return self.target_in_work_dir('singleCopyRegions-%s.bed' % self.genome) def run(self): jobStorePath = '%s/jobStore-singleCopy-%s' % (self.work_dir, self.genome) opts = Job.Runner.getDefaultOptions(jobStorePath) if os.path.exists(jobStorePath): opts.restart = True opts.disableCaching = True opts.batchSystem = self.batchSystem opts.parasolCommand = self.parasolCommand opts.environment = ["LD_LIBRARY_PATH"] with Toil(opts) as toil: if opts.restart: result = toil.restart() else: bed_file = toil.importFile('file://' + self.input().path) result = toil.start( Job.wrapJobFn(extract_single_copy_regions_parallel, os.path.abspath(self.hal_file), bed_file, self.genome, self.chunk_size)) toil.exportFile(result, 'file://' + os.path.abspath(self.output().path))
class McSolverExact(luigi.Task): problem = luigi.TaskParameter() def requires(self): return self.problem @run_decorator def run(self): mcProblem = self.input() g = nifty.graph.UndirectedGraph() edgeCosts = mcProblem.read("costs") g.deserialize(mcProblem.read("graph")) assert g.numberOfEdges == edgeCosts.shape[0] obj = nifty.graph.multicut.multicutObjective(g, edgeCosts) factory = nifty_ilp_factory(obj) ret, mc_energy, t_inf = run_nifty_solver(obj, factory, verbose=True) mc_energy = mc_energy[-1] t_inf = t_inf[-1] workflow_logger.info( "McSolverExact: inference with exact solver in %i s" % t_inf) workflow_logger.info("McSolverExact: energy of the solution %f" % mc_energy) self.output().write(ret) def output(self): save_path = os.path.join(PipelineParameter().cache, "McSolverExact.h5") return HDF5DataTarget(save_path)
class WorkflowBase(luigi.Task): """ Base class for a workflow task, that just chains together a workflow of multiple tasks. """ # temporary folder for configurations etc tmp_folder = luigi.Parameter() # maximum number of concurrent jobs max_jobs = luigi.IntParameter() # path for the global configuration config_dir = luigi.Parameter() # target can be local, slurm, lsf (case insensitive) target = luigi.Parameter() # the workflow can have dependencies; per default we # set to be a dummy task that is always successfull dependency = luigi.TaskParameter(default=DummyTask()) _target_dict = {'lsf': 'LSF', 'slurm': 'Slurm', 'local': 'Local'} def _get_task_name(self, task_base_name): target_postfix = self._target_dict[self.target.lower()] return task_base_name + target_postfix def output(self): # we just mirror the target of the last task return luigi.LocalTarget(self.input().path) @staticmethod def get_config(): """ Return all default configs and their save_path indexed by the task name """ return {'global': BaseClusterTask.default_global_config()}
class MoveToHdfsTask(luigi.Task): """Move the output of a task (assuming it's a LocalTarget) onto HDFS """ description = "Move the output of a task to HDFS" upstream_task = luigi.TaskParameter() cache_invalidator = Parameter( default=None, description= "Can be used to invalidate Luigi's instance cacher (which doesn't work with task params)" ) def requires(self): return self.upstream_task def run(self): source = self.input().path target = self.output().path client = HdfsClient() client.put(source, target) def output(self): return HdfsTarget(os.path.basename(self.input().path))
class FilterBlocksBase(luigi.Task): """ FilterBlocks base class """ task_name = 'filter_blocks' src_file = os.path.abspath(__file__) input_path = luigi.Parameter() input_key = luigi.Parameter() filter_path = luigi.Parameter() output_path = luigi.Parameter() output_key = luigi.Parameter() # dependency = luigi.TaskParameter() def requires(self): return self.dependency def clean_up_for_retry(self, block_list): super().clean_up_for_retry(block_list) # TODO remove any output of failed blocks because it might be corrupted def run_impl(self): # get the global config and init configs shebang, block_shape, roi_begin, roi_end, block_list_path\ = self.global_config_values(with_block_list_path=True) self.init(shebang) # load the task config config = self.get_task_config() # update the config with input and graph paths and keys # as well as block shape config.update({'input_path': self.input_path, 'input_key': self.input_key, 'block_shape': block_shape, 'filter_path': self.filter_path, 'output_path': self.output_path, 'output_key': self.output_key}) # create output dataset shape = vu.get_shape(self.input_path, self.input_key) chunks = tuple(min(bs // 2, sh) for bs, sh in zip(block_shape, shape)) with vu.file_reader(self.output_path) as f: f.require_dataset(self.output_key, shape=shape, dtype='uint64', chunks=chunks, compression='gzip') if self.n_retries == 0: block_list = vu.blocks_in_volume(shape, block_shape, roi_begin, roi_end, block_list_path=block_list_path) else: block_list = self.block_list self.clean_up_for_retry(block_list) n_jobs = min(len(block_list), self.max_jobs) # prime and run the jobs self.prepare_jobs(n_jobs, block_list, config) self.submit_jobs(n_jobs) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(n_jobs)
class ProbsToCostsBase(luigi.Task): """ ProbsToCosts base class """ task_name = 'probs_to_costs' src_file = os.path.abspath(__file__) allow_retry = False # input and output volumes input_path = luigi.Parameter() input_key = luigi.Parameter() output_path = luigi.Parameter() output_key = luigi.Parameter() features_path = luigi.Parameter() features_key = luigi.Parameter() dependency = luigi.TaskParameter() def requires(self): return self.dependency @staticmethod def default_task_config(): # we use this to get also get the common default config config = LocalTask.default_task_config() config.update({'invert_inputs': False, 'transform_to_costs': True, 'weight_edges': False, 'weighting_exponent': 1., 'beta': 0.5}) return config def run_impl(self): # get the global config and init configs shebang, block_shape, roi_begin, roi_end = self.global_config_values() self.init(shebang) # load the task config config = self.get_task_config() with vu.file_reader(self.input_path) as f: n_edges = f[self.input_key].shape[0] # chunk size = 64**3 chunk_size = min(262144, n_edges) # require output dataset with vu.file_reader(self.output_path) as f: f.require_dataset(self.output_key, shape=(n_edges,), compression='gzip', dtype='float32', chunks=(chunk_size,)) # update the config with input and output paths and keys # as well as block shape config.update({'input_path': self.input_path, 'input_key': self.input_key, 'output_path': self.output_path, 'output_key': self.output_key, 'features_path': self.features_path, 'features_key': self.features_key}) # prime and run the jobs self.prepare_jobs(1, None, config) self.submit_jobs(1) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(1)
class MergePairwiseDistances(luigi.Task): tmp_folder = luigi.Parameter() max_jobs = luigi.IntParameter() output_path = luigi.Parameter() dependency = luigi.TaskParameter() def requires(self): return self.dependency def run(self): res_dict = {} for job_id in range(self.max_jobs): path = os.path.join(self.tmp_folder, 'object_distances_%i.pkl' % job_id) # path might not exist because the number of actual jobs is smaller than max_jobs if not os.path.exists(path): continue with open(path, 'rb') as f: distances = pickle.load(f) res_dict.update(distances) with open(self.output_path, 'wb') as f: pickle.dump(res_dict, f) def output(self): return luigi.LocalTarget(self.output_path)
class GenerateImageByBounds(luigi.WrapperTask): """ Schedule Download Tasks """ west = luigi.FloatParameter() north = luigi.FloatParameter() south = luigi.FloatParameter() east = luigi.FloatParameter() zoom = luigi.IntParameter() targetTask = luigi.TaskParameter(default=GenerateImageCSReliefMap) def requires(self): """ scheduling tasks """ candidateTasks = [ GenerateImageCSReliefMap, GenerateImageCurvature, GenerateImageSlope ] if not self.targetTask in candidateTasks: raise edge_nw_x, edge_nw_y, _, _ = deg_to_num(self.north, self.west, self.zoom) edge_se_x, edge_se_y, _, _ = deg_to_num(self.south, self.east, self.zoom) # xRange = [edge_nw_x, edge_se_x] # yRange = [edge_nw_y, edge_se_y] print deg_to_num(self.north, self.west, self.zoom) + deg_to_num( self.south, self.east, self.zoom) for tile_x in range(edge_nw_x - 3, edge_se_x + 3): for tile_y in range(edge_nw_y - 3, edge_se_y + 3): yield self.targetTask(x=tile_x, y=tile_y, z=self.zoom)
class CreateFolds(luigi.Task): dataset = luigi.TaskParameter() num_folds = luigi.IntParameter(default=5) def requires(self): return self.dataset def output(self): return [ luigi.LocalTarget(f'_folds/fold_{fold_id}.h5') for fold_id in range(self.num_folds) ] def run(self): for out in self.output(): out.makedirs() df = pd.read_csv(self.input().path) folds = KFold(n_splits=self.num_folds, shuffle=True, random_state=0) for fold_id, (train_idx, val_idx) in enumerate(folds.split(df)): out_path = self.output()[fold_id].path dump({ 'train': train_idx, 'val': val_idx, }, out_path)
class SolveGlobalBase(luigi.Task): """ SolveGlobal base class """ task_name = 'solve_global' src_file = os.path.abspath(__file__) allow_retry = False # input volumes and graph problem_path = luigi.Parameter() assignment_path = luigi.Parameter() assignment_key = luigi.Parameter() scale = luigi.IntParameter() # dependency = luigi.TaskParameter() def requires(self): return self.dependency @staticmethod def default_task_config(): # we use this to get also get the common default config config = LocalTask.default_task_config() config.update({ 'agglomerator': 'kernighan-lin', 'time_limit_solver': None }) return config def run_impl(self): # get the global config and init configs shebang, block_shape, roi_begin, roi_end = self.global_config_values() self.init(shebang) # load the task config config = self.get_task_config() # update the config with input and graph paths and keys # as well as block shape config.update({ 'assignment_path': self.assignment_path, 'assignment_key': self.assignment_key, 'scale': self.scale, 'problem_path': self.problem_path }) # prime and run the job prefix = 's%i' % self.scale self.prepare_jobs(1, None, config, prefix) self.submit_jobs(1, prefix) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(1, prefix) # part of the luigi API def output(self): return luigi.LocalTarget( os.path.join(self.tmp_folder, self.task_name + '_s%i.log' % self.scale))
class Tabixed(luigi.Task): """ Class that ensures that the BGZIPped external dependency also has tabix index """ target = luigi.TaskParameter() @property def filename(self): return os.path.basename(self.target.output().path) def requires(self): return self.target def run(self): shutil.copyfile(self.target.output().path, self.output().path) tabix(self.output().path) def complete(self): if not os.path.exists(self.output().path + ".tbi"): return False return luigi.Task.complete(self) def output(self): return GlobalConfig().local_target(self.filename)
class CallTask(luigi.WrapperTask): """An entry point for calling most tasks defined in the above workflow. Useful for submitting a list of datasets to process a given task that could be the entire workflow, or only to the desired task. """ level1_list = luigi.Parameter() acq_parser_hint = luigi.OptionalParameter(default="") outdir = luigi.Parameter() task = luigi.TaskParameter() def requires(self): with open(self.level1_list) as src: level1_list = [level1.strip() for level1 in src.readlines()] for level1 in level1_list: work_name = "{}-wagl".format(basename(level1)) container = acquisitions(level1, self.acq_parser_hint) for granule in container.granules: # as each granule is independent, include the granule as the work root work_root = pjoin(self.outdir, work_name, granule) if "group" in self.task.get_param_names(): for group in container.supported_groups: yield self.task(level1, work_root, granule, group) else: yield self.task(level1, work_root, granule)
class GenerateFeatures(luigi.WrapperTask): id_column = luigi.Parameter(default='item_id') dataset = luigi.TaskParameter(default=TrainSet()) text_features = [ 'region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', 'title', 'description', 'user_type' ] def requires(self): yield self.clone(CorrectImagePath, feature_name='image') yield self.clone(ApplyLogTransform, feature_name='deal_probability') yield self.clone(MarkNullInstances, feature_name='price') yield self.clone(FillNaTransform, feature_name='price') yield self.clone(StdScaled, feature_name='price_fillna') yield self.clone(MarkNullInstances, feature_name='image_top_1') yield self.clone(FillNaTransform, feature_name='image_top_1') yield self.clone(StdScaled, feature_name='image_top_1_fillna') yield self.clone(ExtractFeature, feature_name='city') yield self.clone(CreateFolds) yield self.clone(OneHotEncode, feature_name='user_type') yield self.clone(OneHotEncode, feature_name='parent_category_name') yield self.clone(OneHotEncode, feature_name='category_name') yield self.clone(OneHotEncode, feature_name='region') yield self.clone(LabelEncode, feature_name='city') yield self.clone(LabelEncode, feature_name='param_1') yield self.clone(LabelEncode, feature_name='param_2') yield self.clone(LabelEncode, feature_name='param_3') yield self.clone(CharVocabulary, feature_name='description') yield self.clone(WordVectors, feature_name='description', train_features=','.join(self.text_features))
class SubsampleBed(NMETask): """Randomly sample only a portion of the lines from the input BED.""" num_bases = luigi.FloatParameter() prev_task = luigi.TaskParameter() def requires(self): return self.prev_task def output(self): return self.target_in_work_dir('%s-sampled-%s.bed' % (self.genome, self.num_bases)) def run(self): with self.input().open() as in_bed: bases = [] for line in in_bed: fields = line.split() chr = fields[0] start = int(fields[1]) stop = int(fields[2]) for i in xrange(start, stop): bases.append((chr, i)) sample_size = min(self.num_bases, len(bases)) sample = random.sample(bases, sample_size) with self.output().open('w') as out_bed: for base in sample: out_bed.write("\t".join( [base[0], str(base[1]), str(base[1] + 1)]) + "\n")
class MergeMorphologyBase(luigi.Task): """ MergeMorphology base class """ task_name = 'merge_morphology' src_file = os.path.abspath(__file__) allow_retry = False input_path = luigi.Parameter() input_key = luigi.Parameter() output_path = luigi.Parameter() output_key = luigi.Parameter() number_of_labels = luigi.IntParameter() prefix = luigi.Parameter() # dependency = luigi.TaskParameter() def requires(self): return self.dependency def run_impl(self): # get the global config and init configs shebang = self.global_config_values()[0] self.init(shebang) # load the task config config = self.get_task_config() out_shape = (self.number_of_labels, 11) out_chunks = (min(self.number_of_labels, 100000), 11) block_list = vu.blocks_in_volume([out_shape[0]], [out_chunks[0]]) # create output dataset with vu.file_reader(self.output_path) as f: f.require_dataset(self.output_key, shape=out_shape, chunks=out_chunks, compression='gzip', dtype='float64') # update the config with input and graph paths and keys # as well as block shape config.update({'input_path': self.input_path, 'input_key': self.input_key, 'output_path': self.output_path, 'output_key': self.output_key, 'out_shape': out_shape, 'out_chunks': out_chunks}) # prime and run the jobs self.prepare_jobs(self.max_jobs, block_list, config, self.prefix) self.submit_jobs(self.max_jobs, self.prefix) # wait till jobs finish and check for job success self.wait_for_jobs(self.prefix) self.check_jobs(self.max_jobs, self.prefix) # part of the luigi API def output(self): return luigi.LocalTarget(os.path.join(self.tmp_folder, self.task_name + '_%s.log' % self.prefix))
class RunAnywayTask(luigi.Task): targ = luigi.TaskParameter() #try_once = luigi.BoolParameter(False) def run(self): cls = self.targ task = cls() task.run()
class all_some_task(luigi.WrapperTask): require = luigi.TaskParameter() def requires(self): yield { sample: self.require(sample=sample) for sample in brp.fastqs().output() }
class GrowRegionsTaskBase(luigi.Task): """ GrowregionsTask base class """ task_name = 'grow_regions' src_file = os.path.abspath(__file__) input_path = luigi.Parameter() input_key = luigi.Parameter() de_labels_path = luigi.Parameter() de_labels_key = luigi.Parameter() boundaries_path = luigi.Parameter() boundaries_key = luigi.Parameter() graph_path = luigi.Parameter() graph_key = luigi.Parameter() output_path = luigi.Parameter() output_key = luigi.Parameter() dependency = luigi.TaskParameter() def requires(self): return self.dependency @staticmethod def default_task_config(): config = LocalTask.default_task_config() return config def run_impl(self): # get the global config and init configs shebang, block_shape, roi_begin, roi_end = self.global_config_values() self.init(shebang) # load the task config config = self.get_task_config() # NOTE we have to turn the luigi dict parameters into normal python dicts # in order to json serialize them config.update({ 'input_path': self.input_path, 'input_key': self.input_key, 'de_labels_path': self.de_labels_path, 'de_labels_key': self.de_labels_key, 'boundaries_path': self.boundaries_path, 'boundaries_key': self.boundaries_key, 'graph_path': self.graph_path, 'graph_key': self.graph_key, 'output_path': self.output_path, 'output_key': self.output_key }) # prime and run the jobs self.prepare_jobs(1, None, config) self.submit_jobs(1) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(1)
class RelabelWorkflow(luigi.Task): # path to the n5 file and keys path = luigi.Parameter() key = luigi.Parameter() # maximal number of jobs that will be run in parallel max_jobs = luigi.IntParameter() # path to the configuration # TODO allow individual paths for individual blocks config_path = luigi.Parameter() tmp_folder = luigi.Parameter() dependency = luigi.TaskParameter() # FIXME default does not work; this still needs to be specified # TODO different time estimates for different sub-tasks time_estimate = luigi.IntParameter(default=10) run_local = luigi.BoolParameter(default=False) def requires(self): with open(self.config_path) as f: n_jobs_write = json.load(f).get('n_jobs_write', 50) uniques_task = FindUniquesTask(path=self.path, key=self.key, max_jobs=self.max_jobs, config_path=self.config_path, tmp_folder=self.tmp_folder, dependency=self.dependency, time_estimate=self.time_estimate, run_local=self.run_local) labels_task = FindLabelingTask(path=self.path, key=self.key, max_jobs=self.max_jobs, config_path=self.config_path, tmp_folder=self.tmp_folder, dependency=uniques_task, time_estimate=self.time_estimate, run_local=self.run_local) write_task = WriteAssignmentTask(path=self.path, in_key=self.key, out_key=self.key, config_path=self.config_path, max_jobs=n_jobs_write, tmp_folder=self.tmp_folder, identifier='write_relabel', dependency=labels_task, time_estimate=self.time_estimate, run_local=self.run_local) return write_task def run(self): out_path = self.input().path assert os.path.exists(out_path) res_path = self.output().path with open(res_path, 'w') as f: f.write("Success") def output(self): out_file = os.path.join(self.tmp_folder, 'relabeling_workflow.log') return luigi.LocalTarget(out_file)
class CreateMultisetBase(luigi.Task): """ CreateMultiset base class """ task_name = 'create_multiset' src_file = os.path.abspath(__file__) allow_retry = False # input and output volumes input_path = luigi.Parameter() input_key = luigi.Parameter() output_path = luigi.Parameter() output_key = luigi.Parameter() # dependency dependency = luigi.TaskParameter() def requires(self): return self.dependency @staticmethod def default_task_config(): config = LocalTask.default_task_config() config.update({'compression': 'gzip'}) return config def run_impl(self): # get the global config and init configs shebang, block_shape, roi_begin, roi_end = self.global_config_values() self.init(shebang) # get shape and make block config shape = vu.get_shape(self.input_path, self.input_key) # load the create_multiset config config = self.get_task_config() compression = config.get('compression', 'gzip') # require output dataset with vu.file_reader(self.output_path) as f: f.require_dataset(self.output_key, shape=shape, chunks=tuple(block_shape), compression=compression, dtype='uint8') # update the config with input and output paths and keys # as well as block shape config.update({'input_path': self.input_path, 'input_key': self.input_key, 'output_path': self.output_path, 'output_key': self.output_key, 'block_shape': block_shape}) block_list = vu.blocks_in_volume(shape, block_shape, roi_begin, roi_end) self._write_log('scheduling %i blocks to be processed' % len(block_list)) n_jobs = min(len(block_list), self.max_jobs) # prime and run the jobs self.prepare_jobs(n_jobs, block_list, config) self.submit_jobs(n_jobs) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(n_jobs)
class BlockwiseSolver(luigi.Task): pathToSeg = luigi.Parameter() globalProblem = luigi.TaskParameter() numberOfLevels = luigi.Parameter() keyToSeg = luigi.Parameter(default='data') def requires(self): # block size in first hierarchy level initialBlockShape = PipelineParameter().multicutBlockShape # block overlap, for now same for each hierarchy lvl block_overlap = PipelineParameter().multicutBlockOverlap problems = [self.globalProblem] block_factor = 1 for level in range(self.numberOfLevels): # TODO check that we don't get larger than the actual shape here block_shape = list( map(lambda x: x * block_factor, initialBlockShape)) problems.append( ReducedProblem(pathToSeg=self.pathToSeg, problem=problems[-1], blockShape=block_shape, blockOverlap=block_overlap, level=level, keyToSeg=self.keyToSeg)) block_factor *= 2 return problems def run(self): raise NotImplementedError( "BlockwiseSolver is abstract and does not implement a run functionality!" ) # map back to the global solution def map_node_result_to_global(self, problems, reduced_node_result, reduced_problem_index=-1): n_nodes_global = problems[0].read('number_of_nodes') reduced_problem = problems[reduced_problem_index] to_global_nodes = reduced_problem.read("new2global") # TODO vectorize node_result = np.zeros(n_nodes_global, dtype='uint32') for node_id, node_res in enumerate(reduced_node_result): node_result[to_global_nodes[node_id]] = node_res return node_result def output(self): raise NotImplementedError( "BlockwiseSolver is abstract and does not implement the output functionality!" )
class postgres_count_matrix(luigi.Task): password = luigi.Parameter(significant=False) host = luigi.Parameter(significant=False) database = 'rna' user = luigi.Parameter(default='rna', significant=False) table = luigi.Parameter(default='gene_counts') feature_counter = luigi.TaskParameter(default=gene_counter, significant=False) def requires(self): return { x: self.feature_counter(sample=x) for x in fastqs(sample='').run() } def run(self): engine = create_engine( 'postgresql://%s:%s@%s/%s' % (self.user, self.password, self.host, self.database)) try: engine.execute(CreateSchema(parameters().exp_name)) except: # should catch psycopg2.ProgrammingError, but doesnt work pass pandas_files = [ pd.read_table(self.input()[name].path, skiprows=2, index_col=0, names=[ 'Gene', 'Chr', 'Start', 'End', 'Strand', 'Length', name ], usecols=['Gene', name], header=None) for name in self.input() ] count_table = pd.concat(pandas_files, axis=1).sort_index(axis=1) count_table.to_csv("%s/%s.csv" % (parameters().exp_dir, self.table)) count_table.to_sql(self.table, con=engine, schema=parameters().exp_name) # Taken from luigi source code, makes marker table and adds entry self.output().create_marker_table() connection = self.output().connect() self.output().touch(connection) connection.commit() connection.close() def output(self): return luigi.postgres.PostgresTarget(host=self.host, database=self.database, user=self.user, password=self.password, table=self.table, update_id=parameters().exp_name + '_' + self.table)
class TableImplBase(luigi.Task): """ table_impl base class """ task_name = "table_impl" src_file = os.path.abspath(__file__) input_files = luigi.ListParameter() output_files = luigi.ListParameter() input_key = luigi.Parameter() resolution = luigi.ListParameter() dependency = luigi.TaskParameter(default=DummyTask()) def requires(self): return self.dependency def require_output_folders(self): output_folders = [ os.path.split(out_file)[0] for out_file in self.output_files ] output_folders = list(set(output_folders)) for out_folder in output_folders: os.makedirs(out_folder, exist_ok=True) def run_impl(self): # get the global config and init configs shebang = self.global_config_values()[0] self.init(shebang) self.require_output_folders() # luigi may randomly shuffles the file lists, so we need to make sure they are ordered here input_files = list(self.input_files) input_files.sort() output_files = list(self.output_files) output_files.sort() # load and update the task config task_config = self.get_task_config() task_config.update({ "input_files": input_files, "output_files": output_files, "resolution": self.resolution, "input_key": self.input_key }) block_list = list(range(len(input_files))) self._write_log("scheduled %i blocks to run" % len(block_list)) # prime and run the jobs n_jobs = min(len(block_list), self.max_jobs) self.prepare_jobs(n_jobs, block_list, task_config) self.submit_jobs(n_jobs) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(n_jobs)
class EdgeLabelsBase(luigi.Task): """ Edge labels base class """ task_name = 'edge_labels_mc' src_file = os.path.abspath(__file__) # retry is too complecated for now ... allow_retry = False # input and output volumes node_labels_path = luigi.Parameter() node_labels_key = luigi.Parameter() graph_path = luigi.Parameter() graph_key = luigi.Parameter() output_path = luigi.Parameter() output_key = luigi.Parameter() ws_path = luigi.Parameter() ws_key = luigi.Parameter() dependency = luigi.TaskParameter() def requires(self): return self.dependency @staticmethod def default_task_config(): # we use this to get also get the common default config config = LocalTask.default_task_config() config.update({'ignore_label_gt': False}) return config def run_impl(self): # get the global config and init configs shebang, _, _, _ = self.global_config_values() self.init(shebang) # load the task config config = self.get_task_config() # update the task config config.update({ 'node_labels_path': self.node_labels_path, 'node_labels_key': self.node_labels_key, 'output_path': self.output_path, 'output_key': self.output_key, 'graph_path': self.graph_path, 'graph_key': self.graph_key, 'ws_path': self.ws_path, 'ws_key': self.ws_key }) # prime and run the jobs self.prepare_jobs(1, None, config) self.submit_jobs(1) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(1)
class SkeletonizeBase(luigi.Task): """ Skeletonize base class """ task_name = 'skeletonize' src_file = os.path.abspath(__file__) allow_retry = False # input and output volumes input_path = luigi.Parameter() input_key = luigi.Parameter() output_path = luigi.Parameter() output_key = luigi.Parameter() dependency = luigi.TaskParameter(default=DummyTask()) def requires(self): return self.dependency def run_impl(self): # get the global config and init configs shebang, block_shape, roi_begin, roi_end = self.global_config_values() self.init(shebang) # get shape, dtype and make block config with vu.file_reader(self.input_path, 'r') as f: shape = f[self.input_key].shape # load the skeletonize config task_config = self.get_task_config() # require output dataset chunks = (25, 256, 256) chunks = tuple(min(sh, ch) for sh, ch in zip(shape, chunks)) with vu.file_reader(self.output_path) as f: f.require_dataset(self.output_key, shape=shape, chunks=chunks, compression='gzip', dtype='uint64') # update the config with input and output paths and keys # as well as block shape task_config.update({ 'input_path': self.input_path, 'input_key': self.input_key, 'output_path': self.output_path, 'output_key': self.output_key }) # prime and run the jobs n_jobs = 1 self.prepare_jobs(n_jobs, None, task_config) self.submit_jobs(n_jobs) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(n_jobs)
class SimpleStitchAssignmentsBase(luigi.Task): """ SimpleStitchAssignments base class """ task_name = 'simple_stitch_assignments' src_file = os.path.abspath(__file__) allow_retry = False problem_path = luigi.Parameter() features_key = luigi.Parameter() graph_key = luigi.Parameter() assignments_path = luigi.Parameter() assignments_key = luigi.Parameter() edge_size_threshold = luigi.IntParameter(default=0) serialize_edges = luigi.BoolParameter(default=False) # task that is required before running this task dependency = luigi.TaskParameter() def requires(self): return self.dependency def run_impl(self): shebang, block_shape, roi_begin, roi_end = self.global_config_values() self.init(shebang) with vu.file_reader(self.problem_path, 'r') as f: shape = f[self.graph_key].attrs['shape'] block_list = vu.blocks_in_volume(shape, block_shape, roi_begin, roi_end) n_jobs = min(len(block_list), self.max_jobs) config = self.get_task_config() tmp_file = os.path.join(self.tmp_folder, 'stitch_edges.n5') config.update({ 'input_path': tmp_file, 'problem_path': self.problem_path, 'features_key': self.features_key, 'graph_key': self.graph_key, 'assignments_path': self.assignments_path, 'assignments_key': self.assignments_key, 'edge_size_threshold': self.edge_size_threshold, 'serialize_edges': self.serialize_edges, 'n_jobs': n_jobs }) with vu.file_reader(tmp_file) as f: f.require_group('job_results') # we only have a single job to find the labeling self.prepare_jobs(1, None, config) self.submit_jobs(1) # wait till jobs finish and check for job success self.wait_for_jobs() # log the save-path again self.check_jobs(1)
class R2EFeaturesBase(luigi.Task): """ R2EFeaturesBase base class """ task_name = 'r2e_features' src_file = os.path.abspath(__file__) allow_retry = False # input volumes and graph graph_path = luigi.Parameter() graph_key = luigi.Parameter() region_feature_paths = luigi.ListParameter() region_feature_keys = luigi.ListParameter() edge_feature_paths = luigi.ListParameter(default=None) edge_feature_keys = luigi.ListParameter(default=None) output_path = luigi.Parameter() output_key = luigi.Parameter() dependency = luigi.TaskParameter() def requires(self): return self.dependency @staticmethod def default_task_config(): # we use this to get also get the common default config config = LocalTask.default_task_config() return config def run_impl(self): # get the global config and init configs shebang, block_shape, roi_begin, roi_end = self.global_config_values() self.init(shebang) # load the task config config = self.get_task_config() # NOTE we have to turn the luigi dict parameters into normal python dicts # in order to json serialize them config.update({ 'graph_path': self.graph_path, 'graph_key': self.graph_key, 'region_feature_paths': self.region_feature_paths, 'region_feature_keys': self.region_feature_keys, 'edge_feature_paths': self.edge_feature_paths, 'edge_feature_keys': self.edge_feature_keys, 'output_path': self.output_path, 'output_key': self.output_key }) # prime and run the jobs self.prepare_jobs(1, None, config) self.submit_jobs(1) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(1)