Ejemplo n.º 1
0
class MergeTables(luigi.Task):
    output_prefix = luigi.Parameter()
    output_path = luigi.Parameter()
    max_jobs = luigi.IntParameter()
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    def run(self):
        # load all job sub results
        tables = []
        for job_id in range(self.max_jobs):
            path = self.output_prefix + '_job%i.csv' % job_id
            # NOTE: not all jobs might have been scheduled, so
            # we neeed to check if the result actually exists
            if not os.path.exists(path):
                continue
            sub_table = pd.read_csv(path, sep='\t')
            tables.append(sub_table)

        table = pd.concat(tables)
        table.sort_values('label_id', inplace=True)
        table.to_csv(self.output_path, index=False, sep='\t')

    def output(self):
        return luigi.LocalTarget(self.output_path)
Ejemplo n.º 2
0
class FindMergesBase(luigi.Task):
    task_name = 'find_merges'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    path = luigi.Parameter()
    key = luigi.Parameter()
    out_path = luigi.Parameter()
    clear_ids = luigi.ListParameter()
    min_overlap = luigi.IntParameter()
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    def run_impl(self):
        # get the global config and init configs
        shebang = self.global_config_values()[0]
        self.init(shebang)

        # load the task config
        config = self.get_task_config()
        config.update({'path': self.path, 'key': self.key,
                       'clear_ids': self.clear_ids,
                       'out_path': self.out_path, 'min_overlap': self.min_overlap})

        # prime and run the jobs
        n_jobs = 1
        self.prepare_jobs(n_jobs, None, config)
        self.submit_jobs(n_jobs)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(n_jobs)
Ejemplo n.º 3
0
class ApplyThreshold(luigi.Task):
    feature_path = luigi.Parameter()
    feature_key = luigi.Parameter()
    out_path = luigi.Parameter()
    threshold = luigi.FloatParameter()
    threshold_mode = luigi.Parameter(default='less')
    dependency = luigi.TaskParameter()

    threshold_modes = ('less', 'greater', 'equal')

    def requires(self):
        return self.dependency

    def run(self):
        f = z5py.File(self.feature_path)
        ds = f[self.feature_key]
        feats = ds[:]

        assert self.threshold_mode in self.threshold_modes
        if self.threshold_mode == 'less':
            filter_ids = feats < self.threshold
        elif self.threshold_mode == 'greater':
            filter_ids = feats > self.threshold
        elif self.threshold_mode == 'equal':
            filter_ids = feats == self.threshold

        filter_ids = np.where(filter_ids)[0].tolist()
        with open(self.out_path, 'w') as f:
            json.dump(filter_ids, f)

    def output(self):
        return luigi.LocalTarget(self.out_path)
Ejemplo n.º 4
0
class ExtractSingleCopyRegions(NMETask):
    """Get a BED of single-copy regions from a hal file.

    Delegates to a toil pipeline to parallelize the process."""
    chunk_size = luigi.IntParameter(default=500)
    prev_task = luigi.TaskParameter()

    def requires(self):
        return self.prev_task

    def output(self):
        return self.target_in_work_dir('singleCopyRegions-%s.bed' %
                                       self.genome)

    def run(self):
        jobStorePath = '%s/jobStore-singleCopy-%s' % (self.work_dir,
                                                      self.genome)
        opts = Job.Runner.getDefaultOptions(jobStorePath)
        if os.path.exists(jobStorePath):
            opts.restart = True
        opts.disableCaching = True
        opts.batchSystem = self.batchSystem
        opts.parasolCommand = self.parasolCommand
        opts.environment = ["LD_LIBRARY_PATH"]
        with Toil(opts) as toil:
            if opts.restart:
                result = toil.restart()
            else:
                bed_file = toil.importFile('file://' + self.input().path)
                result = toil.start(
                    Job.wrapJobFn(extract_single_copy_regions_parallel,
                                  os.path.abspath(self.hal_file), bed_file,
                                  self.genome, self.chunk_size))
            toil.exportFile(result,
                            'file://' + os.path.abspath(self.output().path))
Ejemplo n.º 5
0
class McSolverExact(luigi.Task):

    problem = luigi.TaskParameter()

    def requires(self):
        return self.problem

    @run_decorator
    def run(self):
        mcProblem = self.input()

        g = nifty.graph.UndirectedGraph()

        edgeCosts = mcProblem.read("costs")
        g.deserialize(mcProblem.read("graph"))

        assert g.numberOfEdges == edgeCosts.shape[0]

        obj = nifty.graph.multicut.multicutObjective(g, edgeCosts)

        factory = nifty_ilp_factory(obj)
        ret, mc_energy, t_inf = run_nifty_solver(obj, factory, verbose=True)
        mc_energy = mc_energy[-1]
        t_inf = t_inf[-1]

        workflow_logger.info(
            "McSolverExact: inference with exact solver in %i s" % t_inf)
        workflow_logger.info("McSolverExact: energy of the solution %f" %
                             mc_energy)

        self.output().write(ret)

    def output(self):
        save_path = os.path.join(PipelineParameter().cache, "McSolverExact.h5")
        return HDF5DataTarget(save_path)
Ejemplo n.º 6
0
class WorkflowBase(luigi.Task):
    """
    Base class for a workflow task, that just chains together
    a workflow of multiple tasks.
    """
    # temporary folder for configurations etc
    tmp_folder = luigi.Parameter()
    # maximum number of concurrent jobs
    max_jobs = luigi.IntParameter()
    # path for the global configuration
    config_dir = luigi.Parameter()
    # target can be local, slurm, lsf (case insensitive)
    target = luigi.Parameter()
    # the workflow can have dependencies; per default we
    # set to be a dummy task that is always successfull
    dependency = luigi.TaskParameter(default=DummyTask())

    _target_dict = {'lsf': 'LSF', 'slurm': 'Slurm', 'local': 'Local'}

    def _get_task_name(self, task_base_name):
        target_postfix = self._target_dict[self.target.lower()]
        return task_base_name + target_postfix

    def output(self):
        # we just mirror the target of the last task
        return luigi.LocalTarget(self.input().path)

    @staticmethod
    def get_config():
        """ Return all default configs and their save_path indexed by the task name
        """
        return {'global': BaseClusterTask.default_global_config()}
Ejemplo n.º 7
0
class MoveToHdfsTask(luigi.Task):
    """Move the output of a task (assuming it's a LocalTarget) onto HDFS
    """

    description = "Move the output of a task to HDFS"

    upstream_task = luigi.TaskParameter()
    cache_invalidator = Parameter(
        default=None,
        description=
        "Can be used to invalidate Luigi's instance cacher (which doesn't work with task params)"
    )

    def requires(self):
        return self.upstream_task

    def run(self):
        source = self.input().path
        target = self.output().path

        client = HdfsClient()
        client.put(source, target)

    def output(self):
        return HdfsTarget(os.path.basename(self.input().path))
Ejemplo n.º 8
0
class FilterBlocksBase(luigi.Task):
    """ FilterBlocks base class
    """

    task_name = 'filter_blocks'
    src_file = os.path.abspath(__file__)

    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    filter_path = luigi.Parameter()
    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    #
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    def clean_up_for_retry(self, block_list):
        super().clean_up_for_retry(block_list)
        # TODO remove any output of failed blocks because it might be corrupted

    def run_impl(self):
        # get the global config and init configs
        shebang, block_shape, roi_begin, roi_end, block_list_path\
            = self.global_config_values(with_block_list_path=True)
        self.init(shebang)

        # load the task config
        config = self.get_task_config()

        # update the config with input and graph paths and keys
        # as well as block shape
        config.update({'input_path': self.input_path, 'input_key': self.input_key,
                       'block_shape': block_shape, 'filter_path': self.filter_path,
                       'output_path': self.output_path, 'output_key': self.output_key})

        # create output dataset
        shape = vu.get_shape(self.input_path, self.input_key)
        chunks = tuple(min(bs // 2, sh) for bs, sh in zip(block_shape, shape))
        with vu.file_reader(self.output_path) as f:
            f.require_dataset(self.output_key, shape=shape,
                              dtype='uint64', chunks=chunks,
                              compression='gzip')

        if self.n_retries == 0:
            block_list = vu.blocks_in_volume(shape, block_shape, roi_begin, roi_end,
                                             block_list_path=block_list_path)
        else:
            block_list = self.block_list
            self.clean_up_for_retry(block_list)
        n_jobs = min(len(block_list), self.max_jobs)

        # prime and run the jobs
        self.prepare_jobs(n_jobs, block_list, config)
        self.submit_jobs(n_jobs)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(n_jobs)
Ejemplo n.º 9
0
class ProbsToCostsBase(luigi.Task):
    """ ProbsToCosts base class
    """

    task_name = 'probs_to_costs'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    # input and output volumes
    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    features_path = luigi.Parameter()
    features_key = luigi.Parameter()
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    @staticmethod
    def default_task_config():
        # we use this to get also get the common default config
        config = LocalTask.default_task_config()
        config.update({'invert_inputs': False, 'transform_to_costs': True,
                       'weight_edges': False, 'weighting_exponent': 1.,
                       'beta': 0.5})
        return config

    def run_impl(self):
        # get the global config and init configs
        shebang, block_shape, roi_begin, roi_end = self.global_config_values()
        self.init(shebang)

        # load the task config
        config = self.get_task_config()

        with vu.file_reader(self.input_path) as f:
            n_edges = f[self.input_key].shape[0]
        # chunk size = 64**3
        chunk_size = min(262144, n_edges)

        # require output dataset
        with vu.file_reader(self.output_path) as f:
            f.require_dataset(self.output_key, shape=(n_edges,), compression='gzip',
                              dtype='float32', chunks=(chunk_size,))

        # update the config with input and output paths and keys
        # as well as block shape
        config.update({'input_path': self.input_path, 'input_key': self.input_key,
                       'output_path': self.output_path, 'output_key': self.output_key,
                       'features_path': self.features_path, 'features_key': self.features_key})

        # prime and run the jobs
        self.prepare_jobs(1, None, config)
        self.submit_jobs(1)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(1)
Ejemplo n.º 10
0
class MergePairwiseDistances(luigi.Task):
    tmp_folder = luigi.Parameter()
    max_jobs = luigi.IntParameter()
    output_path = luigi.Parameter()
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    def run(self):
        res_dict = {}

        for job_id in range(self.max_jobs):
            path = os.path.join(self.tmp_folder,
                                'object_distances_%i.pkl' % job_id)
            # path might not exist because the number of actual jobs is smaller than max_jobs
            if not os.path.exists(path):
                continue
            with open(path, 'rb') as f:
                distances = pickle.load(f)
            res_dict.update(distances)

        with open(self.output_path, 'wb') as f:
            pickle.dump(res_dict, f)

    def output(self):
        return luigi.LocalTarget(self.output_path)
Ejemplo n.º 11
0
class GenerateImageByBounds(luigi.WrapperTask):
    """
    Schedule Download Tasks
    """
    west = luigi.FloatParameter()
    north = luigi.FloatParameter()
    south = luigi.FloatParameter()
    east = luigi.FloatParameter()
    zoom = luigi.IntParameter()
    targetTask = luigi.TaskParameter(default=GenerateImageCSReliefMap)

    def requires(self):
        """
        scheduling tasks
        """

        candidateTasks = [
            GenerateImageCSReliefMap, GenerateImageCurvature,
            GenerateImageSlope
        ]
        if not self.targetTask in candidateTasks:
            raise

        edge_nw_x, edge_nw_y, _, _ = deg_to_num(self.north, self.west,
                                                self.zoom)
        edge_se_x, edge_se_y, _, _ = deg_to_num(self.south, self.east,
                                                self.zoom)
        # xRange = [edge_nw_x, edge_se_x]
        # yRange = [edge_nw_y, edge_se_y]
        print deg_to_num(self.north, self.west, self.zoom) + deg_to_num(
            self.south, self.east, self.zoom)
        for tile_x in range(edge_nw_x - 3, edge_se_x + 3):
            for tile_y in range(edge_nw_y - 3, edge_se_y + 3):
                yield self.targetTask(x=tile_x, y=tile_y, z=self.zoom)
class CreateFolds(luigi.Task):
    dataset = luigi.TaskParameter()
    num_folds = luigi.IntParameter(default=5)

    def requires(self):
        return self.dataset

    def output(self):
        return [
            luigi.LocalTarget(f'_folds/fold_{fold_id}.h5')
            for fold_id in range(self.num_folds)
        ]

    def run(self):
        for out in self.output():
            out.makedirs()

        df = pd.read_csv(self.input().path)
        folds = KFold(n_splits=self.num_folds, shuffle=True, random_state=0)

        for fold_id, (train_idx, val_idx) in enumerate(folds.split(df)):
            out_path = self.output()[fold_id].path
            dump({
                'train': train_idx,
                'val': val_idx,
            }, out_path)
Ejemplo n.º 13
0
class SolveGlobalBase(luigi.Task):
    """ SolveGlobal base class
    """

    task_name = 'solve_global'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    # input volumes and graph
    problem_path = luigi.Parameter()
    assignment_path = luigi.Parameter()
    assignment_key = luigi.Parameter()
    scale = luigi.IntParameter()
    #
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    @staticmethod
    def default_task_config():
        # we use this to get also get the common default config
        config = LocalTask.default_task_config()
        config.update({
            'agglomerator': 'kernighan-lin',
            'time_limit_solver': None
        })
        return config

    def run_impl(self):
        # get the global config and init configs
        shebang, block_shape, roi_begin, roi_end = self.global_config_values()
        self.init(shebang)

        # load the task config
        config = self.get_task_config()

        # update the config with input and graph paths and keys
        # as well as block shape
        config.update({
            'assignment_path': self.assignment_path,
            'assignment_key': self.assignment_key,
            'scale': self.scale,
            'problem_path': self.problem_path
        })

        # prime and run the job
        prefix = 's%i' % self.scale
        self.prepare_jobs(1, None, config, prefix)
        self.submit_jobs(1, prefix)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(1, prefix)

    # part of the luigi API
    def output(self):
        return luigi.LocalTarget(
            os.path.join(self.tmp_folder,
                         self.task_name + '_s%i.log' % self.scale))
Ejemplo n.º 14
0
class Tabixed(luigi.Task):
    """
    Class that ensures that the BGZIPped external dependency also has
    tabix index
    """
    target = luigi.TaskParameter()

    @property
    def filename(self):
        return os.path.basename(self.target.output().path)

    def requires(self):
        return self.target

    def run(self):
        shutil.copyfile(self.target.output().path, self.output().path)
        tabix(self.output().path)

    def complete(self):
        if not os.path.exists(self.output().path + ".tbi"):
            return False
        return luigi.Task.complete(self)

    def output(self):
        return GlobalConfig().local_target(self.filename)
Ejemplo n.º 15
0
class CallTask(luigi.WrapperTask):

    """An entry point for calling most tasks defined in the above
    workflow. Useful for submitting a list of datasets to process
    a given task that could be the entire workflow, or only to
    the desired task.
    """

    level1_list = luigi.Parameter()
    acq_parser_hint = luigi.OptionalParameter(default="")
    outdir = luigi.Parameter()
    task = luigi.TaskParameter()

    def requires(self):
        with open(self.level1_list) as src:
            level1_list = [level1.strip() for level1 in src.readlines()]

        for level1 in level1_list:
            work_name = "{}-wagl".format(basename(level1))
            container = acquisitions(level1, self.acq_parser_hint)
            for granule in container.granules:
                # as each granule is independent, include the granule as the work root
                work_root = pjoin(self.outdir, work_name, granule)
                if "group" in self.task.get_param_names():
                    for group in container.supported_groups:
                        yield self.task(level1, work_root, granule, group)
                else:
                    yield self.task(level1, work_root, granule)
Ejemplo n.º 16
0
class GenerateFeatures(luigi.WrapperTask):
    id_column = luigi.Parameter(default='item_id')
    dataset = luigi.TaskParameter(default=TrainSet())

    text_features = [
        'region', 'city', 'parent_category_name', 'category_name', 'param_1',
        'param_2', 'param_3', 'title', 'description', 'user_type'
    ]

    def requires(self):
        yield self.clone(CorrectImagePath, feature_name='image')
        yield self.clone(ApplyLogTransform, feature_name='deal_probability')
        yield self.clone(MarkNullInstances, feature_name='price')
        yield self.clone(FillNaTransform, feature_name='price')
        yield self.clone(StdScaled, feature_name='price_fillna')
        yield self.clone(MarkNullInstances, feature_name='image_top_1')
        yield self.clone(FillNaTransform, feature_name='image_top_1')
        yield self.clone(StdScaled, feature_name='image_top_1_fillna')
        yield self.clone(ExtractFeature, feature_name='city')
        yield self.clone(CreateFolds)
        yield self.clone(OneHotEncode, feature_name='user_type')
        yield self.clone(OneHotEncode, feature_name='parent_category_name')
        yield self.clone(OneHotEncode, feature_name='category_name')
        yield self.clone(OneHotEncode, feature_name='region')
        yield self.clone(LabelEncode, feature_name='city')
        yield self.clone(LabelEncode, feature_name='param_1')
        yield self.clone(LabelEncode, feature_name='param_2')
        yield self.clone(LabelEncode, feature_name='param_3')
        yield self.clone(CharVocabulary, feature_name='description')
        yield self.clone(WordVectors,
                         feature_name='description',
                         train_features=','.join(self.text_features))
Ejemplo n.º 17
0
class SubsampleBed(NMETask):
    """Randomly sample only a portion of the lines from the input BED."""
    num_bases = luigi.FloatParameter()
    prev_task = luigi.TaskParameter()

    def requires(self):
        return self.prev_task

    def output(self):
        return self.target_in_work_dir('%s-sampled-%s.bed' %
                                       (self.genome, self.num_bases))

    def run(self):
        with self.input().open() as in_bed:
            bases = []
            for line in in_bed:
                fields = line.split()
                chr = fields[0]
                start = int(fields[1])
                stop = int(fields[2])
                for i in xrange(start, stop):
                    bases.append((chr, i))
            sample_size = min(self.num_bases, len(bases))
            sample = random.sample(bases, sample_size)
        with self.output().open('w') as out_bed:
            for base in sample:
                out_bed.write("\t".join(
                    [base[0], str(base[1]),
                     str(base[1] + 1)]) + "\n")
class MergeMorphologyBase(luigi.Task):
    """ MergeMorphology base class
    """

    task_name = 'merge_morphology'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    number_of_labels = luigi.IntParameter()
    prefix = luigi.Parameter()
    #
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    def run_impl(self):
        # get the global config and init configs
        shebang = self.global_config_values()[0]
        self.init(shebang)

        # load the task config
        config = self.get_task_config()

        out_shape = (self.number_of_labels, 11)
        out_chunks = (min(self.number_of_labels, 100000), 11)
        block_list = vu.blocks_in_volume([out_shape[0]], [out_chunks[0]])

        # create output dataset
        with vu.file_reader(self.output_path) as f:
            f.require_dataset(self.output_key, shape=out_shape,
                              chunks=out_chunks, compression='gzip',
                              dtype='float64')

        # update the config with input and graph paths and keys
        # as well as block shape
        config.update({'input_path': self.input_path,
                       'input_key': self.input_key,
                       'output_path': self.output_path,
                       'output_key': self.output_key,
                       'out_shape': out_shape,
                       'out_chunks': out_chunks})

        # prime and run the jobs
        self.prepare_jobs(self.max_jobs, block_list, config, self.prefix)
        self.submit_jobs(self.max_jobs, self.prefix)

        # wait till jobs finish and check for job success
        self.wait_for_jobs(self.prefix)
        self.check_jobs(self.max_jobs, self.prefix)

    # part of the luigi API
    def output(self):
        return luigi.LocalTarget(os.path.join(self.tmp_folder,
                                              self.task_name + '_%s.log' % self.prefix))
Ejemplo n.º 19
0
class RunAnywayTask(luigi.Task):
    targ = luigi.TaskParameter()
    #try_once = luigi.BoolParameter(False)

    def run(self):
        cls = self.targ
        task = cls()
        task.run()
Ejemplo n.º 20
0
class all_some_task(luigi.WrapperTask):
    require = luigi.TaskParameter()

    def requires(self):
        yield {
            sample: self.require(sample=sample)
            for sample in brp.fastqs().output()
        }
Ejemplo n.º 21
0
class GrowRegionsTaskBase(luigi.Task):
    """ GrowregionsTask base class
    """

    task_name = 'grow_regions'
    src_file = os.path.abspath(__file__)

    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    de_labels_path = luigi.Parameter()
    de_labels_key = luigi.Parameter()
    boundaries_path = luigi.Parameter()
    boundaries_key = luigi.Parameter()
    graph_path = luigi.Parameter()
    graph_key = luigi.Parameter()

    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    @staticmethod
    def default_task_config():
        config = LocalTask.default_task_config()
        return config

    def run_impl(self):
        # get the global config and init configs
        shebang, block_shape, roi_begin, roi_end = self.global_config_values()
        self.init(shebang)

        # load the task config
        config = self.get_task_config()

        # NOTE we have to turn the luigi dict parameters into normal python dicts
        # in order to json serialize them
        config.update({
            'input_path': self.input_path,
            'input_key': self.input_key,
            'de_labels_path': self.de_labels_path,
            'de_labels_key': self.de_labels_key,
            'boundaries_path': self.boundaries_path,
            'boundaries_key': self.boundaries_key,
            'graph_path': self.graph_path,
            'graph_key': self.graph_key,
            'output_path': self.output_path,
            'output_key': self.output_key
        })

        # prime and run the jobs
        self.prepare_jobs(1, None, config)
        self.submit_jobs(1)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(1)
class RelabelWorkflow(luigi.Task):

    # path to the n5 file and keys
    path = luigi.Parameter()
    key = luigi.Parameter()
    # maximal number of jobs that will be run in parallel
    max_jobs = luigi.IntParameter()
    # path to the configuration
    # TODO allow individual paths for individual blocks
    config_path = luigi.Parameter()
    tmp_folder = luigi.Parameter()
    dependency = luigi.TaskParameter()
    # FIXME default does not work; this still needs to be specified
    # TODO different time estimates for different sub-tasks
    time_estimate = luigi.IntParameter(default=10)
    run_local = luigi.BoolParameter(default=False)

    def requires(self):
        with open(self.config_path) as f:
            n_jobs_write = json.load(f).get('n_jobs_write', 50)
        uniques_task = FindUniquesTask(path=self.path,
                                       key=self.key,
                                       max_jobs=self.max_jobs,
                                       config_path=self.config_path,
                                       tmp_folder=self.tmp_folder,
                                       dependency=self.dependency,
                                       time_estimate=self.time_estimate,
                                       run_local=self.run_local)
        labels_task = FindLabelingTask(path=self.path,
                                       key=self.key,
                                       max_jobs=self.max_jobs,
                                       config_path=self.config_path,
                                       tmp_folder=self.tmp_folder,
                                       dependency=uniques_task,
                                       time_estimate=self.time_estimate,
                                       run_local=self.run_local)
        write_task = WriteAssignmentTask(path=self.path,
                                         in_key=self.key,
                                         out_key=self.key,
                                         config_path=self.config_path,
                                         max_jobs=n_jobs_write,
                                         tmp_folder=self.tmp_folder,
                                         identifier='write_relabel',
                                         dependency=labels_task,
                                         time_estimate=self.time_estimate,
                                         run_local=self.run_local)
        return write_task

    def run(self):
        out_path = self.input().path
        assert os.path.exists(out_path)
        res_path = self.output().path
        with open(res_path, 'w') as f:
            f.write("Success")

    def output(self):
        out_file = os.path.join(self.tmp_folder, 'relabeling_workflow.log')
        return luigi.LocalTarget(out_file)
Ejemplo n.º 23
0
class CreateMultisetBase(luigi.Task):
    """ CreateMultiset base class
    """

    task_name = 'create_multiset'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    # input and output volumes
    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    # dependency
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    @staticmethod
    def default_task_config():
        config = LocalTask.default_task_config()
        config.update({'compression': 'gzip'})
        return config

    def run_impl(self):
        # get the global config and init configs
        shebang, block_shape, roi_begin, roi_end = self.global_config_values()
        self.init(shebang)

        # get shape and make block config
        shape = vu.get_shape(self.input_path, self.input_key)

        # load the create_multiset config
        config = self.get_task_config()

        compression = config.get('compression', 'gzip')
        # require output dataset
        with vu.file_reader(self.output_path) as f:
            f.require_dataset(self.output_key, shape=shape, chunks=tuple(block_shape),
                              compression=compression, dtype='uint8')

        # update the config with input and output paths and keys
        # as well as block shape
        config.update({'input_path': self.input_path, 'input_key': self.input_key,
                       'output_path': self.output_path, 'output_key': self.output_key,
                       'block_shape': block_shape})
        block_list = vu.blocks_in_volume(shape, block_shape, roi_begin, roi_end)
        self._write_log('scheduling %i blocks to be processed' % len(block_list))
        n_jobs = min(len(block_list), self.max_jobs)

        # prime and run the jobs
        self.prepare_jobs(n_jobs, block_list, config)
        self.submit_jobs(n_jobs)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(n_jobs)
class BlockwiseSolver(luigi.Task):

    pathToSeg = luigi.Parameter()
    globalProblem = luigi.TaskParameter()
    numberOfLevels = luigi.Parameter()
    keyToSeg = luigi.Parameter(default='data')

    def requires(self):
        # block size in first hierarchy level
        initialBlockShape = PipelineParameter().multicutBlockShape
        # block overlap, for now same for each hierarchy lvl
        block_overlap = PipelineParameter().multicutBlockOverlap

        problems = [self.globalProblem]
        block_factor = 1

        for level in range(self.numberOfLevels):
            # TODO check that we don't get larger than the actual shape here
            block_shape = list(
                map(lambda x: x * block_factor, initialBlockShape))
            problems.append(
                ReducedProblem(pathToSeg=self.pathToSeg,
                               problem=problems[-1],
                               blockShape=block_shape,
                               blockOverlap=block_overlap,
                               level=level,
                               keyToSeg=self.keyToSeg))
            block_factor *= 2

        return problems

    def run(self):
        raise NotImplementedError(
            "BlockwiseSolver is abstract and does not implement a run functionality!"
        )

    # map back to the global solution
    def map_node_result_to_global(self,
                                  problems,
                                  reduced_node_result,
                                  reduced_problem_index=-1):

        n_nodes_global = problems[0].read('number_of_nodes')
        reduced_problem = problems[reduced_problem_index]
        to_global_nodes = reduced_problem.read("new2global")

        # TODO vectorize
        node_result = np.zeros(n_nodes_global, dtype='uint32')
        for node_id, node_res in enumerate(reduced_node_result):
            node_result[to_global_nodes[node_id]] = node_res

        return node_result

    def output(self):
        raise NotImplementedError(
            "BlockwiseSolver is abstract and does not implement the output functionality!"
        )
Ejemplo n.º 25
0
class postgres_count_matrix(luigi.Task):
    password = luigi.Parameter(significant=False)
    host = luigi.Parameter(significant=False)
    database = 'rna'
    user = luigi.Parameter(default='rna', significant=False)
    table = luigi.Parameter(default='gene_counts')
    feature_counter = luigi.TaskParameter(default=gene_counter,
                                          significant=False)

    def requires(self):
        return {
            x: self.feature_counter(sample=x)
            for x in fastqs(sample='').run()
        }

    def run(self):
        engine = create_engine(
            'postgresql://%s:%s@%s/%s' %
            (self.user, self.password, self.host, self.database))

        try:
            engine.execute(CreateSchema(parameters().exp_name))
        except:  # should catch psycopg2.ProgrammingError, but doesnt work
            pass

        pandas_files = [
            pd.read_table(self.input()[name].path,
                          skiprows=2,
                          index_col=0,
                          names=[
                              'Gene', 'Chr', 'Start', 'End', 'Strand',
                              'Length', name
                          ],
                          usecols=['Gene', name],
                          header=None) for name in self.input()
        ]
        count_table = pd.concat(pandas_files, axis=1).sort_index(axis=1)
        count_table.to_csv("%s/%s.csv" % (parameters().exp_dir, self.table))
        count_table.to_sql(self.table,
                           con=engine,
                           schema=parameters().exp_name)

        # Taken from luigi source code, makes marker table and adds entry
        self.output().create_marker_table()
        connection = self.output().connect()
        self.output().touch(connection)
        connection.commit()
        connection.close()

    def output(self):
        return luigi.postgres.PostgresTarget(host=self.host,
                                             database=self.database,
                                             user=self.user,
                                             password=self.password,
                                             table=self.table,
                                             update_id=parameters().exp_name +
                                             '_' + self.table)
Ejemplo n.º 26
0
class TableImplBase(luigi.Task):
    """ table_impl base class
    """

    task_name = "table_impl"
    src_file = os.path.abspath(__file__)

    input_files = luigi.ListParameter()
    output_files = luigi.ListParameter()
    input_key = luigi.Parameter()
    resolution = luigi.ListParameter()
    dependency = luigi.TaskParameter(default=DummyTask())

    def requires(self):
        return self.dependency

    def require_output_folders(self):
        output_folders = [
            os.path.split(out_file)[0] for out_file in self.output_files
        ]
        output_folders = list(set(output_folders))
        for out_folder in output_folders:
            os.makedirs(out_folder, exist_ok=True)

    def run_impl(self):
        # get the global config and init configs
        shebang = self.global_config_values()[0]
        self.init(shebang)

        self.require_output_folders()

        # luigi may randomly shuffles the file lists, so we need to make sure they are ordered here
        input_files = list(self.input_files)
        input_files.sort()
        output_files = list(self.output_files)
        output_files.sort()

        # load and update the task config
        task_config = self.get_task_config()
        task_config.update({
            "input_files": input_files,
            "output_files": output_files,
            "resolution": self.resolution,
            "input_key": self.input_key
        })

        block_list = list(range(len(input_files)))
        self._write_log("scheduled %i blocks to run" % len(block_list))

        # prime and run the jobs
        n_jobs = min(len(block_list), self.max_jobs)
        self.prepare_jobs(n_jobs, block_list, task_config)
        self.submit_jobs(n_jobs)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(n_jobs)
class EdgeLabelsBase(luigi.Task):
    """ Edge labels base class
    """

    task_name = 'edge_labels_mc'
    src_file = os.path.abspath(__file__)
    # retry is too complecated for now ...
    allow_retry = False

    # input and output volumes
    node_labels_path = luigi.Parameter()
    node_labels_key = luigi.Parameter()
    graph_path = luigi.Parameter()
    graph_key = luigi.Parameter()
    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    ws_path = luigi.Parameter()
    ws_key = luigi.Parameter()
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    @staticmethod
    def default_task_config():
        # we use this to get also get the common default config
        config = LocalTask.default_task_config()
        config.update({'ignore_label_gt': False})
        return config

    def run_impl(self):
        # get the global config and init configs
        shebang, _, _, _ = self.global_config_values()
        self.init(shebang)

        # load the task config
        config = self.get_task_config()

        # update the task config
        config.update({
            'node_labels_path': self.node_labels_path,
            'node_labels_key': self.node_labels_key,
            'output_path': self.output_path,
            'output_key': self.output_key,
            'graph_path': self.graph_path,
            'graph_key': self.graph_key,
            'ws_path': self.ws_path,
            'ws_key': self.ws_key
        })

        # prime and run the jobs
        self.prepare_jobs(1, None, config)
        self.submit_jobs(1)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(1)
Ejemplo n.º 28
0
class SkeletonizeBase(luigi.Task):
    """ Skeletonize base class
    """

    task_name = 'skeletonize'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    # input and output volumes
    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    dependency = luigi.TaskParameter(default=DummyTask())

    def requires(self):
        return self.dependency

    def run_impl(self):
        # get the global config and init configs
        shebang, block_shape, roi_begin, roi_end = self.global_config_values()
        self.init(shebang)

        # get shape, dtype and make block config
        with vu.file_reader(self.input_path, 'r') as f:
            shape = f[self.input_key].shape

        # load the skeletonize config
        task_config = self.get_task_config()

        # require output dataset
        chunks = (25, 256, 256)
        chunks = tuple(min(sh, ch) for sh, ch in zip(shape, chunks))
        with vu.file_reader(self.output_path) as f:
            f.require_dataset(self.output_key,
                              shape=shape,
                              chunks=chunks,
                              compression='gzip',
                              dtype='uint64')

        # update the config with input and output paths and keys
        # as well as block shape
        task_config.update({
            'input_path': self.input_path,
            'input_key': self.input_key,
            'output_path': self.output_path,
            'output_key': self.output_key
        })

        # prime and run the jobs
        n_jobs = 1
        self.prepare_jobs(n_jobs, None, task_config)
        self.submit_jobs(n_jobs)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(n_jobs)
Ejemplo n.º 29
0
class SimpleStitchAssignmentsBase(luigi.Task):
    """ SimpleStitchAssignments base class
    """

    task_name = 'simple_stitch_assignments'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    problem_path = luigi.Parameter()
    features_key = luigi.Parameter()
    graph_key = luigi.Parameter()
    assignments_path = luigi.Parameter()
    assignments_key = luigi.Parameter()
    edge_size_threshold = luigi.IntParameter(default=0)
    serialize_edges = luigi.BoolParameter(default=False)
    # task that is required before running this task
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    def run_impl(self):
        shebang, block_shape, roi_begin, roi_end = self.global_config_values()
        self.init(shebang)

        with vu.file_reader(self.problem_path, 'r') as f:
            shape = f[self.graph_key].attrs['shape']
        block_list = vu.blocks_in_volume(shape, block_shape, roi_begin,
                                         roi_end)
        n_jobs = min(len(block_list), self.max_jobs)

        config = self.get_task_config()
        tmp_file = os.path.join(self.tmp_folder, 'stitch_edges.n5')
        config.update({
            'input_path': tmp_file,
            'problem_path': self.problem_path,
            'features_key': self.features_key,
            'graph_key': self.graph_key,
            'assignments_path': self.assignments_path,
            'assignments_key': self.assignments_key,
            'edge_size_threshold': self.edge_size_threshold,
            'serialize_edges': self.serialize_edges,
            'n_jobs': n_jobs
        })

        with vu.file_reader(tmp_file) as f:
            f.require_group('job_results')

        # we only have a single job to find the labeling
        self.prepare_jobs(1, None, config)
        self.submit_jobs(1)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        # log the save-path again
        self.check_jobs(1)
class R2EFeaturesBase(luigi.Task):
    """ R2EFeaturesBase base class
    """

    task_name = 'r2e_features'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    # input volumes and graph
    graph_path = luigi.Parameter()
    graph_key = luigi.Parameter()
    region_feature_paths = luigi.ListParameter()
    region_feature_keys = luigi.ListParameter()
    edge_feature_paths = luigi.ListParameter(default=None)
    edge_feature_keys = luigi.ListParameter(default=None)
    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    @staticmethod
    def default_task_config():
        # we use this to get also get the common default config
        config = LocalTask.default_task_config()
        return config

    def run_impl(self):
        # get the global config and init configs
        shebang, block_shape, roi_begin, roi_end = self.global_config_values()
        self.init(shebang)

        # load the task config
        config = self.get_task_config()

        # NOTE we have to turn the luigi dict parameters into normal python dicts
        # in order to json serialize them
        config.update({
            'graph_path': self.graph_path,
            'graph_key': self.graph_key,
            'region_feature_paths': self.region_feature_paths,
            'region_feature_keys': self.region_feature_keys,
            'edge_feature_paths': self.edge_feature_paths,
            'edge_feature_keys': self.edge_feature_keys,
            'output_path': self.output_path,
            'output_key': self.output_key
        })

        # prime and run the jobs
        self.prepare_jobs(1, None, config)
        self.submit_jobs(1)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(1)