class FindMergesBase(luigi.Task):
    task_name = 'find_merges'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    path = luigi.Parameter()
    key = luigi.Parameter()
    out_path = luigi.Parameter()
    clear_ids = luigi.ListParameter()
    min_overlap = luigi.IntParameter()
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    def run_impl(self):
        # get the global config and init configs
        shebang = self.global_config_values()[0]
        self.init(shebang)

        # load the task config
        config = self.get_task_config()
        config.update({'path': self.path, 'key': self.key,
                       'clear_ids': self.clear_ids,
                       'out_path': self.out_path, 'min_overlap': self.min_overlap})

        # prime and run the jobs
        n_jobs = 1
        self.prepare_jobs(n_jobs, None, config)
        self.submit_jobs(n_jobs)

        # wait till jobs finish and check for job success
        self.wait_for_jobs()
        self.check_jobs(n_jobs)
class NeuroproofLearnTaskMixin:
    prob_loading_plan_path = luigi.Parameter(
        description="Location of the probability prediction volume")
    additional_locations = luigi.ListParameter(
        default=[],
        description="Additional probability map locations for Neuroproof")
    seg_loading_plan_path = luigi.Parameter(
        description="Location of the pipeline's watershed segmentation")
    gt_loading_plan_path = luigi.Parameter(
        description="Location of the ground truth segmentation")
    output_location = luigi.Parameter(
        description="Location for the classifier file. Use an .xml extension "
        "to use the OpenCV random forest classifier. Use an .h5 extension "
        "to use the Vigra random forest classifier")

    def input(self):
        loading_plans = [self.prob_loading_plan_path,
                         self.seg_loading_plan_path,
                         self.gt_loading_plan_path] + \
            list(self.additional_locations)
        for loading_plan in loading_plans:
            for tgt in DestVolumeReader(loading_plan).get_source_targets():
                yield tgt

    def output(self):
        return luigi.LocalTarget(self.output_location)
Beispiel #3
0
class ScheduledExternalProgramTask(ExternalProgramTask):
    """
    Variant of :class:`luigi.contrib.external_program.ExternalProgramTask` that
    executes the task with a :class:`Scheduler`.
    """
    scheduler = luigi.ChoiceParameter(default=cfg.scheduler, choices=['local'] + [blurb for blurb in _schedulers], positional=False, significant=False, description='Scheduler to use for running the task')
    scheduler_partition = luigi.OptionalParameter(default=cfg.scheduler_partition, positional=False, significant=False, description='Scheduler partition (or queue) to use if supported')
    scheduler_extra_args = luigi.ListParameter(default=cfg.scheduler_extra_args, positional=False, significant=False, description='Extra arguments to pass to the scheduler')

    walltime = luigi.TimeDeltaParameter(default=datetime.timedelta(), positional=False, significant=False, description='Amout of time to allocate for the task, default value of zero implies unlimited time')
    cpus = luigi.IntParameter(default=1, positional=False, significant=False, description='Number of CPUs to allocate for the task')
    memory = luigi.FloatParameter(default=1, positional=False, significant=False, description='Amount of memory (in gigabyte) to allocate for the task')

    def __init__(self, *kwargs, **kwds):
        super(ScheduledExternalProgramTask, self).__init__(*kwargs, **kwds)
        try:
            if self.scheduler != 'local':
                self._scheduler = _schedulers[self.scheduler]
        except KeyError:
            raise ValueError('Unsupported scheduler {}'.format(self.scheduler))

    @property
    def resources(self):
        if self.scheduler == 'local':
            # local_jobs is actually constrained by the number of workers
            return {'cpus': self.cpus, 'memory': self.memory}
        else:
            return {'{}_jobs'.format(self.scheduler): 1}

    def run(self):
        if self.scheduler == 'local':
            return super(ScheduledExternalProgramTask, self).run()
        else:
            return self._scheduler.run_task(self)
class BootstrapSpokeAsTask(tasks.PuppetTask):
    puppet_account_id = luigi.Parameter()
    account_id = luigi.Parameter()
    iam_role_arns = luigi.ListParameter()
    role_name = luigi.Parameter()
    permission_boundary = luigi.Parameter()
    puppet_role_name = luigi.Parameter()
    puppet_role_path = luigi.Parameter()

    def params_for_results_display(self):
        return {
            "puppet_account_id": self.puppet_account_id,
            "account_id": self.account_id,
        }

    def run(self):
        partition = config.get_partition()
        iam_role_arns_to_use = [
            iam_role_arn for iam_role_arn in self.iam_role_arns
        ]
        iam_role_arns_to_use.append(
            f"arn:{partition}:iam::{self.account_id}:role/{self.role_name}")
        sdk.bootstrap_spoke_as(
            self.puppet_account_id,
            iam_role_arns_to_use,
            self.permission_boundary,
            self.puppet_role_name,
            self.puppet_role_path,
        )
        self.write_output(self.params_for_results_display())
Beispiel #5
0
class DeleteStoragePlan(RunMixin, RequiresMixin, luigi.Task):
    '''a task to delete a storage plan's .tif files'''

    task_namespace = "ariadne_microns_pipeline"

    dependency_outputs = luigi.ListParameter(
        default=[],
        description="The outputs of this task's dependencies. The task "
        "requests these as inputs so that all of them must be present "
        "before the storage plan is deleted.")
    storage_plan_path = luigi.Parameter(description="Storage plan to delete")

    def input(self):
        yield SrcVolumeTarget(self.storage_plan_path)
        for dependency_output in self.dependency_outputs:
            yield luigi.LocalTarget(dependency_output)

    def output(self):
        return luigi.LocalTarget(
            SrcVolumeTarget.storage_plan_path_to_deleted_file(
                self.storage_plan_path))

    def ariadne_run(self):
        self.input().next().remove()
        with self.output().open("w") as fd:
            fd.write("So sorry.\n")
Beispiel #6
0
class StringTieMerge(SlurmExecutableTask, CheckTargetNonEmpty):

    lib_list = luigi.ListParameter()
    output_prefix = luigi.Parameter()
    library = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Set the SLURM request params for this task
        self.mem = 2000
        self.n_cpu = 4
        self.partition = "nbi-medium"

    def requires(self):
        return [self.clone(StringTie, library=lib) for lib in self.lib_list]

    def output(self):
        return LocalTarget(os.path.join(self.base_dir, VERSION, PIPELINE, self.output_prefix, 'stringtie.gtf'))

    def work_script(self):
        self.temp = TemporaryFile()
        return '''#!/bin/bash
        source stringtie-1.3.0;
        set -euo pipefail

        echo '{input}' > {temp}
        stringtie  -p {n_cpu} --merge {temp} > {output}.temp

        mv {output}.temp {output}
        '''.format(input="\n".join([x.path for x in self.input()]),
                   output=self.output().path,
                   temp=self.temp.path,
                   n_cpu=self.n_cpu,
                   )
Beispiel #7
0
class CreateBwaIndices(FtarcTask):
    fa_path = luigi.Parameter()
    bwa = luigi.Parameter(default='bwa')
    use_bwa_mem2 = luigi.BoolParameter(default=False)
    add_index_args = luigi.ListParameter(default=list())
    sh_config = luigi.DictParameter(default=dict())
    priority = 100

    def output(self):
        return [
            luigi.LocalTarget(f'{self.fa_path}.{s}')
            for s in (['0123', 'amb', 'ann', 'pac', 'bwt.2bit.64'] if self.
                      use_bwa_mem2 else ['pac', 'bwt', 'ann', 'amb', 'sa'])
        ]

    def run(self):
        fa = Path(self.fa_path)
        run_id = fa.stem
        self.print_log(f'Create BWA indices:\t{run_id}')
        self.setup_shell(run_id=run_id,
                         commands=self.bwa,
                         cwd=fa.parent,
                         **self.sh_config)
        self.run_shell(args=(f'set -e && {self.bwa} index' +
                             ''.join(f' {a}'
                                     for a in self.add_index_args) + f' {fa}'),
                       input_files_or_dirs=fa,
                       output_files_or_dirs=[o.path for o in self.output()])
Beispiel #8
0
class CybersourceDataValidationTask(WarehouseMixin, luigi.WrapperTask):

    import_date = luigi.DateParameter()

    cybersource_merchant_ids = luigi.ListParameter(
        config_path={'section': 'payment', 'name': 'cybersource_merchant_ids'},
    )

    def requires(self):
        config = get_config()
        for merchant_id in self.cybersource_merchant_ids:
            section_name = 'cybersource:' + merchant_id
            interval_start = luigi.DateParameter().parse(config.get(section_name, 'interval_start'))
            interval_end = self.import_date

            merchant_close_date = config.get(section_name, 'merchant_close_date', '')
            if merchant_close_date:
                parsed_date = luigi.DateParameter().parse(merchant_close_date)
                interval_end = min(self.import_date, parsed_date)

            cybersource_interval = date_interval.Custom(interval_start, interval_end)

            for date in cybersource_interval:
                filename = "cybersource_{}.tsv".format(merchant_id)
                url = url_path_join(self.warehouse_path, 'payments', 'dt=' + date.isoformat(), filename)
                yield ExternalURL(url=url)
Beispiel #9
0
class SmaliList(luigi.Task):
    pkg = luigi.Parameter()
    apks = luigi.ListParameter(significant=False)

    def requires(self):
        return [ApiExtractorRun(file_name=fn, pkg=self.pkg) for fn in self.apks]

    def output(self):
        output_file = os.path.join(cfg.soot_smalilist_folder,
                                   self.pkg,
                                   self.pkg + ".json")
        return ExternalFileTarget(output_file)

    def run(self):
        app_smalilist = {}

        for i in self.input():
            with i['loc'].open() as data_file:
                _, ver, _ = commons().get_apk_data(i['loc'].path)
                smali = json.load(data_file)

                # cast to set to remove duplicates
                app_smalilist[ver] = list(set(smali))

        with self.output().open('w') as data_file:
            json.dump(app_smalilist, data_file, indent=2)
class TestDockerBuildBase(DockerBuildBase):
    goals = luigi.ListParameter([])

    def get_goal_class_map(self) -> Dict[str, DockerAnalyzeImageTask]:
        goal_class_map = {
            "test-analyze-image-1":
            self.create_child_task(
                task_class=TestDockerBuildBaseTestAnalyzeImage,
                task_name="test-analyze-image-1"),
            "test-analyze-image-2":
            self.create_child_task(TestDockerBuildBaseTestAnalyzeImage,
                                   task_name="test-analyze-image-2")
        }
        return goal_class_map

    def get_default_goals(self) -> Set[str]:
        goals = {"test-analyze-image-1"}
        return goals

    def get_goals(self):
        return self.goals

    def run_task(self):
        build_tasks = self.create_build_tasks(False)
        image_infos_futures = yield from self.run_dependencies(build_tasks)
        image_infos = self.get_values_from_futures(image_infos_futures)
        self.return_object(image_infos)
class PostImportDatabaseTask(SchemaManagementTask):
    """
    Task needed to run after importing database into warehouse.
    """

    # Override the standard roles here since these tables will be rather raw. We may want to restrict access to a
    # subset of users.
    roles = luigi.ListParameter(config_path={
        'section': 'vertica-export',
        'name': 'business_intelligence_team_roles'
    }, )

    @property
    def queries(self):
        return [
            "DROP SCHEMA IF EXISTS {schema} CASCADE;".format(
                schema=self.schema),
            "ALTER SCHEMA {schema_loading} RENAME TO {schema};".format(
                schema_loading=self.schema_loading, schema=self.schema),
            "GRANT USAGE ON SCHEMA {schema} TO {roles};".format(
                schema=self.schema, roles=self.vertica_roles),
            "GRANT SELECT ON ALL TABLES IN SCHEMA {schema} TO {roles};".format(
                schema=self.schema, roles=self.vertica_roles),
        ]

    @property
    def marker_name(self):
        return 'post_database_import_{schema}_{date}'.format(
            schema=self.schema, date=self.date.strftime('%Y-%m-%d'))
class lvl1(luigi.Task):
    files = luigi.ListParameter()
    outputpath = ""

    def __init__(self,*args, **kwargs):
        super(lvl1, self).__init__(*args, **kwargs)
        self.conf = initConf()
        self.preparePaths()

    @basicLoggerDecorator(pipelineLogger)
    def preparePaths(self):
        self.outputpath = self.conf['lvl1_outputpath']
 
    @basicLoggerDecorator(pipelineLogger)
    def requires(self):
        return []
 
    @basicLoggerDecorator(pipelineLogger)
    def output(self):
        return luigi.LocalTarget(self.outputpath)
 
    @basicLoggerDecorator(pipelineLogger)
    def run(self):
        with self.output().open('w') as f:
            dakd.extract.prepareFiles(self.files,f)
Beispiel #13
0
class RGBAnnotationMapImage(luigi.Task):
    input_path = luigi.Parameter()
    rgb_components = luigi.ListParameter(default=[0, 1, 2])
    src_data_path = luigi.Parameter()
    render_tiles = luigi.BoolParameter(default=False)

    def make_plot(self, da_emb):
        return make_rgb_annotation_map_image(
            da=da_emb,
            rgb_components=self.rgb_components,
            dataset_path=self.dataset_path,
        )

    def run(self):
        da_emb = xr.open_dataarray(self.input_path)
        fig, axes = self.make_plot(da_emb=da_emb)

        Path(self.output().fn).parent.mkdir(exist_ok=True, parents=True)
        plt.savefig(self.output().fn, fig=fig, bbox_inches="tight")

    def output(self):
        image_fullpath = Path(self.input_path)
        src_path, src_fn = image_fullpath.parent, image_fullpath.name

        fn_out = src_fn.replace(
            ".nc",
            ".rgb_map.{}__comp.png".format("_".join(
                [str(v) for v in self.rgb_components])),
        )

        p = Path(src_path) / fn_out

        return luigi.LocalTarget(str(p))
class PreprocessingPipeline(luigi.Task):
    datasets = luigi.ListParameter(description="Names of the datasets to use")
    export_csv = luigi.BoolParameter(
        description="If specified, exports spectra as csv files",
        significant=False,
        visibility=luigi.parameter.ParameterVisibility.HIDDEN)
    pool_size = luigi.IntParameter(
        default=os.cpu_count() or 1,
        description=
        'Size of parallel pool to use for computations. Choose carefully '
        'to not exceed the memory.',
        significant=False,
        visibility=luigi.parameter.ParameterVisibility.HIDDEN)

    def requires(self):
        for dataset in self.datasets:
            yield AssembleMetadata(dataset=dataset, pool_size=self.pool_size)
        for dataset in self.datasets:
            yield MergeDataset(dataset=dataset,
                               datasets=self.datasets,
                               pool_size=self.pool_size)
        if self.export_csv:
            for dataset in self.datasets:
                yield ExportCsv(dataset=dataset,
                                datasets=self.datasets,
                                pool_size=self.pool_size)
Beispiel #15
0
class NeuralNetworkClassificatorTask(ClassificatorTask):
    _name = 'neural_network'

    solver = luigi.Parameter(default='lbfgs')
    activation = luigi.Parameter(default='relu')
    hidden_layer_sizes = luigi.ListParameter()
    batch_size = luigi.Parameter(default='auto')
    
    def build_and_train(self, x, y):
        from sklearn.neural_network import MLPClassifier
        from sklearn.multiclass import OneVsRestClassifier

        lx,ly,lz = x.shape
        self.meta.update({
            "word_vec_size": lz,
            "len_words": ly,
        })
        x_train = x.reshape(lx, ly*lz)
        model = MLPClassifier(
            hidden_layer_sizes=list(self.hidden_layer_sizes),
            solver=self.solver,
            activation=self.activation,
            batch_size='auto' if self.batch_size == 'auto' else int(self.batch_size)
        )
        if self.ovr_strategy:
            model = OneVsRestClassifier(model)
        model.fit(x_train, y)
        score = model.score(x_train, y)
        self.meta['score'] = score
        self.set_status_message(f'Model fit complete. Score {score}')

        return model
Beispiel #16
0
class UploadFilesToAzureAndRecord(luigi.Task):
    """
    Just copies the result of the parent task into the task DB
    """
    part_id = luigi.Parameter()
    path_list = luigi.ListParameter()

    task_namespace = 'azure'

    def requires(self):
        return UploadFilesToAzure(self.part_id, self.path_list)

    def run(self):
        # Record output in DB too:
        for item in self.path_list:
            tr = UploadToAzure(item).output()
            if not tr.exists():
                tr.touch()

        # All done, so log this task as complete:
        self.output().touch()


    def output(self):
        """If this all works, record success in the DB"""
        return taskdb_target( 'azure_upload_set', '%s UPLOADED' % self.part_id)
Beispiel #17
0
class MergeBam(SlurmExecutableTask, CheckTargetNonEmpty):
    base_dir = luigi.Parameter(significant=False)
    scratch_dir = luigi.Parameter(default="/tgac/scratch/buntingd/", significant=False)

    lib_list = luigi.ListParameter()
    output_prefix = luigi.Parameter()
    library = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Set the SLURM request params for this task
        self.mem = 16000
        self.n_cpu = 3
        self.partition = "nbi-medium"

    def requires(self):
        return [self.clone(Library.MarkDuplicates, library=lib) for lib in self.lib_list]

    def output(self):
        return LocalTarget(os.path.join(self.scratch_dir, VERSION, PIPELINE, self.output_prefix, 'merged.bam'))

    def work_script(self):
        self.temp = TemporaryFile()
        return '''#!/bin/bash
        source samtools-1.3;
        set -euo pipefail

        echo '{input}' > {temp}
        samtools merge -f  {output}.temp.bam -b {temp} --threads 2

        mv {output}.temp.bam {output}
        '''.format(input="\n".join([x.path for x in self.input()]),
                   output=self.output().path,
                   temp=self.temp.path)
class DownloadArticle(luigi.Task):

    lexology_urls = luigi.ListParameter()  # luigi parameter

    def run(self):

        for lexology_url in self.lexology_urls:
            r = requests.get(lexology_url)
            html = r.text
            soup = BeautifulSoup(html, 'html.parser')
            article_header = soup.select('h4')
            article_link_list = []
            for links in article_header:
                article_links = links.find_all('a', href=True)
                for link in article_links:
                    article_link = link.get('href')
                    article_link_list.append(article_link)
                    #print(article_link_list)
            all_text = []
            for article_link in article_link_list:
                try:
                    text = get_article(article_link)
                    all_text.append(text)
                except Exception as e:
                    print(f'error in {article_link}, {e}')

        df = pd.DataFrame(all_text,
                          columns=['article_title', 'article_date', 'text'])
        df.to_pickle(self.output().open('w').path)

    def output(self):
        return luigi.LocalTarget(f'data/raw/raw_articles.pkl')
class PairwiseDistanceWorkflow(WorkflowBase):
    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    morphology_path = luigi.Parameter()
    morphology_key = luigi.Parameter()
    output_path = luigi.Parameter()
    max_distance = luigi.FloatParameter()
    resolution = luigi.ListParameter()
    max_size = luigi.IntParameter(default=None)

    def requires(self):
        distance_task = getattr(distance_tasks,
                                self._get_task_name('ObjectDistances'))
        dep = distance_task(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs,
                            config_dir=self.config_dir,
                            input_path=self.input_path, input_key=self.input_key,
                            morphology_path=self.morphology_path,
                            morphology_key=self.morphology_key,
                            max_distance=self.max_distance, resolution=self.resolution,
                            max_size=self.max_size)
        dep = MergePairwiseDistances(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs,
                                     output_path=self.output_path, dependency=dep)
        return dep

    @staticmethod
    def get_config():
        configs = super(PairwiseDistanceWorkflow, PairwiseDistanceWorkflow).get_config()
        configs.update({'object_distances': distance_tasks.ObjectDistancesLocal.default_task_config()})
        return configs
Beispiel #20
0
class MergeSampleBams(luigi.Task):
    """Merge Multiple Bam Files for One Sample and Coverage Statistics

    Attributes:
        inbams (list): a list of bam files
        outbam (str): output bam filename

    Output:
        - {outdir}/mapping/{sample}.merged.bam
    """
    resources = {"cpu": 2, "memory": 1}
    outbam = luigi.Parameter()
    inbams = luigi.ListParameter()

    def requires(self):
        return []

    def output(self):
        return luigi.LocalTarget(self.outbam)

    def run(self):
        cmd = """samtools merge - {bams} | tee {outfile} \
| samtools index - {outfile}.bai """.format(
            bams = " ".join(self.inbams), outfile = self.outbam,
        )
        logging.info(cmd)
        subprocess.run(cmd, shell=True)
Beispiel #21
0
class CreateSequenceDictionary(FtarcTask):
    fa_path = luigi.Parameter()
    gatk = luigi.Parameter(default='gatk')
    add_createsequencedictionary_args = luigi.ListParameter(default=list())
    n_cpu = luigi.IntParameter(default=1)
    memory_mb = luigi.FloatParameter(default=4096)
    sh_config = luigi.DictParameter(default=dict())
    priority = 70

    def output(self):
        fa = Path(self.fa_path).resolve()
        return luigi.LocalTarget(fa.parent.joinpath(f'{fa.stem}.dict'))

    def run(self):
        run_id = Path(self.fa_path).stem
        self.print_log(f'Create a sequence dictionary:\t{run_id}')
        fa = Path(self.fa_path).resolve()
        seq_dict_path = self.output().path
        self.setup_shell(run_id=run_id,
                         commands=self.gatk,
                         cwd=fa.parent,
                         **self.sh_config,
                         env={
                             'JAVA_TOOL_OPTIONS':
                             self.generate_gatk_java_options(
                                 n_cpu=self.n_cpu, memory_mb=self.memory_mb)
                         })
        self.run_shell(
            args=(f'set -e && {self.gatk} CreateSequenceDictionary' +
                  f' --REFERENCE {fa}' +
                  ''.join(f' {a}'
                          for a in self.add_createsequencedictionary_args) +
                  f' --OUTPUT {seq_dict_path}'),
            input_files_or_dirs=fa,
            output_files_or_dirs=seq_dict_path)
Beispiel #22
0
class Freebayes(luigi.Task):
    """Freebayes Calling Variants

    Attributes:
        inbam (file): input dedup bam file
        outvcf (str): output vcf file

    """
    resources = {"cpu": 1, "memory": 1}
    inbam = luigi.ListParameter()
    outvcf = luigi.Parameter()

    def requires(self):
        return []

    def output(self):
        return luigi.LocalTarget(self.outvcf)

    def run(self):
        cmd = "freebayes -f {genome} {bam} > {outfile}".format(
            genome=Reference().genome, bam=self.inbam, outfile=self.outvcf
        )
        # bcftools filter -e 'QUAL < 20' -s LOWQUAL {rawvcf} {outvcf}
        logging.info(cmd)
        subprocess.run(cmd, shell=True)
Beispiel #23
0
class WideRecommender(ClassifierWithTransferLearningKerasModelTraining):
    input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, ))
    batch_size: int = luigi.IntParameter(default=10)
    learning_rate = luigi.FloatParameter(default=1e-5)
    dense_layers: List[int] = luigi.ListParameter(default=[512, 512])
    dropout: float = luigi.FloatParameter(default=None)
    activation_function: str = luigi.ChoiceParameter(
        choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu")
    kernel_initializer: str = luigi.ChoiceParameter(
        choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform")

    def create_base_model(self) -> Model:
        x_input = Input(shape=self.input_shape, name='wide_inp')

        wide = Dense(self.input_shape[0],
                     activation=self.activation_function,
                     kernel_initializer=self.kernel_initializer,
                     name='wide_mlp')(x_input)

        output = Dense(1,
                       activation='sigmoid',
                       kernel_initializer=self.kernel_initializer)(wide)

        model = Model(x_input, output, name='Wide')

        return model

    def create_model_with(self, base_model: Model) -> Model:
        return base_model
Beispiel #24
0
class Mpileup(luigi.Task):
    """Bcftools Mpileup calling variants

    Attributes:
        inbam (file): input dedup bam file
        outvcf (str): output vcf file

    """
    resources = {"cpu": 1, "memory": 1}
    inbam = luigi.ListParameter()
    outvcf = luigi.Parameter()

    def requires(self):
        return []

    def output(self):
        return luigi.LocalTarget(self.outvcf)

    def run(self):
        cmd = """bcftools mpileup -f {genome} {bam} \
| bcftools call -mv --ploidy {ploidy} -o {outvcf}""".format(
            bam=self.inbam, vcf=self.outvcf,
            genome=Reference().genome, ploidy=Reference().genome_version
        )
        logging.info(cmd)
        subprocess.run(cmd, shell=True)
Beispiel #25
0
class AbyssSealerReduced(CheckTargetNonEmpty, SlurmExecutableTask):

    sealer_klist = luigi.ListParameter()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Set the SLURM request params for this task
        self.mem = 6000
        self.n_cpu = 2
        self.partition = "nbi-medium"

    def output(self):
        return LocalTarget(os.path.join(self.base_dir, PIPELINE, VERSION, "sealer", "SOAP", 'K' + str(self.K), 'K' + str(self.K) + "_scaffold.fa"))

    def requires(self):
        return {'bloomfilters': [self.clone(Assemble.AbyssBloomBuild, bloom_k=k) for k in self.sealer_klist],
                'scaffolds': self.clone(SOAPNremap)}

    def work_script(self):
        return '''#!/bin/bash
                    source abyss-2.0.2;
                    mkdir -p {output}/temp
                    set -euo pipefail

                    abyss-sealer {k_args} -P25 --flank-length=150 -j {n_cpu} -o {output}/temp/{prefix} -S {scaffolds} {bloomfilters}

                    mv {output}/temp/{prefix}* {output}/
        '''.format(k_args=' '.join(['-k' + str(k) for k in self.sealer_klist]),
                   bloomfilters=' '.join(['-i ' + x.path for x in self.input()['bloomfilters']]),
                   scaffolds=self.input()['scaffolds'].path,
                   n_cpu=self.n_cpu,
                   output=os.path.dirname(self.output().path),
                   prefix='K' + str(self.K))
Beispiel #26
0
class _Tracking2DExtraction(luigi.Task):
    """
    Base task for extracting fields from object tracking in 2D. This should
    never be called directly. Instead use either TrackingVariable2D or TrackingLabels2D
    """

    base_name = luigi.Parameter()
    track_without_gal_transform = luigi.BoolParameter(default=False)
    tracking_type = luigi.EnumParameter(enum=TrackingType)
    tracking_timestep_interval = luigi.ListParameter(default=[])

    def requires(self):
        U_tracking_offset = None
        if self.track_without_gal_transform:
            meta = _get_dataset_meta_info(self.base_name)
            U_tracking_offset = meta.get("U_gal", None)
            if U_tracking_offset is None:
                raise Exception(
                    "To remove the Galilean transformation before tracking"
                    " please define the transform velocity"
                    " as `U_gal` in datasources.yaml for"
                    " dataset `{}`".format(self.base_name))
        return PerformObjectTracking2D(
            base_name=self.base_name,
            tracking_type=self.tracking_type,
            timestep_interval=self.tracking_timestep_interval,
            U_offset=U_tracking_offset,
        )
Beispiel #27
0
class RunFeatureUnion(PickleTask):
    dataset = luigi.Parameter()
    levels = luigi.ListParameter()
    sig_type = luigi.Parameter(default='logsig')

    def output(self):
        levels_name = '_'.join(map(str, self.levels))
        filename = f"{self.sig_type}_concat_{levels_name}.pkl"
        return luigi.LocalTarget(PIPELINE_DIR / self.dataset / filename)

    def run(self):
        X_train, y_train, X_test, y_test = load_data(self.dataset)
        logit = LogisticRegression(random_state=42)

        r = []
        for level in self.levels:
            m = classifiers.create_concatenator(logit,
                                                sig_type=self.sig_type,
                                                level=level)
            # start timing
            start = timeit.default_timer()
            m.fit(X_train, y_train)
            elapsed = timeit.default_timer() - start
            # end timing
            r.append([m.score(X_test, y_test), elapsed])

        self.dump(
            pd.DataFrame(r, columns=["Score", "Elapsed"], index=self.levels))
class MultiSampleWorkflow(sl.WorkflowTask):
    """
    This workflow is meant to take an entire dataset description and run the the SingleSampleWorkflow on each sample
    """
    midas_db = sl.Parameter()
    dataset_description = sl.Parameter()
    workdir = sl.Parameter()
    contaminant_removal_method = sl.Parameter(default="bbsplit")
    filter_genomes = luigi.ListParameter()
    ref_info_dir = sl.Parameter()
    ref_combo_hash = sl.Parameter()

    def workflow(self):
        dataset_spec = json.load(self.dataset_description)
        tasks = []
        if len(self.filter_genomes) > 0:
            index_task = self.new_task("ref_index",
                                       CreateIndexForContamRemoval)
        tasks.append(index_task)
        # Samples are in an array in the json. Each sample has a prefix and two read files
        for sample in dataset_spec["samples"]:
            wf = self.new_task('SampleWorkflow_' + sample["prefix"],
                               SingleSampleWorkflow,
                               workdir=self.workdir,
                               prefix=sample["prefix"],
                               in_fastq1=sample["in_fastq1"],
                               in_fastq2=sample["in_fastq2"],
                               midas_db=self.midas_db,
                               filter_genomes=self.filter_genomes,
                               ref_info_dir=self.ref_info_dir,
                               ref_combo_hash=self.ref_combo_hash)
            tasks.append(wf)
        return tasks
Beispiel #29
0
class AggregateArtists(luigi.Task):

    months = luigi.ListParameter()
    user_id = luigi.Parameter()

    def output(self):
        return luigi.LocalTarget(
            "output/artist_streams_{}_{}.csv".format(
                self.user_id, "-".join(self.months)
            ),
            format=luigi.format.Nop,
        )

    def requires(self):
        return [Streams(month, self.user_id) for month in self.months]

    def run(self):
        main_df = None
        for t in self.input():
            with t.open("r") as in_file:
                df = pd.read_csv(in_file)
                if main_df is None:
                    main_df = df
                else:
                    main_df = main_df.append(df)

        counts = (
            main_df.groupby("Artist Name")
            .count()["Apple Id Number"]
            .sort_values(ascending=False)
            .rename("Count")
        )

        with self.output().open("w") as out_file:
            counts.to_csv(out_file)
Beispiel #30
0
class B(PipeTask):
    """ B required by A """
    int_array = luigi.ListParameter(default=None)

    def pipe_run(self):
        print("B saving type [{}]".format(type(self.int_array)))
        return self.int_array