Python ContainerInfo Examples

Programming Language: Python

Namespace/Package Name: sciluigi

Method/Function: ContainerInfo

Examples at hotexamples.com: 12

Python ContainerInfo - 12 examples found. These are the top rated real world Python examples of sciluigi.ContainerInfo extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def workflow(self):

        # Input files are either located in SRA or AWS S3
        assert self.input_location in ["SRA", "S3"]

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)

        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Make tasks that will make sure the reference databases exist
        ref_fasta = self.new_task("load_ref_fasta",
                                  LoadFile,
                                  path=self.ref_fasta)

        # Keep track of all of the jobs for getting the input files
        tasks_load_inputs = {}

        # Keep track of all of the jobs for aligning against the FASTA
        tasks_align_bwa = {}

        # Iterate over all of the rows of samples
        for ix, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # If the inputs are on SRA, execute jobs that will download them
            if self.input_location == "SRA":

                tasks_load_inputs[sample_name] = self.new_task(
                    "download_from_sra_{}".format(sample_name),
                    ImportSRAFastq,
                    sra_accession=input_path,
                    base_s3_folder=self.base_s3_folder,
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=4096,
                        engine=self.engine,
                        aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        mounts={
                            "/docker_scratch": {
                                "bind": self.temp_folder,
                                "mode": "rw"
                            }
                        }))
            else:
                # Make sure the file exists on S3
                assert self.input_location == "S3"
                tasks_load_inputs[sample_name] = self.new_task(
                    "load_from_s3_{}".format(sample_name),
                    LoadFile,
                    path=input_path)

            # Make a task to align the reads, wherever they came from
            tasks_align_bwa[sample_name] = self.new_task(
                "align_bwa_{}".format(sample_name),
                AlignFastqTask,
                ref_name=self.ref_name,
                base_s3_folder=self.base_s3_folder,
                sample_name=sample_name,
                threads=self.align_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.align_threads),
                    mem=int(self.align_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))
        # Assign the output from tasks_load_inputs to the input to tasks_align_bwa
        for sample_name in tasks_load_inputs:
            assert sample_name in tasks_align_bwa

            # Assign the input for the reference database
            tasks_align_bwa[sample_name].in_ref_fasta = ref_fasta.out_file
            tasks_align_bwa[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_fastq

        return tasks_align_bwa

Example #2

Show file

File: humann_workflow.py Project: FredHutch/sciluigi-workflows

    def workflow(self):

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)

        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Keep track of the jobs for each step, for each sample
        tasks_load_inputs = {}
        tasks_fastqp = {}
        tasks_humann = {}

        # Iterate over all of the rows of samples
        for ix, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # Make a UUID to isolate temp files for this task from any others
            task_uuid = str(uuid.uuid4())[:8]

            # 1. LOAD THE INPUT FILES

            tasks_load_inputs[sample_name] = self.new_task(
                "load_from_s3_{}".format(sample_name),
                LoadFile,
                path=input_path)

            # 2. CALCULATE FASTQ QUALITY METRICS
            tasks_fastqp[sample_name] = self.new_task(
                "fastqp_{}".format(sample_name),
                FastqpTask,
                summary_path=os.path.join(self.base_s3_folder, "fastqp",
                                          sample_name + ".fastqp.tsv"),
                input_mount_point="/scratch/{}_fastqp/input/".format(
                    task_uuid),
                output_mount_point="/scratch/{}_fastqp/output/".format(
                    task_uuid),
                containerinfo=sl.ContainerInfo(
                    vcpu=1,
                    mem=10000,
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "fastqp_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # 3. ANALYZE WITH HUMANn2
            tasks_humann[sample_name] = self.new_task(
                "humann2_{}".format(sample_name),
                HUMAnN2Task,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "humann2"),
                threads=self.humann2_threads,
                ref_db=self.humann2_ref_db,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.humann2_threads),
                    mem=int(self.humann2_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "humann2_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        },
                        "/refdbs": {
                            "bind": "/refdbs",
                            "mode": "ro"
                        }
                    }))

        # Assign the output from tasks_load_inputs to the input to tasks_fastqp
        for sample_name in tasks_load_inputs:
            assert sample_name in tasks_fastqp
            tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_file

            assert sample_name in tasks_humann
            tasks_humann[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_file

        return tasks_fastqp, tasks_humann

Example #3

Show file

File: sv_dada2.py Project: Thexiyang/maliampi

    def workflow(self):
        light_containerinfo = sl.ContainerInfo()
        light_containerinfo.from_config(section='light')
        highmem_containerinfo = sl.ContainerInfo()
        highmem_containerinfo.from_config(section='highmem')
        heavy_containerinfo = sl.ContainerInfo()
        heavy_containerinfo.from_config(section='heavy')
        midcpu_containerinfo = sl.ContainerInfo()
        midcpu_containerinfo.from_config(section='midcpu')

        #
        #  Load the manifest of files
        #
        manifest = self.new_task(
            'load_manifest',
            LoadManifest,
            path=self.manifest,
        )

        # For each specimen....
        specimen_tasks = defaultdict(dict)
        specimens = manifest.get_specimens()
        for specimen in specimens:
            # Load the specimen reads.
            specimen_tasks[specimen]['reads'] = self.new_task(
                'specimen_load_{}'.format(specimen),
                LoadSpecimenReads,
                specimen=specimen)
            specimen_tasks[specimen]['reads'].in_manifest = manifest.out_file
            if self.barcodecop and "I1" in specimen_tasks[specimen][
                    'reads'].out_reads() and manifest.is_paired():
                specimen_tasks[specimen]['verified_reads'] = self.new_task(
                    'specimen_bcc_{}'.format(specimen),
                    BCCSpecimenReads,
                    containerinfo=light_containerinfo,
                    specimen=specimen,
                    path=os.path.join(self.working_dir, 'sv', 'bcc'))
                specimen_tasks[specimen][
                    'verified_reads'].in_reads = specimen_tasks[specimen][
                        'reads'].out_reads
            else:
                specimen_tasks[specimen]['verified_reads'] = specimen_tasks[
                    specimen]['reads']

            # DADA2 filer and trim
            specimen_tasks[specimen]['dada2_ft'] = self.new_task(
                'dada2_ft_{}'.format(specimen),
                DADA2_FilterAndTrim,
                containerinfo=light_containerinfo,
                specimen=specimen,
                f_trunc=self.truncLenF,
                r_trunc=self.truncLenR,
                trim_left=self.trimLeft,
                maxN=self.maxN,
                maxEE=self.maxEE,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'ft'))
            specimen_tasks[specimen]['dada2_ft'].in_reads = specimen_tasks[
                specimen]['verified_reads'].out_reads

            specimen_tasks[specimen]['dada2_derep'] = self.new_task(
                'dada2_derep_{}'.format(specimen),
                DADA2_Dereplicate,
                containerinfo=light_containerinfo,
                specimen=specimen,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'derep'))
            specimen_tasks[specimen]['dada2_derep'].in_reads = specimen_tasks[
                specimen]['dada2_ft'].out_reads

        # Now we need the specimens grouped by batch to create error models.
        batch_errModels = {}
        for batch, batched_specimens in manifest.batched_specimens():
            batch_errModels[batch] = self.new_task(
                'dada2_learn_error_batch_{}'.format(batch),
                DADA2_LearnError,
                containerinfo=midcpu_containerinfo,
                batch=batch,
                tar_reads=False,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'errM'))
            batch_errModels[batch].in_reads = [
                specimen_tasks[s]['dada2_ft'].out_reads for s in specimen_tasks
                if s in batched_specimens
            ]
            for specimen in batched_specimens:
                specimen_tasks[specimen]['dada2_errM'] = batch_errModels[batch]

        # Back to for each specimen...
        for specimen in specimens:
            # DADA
            specimen_tasks[specimen]['dada2_dada'] = self.new_task(
                'dada2_dada_{}'.format(specimen),
                DADA2_DADA,
                containerinfo=midcpu_containerinfo,
                specimen=specimen,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'dada'))
            specimen_tasks[specimen]['dada2_dada'].in_derep = specimen_tasks[
                specimen]['dada2_derep'].out_rds
            specimen_tasks[specimen]['dada2_dada'].in_errM = specimen_tasks[
                specimen]['dada2_errM'].out_rds

            # MERGE
            specimen_tasks[specimen]['dada2_merge'] = self.new_task(
                'dada2_merge_{}'.format(specimen),
                DADA2_Merge,
                containerinfo=light_containerinfo,
                specimen=specimen,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'merged'))
            specimen_tasks[specimen]['dada2_merge'].in_dada = specimen_tasks[
                specimen]['dada2_dada'].out_rds
            specimen_tasks[specimen]['dada2_merge'].in_derep = specimen_tasks[
                specimen]['dada2_derep'].out_rds

            # Seqtab
            specimen_tasks[specimen]['dada2_seqtab'] = self.new_task(
                'dada2_seqtab_{}'.format(specimen),
                DADA2_Specimen_Seqtab,
                containerinfo=light_containerinfo,
                specimen=specimen,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'seqtab'))
            specimen_tasks[specimen]['dada2_seqtab'].in_merge = specimen_tasks[
                specimen]['dada2_merge'].out_rds

        # Combine seqtabs by batch
        batch_seqtab = {}
        for batch, batched_specimens in manifest.batched_specimens():
            batch_seqtab[batch] = self.new_task(
                'dada2_combine_seqtabs_{}'.format(batch),
                DADA2_Combine_Seqtabs,
                containerinfo=light_containerinfo,
                fn=os.path.join(
                    self.working_dir,
                    'sv',
                    'dada2',
                    'seqtab',
                    'batches',
                    'seqtab.{}.combined.rds'.format(batch),
                ))
            batch_seqtab[batch].in_seqtabs = [
                specimen_tasks[s]['dada2_seqtab'].out_rds
                for s in specimen_tasks if s in batched_specimens
            ]
        # Now combine all the batch_seqtabs into one master seqtab
        combined_seqtab = self.new_task('dada2_combine_seqtabs',
                                        DADA2_Combine_Seqtabs,
                                        containerinfo=highmem_containerinfo,
                                        fn=os.path.join(
                                            self.working_dir, 'sv', 'dada2',
                                            'seqtab.combined.rds'))
        combined_seqtab.in_seqtabs = [
            st.out_rds for st in batch_seqtab.values()
        ]

        combined_seqtab_nochim = self.new_task(
            'dada2_remove_chimera',
            DADA2_Remove_Chimera,
            containerinfo=heavy_containerinfo,
            fn_rds=os.path.join(self.working_dir, 'sv', 'dada2',
                                'seqtab.combined.nochim.rds'),
            fn_csv=os.path.join(self.destination_dir,
                                'seqtab.combined.nochim.csv'))
        combined_seqtab_nochim.in_seqtab = combined_seqtab.out_rds

        dada2_sv_to_pplacer = self.new_task('dada2_sv_to_pplacer',
                                            DADA2_SV_to_PPlacer,
                                            containerinfo=light_containerinfo,
                                            fasta_fn=os.path.join(
                                                self.destination_dir,
                                                'dada2.sv.fasta',
                                            ),
                                            weights_fn=os.path.join(
                                                self.destination_dir,
                                                'dada2.sv.weights.csv',
                                            ),
                                            map_fn=os.path.join(
                                                self.destination_dir,
                                                'dada2.sv.map.csv',
                                            ))
        dada2_sv_to_pplacer.in_seqtab_csv = combined_seqtab_nochim.out_csv

        return (dada2_sv_to_pplacer)

Example #4

Show file

    def workflow(self):

        # Make sure the project name is alphanumeric
        assert all([s.isalnum() or s == "_" for s in self.project_name
                    ]), "Project name must be alphanumeric"

        # Data can come from either SRA or S3
        assert self.input_location in ["SRA", "S3"]

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)

        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Keep track of the jobs for each step, for each sample
        tasks_load_inputs = {}
        tasks_fastqp = {}
        tasks_metaspades = {}
        tasks_prokka = {}
        tasks_famli = {}

        # Iterate over all of the rows of samples
        for _, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # Make a UUID to isolate temp files for this task from any others
            task_uuid = str(uuid.uuid4())[:8]

            # 1. LOAD THE INPUT FILES

            if self.input_location == "S3":
                tasks_load_inputs[sample_name] = self.new_task(
                    "load_from_s3_{}".format(sample_name),
                    LoadFile,
                    path=input_path)
            elif self.input_location == "SRA":
                assert input_path.startswith("SRR"), input_path

                tasks_load_inputs[sample_name] = self.new_task(
                    "download_from_SRA_{}".format(sample_name),
                    ImportSRAFastq,
                    sra_accession=input_path,
                    base_s3_folder=self.base_s3_folder,
                    input_mount_point="/scratch/{}_get_sra/input/".format(
                        task_uuid),
                    output_mount_point="/scratch/{}_get_sra/output/".format(
                        task_uuid),
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=32000,
                        engine=self.engine,
                        aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                        aws_batch_job_poll_sec=120,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "get_sra_{}".format(sample_name)),
                        mounts={
                            "/docker_scratch": {
                                "bind": self.temp_folder,
                                "mode": "rw"
                            }
                        }))
            else:
                raise Exception("Data must be from S3 or SRA")

            # 2. CALCULATE FASTQ QUALITY METRICS
            tasks_fastqp[sample_name] = self.new_task(
                "fastqp_{}".format(sample_name),
                FastqpTask,
                summary_path=os.path.join(self.base_s3_folder, "fastqp",
                                          sample_name + ".fastqp.tsv"),
                input_mount_point="/scratch/{}_fastqp/input/".format(
                    task_uuid),
                output_mount_point="/scratch/{}_fastqp/output/".format(
                    task_uuid),
                containerinfo=sl.ContainerInfo(
                    vcpu=1,
                    mem=32000,
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "fastqp_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # 3. ASSEMBLE WITH METASPADES
            tasks_metaspades[sample_name] = self.new_task(
                "metaspades_{}".format(sample_name),
                AssembleMetaSPAdes,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "metaspades"),
                threads=self.assemble_threads,
                max_mem=int(int(self.assemble_mem) / 1000),
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.assemble_threads),
                    mem=int(self.assemble_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "metaspades_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # 4. ANNOTATE ASSEMBLIES WITH PROKKA
            tasks_prokka[sample_name] = self.new_task(
                "prokka_{}".format(sample_name),
                AnnotateProkka,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "prokka"),
                threads=self.assemble_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.assemble_threads),
                    mem=int(self.assemble_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "prokka_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

        # Assign the output from tasks_load_inputs to the input to tasks_fastqp
        for sample_name in tasks_load_inputs:
            assert sample_name in tasks_fastqp

            if self.input_location == "S3":
                tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_file
            elif self.input_location == "SRA":
                tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_fastq

            assert sample_name in tasks_metaspades
            if self.input_location == "S3":
                tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_file
            elif self.input_location == "SRA":
                tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_fastq

            assert sample_name in tasks_prokka
            tasks_prokka[sample_name].in_fasta = tasks_metaspades[
                sample_name].out_fasta

        # 5. COMBINE ASSEMBLIES
        task_integrate_assemblies = self.new_task(
            "integrate_assemblies-{}".format(self.project_name),
            IntegrateAssembliesTask,
            output_prefix=self.project_name,
            output_folder=os.path.join(self.base_s3_folder,
                                       "integrated_assembly"),
            gff_folder=os.path.join(self.base_s3_folder, "prokka"),
            fastp_folder=os.path.join(self.base_s3_folder, "prokka"),
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=8,
                mem=120000,
                engine=self.engine,
                aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                aws_batch_job_poll_sec=120,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_prefix="integrate_assemblies_{}".format(
                    self.project_name),
                mounts={
                    "/docker_scratch": {
                        "bind": self.temp_folder,
                        "mode": "rw"
                    }
                }))

        task_integrate_assemblies.in_fastp_list = [
            t.out_faa for t in tasks_prokka.values()
        ]
        task_integrate_assemblies.in_gff_list = [
            t.out_gff for t in tasks_prokka.values()
        ]

        # 6. ALIGN AGAINST THE ASSEMBLY USING FAMLI
        tasks_famli = {}
        # Iterate over all of the rows of samples
        for _, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            tasks_famli[sample_name] = self.new_task(
                "famli_{}".format(sample_name),
                FAMLITask,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "famli"),
                threads=self.famli_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.famli_threads),
                    mem=int(self.famli_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix="famli_{}".format(sample_name),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))
            # Connect the raw FASTQ input
            if self.input_location == "S3":
                tasks_famli[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_file
            elif self.input_location == "SRA":
                tasks_famli[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_fastq

            # Connect the reference database
            tasks_famli[
                sample_name].in_ref_dmnd = task_integrate_assemblies.out_daa

        return tasks_famli, tasks_fastqp

Example #5

Show file

File: fetch_patric_functions_workflow.py Project: FredHutch/sciluigi-workflows

    def workflow(self):

        # Make sure that the S3 folder is formatted with the proper prefix
        assert self.s3_folder.startswith("s3://")

        # Parse the bucket and key for the s3 folder for all results
        s3_bucket, s3_prefix = self.s3_folder[5:].split("/", 1)

        # Connect to S3
        s3 = boto3.resource('s3')

        # 1. Get the summary of all genomes
        genome_metadata_fp = os.path.join(s3_prefix, "patric_genome_metadata.tsv")
        
        print("Writing PATRIC genome metadata to s3://{}/{}".format(
            s3_bucket,
            genome_metadata_fp
        ))

        with urlopen("ftp://ftp.patricbrc.org/RELEASE_NOTES/genome_metadata") as fi:
            s3.Bucket(
                s3_bucket
            ).put_object(
                Key=genome_metadata_fp, 
                Body=fi.read()
            )

        # Now read in all of that information as a table
        genome_metadata = read_tsv_from_s3_as_dataframe(s3_bucket, genome_metadata_fp, sep="\t")

        # 2. Fetch the transcripts and annotation files for every genome
        fetch_transcripts_tasks = {}
        fetch_annotation_tasks = {}

        for genome_accession in map(str, genome_metadata.index.values):
            
            fetch_annotation_tasks[genome_accession] = [
                self.new_task(
                    "fetch_patric_annotations_{}".format(genome_accession),
                    TransferFTPtoS3,
                    ftp_url="ftp://ftp.patricbrc.org/genomes/{}/{}.{}".format(
                        genome_accession,
                        genome_accession,
                        suffix
                    ),
                    s3_url=os.path.join(
                        self.s3_folder,
                        genome_accession,
                        "annotation.tsv"
                    ),
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=1000,
                        engine=self.engine,
                        aws_batch_job_poll_sec=120,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "fetch_patric_annotations_{}".format(genome_accession)
                        )
                    )
                )
                for suffix in ["PATRIC.pathway.tab", "RefSeq.pathway.tab"]
            ]

            fetch_transcripts_tasks[genome_accession] = [
                self.new_task(
                    "fetch_patric_transcripts_{}".format(genome_accession),
                    TransferFTPtoS3,
                    ftp_url="ftp://ftp.patricbrc.org/genomes/{}/{}.{}".format(
                        genome_accession,
                        genome_accession,
                        suffix
                    ),
                    s3_url=os.path.join(
                        self.s3_folder,
                        genome_accession,
                        "transcripts.frn"
                    ),
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=1000,
                        engine=self.engine,
                        aws_batch_job_poll_sec=120,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "fetch_patric_transcripts_{}".format(
                                genome_accession)
                        )
                    )
                )
                for suffix in ["PATRIC.frn", "RefSeq.frn"]
            ]

        # 3. Make a flat file for the 16S records

        extract_all_16S = self.new_task(
            "extract_all_16S",
            Extract16S,
            s3_parent_folder=self.s3_folder,
            s3_url=os.path.join(self.s3_folder, "transcripts.fasta"),
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=1,
                mem=1000,
                engine=self.engine,
                aws_batch_job_poll_sec=120,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_prefix="extract_all_16s"
            )
        )

        extract_all_16S.in_fastas = [
            genome_transcript[0].out_file for genome_transcript in fetch_transcripts_tasks.values()
        ]

        # 4. Make a flat file for the annotations
        extract_all_annotations = self.new_task(
            "extract_all_annotations",
            ExtractAnnotations,
            s3_parent_folder=self.s3_folder,
            s3_url=os.path.join(self.s3_folder, "annotations.tsv"),
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=1,
                mem=1000,
                engine=self.engine,
                aws_batch_job_poll_sec=120,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_prefix="extract_all_annotations"
            )
        )

        extract_all_annotations.in_fastas = {
            genome_id: genome_transcript[0].out_file
            for genome_id, genome_transcript in fetch_transcripts_tasks.items()
        }
        extract_all_annotations.in_annotations = {
            genome_id: genome_annotation[0].out_file
            for genome_id, genome_annotation in fetch_annotation_tasks.items()
        }

        return extract_all_16S, extract_all_annotations

Example #6

Show file

File: classify.py Project: gdanetzk/maliampi

    def workflow(self):
        # Intialize our container info
        light_containerinfo = sl.ContainerInfo()
        light_containerinfo.from_config(section='light')
        long_containerinfo = light_containerinfo
        midcpu_containerinfo = sl.ContainerInfo()
        midcpu_containerinfo.from_config(section='midcpu')
        heavy_containerinfo = sl.ContainerInfo()
        heavy_containerinfo.from_config(section='heavy')
        highmem_containerinfo = sl.ContainerInfo()
        highmem_containerinfo.from_config(section='highmem')

        #
        #  Load the refpkg (in tgz format)
        #
        refpkg_tgz = self.new_task(
            'load_refpkg_tgz',
            LoadRefpkgTGZ,
            path=self.refpkg_tgz,
            file_format='gzip',
        )

        jplace = self.new_task(
            'load_jplace',
            LoadFile,
            path=self.jplace,
        )

        # Load the seq map
        seq_map = self.new_task('load_seq_map',
                                LoadFile,
                                path=self.seq_map_csv)

        # Load the weights if provided
        if self.sv_weights_csv:
            sv_weights = self.new_task('load_sv_weight',
                                       LoadFile,
                                       path=self.sv_weights_csv)
        else:
            sv_weights = None

        if self.labels:
            labels = self.new_task('load_labels', LoadFile, path=self.labels)
        else:
            labels = None

        #  And unpack the refpkg to the relevant bits
        refpkg_alignments = self.new_task(
            'refpkg_alignments',
            ExtractRefpkgAlignment,
            aln_fasta_fn=os.path.join(self.working_dir, 'placement',
                                      'refpkg.aln.fasta'),
            aln_sto_fn=os.path.join(self.working_dir, 'placement',
                                    'refpkg.aln.sto'),
        )
        refpkg_alignments.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz

        #
        #  Load the sequence variants (fasta format)
        #
        sv_fasta = self.new_task('load_sv',
                                 LoadFastaSeqs,
                                 fasta_seq_path=self.sv_fasta)

        #
        #  Align the sequence variants
        #
        sv_aligned = self.new_task(
            'align_sv',
            CMAlignSeqs,
            containerinfo=heavy_containerinfo,
            alignment_sto_fn=os.path.join(self.working_dir, 'placement',
                                          'sv.aln.sto'),
            alignment_score_fn=os.path.join(self.working_dir, 'placement',
                                            'sv.aln.scores'),
        )
        sv_aligned.in_seqs = sv_fasta.out_seqs

        sv_aligned_fasta = self.new_task(
            'align_sv_to_fasta',
            AlignmentStoToFasta,
            align_fasta_fn=os.path.join(self.working_dir, 'placement',
                                        'sv.aln.fasta'),
        )
        sv_aligned_fasta.in_align_sto = sv_aligned.out_align_sto

        #
        #  Combine the refpkg alignment with the sequence variant alignment
        #

        sv_refpkg_aln_sto = self.new_task('combine_sv_refpkg_aln_sto',
                                          CombineAlignmentsSTO,
                                          containerinfo=light_containerinfo,
                                          combined_aln_sto_fn=os.path.join(
                                              self.working_dir, 'placement',
                                              'sv_refpkg_aln.sto'))
        sv_refpkg_aln_sto.in_aln_sto_1 = refpkg_alignments.out_aln_sto
        sv_refpkg_aln_sto.in_aln_sto_2 = sv_aligned.out_align_sto

        #
        #  Prep the placements.db using the refpkg
        #

        prepped_placementdb = self.new_task(
            'prep_placementdb',
            PlacementDB_Prep,
            containerinfo=light_containerinfo,
            placement_db_fn=os.path.join(self.destination_dir,
                                         'classification', 'placement.db'))
        prepped_placementdb.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz

        #
        #  Insert the seq_info / map of sv -> specimens
        #

        placement_db_w_si = self.new_task(
            'placement_db_add_si',
            PlacementDB_AddSI,
            containerinfo=light_containerinfo,
        )
        placement_db_w_si.in_placement_db = prepped_placementdb.out_placement_db
        placement_db_w_si.in_seq_map = seq_map.out_file

        #
        #  Classify the sequence variants
        #

        placement_db_classified = self.new_task(
            'classify_into_placement_db',
            PlacementDB_Classify_SV,
            containerinfo=midcpu_containerinfo,
        )
        placement_db_classified.in_placement_db = placement_db_w_si.out_placement_db
        placement_db_classified.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        placement_db_classified.in_sv_refpkg_aln_sto = sv_refpkg_aln_sto.out_aln_sto
        placement_db_classified.in_jplace = jplace.out_file

        #
        #  Multiclass concat names
        #

        placement_db_mcc = self.new_task(
            'placement_db_multiclass_concat',
            PlacementDB_MCC,
            containerinfo=long_containerinfo,
        )
        placement_db_mcc.in_placement_db = placement_db_classified.out_placement_db
        placement_db_mcc.in_weights = sv_weights.out_file

        #
        #  Tabular CSV outputs
        #
        tables_for_rank = {}
        for rank in ['phylum', 'class', 'order', 'family', 'genus', 'species']:
            tables_for_rank[rank] = self.new_task(
                'by_specimen_{}'.format(rank),
                GenerateTables,
                containerinfo=light_containerinfo,
                tables_path=os.path.join(
                    self.destination_dir,
                    'classification',
                    'tables',
                ),
                rank=rank)
            tables_for_rank[
                rank].in_placement_db = placement_db_mcc.out_placement_db
            tables_for_rank[rank].in_seq_map = seq_map.out_file
            if labels:
                tables_for_rank[rank].in_labels = labels.out_file

        return (placement_db_mcc, tables_for_rank)

Example #7

Show file

class Workflow_NCBI_16s(sl.WorkflowTask):
    #
    # Take a set of sequence variants in FASTA format and at least one repository
    # of reference sequences.
    # Search the repository / repositories for matches above a specified threshold
    # for the sequence variants.
    #  Use those recruited full length repo sequences to build a refpkg.
    #
    working_dir = sl.Parameter()
    ncbi_email = sl.Parameter()
    repo_url = sl.Parameter()
    example_seqs = sl.Parameter()

    heavy_containerinfo = sl.ContainerInfo(
        vcpu=36,
        mem=70000,
        container_cache=os.path.abspath(
            os.path.join('../working', 'containers/')),
        engine='aws_batch',
        aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/',
        aws_jobRoleArn=
        'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask',
        aws_batch_job_queue='optimal',
        slurm_partition='boneyard')

    light_containerinfo = sl.ContainerInfo(
        vcpu=2,
        mem=2024,
        container_cache=os.path.abspath(
            os.path.join('../working', 'containers/')),
        engine='aws_batch',
        aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/',
        aws_jobRoleArn=
        'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask',
        aws_batch_job_queue='optimal',
        slurm_partition='boneyard')

    test_containerinfo = sl.ContainerInfo(
        vcpu=2,
        mem=4096,
        container_cache=os.path.abspath(
            os.path.join('../working', 'containers/')),
        engine=ENGINE,
        aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/',
        aws_jobRoleArn=
        'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask',
        aws_batch_job_queue='optimal',
        slurm_partition='boneyard')

    local_containerinfo = sl.ContainerInfo(
        vcpu=2,
        mem=4096,
        container_cache=os.path.abspath(
            os.path.join('../working', 'containers/')),
        engine='docker',
        aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/',
        aws_jobRoleArn=
        'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask',
        aws_batch_job_queue='optimal',
        slurm_partition='boneyard')

    def workflow(self):
        #
        # Load current accessions with 16s in a genome
        #

        repo_url = self.new_task(
            'load_repo_url',
            LoadFile,
            path=self.repo_url,
        )

        example_seqs = self.new_task('load_example_seqs',
                                     LoadFile,
                                     path=self.example_seqs)

        acc_genome_16s = self.new_task(
            'genome_16s_accessions',
            NT_AccessionsForQuery,
            containerinfo=self.test_containerinfo,
            email=self.ncbi_email,
            accessions_fn=os.path.join(self.working_dir, 'ncbi_16s',
                                       'accession', 'genome_16s.csv'),
            query=("16s[All Fields] AND rRNA[Feature Key]"
                   " AND Bacteria[Organism]"
                   " AND 500000 : 99999999999[Sequence Length]"
                   " AND genome[All Fields]"),
        )

        repo_genome_update = self.new_task(
            'repo_genome_update',
            NT_Repo_Update_Accessions,
            extra_values={'is_genome': True},
        )
        repo_genome_update.in_repo_url = repo_url.out_file
        repo_genome_update.in_accessions = acc_genome_16s.out_accessions

        repo_filled = self.new_task(
            'repo_fill',
            NT_Repo_Fill,
            containerinfo=self.test_containerinfo,
            email=self.ncbi_email,
            working_dir=os.path.join(
                self.working_dir,
                'ncbi_16s',
            ),
        )
        repo_filled.in_repo = repo_genome_update.out_repo

        #  Now dump out 16S / seq_info from the genomes.
        repo_dumped = self.new_task(
            'repo_dump',
            NT_Repo_Output_FastaSeqInfo,
            fn_fasta_gz=os.path.join(self.working_dir, 'ncbi_16s',
                                     'genomes.16s.fasta.gz'),
            fn_seq_info=os.path.join(self.working_dir, 'ncbi_16s',
                                     'genomes.16s.seq_info.csv'),
        )
        repo_dumped.in_repo = repo_filled.out_repo

        # Find genomes missing peptide / rRNA annotations
        prokka_annotation = self.new_task(
            'prokka_annotation',
            NT_Repo_Prokka,
            containerinfo=self.light_containerinfo,
            num_concurrent=100,
            workdir=os.path.join(self.working_dir, 'ncbi_16s', 'prokka'))
        prokka_annotation.in_repo = repo_filled.out_repo

        return (prokka_annotation)
        # Use cmsearch to be sure these are vaguely like rRNA
        cmsearch_verify = self.new_task(
            'cmsearch_verify',
            CMSearchVerify,
            containerinfo=self.heavy_containerinfo,
            results_fn=os.path.join(self.working_dir, 'ncbi_16s',
                                    'genomes.16s.cmsearch.tsv'),
        )
        cmsearch_verify.in_seqs = repo_dumped.out_seqs

        #  And filter to rRNA.
        verified_seqs = self.new_task(
            'verify_repo',
            VerifyRepo,
            containerinfo=self.heavy_containerinfo,
            uc_fn=os.path.join(self.working_dir, 'ncbi_16s',
                               'genomes.16s.verified.uc'),
            verified_seqs_fn=os.path.join(self.working_dir, 'ncbi_16s',
                                          'genomes.16s.verified.fasta.gz'),
            unverified_seqs_fn=os.path.join(self.working_dir, 'ncbi_16s',
                                            'genomes.16s.unverified.fasta.gz'),
        )
        verified_seqs.in_repo_seqs = repo_dumped.out_seqs
        verified_seqs.in_expected_seqs = example_seqs.out_file

        return (repo_dumped)

Example #8

Show file

File: placement.py Project: gdanetzk/maliampi

    def workflow(self):
        # Intialize our container info
        light_containerinfo = sl.ContainerInfo()
        light_containerinfo.from_config(
            section='light'
        )
        long_containerinfo = light_containerinfo
        midcpu_containerinfo = sl.ContainerInfo()
        midcpu_containerinfo.from_config(
            section='midcpu'
        )
        heavy_containerinfo = sl.ContainerInfo()
        heavy_containerinfo.from_config(
            section='heavy'
        )
        highmem_containerinfo = sl.ContainerInfo()
        highmem_containerinfo.from_config(
            section='highmem'
        )

        #
        #  Load the refpkg (in tgz format)
        #
        refpkg_tgz = self.new_task(
            'load_refpkg_tgz',
            LoadRefpkgTGZ,
            path=self.refpkg_tgz,
            file_format='gzip',
        )

        # Load the seq map
        seq_map = self.new_task(
            'load_seq_map',
            LoadFile,
            path=self.seq_map_csv
        )

        # Load the weights if provided
        if self.sv_weights_csv:
            sv_weights = self.new_task(
                'load_sv_weight',
                LoadFile,
                path=self.sv_weights_csv
            )
        else:
            sv_weights = None

        #  And unpack the refpkg to the relevant bits
        refpkg_alignments = self.new_task(
            'refpkg_alignments',
            ExtractRefpkgAlignment,
            aln_fasta_fn=os.path.join(
                self.working_dir,
                'placement',
                'refpkg.aln.fasta'
            ),
            aln_sto_fn=os.path.join(
                self.working_dir,
                'placement',
                'refpkg.aln.sto'
            ),
        )
        refpkg_alignments.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz

        #
        #  Load the sequence variants (fasta format)
        #
        sv_fasta = self.new_task(
            'load_sv',
            LoadFastaSeqs,
            fasta_seq_path=self.sv_fasta
        )

        #
        #  Align the sequence variants
        #
        sv_aligned = self.new_task(
            'align_sv',
            CMAlignSeqs,
            containerinfo=heavy_containerinfo,
            alignment_sto_fn=os.path.join(
                self.working_dir,
                'placement',
                'sv.aln.sto'
            ),
            alignment_score_fn=os.path.join(
                self.working_dir,
                'placement',
                'sv.aln.scores'
            ),
        )
        sv_aligned.in_seqs = sv_fasta.out_seqs

        sv_aligned_fasta = self.new_task(
            'align_sv_to_fasta',
            AlignmentStoToFasta,
            align_fasta_fn=os.path.join(
                self.working_dir,
                'placement',
                'sv.aln.fasta'
            ),
        )
        sv_aligned_fasta.in_align_sto = sv_aligned.out_align_sto

        #
        #  Combine the refpkg alignment with the sequence variant alignment
        #

        sv_refpkg_aln_sto = self.new_task(
            'combine_sv_refpkg_aln_sto',
            CombineAlignmentsSTO,
            containerinfo=heavy_containerinfo,
            combined_aln_sto_fn=os.path.join(
                self.working_dir,
                'placement',
                'sv_refpkg_aln.sto'
            )
        )
        sv_refpkg_aln_sto.in_aln_sto_1 = refpkg_alignments.out_aln_sto
        sv_refpkg_aln_sto.in_aln_sto_2 = sv_aligned.out_align_sto

        #
        #  Place the sequence variants using this combined aligment
        #
        dedup_jplace = self.new_task(
            'make_dedup_jplace',
            PPLACER_PlaceAlignment,
            containerinfo=heavy_containerinfo,
            jplace_fn=os.path.join(
                self.destination_dir,
                'placement',
                'dedup.jplace'
            )
        )
        dedup_jplace.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        dedup_jplace.in_merged_aln_sto = sv_refpkg_aln_sto.out_aln_sto

        #
        #  Reduplicate
        #

        if not sv_weights:
            redup_jplace = dedup_jplace
        else:
            redup_jplace = self.new_task(
                'reduplicate_jplace',
                Jplace_Reduplicate,
                containerinfo=light_containerinfo,
                jplace_fn=os.path.join(
                    self.destination_dir,
                    'placement',
                    'redup.jplace.gz'
                )
            )
            redup_jplace.in_jplace = dedup_jplace.out_jplace
            redup_jplace.in_weights = sv_weights.out_file

        #
        #  ADCL
        #
        adcl = self.new_task(
            'create_adcl',
            Jplace_ADCL,
            containerinfo=light_containerinfo,
            adcl_fn=os.path.join(
                self.destination_dir,
                'placement',
                'adcl.gz'
            )
        )
        adcl.in_jplace = redup_jplace.out_jplace

        #
        #  EDPL
        #

        edpl = self.new_task(
            'calculate_edpl',
            Jplace_EDPL,
            containerinfo=highmem_containerinfo,
            edpl_fn=os.path.join(
                self.destination_dir,
                'placement',
                'edpl.gz'
            )
        )
        edpl.in_jplace = redup_jplace.out_jplace

        #
        #  EPCA
        #
        epca = self.new_task(
            'calculate_epca',
            Jplace_PCA,
            containerinfo=long_containerinfo,
            path=os.path.join(
                self.destination_dir,
                'placement',
                'pca'
            ),
            prefix='epca',
            pca='epca'
        )
        epca.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        epca.in_seq_map = seq_map.out_file
        epca.in_jplace = redup_jplace.out_jplace

        #
        #  LPCA
        #

        lpca = self.new_task(
            'calculate_lpca',
            Jplace_PCA,
            containerinfo=highmem_containerinfo,
            path=os.path.join(
                self.destination_dir,
                'placement',
                'pca'
            ),
            prefix='lpca',
            pca='lpca'
        )
        lpca.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        lpca.in_seq_map = seq_map.out_file
        lpca.in_jplace = redup_jplace.out_jplace

        #
        #  KR-distance
        #

        kr_distance = self.new_task(
            'calculate_kr_distance',
            Jplace_KR_Distance,
            containerinfo=long_containerinfo,
            kr_fn=os.path.join(
                self.destination_dir,
                'placement',
                'kr_distance.csv'
            ),
        )
        kr_distance.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        kr_distance.in_seq_map = seq_map.out_file
        kr_distance.in_jplace = redup_jplace.out_jplace

        # 
        #  Alpha-Diversity
        #

        alpha_diversity = self.new_task(
            'calculate_alpha_diversity',
            Jplace_Alpha_Diversity,
            containerinfo=light_containerinfo,
            alpha_diversity_fn=os.path.join(
                self.destination_dir,
                'placement',
                'alpha_diversity.csv'
            ),
        )
        alpha_diversity.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        alpha_diversity.in_seq_map = seq_map.out_file
        alpha_diversity.in_jplace = redup_jplace.out_jplace

        return(epca, lpca, adcl, edpl, kr_distance, alpha_diversity)

Example #9

Show file

    def workflow(self):

        # Input files are either located in SRA or AWS S3
        assert self.input_location in ["SRA", "S3"]

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)
        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Make tasks that will make sure the reference databases exist
        ref_db_dmnd = self.new_task("load_ref_db_dmnd",
                                    LoadFile,
                                    path=self.ref_db_dmnd)
        ref_db_metadata = self.new_task("load_ref_db_metadata",
                                        LoadFile,
                                        path=self.ref_db_metadata)

        # Keep track of all of the jobs for getting the input files
        tasks_load_inputs = {}

        # Keep track of all of the jobs for aligning against the viral database
        tasks_map_viruses = {}

        # Assembling datasets de novo
        tasks_metaspades = {}

        # Running VirFinder on assembled contigs
        tasks_virfinder = {}

        # Iterate over all of the rows of samples
        for ix, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # If the inputs are on SRA, execute jobs that will download them
            if self.input_location == "SRA":

                tasks_load_inputs[sample_name] = self.new_task(
                    "download_from_sra_{}".format(sample_name),
                    ImportSRAFastq,
                    sra_accession=input_path,
                    base_s3_folder=self.base_s3_folder,
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=4096,
                        engine=self.engine,
                        aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "download_from_sra_{}".format(sample_name)),
                        mounts={
                            "/docker_scratch": {
                                "bind": self.temp_folder,
                                "mode": "rw"
                            }
                        }))
            else:
                # Make sure the file exists on S3
                assert self.input_location == "S3"
                tasks_load_inputs[sample_name] = self.new_task(
                    "load_from_s3_{}".format(sample_name),
                    LoadFile,
                    path=input_path)

            # Make a task to align the reads, wherever they came from
            tasks_map_viruses[sample_name] = self.new_task(
                "map_viruses_{}".format(sample_name),
                MapVirusesTask,
                output_folder=os.path.join(self.base_s3_folder,
                                           self.mapping_output_folder),
                sample_name=sample_name,
                threads=self.align_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.align_threads),
                    mem=int(self.align_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "map_viruses_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # De novo assembly with metaSPAdes
            tasks_metaspades[sample_name] = self.new_task(
                "metaspades_{}".format(sample_name),
                AssembleMetaSPAdes,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "metaspades"),
                threads=self.assemble_threads,
                max_mem=int(int(self.assemble_mem) / 1000),
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.assemble_threads),
                    mem=int(self.assemble_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "metaspades_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # Run VirFinder on the assembled contigs
            tasks_virfinder[sample_name] = self.new_task(
                "virfinder_{}".format(sample_name),
                VirFinderTask,
                base_s3_folder=self.base_s3_folder,
                sample_name=sample_name,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.align_threads),
                    mem=int(self.align_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "virfinder_{}".format(sample_name)),
                ))

        # Assign the output from tasks_load_inputs to the input to tasks_map_viruses
        for sample_name in tasks_load_inputs:
            assert sample_name in tasks_map_viruses

            # Assign the input for the reference database
            tasks_map_viruses[
                sample_name].in_ref_db_dmnd = ref_db_dmnd.out_file
            tasks_map_viruses[
                sample_name].in_ref_db_metadata = ref_db_metadata.out_file
            tasks_map_viruses[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_file

            # VirFinder depends on metaspades
            tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_file
            tasks_virfinder[sample_name].in_fasta = tasks_metaspades[
                sample_name].out_fasta

        return tasks_map_viruses, tasks_virfinder

Example #10

Show file

    def workflow(self):

        # Make sure the project name is alphanumeric
        assert all([s.isalnum() or s == "_" for s in self.project_name
                    ]), "Project name must be alphanumeric"

        # Data can come from either SRA or S3
        assert self.input_location in ["SRA", "S3"]

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)

        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Keep track of the jobs for each step, for each sample
        tasks_load_inputs = {}
        tasks_famli = {}

        # Iterate over all of the rows of samples
        for _, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # Make a UUID to isolate temp files for this task from any others
            task_uuid = str(uuid.uuid4())[:8]

            # 0. LOAD THE DATABASE
            tasks_load_db = self.new_task("load_db_from_s3",
                                          LoadFile,
                                          path=self.famli_db_location)

            # 1. LOAD THE INPUT FILES

            if self.input_location == "S3":
                tasks_load_inputs[sample_name] = self.new_task(
                    "load_from_s3_{}".format(sample_name),
                    LoadFile,
                    path=input_path)
            elif self.input_location == "SRA":
                assert input_path.startswith("SRR"), input_path

                tasks_load_inputs[sample_name] = self.new_task(
                    "download_from_SRA_{}".format(sample_name),
                    ImportSRAFastq,
                    sra_accession=input_path,
                    base_s3_folder=self.base_s3_folder,
                    input_mount_point="/scratch/{}_get_sra/input/".format(
                        task_uuid),
                    output_mount_point="/scratch/{}_get_sra/output/".format(
                        task_uuid),
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=32000,
                        engine=self.engine,
                        aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                        aws_batch_job_poll_sec=120,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "get_sra_{}".format(sample_name)),
                        mounts={
                            "/docker_scratch": {
                                "bind": self.temp_folder,
                                "mode": "rw"
                            }
                        }))
            else:
                raise Exception("Data must be from S3 or SRA")

            # 2. ALIGN AGAINST THE DATABASE USING FAMLI

            tasks_famli[sample_name] = self.new_task(
                "famli_{}".format(sample_name),
                FAMLITask,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder,
                                           self.output_folder),
                threads=self.famli_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.famli_threads),
                    mem=int(self.famli_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix="famli_{}".format(sample_name),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))
            # Connect the raw FASTQ input
            if self.input_location == "S3":
                tasks_famli[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_file
            elif self.input_location == "SRA":
                tasks_famli[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_fastq

            # Connect the reference database
            tasks_famli[sample_name].in_ref_dmnd = tasks_load_db.out_file

        return tasks_famli

Example #11

Show file

    def workflow(self):
        # Intialize our container info
        light_containerinfo = sl.ContainerInfo()
        light_containerinfo.from_config(section='light')
        midcpu_containerinfo = sl.ContainerInfo()
        midcpu_containerinfo.from_config(section='midcpu')
        heavy_containerinfo = sl.ContainerInfo()
        heavy_containerinfo.from_config(section='heavy')
        highmem_containerinfo = sl.ContainerInfo()
        highmem_containerinfo.from_config(section='highmem')

        #
        # Build our taxonomy db
        #
        taxonomy_db = self.new_task('taxonomy_db',
                                    BuildTaxtasticDB,
                                    containerinfo=light_containerinfo,
                                    tax_db_path=os.path.join(
                                        self.working_dir, 'refpkg',
                                        'taxonomy.db'))

        #
        # Load the sequence variants
        #

        sequence_variants = self.new_task(
            'load_sequence_variants',
            LoadFastaSeqs,
            fasta_seq_path=self.sequence_variants_path)
        log.info("Loaded sequence variants")

        # Load the sequence information
        seq_info_files = [
            self.new_task('load_si_{}'.format(si_i), LoadFile, path=si_path)
            for si_i, si_path in enumerate(self.repo_seq_info.split(','))
        ]
        log.info("Loaded %d sequence information files", len(seq_info_files))

        #
        # Load the annotated repositories
        #
        repo_annotated = [
            self.new_task('load_annotated_repo_{}'.format(r_i),
                          LoadFastaSeqs,
                          fasta_seq_path=r_path)
            for r_i, r_path in enumerate(self.repo_annotated_fasta.split(','))
        ]
        log.info("Loaded %d Annotated Repositories", len(repo_annotated))

        #
        # Search the sequence variants in the annotated repository
        #
        search_sv_annotated = []
        for ra_i, r_annotated in enumerate(repo_annotated):

            r_a_task = self.new_task(
                'search_sv_annotated_{}'.format(ra_i),
                SearchRepoForMatches,
                containerinfo=midcpu_containerinfo,
                matches_uc_path=os.path.join(
                    self.working_dir, 'refpkg',
                    'repo.annotated__{}.matches.uc'.format(ra_i)),
                unmatched_exp_seqs_path=os.path.join(
                    self.working_dir, 'refpkg',
                    'repo.annotated__{}.annotated.exp_seqs_unmatched.fasta'.
                    format(ra_i)),
                matched_repo_seqs_path=os.path.join(
                    self.working_dir, 'refpkg',
                    'repo.annotated__{}.recruited_repo_seqs.fasta'.format(
                        ra_i)),
                min_id=self.min_id_annotated,
                maxaccepts=
                10,  # Default take the top 10 (roughly corresponding to a 95% id for most)
            )
            r_a_task.in_exp_seqs = sequence_variants.out_seqs
            r_a_task.in_repo_seqs = r_annotated.out_seqs
            search_sv_annotated.append(r_a_task)
        #
        # Combine Recruits into one file
        #

        combined_repo_matches = self.new_task(
            'combine_repo_matches',
            CombineRepoMatches,
            seqs_fn=os.path.join(self.working_dir, 'refpkg',
                                 'combined.repo.maches.fasta'),
            seq_info_fn=os.path.join(self.working_dir, 'refpkg',
                                     'combined.repo.maches.seq_info.csv'),
        )
        combined_repo_matches.in_seqs = [
            ssv.out_matched_repo_seqs for ssv in search_sv_annotated
        ]
        combined_repo_matches.in_seq_info = [
            sif.out_file for sif in seq_info_files
        ]

        refpkg_seqs = combined_repo_matches.out_seqs
        refpkg_seqinfo = combined_repo_matches.out_seq_info

        #
        # Verify the taxonomy for the refpkg seqinfo file.
        #

        verified_refpkg_seqinfo = self.new_task(
            'verify_refpkg_seqinfo_taxonomy',
            ConfirmSeqInfoTaxonomy,
            email=self.entrez_email,
            containerinfo=light_containerinfo,
            confirmed_seqinfo_path=os.path.join(
                self.working_dir, 'refpkg',
                'seq_info.refpkg.verified_tax.csv'))
        verified_refpkg_seqinfo.in_seq_info = refpkg_seqinfo
        verified_refpkg_seqinfo.in_tax_db = taxonomy_db.out_tax_db

        #
        # Parse UC file to determine if we achieved our minimum-best goal
        # for each SV.
        #

        #
        # Align recruited repo seqs
        #

        align_recruits = self.new_task(
            'align_recruits',
            CMAlignSeqs,
            containerinfo=highmem_containerinfo,
            alignment_sto_fn=os.path.join(self.working_dir, 'refpkg',
                                          'recruit.aln.sto'),
            alignment_score_fn=os.path.join(self.working_dir, 'refpkg',
                                            'recruit.aln.scores'),
        )
        align_recruits.in_seqs = refpkg_seqs

        #
        # Make a fasta version of the alignment
        #

        align_fasta = self.new_task(
            'align_fasta',
            AlignmentStoToFasta,
            align_fasta_fn=os.path.join(self.working_dir, 'refpkg',
                                        'recruit.aln.fasta'),
        )
        align_fasta.in_align_sto = align_recruits.out_align_sto

        #
        #  Make a tree of the reference package sequences
        #

        raxml_tree = self.new_task(
            'raxml_tree',
            RAxMLTree,
            containerinfo=heavy_containerinfo,
            tree_path=os.path.join(self.working_dir, 'refpkg', 'refpkg.tre'),
            tree_stats_path=os.path.join(self.working_dir, 'refpkg',
                                         'refpkg.tre.info'),
        )
        raxml_tree.in_align_fasta = align_fasta.out_align_fasta

        #
        # Cleanup the tree info to remove cruft
        #

        tree_info_cleanup = self.new_task(
            'tree_info_cleanup',
            CleanupTreeInfo,
            tree_info_path=os.path.join(self.working_dir, 'refpkg',
                                        'refpkg.tre.cleaned.info'),
        )
        tree_info_cleanup.in_tree_info = raxml_tree.out_tree_stats

        #
        #  Start to assemble the reference package at this point
        #

        # Taxtable
        refpkg_taxtable = self.new_task('refpkg_taxtable',
                                        TaxTableForSeqInfo,
                                        containerinfo=light_containerinfo,
                                        taxtable_path=os.path.join(
                                            self.working_dir, 'refpkg',
                                            'taxtable.csv'))
        refpkg_taxtable.in_seq_info = verified_refpkg_seqinfo.out_seq_info
        refpkg_taxtable.in_tax_db = taxonomy_db.out_tax_db

        # Covariance Matrix
        obtain_cm = self.new_task('obtain_cm',
                                  ObtainCM,
                                  containerinfo=light_containerinfo,
                                  cm_destination=os.path.join(
                                      self.working_dir, 'refpkg',
                                      'rRNA_16S_SSU.cm'))

        # And the actual combination step
        combine_refpgk = self.new_task(
            'combine_refpkg',
            CombineRefpkg,
            containerinfo=light_containerinfo,
            refpkg_path=os.path.join(
                self.new_refpkg_path,
                'refpkg',
            ),
            refpkg_name=self.new_refpkg_name,
        )
        combine_refpgk.in_aln_fasta = align_fasta.out_align_fasta
        combine_refpgk.in_aln_sto = align_recruits.out_align_sto
        combine_refpgk.in_tree = raxml_tree.out_tree
        combine_refpgk.in_tree_stats = tree_info_cleanup.out_tree_info
        combine_refpgk.in_taxtable = refpkg_taxtable.out_taxtable
        combine_refpgk.in_seq_info = verified_refpkg_seqinfo.out_seq_info
        combine_refpgk.in_cm = obtain_cm.out_cm

        return (combine_refpgk)

        #
        # Combine the sequences, avoiding duplicate sequences
        #

        combined_recruits = self.new_task(
            'combine_repo_recruits',
            CombineRepoMatches,
            seqs_fn=os.path.join(self.working_dir, 'refpkg',
                                 'recruits.combined.fasta'),
            seq_info_fn=os.path.join(self.working_dir, 'refpkg',
                                     'recruits.combined.seq_info.csv'))
        combined_recruits.in_seqs = [
            search_sv_genomes.out_matched_repo_seqs,
            search_sv_filtered.out_matched_repo_seqs,
        ]
        combined_recruits.in_seq_info = [
            repo_genomes_seq_info.out_file,
            repo_filtered_seq_info.out_file,
        ]

        refpkg_taxtable = self.new_task('refpkg_taxtable',
                                        TaxTableForSeqInfo,
                                        containerinfo=self.local_containerinfo,
                                        taxtable_path=os.path.join(
                                            self.working_dir, 'refpkg',
                                            'taxtable.csv'))
        refpkg_taxtable.in_seq_info = combined_recruits.out_seq_info
        refpkg_taxtable.in_tax_db = taxonomy_db.out_tax_db

        obtain_cm = self.new_task('obtain_cm',
                                  ObtainCM,
                                  containerinfo=self.local_containerinfo,
                                  cm_destination=os.path.join(
                                      self.working_dir, 'refpkg',
                                      'rRNA_16S_SSU.cm'))

        combine_refpgk = self.new_task(
            'combine_refpkg',
            CombineRefpkg,
            containerinfo=self.local_containerinfo,
            refpkg_path=os.path.join(
                self.working_dir,
                'refpkg',
            ),
            refpkg_name='test',
        )
        combine_refpgk.in_aln_fasta = align_fasta.out_align_fasta
        combine_refpgk.in_aln_sto = align_recruits.out_align_sto
        combine_refpgk.in_tree = raxml_tree.out_tree
        combine_refpgk.in_tree_stats = raxml_tree.out_tree_stats
        combine_refpgk.in_taxtable = refpkg_taxtable.out_taxtable
        combine_refpgk.in_seq_info = combined_recruits.out_seq_info
        combine_refpgk.in_cm = obtain_cm.out_cm

        return (combine_refpgk)

Example #12

Show file

File: annotate_genome_workflow.py Project: FredHutch/sciluigi-workflows

    def workflow(self):

        # Load the input file
        genome_fasta = self.new_task("load_genome_fasta",
                                     LoadFile,
                                     path=self.genome_fasta)

        # Run Prokka
        annotate_prokka = self.new_task(
            "annotate_prokka_{}".format(self.genome_name),
            AnnotateProkka,
            sample_name=self.genome_name,
            output_folder=os.path.join(self.base_s3_folder, "prokka"),
            threads=self.checkm_threads,
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=int(self.checkm_threads),
                mem=int(self.checkm_memory),
                engine=self.engine,
                aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_name="annotate_prokka_{}".format(
                    self.genome_name),
                mounts={
                    "/docker_scratch": {
                        "bind": self.temp_folder,
                        "mode": "rw"
                    }
                }))

        # Link the file for prokka annotation
        annotate_prokka.in_fasta = genome_fasta.out_file

        # Run CheckM
        checkm = self.new_task(
            "checkm_{}".format(self.genome_name),
            CheckM,
            sample_name=self.genome_name,
            output_folder=os.path.join(self.base_s3_folder, "checkm"),
            threads=8,
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=int(8),
                mem=int(64000),
                engine=self.engine,
                aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_name="checkm_{}".format(self.genome_name),
                mounts={
                    "/docker_scratch": {
                        "bind": self.temp_folder,
                        "mode": "rw"
                    }
                }))

        # Link the protein coding sequences from prokka into the inputs for checkm
        checkm.in_faa = annotate_prokka.out_faa

        return checkm