def workflow(self): # Input files are either located in SRA or AWS S3 assert self.input_location in ["SRA", "S3"] # Read in the metadata sheet metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep) for col_name in [self.input_column_name, self.sample_column_name]: assert col_name in metadata.columns, "{} not found in {}".format( col_name, self.metadata_fp) # Make sure that all samples and files are unique assert metadata[col_name].unique().shape[0] == metadata.shape[0] # Make tasks that will make sure the reference databases exist ref_fasta = self.new_task("load_ref_fasta", LoadFile, path=self.ref_fasta) # Keep track of all of the jobs for getting the input files tasks_load_inputs = {} # Keep track of all of the jobs for aligning against the FASTA tasks_align_bwa = {} # Iterate over all of the rows of samples for ix, r in metadata.iterrows(): # Get the sample name and the file location sample_name = r[self.sample_column_name] input_path = r[self.input_column_name] # If the inputs are on SRA, execute jobs that will download them if self.input_location == "SRA": tasks_load_inputs[sample_name] = self.new_task( "download_from_sra_{}".format(sample_name), ImportSRAFastq, sra_accession=input_path, base_s3_folder=self.base_s3_folder, containerinfo=sl.ContainerInfo( vcpu=1, mem=4096, engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) else: # Make sure the file exists on S3 assert self.input_location == "S3" tasks_load_inputs[sample_name] = self.new_task( "load_from_s3_{}".format(sample_name), LoadFile, path=input_path) # Make a task to align the reads, wherever they came from tasks_align_bwa[sample_name] = self.new_task( "align_bwa_{}".format(sample_name), AlignFastqTask, ref_name=self.ref_name, base_s3_folder=self.base_s3_folder, sample_name=sample_name, threads=self.align_threads, temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.align_threads), mem=int(self.align_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # Assign the output from tasks_load_inputs to the input to tasks_align_bwa for sample_name in tasks_load_inputs: assert sample_name in tasks_align_bwa # Assign the input for the reference database tasks_align_bwa[sample_name].in_ref_fasta = ref_fasta.out_file tasks_align_bwa[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_fastq return tasks_align_bwa
def workflow(self): # Read in the metadata sheet metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep) for col_name in [self.input_column_name, self.sample_column_name]: assert col_name in metadata.columns, "{} not found in {}".format( col_name, self.metadata_fp) # Make sure that all samples and files are unique assert metadata[col_name].unique().shape[0] == metadata.shape[0] # Keep track of the jobs for each step, for each sample tasks_load_inputs = {} tasks_fastqp = {} tasks_humann = {} # Iterate over all of the rows of samples for ix, r in metadata.iterrows(): # Get the sample name and the file location sample_name = r[self.sample_column_name] input_path = r[self.input_column_name] # Make a UUID to isolate temp files for this task from any others task_uuid = str(uuid.uuid4())[:8] # 1. LOAD THE INPUT FILES tasks_load_inputs[sample_name] = self.new_task( "load_from_s3_{}".format(sample_name), LoadFile, path=input_path) # 2. CALCULATE FASTQ QUALITY METRICS tasks_fastqp[sample_name] = self.new_task( "fastqp_{}".format(sample_name), FastqpTask, summary_path=os.path.join(self.base_s3_folder, "fastqp", sample_name + ".fastqp.tsv"), input_mount_point="/scratch/{}_fastqp/input/".format( task_uuid), output_mount_point="/scratch/{}_fastqp/output/".format( task_uuid), containerinfo=sl.ContainerInfo( vcpu=1, mem=10000, engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "fastqp_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # 3. ANALYZE WITH HUMANn2 tasks_humann[sample_name] = self.new_task( "humann2_{}".format(sample_name), HUMAnN2Task, sample_name=sample_name, output_folder=os.path.join(self.base_s3_folder, "humann2"), threads=self.humann2_threads, ref_db=self.humann2_ref_db, temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.humann2_threads), mem=int(self.humann2_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "humann2_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" }, "/refdbs": { "bind": "/refdbs", "mode": "ro" } })) # Assign the output from tasks_load_inputs to the input to tasks_fastqp for sample_name in tasks_load_inputs: assert sample_name in tasks_fastqp tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_file assert sample_name in tasks_humann tasks_humann[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_file return tasks_fastqp, tasks_humann
def workflow(self): light_containerinfo = sl.ContainerInfo() light_containerinfo.from_config(section='light') highmem_containerinfo = sl.ContainerInfo() highmem_containerinfo.from_config(section='highmem') heavy_containerinfo = sl.ContainerInfo() heavy_containerinfo.from_config(section='heavy') midcpu_containerinfo = sl.ContainerInfo() midcpu_containerinfo.from_config(section='midcpu') # # Load the manifest of files # manifest = self.new_task( 'load_manifest', LoadManifest, path=self.manifest, ) # For each specimen.... specimen_tasks = defaultdict(dict) specimens = manifest.get_specimens() for specimen in specimens: # Load the specimen reads. specimen_tasks[specimen]['reads'] = self.new_task( 'specimen_load_{}'.format(specimen), LoadSpecimenReads, specimen=specimen) specimen_tasks[specimen]['reads'].in_manifest = manifest.out_file if self.barcodecop and "I1" in specimen_tasks[specimen][ 'reads'].out_reads() and manifest.is_paired(): specimen_tasks[specimen]['verified_reads'] = self.new_task( 'specimen_bcc_{}'.format(specimen), BCCSpecimenReads, containerinfo=light_containerinfo, specimen=specimen, path=os.path.join(self.working_dir, 'sv', 'bcc')) specimen_tasks[specimen][ 'verified_reads'].in_reads = specimen_tasks[specimen][ 'reads'].out_reads else: specimen_tasks[specimen]['verified_reads'] = specimen_tasks[ specimen]['reads'] # DADA2 filer and trim specimen_tasks[specimen]['dada2_ft'] = self.new_task( 'dada2_ft_{}'.format(specimen), DADA2_FilterAndTrim, containerinfo=light_containerinfo, specimen=specimen, f_trunc=self.truncLenF, r_trunc=self.truncLenR, trim_left=self.trimLeft, maxN=self.maxN, maxEE=self.maxEE, path=os.path.join(self.working_dir, 'sv', 'dada2', 'ft')) specimen_tasks[specimen]['dada2_ft'].in_reads = specimen_tasks[ specimen]['verified_reads'].out_reads specimen_tasks[specimen]['dada2_derep'] = self.new_task( 'dada2_derep_{}'.format(specimen), DADA2_Dereplicate, containerinfo=light_containerinfo, specimen=specimen, path=os.path.join(self.working_dir, 'sv', 'dada2', 'derep')) specimen_tasks[specimen]['dada2_derep'].in_reads = specimen_tasks[ specimen]['dada2_ft'].out_reads # Now we need the specimens grouped by batch to create error models. batch_errModels = {} for batch, batched_specimens in manifest.batched_specimens(): batch_errModels[batch] = self.new_task( 'dada2_learn_error_batch_{}'.format(batch), DADA2_LearnError, containerinfo=midcpu_containerinfo, batch=batch, tar_reads=False, path=os.path.join(self.working_dir, 'sv', 'dada2', 'errM')) batch_errModels[batch].in_reads = [ specimen_tasks[s]['dada2_ft'].out_reads for s in specimen_tasks if s in batched_specimens ] for specimen in batched_specimens: specimen_tasks[specimen]['dada2_errM'] = batch_errModels[batch] # Back to for each specimen... for specimen in specimens: # DADA specimen_tasks[specimen]['dada2_dada'] = self.new_task( 'dada2_dada_{}'.format(specimen), DADA2_DADA, containerinfo=midcpu_containerinfo, specimen=specimen, path=os.path.join(self.working_dir, 'sv', 'dada2', 'dada')) specimen_tasks[specimen]['dada2_dada'].in_derep = specimen_tasks[ specimen]['dada2_derep'].out_rds specimen_tasks[specimen]['dada2_dada'].in_errM = specimen_tasks[ specimen]['dada2_errM'].out_rds # MERGE specimen_tasks[specimen]['dada2_merge'] = self.new_task( 'dada2_merge_{}'.format(specimen), DADA2_Merge, containerinfo=light_containerinfo, specimen=specimen, path=os.path.join(self.working_dir, 'sv', 'dada2', 'merged')) specimen_tasks[specimen]['dada2_merge'].in_dada = specimen_tasks[ specimen]['dada2_dada'].out_rds specimen_tasks[specimen]['dada2_merge'].in_derep = specimen_tasks[ specimen]['dada2_derep'].out_rds # Seqtab specimen_tasks[specimen]['dada2_seqtab'] = self.new_task( 'dada2_seqtab_{}'.format(specimen), DADA2_Specimen_Seqtab, containerinfo=light_containerinfo, specimen=specimen, path=os.path.join(self.working_dir, 'sv', 'dada2', 'seqtab')) specimen_tasks[specimen]['dada2_seqtab'].in_merge = specimen_tasks[ specimen]['dada2_merge'].out_rds # Combine seqtabs by batch batch_seqtab = {} for batch, batched_specimens in manifest.batched_specimens(): batch_seqtab[batch] = self.new_task( 'dada2_combine_seqtabs_{}'.format(batch), DADA2_Combine_Seqtabs, containerinfo=light_containerinfo, fn=os.path.join( self.working_dir, 'sv', 'dada2', 'seqtab', 'batches', 'seqtab.{}.combined.rds'.format(batch), )) batch_seqtab[batch].in_seqtabs = [ specimen_tasks[s]['dada2_seqtab'].out_rds for s in specimen_tasks if s in batched_specimens ] # Now combine all the batch_seqtabs into one master seqtab combined_seqtab = self.new_task('dada2_combine_seqtabs', DADA2_Combine_Seqtabs, containerinfo=highmem_containerinfo, fn=os.path.join( self.working_dir, 'sv', 'dada2', 'seqtab.combined.rds')) combined_seqtab.in_seqtabs = [ st.out_rds for st in batch_seqtab.values() ] combined_seqtab_nochim = self.new_task( 'dada2_remove_chimera', DADA2_Remove_Chimera, containerinfo=heavy_containerinfo, fn_rds=os.path.join(self.working_dir, 'sv', 'dada2', 'seqtab.combined.nochim.rds'), fn_csv=os.path.join(self.destination_dir, 'seqtab.combined.nochim.csv')) combined_seqtab_nochim.in_seqtab = combined_seqtab.out_rds dada2_sv_to_pplacer = self.new_task('dada2_sv_to_pplacer', DADA2_SV_to_PPlacer, containerinfo=light_containerinfo, fasta_fn=os.path.join( self.destination_dir, 'dada2.sv.fasta', ), weights_fn=os.path.join( self.destination_dir, 'dada2.sv.weights.csv', ), map_fn=os.path.join( self.destination_dir, 'dada2.sv.map.csv', )) dada2_sv_to_pplacer.in_seqtab_csv = combined_seqtab_nochim.out_csv return (dada2_sv_to_pplacer)
def workflow(self): # Make sure the project name is alphanumeric assert all([s.isalnum() or s == "_" for s in self.project_name ]), "Project name must be alphanumeric" # Data can come from either SRA or S3 assert self.input_location in ["SRA", "S3"] # Read in the metadata sheet metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep) for col_name in [self.input_column_name, self.sample_column_name]: assert col_name in metadata.columns, "{} not found in {}".format( col_name, self.metadata_fp) # Make sure that all samples and files are unique assert metadata[col_name].unique().shape[0] == metadata.shape[0] # Keep track of the jobs for each step, for each sample tasks_load_inputs = {} tasks_fastqp = {} tasks_metaspades = {} tasks_prokka = {} tasks_famli = {} # Iterate over all of the rows of samples for _, r in metadata.iterrows(): # Get the sample name and the file location sample_name = r[self.sample_column_name] input_path = r[self.input_column_name] # Make a UUID to isolate temp files for this task from any others task_uuid = str(uuid.uuid4())[:8] # 1. LOAD THE INPUT FILES if self.input_location == "S3": tasks_load_inputs[sample_name] = self.new_task( "load_from_s3_{}".format(sample_name), LoadFile, path=input_path) elif self.input_location == "SRA": assert input_path.startswith("SRR"), input_path tasks_load_inputs[sample_name] = self.new_task( "download_from_SRA_{}".format(sample_name), ImportSRAFastq, sra_accession=input_path, base_s3_folder=self.base_s3_folder, input_mount_point="/scratch/{}_get_sra/input/".format( task_uuid), output_mount_point="/scratch/{}_get_sra/output/".format( task_uuid), containerinfo=sl.ContainerInfo( vcpu=1, mem=32000, engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "get_sra_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) else: raise Exception("Data must be from S3 or SRA") # 2. CALCULATE FASTQ QUALITY METRICS tasks_fastqp[sample_name] = self.new_task( "fastqp_{}".format(sample_name), FastqpTask, summary_path=os.path.join(self.base_s3_folder, "fastqp", sample_name + ".fastqp.tsv"), input_mount_point="/scratch/{}_fastqp/input/".format( task_uuid), output_mount_point="/scratch/{}_fastqp/output/".format( task_uuid), containerinfo=sl.ContainerInfo( vcpu=1, mem=32000, engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "fastqp_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # 3. ASSEMBLE WITH METASPADES tasks_metaspades[sample_name] = self.new_task( "metaspades_{}".format(sample_name), AssembleMetaSPAdes, sample_name=sample_name, output_folder=os.path.join(self.base_s3_folder, "metaspades"), threads=self.assemble_threads, max_mem=int(int(self.assemble_mem) / 1000), temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.assemble_threads), mem=int(self.assemble_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "metaspades_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # 4. ANNOTATE ASSEMBLIES WITH PROKKA tasks_prokka[sample_name] = self.new_task( "prokka_{}".format(sample_name), AnnotateProkka, sample_name=sample_name, output_folder=os.path.join(self.base_s3_folder, "prokka"), threads=self.assemble_threads, temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.assemble_threads), mem=int(self.assemble_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "prokka_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # Assign the output from tasks_load_inputs to the input to tasks_fastqp for sample_name in tasks_load_inputs: assert sample_name in tasks_fastqp if self.input_location == "S3": tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_file elif self.input_location == "SRA": tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_fastq assert sample_name in tasks_metaspades if self.input_location == "S3": tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_file elif self.input_location == "SRA": tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_fastq assert sample_name in tasks_prokka tasks_prokka[sample_name].in_fasta = tasks_metaspades[ sample_name].out_fasta # 5. COMBINE ASSEMBLIES task_integrate_assemblies = self.new_task( "integrate_assemblies-{}".format(self.project_name), IntegrateAssembliesTask, output_prefix=self.project_name, output_folder=os.path.join(self.base_s3_folder, "integrated_assembly"), gff_folder=os.path.join(self.base_s3_folder, "prokka"), fastp_folder=os.path.join(self.base_s3_folder, "prokka"), temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=8, mem=120000, engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix="integrate_assemblies_{}".format( self.project_name), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) task_integrate_assemblies.in_fastp_list = [ t.out_faa for t in tasks_prokka.values() ] task_integrate_assemblies.in_gff_list = [ t.out_gff for t in tasks_prokka.values() ] # 6. ALIGN AGAINST THE ASSEMBLY USING FAMLI tasks_famli = {} # Iterate over all of the rows of samples for _, r in metadata.iterrows(): # Get the sample name and the file location sample_name = r[self.sample_column_name] input_path = r[self.input_column_name] tasks_famli[sample_name] = self.new_task( "famli_{}".format(sample_name), FAMLITask, sample_name=sample_name, output_folder=os.path.join(self.base_s3_folder, "famli"), threads=self.famli_threads, temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.famli_threads), mem=int(self.famli_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix="famli_{}".format(sample_name), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # Connect the raw FASTQ input if self.input_location == "S3": tasks_famli[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_file elif self.input_location == "SRA": tasks_famli[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_fastq # Connect the reference database tasks_famli[ sample_name].in_ref_dmnd = task_integrate_assemblies.out_daa return tasks_famli, tasks_fastqp
def workflow(self): # Make sure that the S3 folder is formatted with the proper prefix assert self.s3_folder.startswith("s3://") # Parse the bucket and key for the s3 folder for all results s3_bucket, s3_prefix = self.s3_folder[5:].split("/", 1) # Connect to S3 s3 = boto3.resource('s3') # 1. Get the summary of all genomes genome_metadata_fp = os.path.join(s3_prefix, "patric_genome_metadata.tsv") print("Writing PATRIC genome metadata to s3://{}/{}".format( s3_bucket, genome_metadata_fp )) with urlopen("ftp://ftp.patricbrc.org/RELEASE_NOTES/genome_metadata") as fi: s3.Bucket( s3_bucket ).put_object( Key=genome_metadata_fp, Body=fi.read() ) # Now read in all of that information as a table genome_metadata = read_tsv_from_s3_as_dataframe(s3_bucket, genome_metadata_fp, sep="\t") # 2. Fetch the transcripts and annotation files for every genome fetch_transcripts_tasks = {} fetch_annotation_tasks = {} for genome_accession in map(str, genome_metadata.index.values): fetch_annotation_tasks[genome_accession] = [ self.new_task( "fetch_patric_annotations_{}".format(genome_accession), TransferFTPtoS3, ftp_url="ftp://ftp.patricbrc.org/genomes/{}/{}.{}".format( genome_accession, genome_accession, suffix ), s3_url=os.path.join( self.s3_folder, genome_accession, "annotation.tsv" ), containerinfo=sl.ContainerInfo( vcpu=1, mem=1000, engine=self.engine, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "fetch_patric_annotations_{}".format(genome_accession) ) ) ) for suffix in ["PATRIC.pathway.tab", "RefSeq.pathway.tab"] ] fetch_transcripts_tasks[genome_accession] = [ self.new_task( "fetch_patric_transcripts_{}".format(genome_accession), TransferFTPtoS3, ftp_url="ftp://ftp.patricbrc.org/genomes/{}/{}.{}".format( genome_accession, genome_accession, suffix ), s3_url=os.path.join( self.s3_folder, genome_accession, "transcripts.frn" ), containerinfo=sl.ContainerInfo( vcpu=1, mem=1000, engine=self.engine, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "fetch_patric_transcripts_{}".format( genome_accession) ) ) ) for suffix in ["PATRIC.frn", "RefSeq.frn"] ] # 3. Make a flat file for the 16S records extract_all_16S = self.new_task( "extract_all_16S", Extract16S, s3_parent_folder=self.s3_folder, s3_url=os.path.join(self.s3_folder, "transcripts.fasta"), temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=1, mem=1000, engine=self.engine, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix="extract_all_16s" ) ) extract_all_16S.in_fastas = [ genome_transcript[0].out_file for genome_transcript in fetch_transcripts_tasks.values() ] # 4. Make a flat file for the annotations extract_all_annotations = self.new_task( "extract_all_annotations", ExtractAnnotations, s3_parent_folder=self.s3_folder, s3_url=os.path.join(self.s3_folder, "annotations.tsv"), temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=1, mem=1000, engine=self.engine, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix="extract_all_annotations" ) ) extract_all_annotations.in_fastas = { genome_id: genome_transcript[0].out_file for genome_id, genome_transcript in fetch_transcripts_tasks.items() } extract_all_annotations.in_annotations = { genome_id: genome_annotation[0].out_file for genome_id, genome_annotation in fetch_annotation_tasks.items() } return extract_all_16S, extract_all_annotations
def workflow(self): # Intialize our container info light_containerinfo = sl.ContainerInfo() light_containerinfo.from_config(section='light') long_containerinfo = light_containerinfo midcpu_containerinfo = sl.ContainerInfo() midcpu_containerinfo.from_config(section='midcpu') heavy_containerinfo = sl.ContainerInfo() heavy_containerinfo.from_config(section='heavy') highmem_containerinfo = sl.ContainerInfo() highmem_containerinfo.from_config(section='highmem') # # Load the refpkg (in tgz format) # refpkg_tgz = self.new_task( 'load_refpkg_tgz', LoadRefpkgTGZ, path=self.refpkg_tgz, file_format='gzip', ) jplace = self.new_task( 'load_jplace', LoadFile, path=self.jplace, ) # Load the seq map seq_map = self.new_task('load_seq_map', LoadFile, path=self.seq_map_csv) # Load the weights if provided if self.sv_weights_csv: sv_weights = self.new_task('load_sv_weight', LoadFile, path=self.sv_weights_csv) else: sv_weights = None if self.labels: labels = self.new_task('load_labels', LoadFile, path=self.labels) else: labels = None # And unpack the refpkg to the relevant bits refpkg_alignments = self.new_task( 'refpkg_alignments', ExtractRefpkgAlignment, aln_fasta_fn=os.path.join(self.working_dir, 'placement', 'refpkg.aln.fasta'), aln_sto_fn=os.path.join(self.working_dir, 'placement', 'refpkg.aln.sto'), ) refpkg_alignments.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz # # Load the sequence variants (fasta format) # sv_fasta = self.new_task('load_sv', LoadFastaSeqs, fasta_seq_path=self.sv_fasta) # # Align the sequence variants # sv_aligned = self.new_task( 'align_sv', CMAlignSeqs, containerinfo=heavy_containerinfo, alignment_sto_fn=os.path.join(self.working_dir, 'placement', 'sv.aln.sto'), alignment_score_fn=os.path.join(self.working_dir, 'placement', 'sv.aln.scores'), ) sv_aligned.in_seqs = sv_fasta.out_seqs sv_aligned_fasta = self.new_task( 'align_sv_to_fasta', AlignmentStoToFasta, align_fasta_fn=os.path.join(self.working_dir, 'placement', 'sv.aln.fasta'), ) sv_aligned_fasta.in_align_sto = sv_aligned.out_align_sto # # Combine the refpkg alignment with the sequence variant alignment # sv_refpkg_aln_sto = self.new_task('combine_sv_refpkg_aln_sto', CombineAlignmentsSTO, containerinfo=light_containerinfo, combined_aln_sto_fn=os.path.join( self.working_dir, 'placement', 'sv_refpkg_aln.sto')) sv_refpkg_aln_sto.in_aln_sto_1 = refpkg_alignments.out_aln_sto sv_refpkg_aln_sto.in_aln_sto_2 = sv_aligned.out_align_sto # # Prep the placements.db using the refpkg # prepped_placementdb = self.new_task( 'prep_placementdb', PlacementDB_Prep, containerinfo=light_containerinfo, placement_db_fn=os.path.join(self.destination_dir, 'classification', 'placement.db')) prepped_placementdb.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz # # Insert the seq_info / map of sv -> specimens # placement_db_w_si = self.new_task( 'placement_db_add_si', PlacementDB_AddSI, containerinfo=light_containerinfo, ) placement_db_w_si.in_placement_db = prepped_placementdb.out_placement_db placement_db_w_si.in_seq_map = seq_map.out_file # # Classify the sequence variants # placement_db_classified = self.new_task( 'classify_into_placement_db', PlacementDB_Classify_SV, containerinfo=midcpu_containerinfo, ) placement_db_classified.in_placement_db = placement_db_w_si.out_placement_db placement_db_classified.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz placement_db_classified.in_sv_refpkg_aln_sto = sv_refpkg_aln_sto.out_aln_sto placement_db_classified.in_jplace = jplace.out_file # # Multiclass concat names # placement_db_mcc = self.new_task( 'placement_db_multiclass_concat', PlacementDB_MCC, containerinfo=long_containerinfo, ) placement_db_mcc.in_placement_db = placement_db_classified.out_placement_db placement_db_mcc.in_weights = sv_weights.out_file # # Tabular CSV outputs # tables_for_rank = {} for rank in ['phylum', 'class', 'order', 'family', 'genus', 'species']: tables_for_rank[rank] = self.new_task( 'by_specimen_{}'.format(rank), GenerateTables, containerinfo=light_containerinfo, tables_path=os.path.join( self.destination_dir, 'classification', 'tables', ), rank=rank) tables_for_rank[ rank].in_placement_db = placement_db_mcc.out_placement_db tables_for_rank[rank].in_seq_map = seq_map.out_file if labels: tables_for_rank[rank].in_labels = labels.out_file return (placement_db_mcc, tables_for_rank)
class Workflow_NCBI_16s(sl.WorkflowTask): # # Take a set of sequence variants in FASTA format and at least one repository # of reference sequences. # Search the repository / repositories for matches above a specified threshold # for the sequence variants. # Use those recruited full length repo sequences to build a refpkg. # working_dir = sl.Parameter() ncbi_email = sl.Parameter() repo_url = sl.Parameter() example_seqs = sl.Parameter() heavy_containerinfo = sl.ContainerInfo( vcpu=36, mem=70000, container_cache=os.path.abspath( os.path.join('../working', 'containers/')), engine='aws_batch', aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/', aws_jobRoleArn= 'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask', aws_batch_job_queue='optimal', slurm_partition='boneyard') light_containerinfo = sl.ContainerInfo( vcpu=2, mem=2024, container_cache=os.path.abspath( os.path.join('../working', 'containers/')), engine='aws_batch', aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/', aws_jobRoleArn= 'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask', aws_batch_job_queue='optimal', slurm_partition='boneyard') test_containerinfo = sl.ContainerInfo( vcpu=2, mem=4096, container_cache=os.path.abspath( os.path.join('../working', 'containers/')), engine=ENGINE, aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/', aws_jobRoleArn= 'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask', aws_batch_job_queue='optimal', slurm_partition='boneyard') local_containerinfo = sl.ContainerInfo( vcpu=2, mem=4096, container_cache=os.path.abspath( os.path.join('../working', 'containers/')), engine='docker', aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/', aws_jobRoleArn= 'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask', aws_batch_job_queue='optimal', slurm_partition='boneyard') def workflow(self): # # Load current accessions with 16s in a genome # repo_url = self.new_task( 'load_repo_url', LoadFile, path=self.repo_url, ) example_seqs = self.new_task('load_example_seqs', LoadFile, path=self.example_seqs) acc_genome_16s = self.new_task( 'genome_16s_accessions', NT_AccessionsForQuery, containerinfo=self.test_containerinfo, email=self.ncbi_email, accessions_fn=os.path.join(self.working_dir, 'ncbi_16s', 'accession', 'genome_16s.csv'), query=("16s[All Fields] AND rRNA[Feature Key]" " AND Bacteria[Organism]" " AND 500000 : 99999999999[Sequence Length]" " AND genome[All Fields]"), ) repo_genome_update = self.new_task( 'repo_genome_update', NT_Repo_Update_Accessions, extra_values={'is_genome': True}, ) repo_genome_update.in_repo_url = repo_url.out_file repo_genome_update.in_accessions = acc_genome_16s.out_accessions repo_filled = self.new_task( 'repo_fill', NT_Repo_Fill, containerinfo=self.test_containerinfo, email=self.ncbi_email, working_dir=os.path.join( self.working_dir, 'ncbi_16s', ), ) repo_filled.in_repo = repo_genome_update.out_repo # Now dump out 16S / seq_info from the genomes. repo_dumped = self.new_task( 'repo_dump', NT_Repo_Output_FastaSeqInfo, fn_fasta_gz=os.path.join(self.working_dir, 'ncbi_16s', 'genomes.16s.fasta.gz'), fn_seq_info=os.path.join(self.working_dir, 'ncbi_16s', 'genomes.16s.seq_info.csv'), ) repo_dumped.in_repo = repo_filled.out_repo # Find genomes missing peptide / rRNA annotations prokka_annotation = self.new_task( 'prokka_annotation', NT_Repo_Prokka, containerinfo=self.light_containerinfo, num_concurrent=100, workdir=os.path.join(self.working_dir, 'ncbi_16s', 'prokka')) prokka_annotation.in_repo = repo_filled.out_repo return (prokka_annotation) # Use cmsearch to be sure these are vaguely like rRNA cmsearch_verify = self.new_task( 'cmsearch_verify', CMSearchVerify, containerinfo=self.heavy_containerinfo, results_fn=os.path.join(self.working_dir, 'ncbi_16s', 'genomes.16s.cmsearch.tsv'), ) cmsearch_verify.in_seqs = repo_dumped.out_seqs # And filter to rRNA. verified_seqs = self.new_task( 'verify_repo', VerifyRepo, containerinfo=self.heavy_containerinfo, uc_fn=os.path.join(self.working_dir, 'ncbi_16s', 'genomes.16s.verified.uc'), verified_seqs_fn=os.path.join(self.working_dir, 'ncbi_16s', 'genomes.16s.verified.fasta.gz'), unverified_seqs_fn=os.path.join(self.working_dir, 'ncbi_16s', 'genomes.16s.unverified.fasta.gz'), ) verified_seqs.in_repo_seqs = repo_dumped.out_seqs verified_seqs.in_expected_seqs = example_seqs.out_file return (repo_dumped)
def workflow(self): # Intialize our container info light_containerinfo = sl.ContainerInfo() light_containerinfo.from_config( section='light' ) long_containerinfo = light_containerinfo midcpu_containerinfo = sl.ContainerInfo() midcpu_containerinfo.from_config( section='midcpu' ) heavy_containerinfo = sl.ContainerInfo() heavy_containerinfo.from_config( section='heavy' ) highmem_containerinfo = sl.ContainerInfo() highmem_containerinfo.from_config( section='highmem' ) # # Load the refpkg (in tgz format) # refpkg_tgz = self.new_task( 'load_refpkg_tgz', LoadRefpkgTGZ, path=self.refpkg_tgz, file_format='gzip', ) # Load the seq map seq_map = self.new_task( 'load_seq_map', LoadFile, path=self.seq_map_csv ) # Load the weights if provided if self.sv_weights_csv: sv_weights = self.new_task( 'load_sv_weight', LoadFile, path=self.sv_weights_csv ) else: sv_weights = None # And unpack the refpkg to the relevant bits refpkg_alignments = self.new_task( 'refpkg_alignments', ExtractRefpkgAlignment, aln_fasta_fn=os.path.join( self.working_dir, 'placement', 'refpkg.aln.fasta' ), aln_sto_fn=os.path.join( self.working_dir, 'placement', 'refpkg.aln.sto' ), ) refpkg_alignments.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz # # Load the sequence variants (fasta format) # sv_fasta = self.new_task( 'load_sv', LoadFastaSeqs, fasta_seq_path=self.sv_fasta ) # # Align the sequence variants # sv_aligned = self.new_task( 'align_sv', CMAlignSeqs, containerinfo=heavy_containerinfo, alignment_sto_fn=os.path.join( self.working_dir, 'placement', 'sv.aln.sto' ), alignment_score_fn=os.path.join( self.working_dir, 'placement', 'sv.aln.scores' ), ) sv_aligned.in_seqs = sv_fasta.out_seqs sv_aligned_fasta = self.new_task( 'align_sv_to_fasta', AlignmentStoToFasta, align_fasta_fn=os.path.join( self.working_dir, 'placement', 'sv.aln.fasta' ), ) sv_aligned_fasta.in_align_sto = sv_aligned.out_align_sto # # Combine the refpkg alignment with the sequence variant alignment # sv_refpkg_aln_sto = self.new_task( 'combine_sv_refpkg_aln_sto', CombineAlignmentsSTO, containerinfo=heavy_containerinfo, combined_aln_sto_fn=os.path.join( self.working_dir, 'placement', 'sv_refpkg_aln.sto' ) ) sv_refpkg_aln_sto.in_aln_sto_1 = refpkg_alignments.out_aln_sto sv_refpkg_aln_sto.in_aln_sto_2 = sv_aligned.out_align_sto # # Place the sequence variants using this combined aligment # dedup_jplace = self.new_task( 'make_dedup_jplace', PPLACER_PlaceAlignment, containerinfo=heavy_containerinfo, jplace_fn=os.path.join( self.destination_dir, 'placement', 'dedup.jplace' ) ) dedup_jplace.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz dedup_jplace.in_merged_aln_sto = sv_refpkg_aln_sto.out_aln_sto # # Reduplicate # if not sv_weights: redup_jplace = dedup_jplace else: redup_jplace = self.new_task( 'reduplicate_jplace', Jplace_Reduplicate, containerinfo=light_containerinfo, jplace_fn=os.path.join( self.destination_dir, 'placement', 'redup.jplace.gz' ) ) redup_jplace.in_jplace = dedup_jplace.out_jplace redup_jplace.in_weights = sv_weights.out_file # # ADCL # adcl = self.new_task( 'create_adcl', Jplace_ADCL, containerinfo=light_containerinfo, adcl_fn=os.path.join( self.destination_dir, 'placement', 'adcl.gz' ) ) adcl.in_jplace = redup_jplace.out_jplace # # EDPL # edpl = self.new_task( 'calculate_edpl', Jplace_EDPL, containerinfo=highmem_containerinfo, edpl_fn=os.path.join( self.destination_dir, 'placement', 'edpl.gz' ) ) edpl.in_jplace = redup_jplace.out_jplace # # EPCA # epca = self.new_task( 'calculate_epca', Jplace_PCA, containerinfo=long_containerinfo, path=os.path.join( self.destination_dir, 'placement', 'pca' ), prefix='epca', pca='epca' ) epca.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz epca.in_seq_map = seq_map.out_file epca.in_jplace = redup_jplace.out_jplace # # LPCA # lpca = self.new_task( 'calculate_lpca', Jplace_PCA, containerinfo=highmem_containerinfo, path=os.path.join( self.destination_dir, 'placement', 'pca' ), prefix='lpca', pca='lpca' ) lpca.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz lpca.in_seq_map = seq_map.out_file lpca.in_jplace = redup_jplace.out_jplace # # KR-distance # kr_distance = self.new_task( 'calculate_kr_distance', Jplace_KR_Distance, containerinfo=long_containerinfo, kr_fn=os.path.join( self.destination_dir, 'placement', 'kr_distance.csv' ), ) kr_distance.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz kr_distance.in_seq_map = seq_map.out_file kr_distance.in_jplace = redup_jplace.out_jplace # # Alpha-Diversity # alpha_diversity = self.new_task( 'calculate_alpha_diversity', Jplace_Alpha_Diversity, containerinfo=light_containerinfo, alpha_diversity_fn=os.path.join( self.destination_dir, 'placement', 'alpha_diversity.csv' ), ) alpha_diversity.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz alpha_diversity.in_seq_map = seq_map.out_file alpha_diversity.in_jplace = redup_jplace.out_jplace return(epca, lpca, adcl, edpl, kr_distance, alpha_diversity)
def workflow(self): # Input files are either located in SRA or AWS S3 assert self.input_location in ["SRA", "S3"] # Read in the metadata sheet metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep) for col_name in [self.input_column_name, self.sample_column_name]: assert col_name in metadata.columns, "{} not found in {}".format( col_name, self.metadata_fp) # Make sure that all samples and files are unique assert metadata[col_name].unique().shape[0] == metadata.shape[0] # Make tasks that will make sure the reference databases exist ref_db_dmnd = self.new_task("load_ref_db_dmnd", LoadFile, path=self.ref_db_dmnd) ref_db_metadata = self.new_task("load_ref_db_metadata", LoadFile, path=self.ref_db_metadata) # Keep track of all of the jobs for getting the input files tasks_load_inputs = {} # Keep track of all of the jobs for aligning against the viral database tasks_map_viruses = {} # Assembling datasets de novo tasks_metaspades = {} # Running VirFinder on assembled contigs tasks_virfinder = {} # Iterate over all of the rows of samples for ix, r in metadata.iterrows(): # Get the sample name and the file location sample_name = r[self.sample_column_name] input_path = r[self.input_column_name] # If the inputs are on SRA, execute jobs that will download them if self.input_location == "SRA": tasks_load_inputs[sample_name] = self.new_task( "download_from_sra_{}".format(sample_name), ImportSRAFastq, sra_accession=input_path, base_s3_folder=self.base_s3_folder, containerinfo=sl.ContainerInfo( vcpu=1, mem=4096, engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "download_from_sra_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) else: # Make sure the file exists on S3 assert self.input_location == "S3" tasks_load_inputs[sample_name] = self.new_task( "load_from_s3_{}".format(sample_name), LoadFile, path=input_path) # Make a task to align the reads, wherever they came from tasks_map_viruses[sample_name] = self.new_task( "map_viruses_{}".format(sample_name), MapVirusesTask, output_folder=os.path.join(self.base_s3_folder, self.mapping_output_folder), sample_name=sample_name, threads=self.align_threads, temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.align_threads), mem=int(self.align_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "map_viruses_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # De novo assembly with metaSPAdes tasks_metaspades[sample_name] = self.new_task( "metaspades_{}".format(sample_name), AssembleMetaSPAdes, sample_name=sample_name, output_folder=os.path.join(self.base_s3_folder, "metaspades"), threads=self.assemble_threads, max_mem=int(int(self.assemble_mem) / 1000), temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.assemble_threads), mem=int(self.assemble_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "metaspades_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # Run VirFinder on the assembled contigs tasks_virfinder[sample_name] = self.new_task( "virfinder_{}".format(sample_name), VirFinderTask, base_s3_folder=self.base_s3_folder, sample_name=sample_name, containerinfo=sl.ContainerInfo( vcpu=int(self.align_threads), mem=int(self.align_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "virfinder_{}".format(sample_name)), )) # Assign the output from tasks_load_inputs to the input to tasks_map_viruses for sample_name in tasks_load_inputs: assert sample_name in tasks_map_viruses # Assign the input for the reference database tasks_map_viruses[ sample_name].in_ref_db_dmnd = ref_db_dmnd.out_file tasks_map_viruses[ sample_name].in_ref_db_metadata = ref_db_metadata.out_file tasks_map_viruses[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_file # VirFinder depends on metaspades tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_file tasks_virfinder[sample_name].in_fasta = tasks_metaspades[ sample_name].out_fasta return tasks_map_viruses, tasks_virfinder
def workflow(self): # Make sure the project name is alphanumeric assert all([s.isalnum() or s == "_" for s in self.project_name ]), "Project name must be alphanumeric" # Data can come from either SRA or S3 assert self.input_location in ["SRA", "S3"] # Read in the metadata sheet metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep) for col_name in [self.input_column_name, self.sample_column_name]: assert col_name in metadata.columns, "{} not found in {}".format( col_name, self.metadata_fp) # Make sure that all samples and files are unique assert metadata[col_name].unique().shape[0] == metadata.shape[0] # Keep track of the jobs for each step, for each sample tasks_load_inputs = {} tasks_famli = {} # Iterate over all of the rows of samples for _, r in metadata.iterrows(): # Get the sample name and the file location sample_name = r[self.sample_column_name] input_path = r[self.input_column_name] # Make a UUID to isolate temp files for this task from any others task_uuid = str(uuid.uuid4())[:8] # 0. LOAD THE DATABASE tasks_load_db = self.new_task("load_db_from_s3", LoadFile, path=self.famli_db_location) # 1. LOAD THE INPUT FILES if self.input_location == "S3": tasks_load_inputs[sample_name] = self.new_task( "load_from_s3_{}".format(sample_name), LoadFile, path=input_path) elif self.input_location == "SRA": assert input_path.startswith("SRR"), input_path tasks_load_inputs[sample_name] = self.new_task( "download_from_SRA_{}".format(sample_name), ImportSRAFastq, sra_accession=input_path, base_s3_folder=self.base_s3_folder, input_mount_point="/scratch/{}_get_sra/input/".format( task_uuid), output_mount_point="/scratch/{}_get_sra/output/".format( task_uuid), containerinfo=sl.ContainerInfo( vcpu=1, mem=32000, engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix=re.sub( '[^a-zA-Z0-9-_]', '_', "get_sra_{}".format(sample_name)), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) else: raise Exception("Data must be from S3 or SRA") # 2. ALIGN AGAINST THE DATABASE USING FAMLI tasks_famli[sample_name] = self.new_task( "famli_{}".format(sample_name), FAMLITask, sample_name=sample_name, output_folder=os.path.join(self.base_s3_folder, self.output_folder), threads=self.famli_threads, temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.famli_threads), mem=int(self.famli_mem), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_batch_job_poll_sec=120, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_prefix="famli_{}".format(sample_name), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # Connect the raw FASTQ input if self.input_location == "S3": tasks_famli[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_file elif self.input_location == "SRA": tasks_famli[sample_name].in_fastq = tasks_load_inputs[ sample_name].out_fastq # Connect the reference database tasks_famli[sample_name].in_ref_dmnd = tasks_load_db.out_file return tasks_famli
def workflow(self): # Intialize our container info light_containerinfo = sl.ContainerInfo() light_containerinfo.from_config(section='light') midcpu_containerinfo = sl.ContainerInfo() midcpu_containerinfo.from_config(section='midcpu') heavy_containerinfo = sl.ContainerInfo() heavy_containerinfo.from_config(section='heavy') highmem_containerinfo = sl.ContainerInfo() highmem_containerinfo.from_config(section='highmem') # # Build our taxonomy db # taxonomy_db = self.new_task('taxonomy_db', BuildTaxtasticDB, containerinfo=light_containerinfo, tax_db_path=os.path.join( self.working_dir, 'refpkg', 'taxonomy.db')) # # Load the sequence variants # sequence_variants = self.new_task( 'load_sequence_variants', LoadFastaSeqs, fasta_seq_path=self.sequence_variants_path) log.info("Loaded sequence variants") # Load the sequence information seq_info_files = [ self.new_task('load_si_{}'.format(si_i), LoadFile, path=si_path) for si_i, si_path in enumerate(self.repo_seq_info.split(',')) ] log.info("Loaded %d sequence information files", len(seq_info_files)) # # Load the annotated repositories # repo_annotated = [ self.new_task('load_annotated_repo_{}'.format(r_i), LoadFastaSeqs, fasta_seq_path=r_path) for r_i, r_path in enumerate(self.repo_annotated_fasta.split(',')) ] log.info("Loaded %d Annotated Repositories", len(repo_annotated)) # # Search the sequence variants in the annotated repository # search_sv_annotated = [] for ra_i, r_annotated in enumerate(repo_annotated): r_a_task = self.new_task( 'search_sv_annotated_{}'.format(ra_i), SearchRepoForMatches, containerinfo=midcpu_containerinfo, matches_uc_path=os.path.join( self.working_dir, 'refpkg', 'repo.annotated__{}.matches.uc'.format(ra_i)), unmatched_exp_seqs_path=os.path.join( self.working_dir, 'refpkg', 'repo.annotated__{}.annotated.exp_seqs_unmatched.fasta'. format(ra_i)), matched_repo_seqs_path=os.path.join( self.working_dir, 'refpkg', 'repo.annotated__{}.recruited_repo_seqs.fasta'.format( ra_i)), min_id=self.min_id_annotated, maxaccepts= 10, # Default take the top 10 (roughly corresponding to a 95% id for most) ) r_a_task.in_exp_seqs = sequence_variants.out_seqs r_a_task.in_repo_seqs = r_annotated.out_seqs search_sv_annotated.append(r_a_task) # # Combine Recruits into one file # combined_repo_matches = self.new_task( 'combine_repo_matches', CombineRepoMatches, seqs_fn=os.path.join(self.working_dir, 'refpkg', 'combined.repo.maches.fasta'), seq_info_fn=os.path.join(self.working_dir, 'refpkg', 'combined.repo.maches.seq_info.csv'), ) combined_repo_matches.in_seqs = [ ssv.out_matched_repo_seqs for ssv in search_sv_annotated ] combined_repo_matches.in_seq_info = [ sif.out_file for sif in seq_info_files ] refpkg_seqs = combined_repo_matches.out_seqs refpkg_seqinfo = combined_repo_matches.out_seq_info # # Verify the taxonomy for the refpkg seqinfo file. # verified_refpkg_seqinfo = self.new_task( 'verify_refpkg_seqinfo_taxonomy', ConfirmSeqInfoTaxonomy, email=self.entrez_email, containerinfo=light_containerinfo, confirmed_seqinfo_path=os.path.join( self.working_dir, 'refpkg', 'seq_info.refpkg.verified_tax.csv')) verified_refpkg_seqinfo.in_seq_info = refpkg_seqinfo verified_refpkg_seqinfo.in_tax_db = taxonomy_db.out_tax_db # # Parse UC file to determine if we achieved our minimum-best goal # for each SV. # # # Align recruited repo seqs # align_recruits = self.new_task( 'align_recruits', CMAlignSeqs, containerinfo=highmem_containerinfo, alignment_sto_fn=os.path.join(self.working_dir, 'refpkg', 'recruit.aln.sto'), alignment_score_fn=os.path.join(self.working_dir, 'refpkg', 'recruit.aln.scores'), ) align_recruits.in_seqs = refpkg_seqs # # Make a fasta version of the alignment # align_fasta = self.new_task( 'align_fasta', AlignmentStoToFasta, align_fasta_fn=os.path.join(self.working_dir, 'refpkg', 'recruit.aln.fasta'), ) align_fasta.in_align_sto = align_recruits.out_align_sto # # Make a tree of the reference package sequences # raxml_tree = self.new_task( 'raxml_tree', RAxMLTree, containerinfo=heavy_containerinfo, tree_path=os.path.join(self.working_dir, 'refpkg', 'refpkg.tre'), tree_stats_path=os.path.join(self.working_dir, 'refpkg', 'refpkg.tre.info'), ) raxml_tree.in_align_fasta = align_fasta.out_align_fasta # # Cleanup the tree info to remove cruft # tree_info_cleanup = self.new_task( 'tree_info_cleanup', CleanupTreeInfo, tree_info_path=os.path.join(self.working_dir, 'refpkg', 'refpkg.tre.cleaned.info'), ) tree_info_cleanup.in_tree_info = raxml_tree.out_tree_stats # # Start to assemble the reference package at this point # # Taxtable refpkg_taxtable = self.new_task('refpkg_taxtable', TaxTableForSeqInfo, containerinfo=light_containerinfo, taxtable_path=os.path.join( self.working_dir, 'refpkg', 'taxtable.csv')) refpkg_taxtable.in_seq_info = verified_refpkg_seqinfo.out_seq_info refpkg_taxtable.in_tax_db = taxonomy_db.out_tax_db # Covariance Matrix obtain_cm = self.new_task('obtain_cm', ObtainCM, containerinfo=light_containerinfo, cm_destination=os.path.join( self.working_dir, 'refpkg', 'rRNA_16S_SSU.cm')) # And the actual combination step combine_refpgk = self.new_task( 'combine_refpkg', CombineRefpkg, containerinfo=light_containerinfo, refpkg_path=os.path.join( self.new_refpkg_path, 'refpkg', ), refpkg_name=self.new_refpkg_name, ) combine_refpgk.in_aln_fasta = align_fasta.out_align_fasta combine_refpgk.in_aln_sto = align_recruits.out_align_sto combine_refpgk.in_tree = raxml_tree.out_tree combine_refpgk.in_tree_stats = tree_info_cleanup.out_tree_info combine_refpgk.in_taxtable = refpkg_taxtable.out_taxtable combine_refpgk.in_seq_info = verified_refpkg_seqinfo.out_seq_info combine_refpgk.in_cm = obtain_cm.out_cm return (combine_refpgk) # # Combine the sequences, avoiding duplicate sequences # combined_recruits = self.new_task( 'combine_repo_recruits', CombineRepoMatches, seqs_fn=os.path.join(self.working_dir, 'refpkg', 'recruits.combined.fasta'), seq_info_fn=os.path.join(self.working_dir, 'refpkg', 'recruits.combined.seq_info.csv')) combined_recruits.in_seqs = [ search_sv_genomes.out_matched_repo_seqs, search_sv_filtered.out_matched_repo_seqs, ] combined_recruits.in_seq_info = [ repo_genomes_seq_info.out_file, repo_filtered_seq_info.out_file, ] refpkg_taxtable = self.new_task('refpkg_taxtable', TaxTableForSeqInfo, containerinfo=self.local_containerinfo, taxtable_path=os.path.join( self.working_dir, 'refpkg', 'taxtable.csv')) refpkg_taxtable.in_seq_info = combined_recruits.out_seq_info refpkg_taxtable.in_tax_db = taxonomy_db.out_tax_db obtain_cm = self.new_task('obtain_cm', ObtainCM, containerinfo=self.local_containerinfo, cm_destination=os.path.join( self.working_dir, 'refpkg', 'rRNA_16S_SSU.cm')) combine_refpgk = self.new_task( 'combine_refpkg', CombineRefpkg, containerinfo=self.local_containerinfo, refpkg_path=os.path.join( self.working_dir, 'refpkg', ), refpkg_name='test', ) combine_refpgk.in_aln_fasta = align_fasta.out_align_fasta combine_refpgk.in_aln_sto = align_recruits.out_align_sto combine_refpgk.in_tree = raxml_tree.out_tree combine_refpgk.in_tree_stats = raxml_tree.out_tree_stats combine_refpgk.in_taxtable = refpkg_taxtable.out_taxtable combine_refpgk.in_seq_info = combined_recruits.out_seq_info combine_refpgk.in_cm = obtain_cm.out_cm return (combine_refpgk)
def workflow(self): # Load the input file genome_fasta = self.new_task("load_genome_fasta", LoadFile, path=self.genome_fasta) # Run Prokka annotate_prokka = self.new_task( "annotate_prokka_{}".format(self.genome_name), AnnotateProkka, sample_name=self.genome_name, output_folder=os.path.join(self.base_s3_folder, "prokka"), threads=self.checkm_threads, temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(self.checkm_threads), mem=int(self.checkm_memory), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_name="annotate_prokka_{}".format( self.genome_name), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # Link the file for prokka annotation annotate_prokka.in_fasta = genome_fasta.out_file # Run CheckM checkm = self.new_task( "checkm_{}".format(self.genome_name), CheckM, sample_name=self.genome_name, output_folder=os.path.join(self.base_s3_folder, "checkm"), threads=8, temp_folder=self.temp_folder, containerinfo=sl.ContainerInfo( vcpu=int(8), mem=int(64000), engine=self.engine, aws_s3_scratch_loc=self.aws_s3_scratch_loc, aws_jobRoleArn=self.aws_job_role_arn, aws_batch_job_queue=self.aws_batch_job_queue, aws_batch_job_name="checkm_{}".format(self.genome_name), mounts={ "/docker_scratch": { "bind": self.temp_folder, "mode": "rw" } })) # Link the protein coding sequences from prokka into the inputs for checkm checkm.in_faa = annotate_prokka.out_faa return checkm