class Main(Step): """From multi-lane FASTQ files to merged duplicate-marked BAMs, with conditional execution depending on QC metrics. To run locally, change into project directory and type: $ python -m sampleqc run --manifest_filename manifest.tsv --project_name my_project """ manifest_filename = Input(str) project_name = Input(str) qc_summary = Output(File) def execute(self): "Main execution method. Execution starts here." # setup execution project, stage apps, ref files Context().initialize(project_name=self.project_name) # parse manifest into cohort, import fastq files, set metadata cohort = load_manifest(self.manifest_filename) # process samples in loop # note: processing happens in parallel due to use of promises processed_bams = [ ProcessSample(fastqs=s.fastqs, name_=s.id).processed_bam for s in cohort.samples ] # collect BAM QC metrics and upload summary file self.qc_summary = CollectAndUploadQCSummary( processed_bams=processed_bams).uploaded_file
class WESsomatic(AppStep): case_id = Input(str) tumor_bam = Input(File) normal_bam = Input(File) annotated_mutect_variants = Output(File) def execute(self): ctx = Context() self.run_task( app_name="wes_somatic", inputs={ "tumor_reads": self.tumor_bam, "normal_reads": self.normal_bam, "target_bed": ctx.refs["v5_core"], "kgsnp_database": ctx.refs["g1k_snps"], "kgindel_database": ctx.refs["g1k_indels"], "mgindel_database": ctx.refs["hc_indels"], "snpEff_database": ctx.refs["snpeff"], "cosmic_database": ctx.refs["cosmic"], "cache_file": ctx.refs["vep"], "annotation_reference": ctx.refs["grch37_fasta"], "ExAC_database": ctx.refs["exac"], "input_tar_with_reference": ctx.refs["hg19_fasta"], "dbSNP": ctx.refs["dbsnp_138"], }, task_name="WESsomatic-" + self.case_id, ) self.annotated_mutect_variants = self.task.outputs[ "annotated_mutect_variants"]
class Main(Step): """Example automation that imports FASTq files from volume, aligns FASTq files with BWA, and exports resulting BAM files back to volume. Inputs of the automation are name of the SB project, SB volume ID, and volume source and destination directories. To run this automation on your computer, type the following command while inside the project root directory: python -m app run --project_name <project-name> [--volume_id <volume-id> --src_dir <source-directory> --dest_dir <destination-directory>] """ project_name = Input(str, description="Name of SB execution project") volume_id = Input( str, description="ID of volume for file import and export", default="external-demos/volumes_api_demo_bucket", ) src_dir = Input( str, description="Source directory on volume containing input FASTq files", default="", ) dest_dir = Input( str, description= "Target directory on volume where outputs will be exported to", default="automation/import-run-export/result", ) def execute(self): "Execution starts here." # initialize automation context with execution project and volume ctx = Context().initialize(self.project_name, self.volume_id) # stage input FASTq files and group them into samples samples = ImportFiles(src_dir=self.src_dir).samples # run BWA for each sample; samples are processed in parallel # because app outputs are promises and we can access them # before output values become available (lazy evaluation) bams = [] for sample in samples: bwa = BWAmem(f"BWAmem-{sample.sample_id}", input_reads=sample.fastq_files) bams.append(bwa.aligned_reads) # export all BAM files to volume; export step starts executing # as soon as all BAM files have become available ExportFiles(files=bams, to_volume=ctx.volume, prefix=self.dest_dir, overwrite=True)
class ProcessBam(Step): """Processes single BAM file with execution conditioned on automation setting (static conditional) and alignment QC metric (dynamic conditional).""" input_bam = Input(File) processed_bam = Output(File) qc_metrics = Output(QCMetrics) def execute(self): # compute alignment quality metrics and provide on output picard = PicardAlignmentSummaryMetrics(input_bam=self.input_bam) self.qc_metrics = picard.qc_metrics # if mark duplicates is not required return input BAM; # note: static conditional that does not cause exeuction block if self.config_.skip_duplicate_marking: self.processed_bam = self.input_bam return # if BAM failed QC do not process further and return input BAM; # note: dynamic conditional that blocks this thread until QC # metrics finished computing if not bam_qc_metrics_ok(self.qc_metrics, self.config_): logging.info(f"Sample failed QC: {self.input_bam.name}") self.processed_bam = self.input_bam return # mark duplicates and return de-duped BAM as result self.processed_bam = PicardMarkDuplicates(input_bam=self.input_bam).deduped_bam
class ProcessSample(Step): "Processes a single sample" fastqs = Input(List[File]) aligned_bam = Output(File) processed_bam = Output(File) bam_qc_metrics = Output(QCMetrics) def execute(self): # run trimgalore on all lanes tg = Trimgalore(reads=self.fastqs, paired=True, fastqc=True) # only keep fastq pairs that meet quality cutoff filter = FilterFastq(input_fastq=tg.trimmed_reads) # run BWA on remaining lanes and provide BAM on output; # immediately unblocks other steps waiting for BAM output, # even before this execute() function finishes self.aligned_bam = BWAmem(fastqs=filter.pass_fastq).merged_bam # process BAM with conditional execution inside process_bam = ProcessBam(input_bam=self.aligned_bam) # return processed BAM and BAM QC metrics as output self.processed_bam = process_bam.processed_bam self.bam_qc_metrics = process_bam.qc_metrics
class Trimgalore(AppStep): reads = Input(List[File]) paired = Input(bool) fastqc = Input(bool) trimmed_reads = Output(List[File]) def execute(self): self.run_task( app_name="trimgalore", inputs={ "reads": self.reads, "paired": self.paired, "fastqc": self.fastqc }, task_name="Trimgalore-" + self.reads[0].metadata["sample_id"], ) self.trimmed_reads = self.task.outputs["trimmed_reads"]
class FilterFastq(Step): "Filters out FASTq files not meeting QC criteria" input_fastq = Input(List[File]) pass_fastq = Output(List[File]) def execute(self): self.pass_fastq = [ fq for fq in self.input_fastq if fq.size > self.config_.qc.min_fastq_size ]
class BAMprep(AppStep): sample_id = Input(str) fastq_files = Input(List[File]) dedup_metrics = Output(File) recal_table = Output(File) alignment_metrics = Output(File) hs_metrics = Output(File) per_target_coverage = Output(File) output_bam = Output(File) median_target_coverage = Output(int) def execute(self): ctx = Context() self.run_task( app_name="bam_prep", inputs={ "input_reads": self.fastq_files, "input_tar_with_reference": ctx.refs["hg19_fasta"], "bait_bed": ctx.refs["sureselect_xt"], "target_bed": ctx.refs["v5_core"], "kgsnp_database": ctx.refs["g1k_snps"], "mgindel_database": ctx.refs["hc_indels"], }, task_name="BAMprep-" + self.sample_id, ) self.dedup_metrics = self.task.outputs["dedup_metrics"] self.recal_table = self.task.outputs["recal_table"] self.alignment_metrics = self.task.outputs["alignment_metrics"] self.hs_metrics = self.task.outputs["hs_metrics"] self.per_target_coverage = self.task.outputs["per_target_coverage"] self.output_bam = self.task.outputs["output_bam"] self.hs_metrics = self.task.outputs["hs_metrics"] self.median_target_coverage = self.get_median_target_coverage( self.hs_metrics) def get_median_target_coverage(self, file): "Parses median target coverage from hs metrics file" for line in file.content().split("\n"): if line.startswith("SureSelect"): return int(line.strip().split("\t")[23])
class Main(Step): file_name = Input(str) def execute(self): with open(str(self.file_name), "r") as f: counts = [ WordCounter(f"counter{idx}", line=line).count for idx, line in enumerate(f) ] logging.info(f"Found {sum(counts)} words.")
class ProcessSample(Step): "Processes a single sample" fastqs = Input(List[File]) processed_bam = Output(ProcessedBam) def execute(self): tg = Trimgalore(reads=self.fastqs, paired=True, fastqc=True) filter = FilterFastq(input_fastq=tg.trimmed_reads) aligned_bam = BWAmem(fastqs=filter.pass_fastq).merged_bam self.processed_bam = ProcessBam(input_bam=aligned_bam).processed_bam
class ImportFilesAndGroupBySample(Step): """Finds FASTq files on volume, imports them into project, sets file metadata, and returns updated files grouped by sample""" src_dir = Input(VolumeFolder) samples = Output(List[Sample]) def execute(self): imported_files = self.import_files_from_volume() updated_files = self.update_file_metadata(imported_files) self.samples = self.group_files_by_sample(updated_files) def import_files_from_volume(self): "Imports all fastq files found at volume source location" volume = SBApi().volumes.get(self.src_dir.volume_id) fastq_paths = [ l.location for l in volume.list(prefix=self.src_dir.prefix) if "TCRBOA7" in l.location and l.location.endswith(".fastq") ] return FindOrImportFiles( filepaths=fastq_paths, from_volume=volume, to_project=Context().project, ).imported_files def update_file_metadata(self, files): """Sets file metadata in bulk for list of files based on file names. Setting metadata in bulk instead of per-file reduces API calls. Example filename: TCRBOA7-N-WEX-TEST.read1.fastq""" metadata = [] for file in files: sample_id = file.name.split("-WEX")[0] paired_end = file.name.split("read")[1].split(".")[0] metadata.append({"sample_id": sample_id, "paired_end": paired_end}) return SetMetadataBulk(to_files=files, metadata=metadata).updated_files def group_files_by_sample(self, files): """Groups files into list of sample objects for easier downstream processing.""" samples = {} for file in files: sample_id = file.metadata["sample_id"] if sample_id not in samples: samples[sample_id] = Sample(sample_id) samples[sample_id].fastq_files.append(file) return list(samples.values())
class PicardMarkDuplicates(AppStep): input_bam = Input(File) deduped_bam = Output(File) def execute(self): self.run_task( app_name="markdup", inputs={"input_bam": [self.input_bam]}, task_name="MarkDup-" + self.input_bam.metadata["sample_id"], ) self.deduped_bam = self.task.outputs["deduped_bam"]
class MultiQC(AppStep): input_files = Input(List[File]) config_files = Input(Optional[List[File]]) sample_names = Input(Optional[File]) html_report = Output(File) pdf_report = Output(File) def execute(self): ctx = Context() self.run_task( app_name="multi_qc", inputs={ "in_reports": self.input_files, "config": self.config_files, "sample_names": self.sample_names, "pdf": True, }, task_name="MultiQC", ) self.html_report = self.task.outputs["out_html"] self.pdf_report = self.task.outputs["out_pdf"]
class Main(Step): """From multi-lane FASTQ files to merged duplicate-marked BAMs, with conditional execution depending on QC metrics. To run locally, change into project directory and type: $ python -m sampleqc run --manifest_filename manifest.tsv --project_name my_project """ manifest_filename = Input(str) project_name = Input(str) qc_summary = Output(File) def execute(self): "Main execution method. Execution starts here." # setup execution project, stage apps, ref files Context().initialize(project_name=self.project_name) # parse manifest into cohort, import fastq files, set metadata cohort = load_manifest(self.manifest_filename) for sample in cohort.samples: # process sample in seprate step # step must be named explicitly b/c of loop ps = ProcessSample(f"Process-{sample.id}", fastqs=sample.fastqs) # collect results for downstream aggregation steps sample.aligned_bam = ps.aligned_bam sample.bam_qc_metrics = ps.bam_qc_metrics # upload QC metrics summary file to SB platform and # provide uploaded file on output self.qc_summary = CollectAndUploadQCSummary( qc_metrics=[s.bam_qc_metrics for s in cohort.samples] ).uploaded_file
class CollectAndUploadQCSummary(Step): """Collects BAM QC metrics from all processed samples and uploads summary file in tab-separated format to SB project. Overwrites existing file. Returns uploaded file object.""" qc_metrics = Input(List[QCMetrics]) uploaded_file = Output(File) def execute(self): # NOTE: don't create a transient temporary file (not thread safe) # because actual upload happens in another thread temp_filename = tempfile.gettempdir() + "/bam_qc_metrics.tsv" temp = open(temp_filename, "wt") # write header temp.write( "\t".join( [ "sample_id", "bam_file", "pct_pf_reads_aligned", "strand_balance", "status", ] ) + "\n" ) # write content for qc in self.qc_metrics: temp.write( "\t".join( [ qc.bam_file.metadata["sample_id"], qc.bam_file.name, str(qc.pct_pf_reads_aligned), str(qc.strand_balance), "PASS" if bam_qc_metrics_ok(qc, self.config_) else "FAIL", ] ) + "\n" ) temp.close() # upload file to platform (overwrites existing file) self.uploaded_file = UploadFile( local_path=temp.name, to_project=Context().project, overwrite=True ).file
class BWAmem(AppStep): fastqs = Input(List[File]) merged_bam = Output(File) def execute(self): ctx = Context() self.run_task( app_name="bwa", inputs={ "FASTQ": self.fastqs, "Input_reference": ctx.refs["bwa_bundle"] }, task_name="BWAmem-" + self.fastqs[0].metadata["sample_id"], ) self.merged_bam = self.task.outputs["merged_bam"]
class ProcessBam(Step): """Processes single BAM file including alignment QC. If mark duplicates is not required (static conditional) or BAM failed alignment QC (dynamic conditional), returns input BAM without further processing. Otherwise, runs deduplication and return deduplicated BAM""" input_bam = Input(File) processed_bam = Output(ProcessedBam) def execute(self): asm = PicardAlignmentSummaryMetrics(input_bam=self.input_bam) qc_failed = not bam_qc_metrics_ok(asm.qc_metrics, self.config_) if self.config_.skip_duplicate_marking or qc_failed: self.processed_bam = ProcessedBam(self.input_bam, asm.qc_metrics) else: md = PicardMarkDuplicates(input_bam=self.input_bam) self.processed_bam = ProcessedBam(md.deduped_bam, asm.qc_metrics)
class BWAmem(Step): """App wrapper step that runs BWA-MEM on SB platform. Names task after sample ID metadata.""" input_reads = Input(List[File]) aligned_reads = Output(File) def execute(self): ctx = Context() task = FindOrCreateAndRunTask( new_name="BWAmem - " + self.input_reads[0].metadata["sample_id"] + " - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), inputs={ "input_reads": self.input_reads, "reference_index_tar": ctx.refs["bwa_bundle"], }, app=ctx.apps["bwa"], in_project=ctx.project, ).finished_task self.aligned_reads = task.outputs["aligned_reads"]
class PicardAlignmentSummaryMetrics(AppStep): input_bam = Input(File) summary_metrics_file = Output(File) qc_metrics = Output(QCMetrics) def execute(self): ctx = Context() self.run_task( app_name="alignmentqc", inputs={ "input_bam": self.input_bam, "reference": ctx.refs["reference_fasta"], }, task_name="AlignmentQC-" + self.input_bam.metadata["sample_id"], ) self.summary_metrics_file = self.task.outputs["summary_metrics"] self.qc_metrics = self.parse_qc_from_metrics_file() logging.info( f"pct_pf_reads_aligned: {self.qc_metrics.pct_pf_reads_aligned}") logging.info(f"strand balance: {self.qc_metrics.strand_balance}") def parse_qc_from_metrics_file(self): "reads QC metrics from picard output file into QC object" for s in self.summary_metrics_file.stream(): for line in s.decode("utf-8").split("\n"): if not line.startswith("PAIR"): continue record = line.strip().split("\t") return QCMetrics( bam_file=self.input_bam, pct_pf_reads_aligned=float(record[6]), strand_balance=float(record[19]), )
def generate_cwl_step( app, project, execute_method=run_task, import_suggested_files=True ): """ Generates a Step object with Input and Output ports named the same as the given CWL app :param app: App to wrap :param project: Project where the app resides :param execute_method: Execute method to be used in this step. :param import_suggested_files: Import suggested files in the project :return: Freyja Step """ input_dict = {} outp_dict = {} if isinstance(app, sb.App): pass elif isinstance(app, str): app = FindOrCopyApp(app_id=app, to_project=project, name_=f"Copy {app}").app suggested_values = get_suggested_values(app) for key in suggested_values: if isinstance(suggested_values[key], list): if isinstance(suggested_values[key][0], sb.File): if import_suggested_files: suggested_values[key] = FindOrCopyFiles( "Copying suggested file {} for {}".format(key, app.id), files=suggested_values[key], to_project=project, ).copied_files else: suggested_values[key] = None elif isinstance(suggested_values[key], sb.File): if import_suggested_files: suggested_values[key] = FindOrCopyFiles( "Copying suggested file {} for {}".format(key, app.id), files=[suggested_values[key]], to_project=project, ).copied_files[0] else: suggested_values[key] = None cwl_version = app.raw["cwlVersion"] if cwl_version == "v1.0": for inp in app.raw["inputs"]: inp_id = inp["id"] inp_type = inp["type"] value = None if inp_id in suggested_values: value = suggested_values[inp_id] if isinstance(inp_type, str) or isinstance(inp_type, dict): if inp_type in CWLTypes.File: input_dict[inp_id] = Input(Optional[sb.File], default=value) elif inp_type in CWLTypes.Array: input_dict[inp_id] = Input(Optional[list], default=value) elif inp_type in CWLTypes.String: input_dict[inp_id] = Input(Optional[str], default=value) elif inp_type in CWLTypes.Int: input_dict[inp_id] = Input(Optional[int], default=value) elif inp_type in CWLTypes.Bool: input_dict[inp_id] = Input(Optional[bool], default=value) elif inp_type in CWLTypes.Float: input_dict[inp_id] = Input(Optional[float], default=value) elif isinstance(inp_type, list): if inp_type[1] in CWLTypes.Array: input_dict[inp_id] = Input(Optional[list], default=value) elif inp_type[1]["type"] == "enum": input_dict[inp_id] = Input(Optional[str], default=value) else: if inp["type"][1]["type"] == "enum": input_dict[inp_id] = Input(Optional[str], default=value) for outp in app.raw["outputs"]: outp_id = outp["id"] outp_type = get_type(outp_id, app.raw) if outp_type in CWLTypes.File: outp_dict[outp_id] = Output(Optional[sb.File]) elif outp_type in CWLTypes.Array: outp_dict[outp_id] = Output(Optional[list]) elif outp_type in CWLTypes.String: outp_dict[outp_id] = Output(Optional[str]) elif outp_type in CWLTypes.Int: outp_dict[outp_id] = Output(Optional[int]) elif outp_type in CWLTypes.Bool: outp_dict[outp_id] = Output(Optional[bool]) elif outp_type in CWLTypes.Float: outp_dict[outp_id] = Output(Optional[float]) elif cwl_version == "sbg:draft-2": for inp in app.raw["inputs"]: inp_id = inp["id"][1:] inp_type = [t for t in inp["type"] if t != "null"][0] value = None if inp_id in suggested_values: value = suggested_values[inp_id] if inp_type in CWLDraft2Types.File: input_dict[inp_id] = Input(Optional[sb.File], default=value) elif inp_type in CWLDraft2Types.Array: input_dict[inp_id] = Input(Optional[list], default=value) elif inp_type in CWLDraft2Types.String: input_dict[inp_id] = Input(Optional[str], default=value) elif inp_type in CWLDraft2Types.Int: input_dict[inp_id] = Input(Optional[int], default=value) elif inp_type in CWLDraft2Types.Bool: input_dict[inp_id] = Input(Optional[bool], default=value) elif inp_type in CWLDraft2Types.Float: input_dict[inp_id] = Input(Optional[float], default=value) elif "type" in inp_type: if inp_type["type"] == "enum": input_dict[inp_id] = Input(Optional[str], default=value) elif inp_type["type"] == "array": input_dict[inp_id] = Input(Optional[list], default=value) elif inp_type["type"] == "record": input_dict[inp_id] = Input(Optional[dict], default=value) for outp in app.raw["outputs"]: outp_id = outp["id"][1:] outp_type = get_type(outp["id"], app.raw) if outp_type in CWLDraft2Types.File: outp_dict[outp_id] = Output(Optional[sb.File]) elif outp_type in CWLDraft2Types.Array: outp_dict[outp_id] = Output(Optional[list]) elif outp_type in CWLDraft2Types.String: outp_dict[outp_id] = Output(Optional[str]) elif outp_type in CWLDraft2Types.Int: outp_dict[outp_id] = Output(Optional[int]) elif outp_type in CWLDraft2Types.Bool: outp_dict[outp_id] = Output(Optional[bool]) elif outp_type in CWLDraft2Types.Float: outp_dict[outp_id] = Output(Optional[float]) elif "type" in outp_type: if outp_type["type"] == "enum": outp_dict[outp_id] = Output(Optional[str]) if outp_type["type"] == "array": outp_dict[outp_id] = Output(Optional[list]) else: logger.error(f"CWL version not recognised: {cwl_version}") raise Exception(f"CWL version not recognised: {cwl_version}") input_dict["app_"] = Input(sb.App, default=app) input_dict["project_"] = Input(sb.Project, default=project) subt = Step.new( inputs=input_dict, outputs=outp_dict, execute=execute_method, cls_name="RunApp" ) return subt
class WordCounter(Step): line = Input(str) count = Output(int) def execute(self): self.count = len(self.line.strip().split())
class Main(Step): """Imports FASTq files from a cloud bucket, aligns them with BWA, and exports resulting BAM files back to a cloud bucket location. To run this automation from your local computer, type the following command while inside the project root directory: python -m app run --project_name <project-name> [--src_dir <location>] [--dest_dir <location>] whereas <location> refers to a cloud bucket directory in format <sb-volume-id>:<bucket-prefix>. If not provided, location defaults as specified in the automation code are used. """ project_name = Input( str, name="Project name", description= "Name of platform project. Re-uses existing project if found, otherwise create new one.", ) src_dir = Input( VolumeFolder, name="Input folder", description="Cloud bucket location containing input FASTq files.", default="external-demos/volumes_api_demo_bucket:inputs", ) dest_dir = Input( VolumeFolder, name="Output folder", description= "Cloud bucket location for result files. Overwrites already existing files.", default= "external-demos/volumes_api_demo_bucket:automation/import-run-export/result", ) project = Output( Project, name="Analysis project", description="SB project in which processing took place.", ) bams = Output( List[File], name="BAM files", description="BAM files containing aligned reads.", ) def execute(self): "Execution starts here." # initialize context singleton used througout the automation ctx = Context().initialize(self.project_name) # stage input FASTq files, set file metadata, and group by samples samples = ImportFilesAndGroupBySample(src_dir=self.src_dir).samples # run BWA for each sample; samples are processed in parallel # because app outputs are promises and we can use them # before results are available (lazy evaluation) self.bams = [ BWAmem( f"BWAmem-{sample.sample_id}", # name the step (must be unique) input_reads=sample.fastq_files).aligned_reads for sample in samples ] # export all BAM files to volume; export step starts executing # as soon as all BAM files have become available export_volume = SBApi().volumes.get(self.dest_dir.volume_id) ExportFiles( files=self.bams, to_volume=export_volume, prefix=self.dest_dir.prefix, overwrite=True, ) # capture analysis project as output self.project = ctx.project