def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='genericpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='hiplexpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/undr_rover') safe_make_dir('variants/undr_rover/coverdir') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('passed_filter_files'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='variants/undr_rover/{sample[0]}.vcf', extras=['{sample[0]}']) #### concatenate undr_rover vcfs #### pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('apply_undr_rover'), filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/undr_rover/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='fastq2bam') # Get a list of paths to all the FASTQ files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_files, name='original_files', output=input_files) pipeline.transform( task_func=stages.fastq2bam, name='fastq2bam', input=output_from('original_files'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), extras=['{sample[0]}'], output='{path[0]}/out/{sample[0]}.bam') pipeline.transform( task_func=stages.validate_prealigned_bam, name='validate_prealigned_bam', input=output_from('fastq2bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.validation') pipeline.transform( task_func=stages.align, name='align', input=output_from('validate_prealigned_bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'), add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'), output='{path[0]}/{sample[0]}.mapped.bam') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='crpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Find the path to the reference genome # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Convert FASTQ file to FASTA using fastx toolkit # pipeline.transform( # task_func=stages.fastq_to_fasta, # name='fastq_to_fasta', # input=output_from('original_fastqs'), # filter=suffix('.fastq.gz'), # output='.fasta') # The original reference file # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. #pipeline.originate( # task_func=stages.original_reference, # name='original_reference', # output=reference_file) # Run fastQC on the FASTQ files pipeline.transform(task_func=stages.fastqc, name='fastqc', input=output_from('original_fastqs'), filter=suffix('.fastq.gz'), output='_fastqc') # Index the reference using BWA #pipeline.transform( # task_func=stages.index_reference_bwa, # name='index_reference_bwa', # input=output_from('original_reference'), # filter=suffix('.fa'), # output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Index the reference using samtools # pipeline.transform( # task_func=stages.index_reference_samtools, # name='index_reference_samtools', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.fa.fai') # Index the reference using bowtie 2 # pipeline.transform( # task_func=stages.index_reference_bowtie2, # name='index_reference_bowtie2', # input=output_from('original_reference'), # filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'), # output=['{path[0]}/{refname[0]}.1.bt2', # '{path[0]}/{refname[0]}.2.bt2', # '{path[0]}/{refname[0]}.3.bt2', # '{path[0]}/{refname[0]}.4.bt2', # '{path[0]}/{refname[0]}.rev.1.bt2', # '{path[0]}/{refname[0]}.rev.2.bt2'], # extras=['{path[0]}/{refname[0]}']) # # Create a FASTA sequence dictionary for the reference using picard # pipeline.transform( # task_func=stages.reference_dictionary_picard, # name='reference_dictionary_picard', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.dict') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add two more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort alignment with sambamba pipeline.transform(task_func=stages.sort_bam_sambamba, name='sort_alignment', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.sorted.bam') # Extract MMR genes from the sorted BAM file pipeline.transform( task_func=stages.extract_genes_bedtools, name='extract_genes_bedtools', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.mmr.bam') # Extract selected chromosomes from the sorted BAM file pipeline.transform( task_func=stages.extract_chromosomes_samtools, name='extract_chromosomes_samtools', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.chroms.bam') # Index the MMR genes bam file with samtools pipeline.transform(task_func=stages.index_bam, name='index_mmr_alignment', input=output_from('extract_genes_bedtools'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'), output='{path[0]}/{sample[0]}.mmr.bam.bai') # Compute depth of coverage of the alignment with GATK DepthOfCoverage #pipeline.transform( # task_func=stages.alignment_coverage_gatk, # name='alignment_coverage_gatk', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs([reference_file]), # output='{path[0]}/{sample[0]}.coverage_summary', # extras=['{path[0]}/{sample[0]}_coverage']) # Index the alignment with samtools pipeline.transform( task_func=stages.index_bam, name='index_alignment', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.sorted.bam.bai') # Generate alignment stats with bamtools pipeline.transform(task_func=stages.bamtools_stats, name='bamtools_stats', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.stats.txt') # Extract the discordant paired-end alignments pipeline.transform(task_func=stages.extract_discordant_alignments, name='extract_discordant_alignments', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.discordants.unsorted.bam') # Extract split-read alignments pipeline.transform(task_func=stages.extract_split_read_alignments, name='extract_split_read_alignments', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.splitters.unsorted.bam') # Sort discordant reads. # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name='sort_discordants', input=output_from('extract_discordant_alignments'), filter=formatter( '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'), extras=['{path[0]}/{sample[0]}.discordants'], output='{path[0]}/{sample[0]}.discordants.bam') # Index the sorted discordant bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_discordants', # input=output_from('sort_discordants'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'), # output='{path[0]}/{sample[0]}.discordants.bam.bai') # Sort discordant reads # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name='sort_splitters', input=output_from('extract_split_read_alignments'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'), extras=['{path[0]}/{sample[0]}.splitters'], output='{path[0]}/{sample[0]}.splitters.bam') # Index the sorted splitters bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_splitters', # input=output_from('sort_splitters'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'), # output='{path[0]}/{sample[0]}.splitters.bam.bai') # Call structural variants with lumpy (pipeline.transform( task_func=stages.structural_variants_lumpy, name='structural_variants_lumpy', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), add_inputs=add_inputs([ '{path[0]}/{sample[0]}.splitters.bam', '{path[0]}/{sample[0]}.discordants.bam' ]), output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment'). follows('sort_splitters').follows('sort_discordants')) # Call genotypes on lumpy output using SVTyper #(pipeline.transform( # task_func=stages.genotype_svtyper, # name='genotype_svtyper', # input=output_from('structural_variants_lumpy'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']), # output='{path[0]}/{sample[0]}.svtyper.vcf') # .follows('align_bwa') # .follows('sort_splitters') # .follows('index_alignment') # .follows('index_splitters') # .follows('index_discordants')) # Call SVs with Socrates (pipeline.transform( task_func=stages.structural_variants_socrates, name='structural_variants_socrates', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # output goes to {path[0]}/socrates/ output= '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt', extras=['{path[0]}'])) # Call DELs with DELLY pipeline.merge(task_func=stages.deletions_delly, name='deletions_delly', input=output_from('sort_alignment'), output='delly.DEL.vcf') # Call DUPs with DELLY pipeline.merge(task_func=stages.duplications_delly, name='duplications_delly', input=output_from('sort_alignment'), output='delly.DUP.vcf') # Call INVs with DELLY pipeline.merge(task_func=stages.inversions_delly, name='inversions_delly', input=output_from('sort_alignment'), output='delly.INV.vcf') # Call TRAs with DELLY pipeline.merge(task_func=stages.translocations_delly, name='translocations_delly', input=output_from('sort_alignment'), output='delly.TRA.vcf') # Join both read pair files using gustaf_mate_joining #pipeline.transform( # task_func=stages.gustaf_mate_joining, # name='gustaf_mate_joining', # input=output_from('fastq_to_fasta'), # # Match the R1 (read 1) FASTA file and grab the path and sample name. # # This will be the first input to the stage. # # We assume the sample name may consist of only alphanumeric # # characters. # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'), # # Add one more input to the stage: # # 1. The corresponding R2 FASTA file # add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']), # output='{path[0]}/{sample[0]}.joined_mates.fasta') # Call structural variants with pindel #(pipeline.transform( # task_func=stages.structural_variants_pindel, # name='structural_variants_pindel', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]), # output='{path[0]}/{sample[0]}.pindel') # .follows('index_reference_bwa') # .follows('index_reference_samtools')) return pipeline
def main(): # Preparatory actions os.system("clear") os.chdir(".") Data.default_cwd = os.getcwd() Data.logs = [] Data.fails = [] Data.PROFILES_HISTORY_DIR_PATH = (os.path.expanduser("~") + os.sep + Data.PROFILES_HISTORY_DIR_PATH) # Getting deploy profiles from history deploy_profile = None is_hist_profile_selected = False if Data.USE_PROFILES_HISTORY_SEARCH: profiles_history = ProfilesHistory.get_profiles_paths_from_history() if len(profiles_history) > 0: print("[i] Found profiles in history:") print("0. Enter deploy profile path manually") profiles_history_counter = 0 for one_hist_profile_path in profiles_history: profiles_history_counter = profiles_history_counter + 1 print( str(profiles_history_counter) + ". " + one_hist_profile_path) selected_hist_profile_idx = input('Select variant: ') if selected_hist_profile_idx.isdigit(): selected_hist_profile_idx = int(selected_hist_profile_idx) if len(profiles_history) >= selected_hist_profile_idx: if selected_hist_profile_idx > 0: is_hist_profile_selected = True deploy_profile = Helpers.get_profile_file( profiles_history[selected_hist_profile_idx - 1]) else: exit("[X] Invalid input!") if not is_hist_profile_selected: # Reading deploy profile manually deploy_profile = Helpers.get_profile_file(None) if not deploy_profile: exit("[X] Invalid deploy profile or file not found!") start_time = datetime.datetime.now() # Profile validation profile_validation_res = Helpers.validate_profile(deploy_profile) if profile_validation_res != True: exit("[X] Profile validation error: " + str(profile_validation_res)) # Project name project_name = deploy_profile["project_name"] print("=" * 40) print("[i] Selected project name: " + str(project_name)) # Parsing profile environments print("[i] Found environments: " + str(Helpers.get_all_profile_envs(deploy_profile))) selected_profile_env = Helpers.select_profile_env(deploy_profile) if selected_profile_env == False: exit("[X] Unknown environment!") print("[i] Selected environment: " + selected_profile_env) # Processing stages print("[i] Processing stages...") deploy_stages = deploy_profile["environments"][selected_profile_env][ "stages"] deploy_credentials = deploy_profile["environments"][selected_profile_env][ "credentials"] stages_counter = 0 for one_stage in deploy_stages: if one_stage["ignore"]: print("[i] " + str(stages_counter) + ". " + str(one_stage["name"]) + " (IGNORED)") continue stages_counter = stages_counter + 1 print("[i] " + str(stages_counter) + ". " + str(one_stage["print"])) result = Stages.run_stage(one_stage["name"], one_stage["details"], deploy_credentials) if result != True: print("[!] Current stage failed!\nResult:\n" + str(result) + "\nContinue? (yes/no)") continue_or_not = input() if continue_or_not.replace(" ", "") != "yes": exit("[X] Exited with error!") # Saving current profile path to local history ProfilesHistory.save_profile_path_to_history(Data.curr_profile_path) end_time = datetime.datetime.now() print("[i] All done in " + str(int((end_time - start_time).total_seconds())) + " seconds!") print("=" * 40) # Slack notification print("Send notification slack? yes/no") send_notice_or_not = input() if send_notice_or_not.lower().replace(" ", "") == "yes": is_slack_bot_creds_valid = Notifications.validate_slack_bot_credentials( deploy_credentials) if is_slack_bot_creds_valid: print("Enter the message to send:") slack_msg_to_send = input() if slack_msg_to_send.lower().replace(" ", "") != "": slack_msg_to_send = ( "Backend update!" + "\n" + "Start time: " + str(start_time) + "\n" + "End time: " + str(end_time) + "\n" + "Elapsed time: " + str(int((end_time - start_time).total_seconds())) + " seconds\n" + "Environment: " + selected_profile_env + "\n" + "Service: " + project_name + "\n" + "Message: " + slack_msg_to_send) Notifications.send_msg_to_slack( deploy_credentials["slack_bot"]["main"]["bot_token"], deploy_credentials["slack_bot"]["main"]["project_channel"], slack_msg_to_send, deploy_credentials["slack_bot"]["main"]["icon_emoji"]) else: print("[!] Invalid Slack credentials!")
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq # new sample name = OHI031002-P02F04 filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)-(?P<tumor>[TN]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq add_inputs=add_inputs( '{path[0]}/{sample[0]}-{tumor[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{tumor[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}/{sample[0]}_{tumor[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # High quality and primary alignments pipeline.transform( task_func=stages.primary_bam, name='primary_bam', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), output='.primary.bam') # index bam file pipeline.transform( task_func=stages.index_sort_bam_picard, name='index_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.bam.bai') # Clip the primer_seq from BAM File (pipeline.transform( task_func=stages.clip_bam, name='clip_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.primerclipped.bam') .follows('index_bam')) ###### GATK VARIANT CALLING - MuTect2 ###### # Call somatics variants using MuTect2 pipeline.transform( task_func=stages.call_mutect2_gatk, name='call_mutect2_gatk', input=output_from('clip_bam'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+)_T.primary.primerclipped.bam'), add_inputs=add_inputs( '{path[0]}/{sample[0]}_N.primary.primerclipped.bam'), # extras=['{sample[0]}'], output='variants/mutect2/{sample[0]}.mutect2.vcf') # .follows('clip_bam') ###### GATK VARIANT CALLING - MuTect2 ###### # -------- VEP ---------- # Apply NORM (pipeline.transform( task_func=stages.apply_vt, name='apply_vt', input=output_from('call_mutect2_gatk'), filter=suffix('.mutect2.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.vt.vcf') .follows('call_mutect2_gatk')) # # Apply VEP (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('apply_vt'), filter=suffix('.mutect2.vt.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.vt.vep.vcf') .follows('apply_vt')) # # Apply vcfanno (pipeline.transform( task_func=stages.apply_vcfanno, name='apply_vcfanno', input=output_from('apply_vep'), filter=suffix('.mutect2.vt.vep.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.annotated.vcf') .follows('apply_vep')) return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='complexo') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform(task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform(task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam').follows( 'mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform(task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam').follows( 'local_realignment_gatk')) # Call variants using GATK pipeline.transform(task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='COMPLEXO.mergedgvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergedgvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform(task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['COMPLEXO.snp_recal', 'COMPLEXO.snp_tranches']), output='.recal_SNP.vcf').follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs( ['COMPLEXO.indel_recal', 'COMPLEXO.indel_tranches']), output='.recal_INDEL.vcf').follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['COMPLEXO.recal_INDEL.vcf']), output='.combined.vcf').follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform(task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='cellfree_seq') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.sort.hq.bam') pipeline.transform(task_func=stages.run_connor, name='run_connor', input=output_from('align_bwa'), filter=suffix('.sort.hq.bam'), output='.sort.hq.connor.bam') safe_make_dir('metrics') safe_make_dir('metrics/summary') safe_make_dir('metrics/connor') pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_raw', input=output_from('coverage_bed_raw', 'genome_reads_raw', 'target_reads_raw', 'total_reads_raw'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'summary.txt']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/connor/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_connor', input=output_from('coverage_bed_connor', 'genome_reads_connor', 'target_reads_connor', 'total_reads_connor'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/connor/all_sample.summary.\1.txt', extras=[r'\1', 'connor.summary.txt']) safe_make_dir('variants') safe_make_dir('variants/vardict') pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') (pipeline.merge( task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('sort_vcfs'), output='variants/vardict/combined.vcf.gz').follows('index_vcfs')) pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
def make_pipeline_process(state): #originate process pipeline state # Define empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the directories to be combined for variant calling run_directories = state.config.get_option('runs') #grab files from each of the processed directories in "runs" gatk_files = [] for directory in run_directories: gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf')) stages = Stages(state) #dummy stage to take the globbed outputs of each run that is to be processed pipeline.originate(task_func=stages.glob_gatk, name='glob_gatk', output=gatk_files) # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('glob_gatk'), output='processed/gatk/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Apply GT filters to genotyped vcf pipeline.transform(task_func=stages.genotype_filter_gatk, name='genotype_filter_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.gt-filter.vcf') # Decompose and normalise multiallelic sites pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('genotype_filter_gatk'), filter=suffix('.raw.gt-filter.vcf'), output='.raw.gt-filter.decomp.norm.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('vt_decompose_normalise'), filter=suffix('.raw.gt-filter.decomp.norm.vcf'), output='.raw.gt-filter.decomp.norm.annotate.vcf') # Filter vcf pipeline.transform( task_func=stages.gatk_filter, name='gatk_filter', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vcf') #Apply VEP pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('gatk_filter'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf') ####### vardict stuff vardict_files = [] for directory in run_directories: vardict_files.extend( glob.glob(directory + '/variants/vardict/*sorted.vcf.gz')) #dummy stage to take the globbed outputs of each run that is to be processed pipeline.originate(task_func=stages.glob_vardict, name='glob_vardict', output=vardict_files) safe_make_dir('processed/vardict') #concatenate all vardict vcfs pipeline.merge(task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('glob_vardict'), output='processed/vardict/combined.vcf.gz') pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise_vardict', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise_vardict'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep_vardict', input=output_from('vt_decompose_normalise_vardict'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the FASTQ files #fastq_files = state.config.get_option('fastqs') fastq_files = glob.glob("fastqs/*.gz") # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('processed_fastqs') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') safe_make_dir('metrics/pass_samples') safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/vardict') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) pipeline.transform( task_func=stages.run_surecalltrimmer, name='run_surecalltrimmer', input=output_from('original_fastqs'), filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'), #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'), #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'), extras=['{sample[0]}'], # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step. output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz') #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('run_surecalltrimmer'), filter=formatter( 'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz' ), add_inputs=add_inputs( 'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'), #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'), #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'), extras=['{sample[0]}'], output='alignments/{sample[0]}.bam') # Run locatit from agilent. this should produce sorted bam files, so no sorting needed at the next step pipeline.collate(task_func=stages.run_locatit, name='run_locatit', input=output_from('align_bwa', 'original_fastqs'), filter=regex(r'.+/(.+_L\d\d\d).+'), output=r'alignments/\1.locatit.bam') pipeline.transform(task_func=stages.sort_bam, name='sort_bam', input=output_from('run_locatit'), filter=suffix('.locatit.bam'), output='.sorted.locatit.bam') # # # # # Metrics stages # # # # # # generate mapping metrics (post locatit) pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) # Intersect the bam file with the region of interest pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') # Calculate coverage metrics from the intersected bam file pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') # Count the number of mapped reads pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') # Count the number of on-target reads pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') # Count the number of total reads pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') # Generate summary metrics from the stats files produces pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) # # # # # Metrics stages end # # # # # # # # # # Checking metrics and calling # # # # # # Originate to set the location of the metrics summary file (pipeline.originate( task_func=stages.grab_summary_file, name='grab_summary_file', output='all_sample.summary.txt').follows('generate_stats')) # Awk command to produce a list of bam files passing filters pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt') # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller pipeline.subdivide(name='passed_filter_files', task_func=stages.read_samples, input=output_from('filter_stats'), filter=formatter(), output="metrics/pass_samples/*.bam") # Call variants using GATK (pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam')) # Call variants with vardict (pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']).follows('sort_bam')) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') return (pipeline)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='fastq2bam') # Get a list of paths to all the FASTQ files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_files, name='original_files', output=input_files) # # performs fastqc on fastq inputs # pipeline.transform( task_func=stages.fastqc, name='fastqc', input=output_from('original_files'), filter=formatter('(?P<path>.+)/(?P<filename>.+).fastq.gz'), output='{path[0]}/{filename[0]}_fastqc') # # converts the fastq inputs to pre-aligned bams # pipeline.transform( task_func=stages.fastq2bam, name='fastq2bam', input=output_from('original_files'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), extras=['{sample[0]}'], output='{path[0]}/{sample[0]}.bam') # # validates pre-aligned bams x.bam -> x.validation # pipeline.transform( task_func=stages.validate_prealigned_bam, name='validate_prealigned_bam', input=output_from('fastq2bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.validation') # aligns pre-aligned bam x.bam -> x.mapped.bam pipeline.transform( task_func=stages.align, name='align', input=output_from('validate_prealigned_bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'), add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'), output='{path[0]}/{sample[0]}.mapped.bam') # generates stats about an aligned bam pipeline.transform( task_func=stages.align_stats_bedtools, name='align_stats_bedtools', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.genomecov.stats') # generates stats about an aligned bam pipeline.transform( task_func=stages.align_stats_picard, name='align_stats_picard', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.picard.stats') # # runs the Sanger variant calling pipeline # #pipeline.transform( # task_func=stages.analyse_wgs, # name='analyse_wgs', # input=output_from('align'), # filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), # output='{path[0]}/{sample[0]}.wgs/manifest') # runs the components of the Sanger variant calling pipeline pipeline.transform( task_func=stages.analyse_wgs_prepare, name='analyse_wgs_prepare', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.prepare') pipeline.transform( task_func=stages.analyse_wgs_reference_files, name='analyse_wgs_reference_files', input=[output_from('align'), output_from('analyse_wgs_prepare')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.reference_files') pipeline.transform( task_func=stages.analyse_wgs_init, name='analyse_wgs_init', input=[ output_from('align'), output_from('analyse_wgs_reference_files') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.init') # block 1 pipeline.transform( task_func=stages.analyse_wgs_verify_WT, name='analyse_wgs_verify_WT', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.verify_WT') pipeline.transform( task_func=stages.analyse_wgs_cgpPindel_input, name='analyse_wgs_cgpPindel_input', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel_input') pipeline.transform( task_func=stages.analyse_wgs_alleleCount, name='analyse_wgs_alleleCount', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.alleleCount') # block 2 pipeline.transform( task_func=stages.analyse_wgs_ascat, name='analyse_wgs_ascat', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.ascat') pipeline.transform( task_func=stages.analyse_wgs_cgpPindel, name='analyse_wgs_cgpPindel', input=[ output_from('align'), output_from('analyse_wgs_cgpPindel_input') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel') pipeline.transform( task_func=stages.analyse_wgs_BRASS_input, name='analyse_wgs_BRASS_input', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.BRASS_input') pipeline.transform( task_func=stages.analyse_wgs_BRASS_cover, name='analyse_wgs_BRASS_cover', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.BRASS_cover') pipeline.transform( task_func=stages.analyse_wgs_CaVEMan_split, name='analyse_wgs_CaVEMan_split', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan_split') # after block 2 pipeline.transform( task_func=stages.analyse_wgs_ascat_prep, name='analyse_wgs_ascat_prep', input=[output_from('align'), output_from('analyse_wgs_ascat')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.ascat_prep') pipeline.transform( task_func=stages.analyse_wgs_pindel_prep, name='analyse_wgs_pindel_prep', input=[output_from('align'), output_from('analyse_wgs_cgpPindel')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.pindel_prep') # parallel block 3 pipeline.transform( task_func=stages.analyse_wgs_verify_MT, name='analyse_wgs_verify_MT', input=[ output_from('align'), output_from('analyse_wgs_verify_WT'), output_from('analyse_wgs_ascat_prep') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.verify_MT') pipeline.transform( task_func=stages.analyse_wgs_CaVEMan, name='analyse_wgs_CaVEMan', input=[ output_from('align'), output_from('analyse_wgs_CaVEMan_split'), output_from('analyse_wgs_ascat_prep'), output_from('analyse_wgs_cgpPindel') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan') pipeline.transform( task_func=stages.analyse_wgs_BRASS, name='analyse_wgs_BRASS', input=[ output_from('align'), output_from('analyse_wgs_BRASS_cover'), output_from('analyse_wgs_BRASS_input'), output_from('analyse_wgs_ascat_prep') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.BRASS') pipeline.transform( task_func=stages.analyse_wgs_cgpPindel_annot, name='analyse_wgs_cgpPindel_annot', input=[output_from('align'), output_from('analyse_wgs_pindel_prep')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel_annot') # pre block 4 pipeline.transform( task_func=stages.analyse_wgs_caveman_prep, name='analyse_wgs_caveman_prep', input=[output_from('align'), output_from('analyse_wgs_CaVEMan')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.caveman_prep') # block 4 pipeline.transform( task_func=stages.analyse_wgs_CaVEMan_annot, name='analyse_wgs_CaVEMan_annot', input=[output_from('align'), output_from('analyse_wgs_caveman_prep')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan_annot') # done pipeline.transform( task_func=stages.analyse_wgs_finish, name='analyse_wgs_finish', input=[ output_from('align'), output_from('analyse_wgs_CaVEMan_annot'), output_from('analyse_wgs_BRASS'), output_from('analyse_wgs_cgpPindel_annot'), output_from('analyse_wgs_alleleCount'), output_from('analyse_wgs_verify_MT') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.finish') # # runs the delly singularity container # pipeline.transform( task_func=stages.delly, name='delly', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.delly.completed') pipeline.transform( task_func=stages.gridss, name='gridss', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.gridss.completed') pipeline.transform( task_func=stages.muse, name='muse', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.muse.completed') pipeline.transform( task_func=stages.mutect2, name='mutect2', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.mutect2.completed') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq # new sample name = OHI031002-P02F04 filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq' ), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq add_inputs=add_inputs( '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}_{readid[0]}/{sample[0]}_{readid[0]}.bam' ) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq' ), add_inputs=add_inputs( '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], extras=['{sample[0]}', '{readid[0]}'], # The output file name is the sample name with a .bam extension. output='variants/undr_rover/{sample[0]}_{readid[0]}.vcf') # Sort the BAM file using Picard pipeline.transform(task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # High quality and primary alignments pipeline.transform(task_func=stages.primary_bam, name='primary_bam', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), output='.primary.bam') # index bam file pipeline.transform(task_func=stages.index_sort_bam_picard, name='index_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.bam.bai') # Clip the primer_seq from BAM File (pipeline.transform( task_func=stages.clip_bam, name='clip_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.primerclipped.bam').follows('index_bam')) ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('clip_bam'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-_]+).primary.primerclipped.bam'), output='variants/gatk/{sample[0]}.g.vcf') # .follows('index_sort_bam_picard')) # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/gatk/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.annotate.vcf') # Apply VariantFiltration using GATK pipeline.transform(task_func=stages.apply_variant_filtration_gatk_lenient, name='apply_variant_filtration_gatk_lenient', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.annotate.vcf'), output='.raw.annotate.filtered_lenient.vcf') return pipeline
def make_pipeline_process(state): # Define empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the directories to be combined for variant calling run_directories = state.config.get_option('runs') #grab files from each of the processed directories in "runs" gatk_files = [] undr_rover_files = [] for directory in run_directories: gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf')) undr_rover_files.extend( glob.glob(directory + '/variants/undr_rover/*sorted.vcf.gz')) # Stages are dependent on the state stages = Stages(state) # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.glob_gatk, name='glob_gatk', output=gatk_files) #Dummy stage to grab the undr rover files pipeline.originate(task_func=stages.glob_undr_rover, name='glob_undr_rover', output=undr_rover_files) safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/undr_rover') pipeline.merge(task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('glob_undr_rover'), output='variants/undr_rover/combined_undr_rover.vcf.gz') pipeline.transform(task_func=stages.index_final_vcf, name='index_final_vcf', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.vcf.gz.tbi') # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('glob_gatk'), output='ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Apply GT filters to genotyped vcf pipeline.transform(task_func=stages.genotype_filter_gatk, name='genotype_filter_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.gt-filter.vcf') # Decompose and normalise multiallelic sites pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('genotype_filter_gatk'), filter=suffix('.raw.gt-filter.vcf'), output='.raw.gt-filter.decomp.norm.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('vt_decompose_normalise'), filter=suffix('.raw.gt-filter.decomp.norm.vcf'), output='.raw.gt-filter.decomp.norm.annotate.vcf') # Filter vcf pipeline.transform( task_func=stages.gatk_filter, name='gatk_filter', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vcf') #Apply VEP (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('gatk_filter'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'), add_inputs=add_inputs( ['variants/undr_rover/combined_undr_rover.vcf.gz']), output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf').follows( 'index_final_vcf')) return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter( '.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz' ), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.clipped.sort.hq.bam') # generate mapping metrics. pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) summary_file = 'all_sample.summary.txt' (pipeline.originate(task_func=stages.grab_summary_file, name='grab_summary_file', output=summary_file).follows('generate_stats')) pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt', extras=['all_sample.failed.summary.txt']) return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq # new sample name = OHI031002-P02F04 filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq' ), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq add_inputs=add_inputs( '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}_{readid[0]}/{sample[0]}_{readid[0]}.bam' ) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq' ), add_inputs=add_inputs( '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], extras=['{sample[0]}', '{readid[0]}'], # The output file name is the sample name with a .bam extension. output='variants/undr_rover/{sample[0]}_{readid[0]}.vcf') # Sort the BAM file using Picard pipeline.transform(task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # High quality and primary alignments pipeline.transform(task_func=stages.primary_bam, name='primary_bam', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), output='.primary.bam') # index bam file pipeline.transform(task_func=stages.index_sort_bam_picard, name='index_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.bam.bai') # Clip the primer_seq from BAM File (pipeline.transform( task_func=stages.clip_bam, name='clip_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.primerclipped.bam').follows('index_bam')) ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('clip_bam'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-_]+).primary.primerclipped.bam'), output='variants/gatk/{sample[0]}.g.vcf') # .follows('index_sort_bam_picard')) # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/gatk/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.annotate.vcf') # Apply VariantFiltration using GATK pipeline.transform(task_func=stages.apply_variant_filtration_gatk, name='apply_variant_filtration_gatk', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.annotate.vcf'), output='.raw.annotate.filtered.vcf') # Apply NORM (pipeline.transform( task_func=stages.apply_vt, name='apply_vt', input=output_from('apply_variant_filtration_gatk'), filter=suffix('.raw.annotate.filtered.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.raw.annotate.filtered.norm.vcf').follows( 'apply_variant_filtration_gatk')) # Apply VEP (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('apply_vt'), filter=suffix('.raw.annotate.filtered.norm.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.raw.annotate.filtered.norm.vep.vcf').follows('apply_vt')) # Apply SnpEff (pipeline.transform( task_func=stages.apply_snpeff, name='apply_snpeff', input=output_from('apply_vep'), filter=suffix('.raw.annotate.filtered.norm.vep.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.raw.annotate.filtered.norm.vep.snpeff.vcf').follows( 'apply_vep')) # Apply vcfanno (pipeline.transform( task_func=stages.apply_vcfanno, name='apply_vcfanno', input=output_from('apply_snpeff'), filter=suffix('.raw.annotate.filtered.norm.vep.snpeff.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.annotated.vcf').follows('apply_snpeff')) # Concatenate undr_rover vcf files pipeline.merge(task_func=stages.apply_cat_vcf, name='apply_cat_vcf', input=output_from('apply_undr_rover'), output='variants/undr_rover/ur.vcf.gz') # # Apple VEP on concatenated undr_rover vcf file # (pipeline.transform( # task_func=stages.apply_vep, # name='apply_vep_ur', # input=output_from('apply_cat_vcf'), # filter=suffix('.vcf.gz'), # output='.vep.vcf') # .follows('apply_cat_vcf')) # # # Apply vcfanno on concatenated/vep undr_rover vcf file # (pipeline.transform( # task_func=stages.apply_vcfanno, # name='apply_vcfanno_ur', # input=output_from('apply_vep_ur'), # filter=suffix('.vep.vcf'), # output='.vep.anno.vcf') # .follows('apply_vep_ur')) # # # Apply snpeff # (pipeline.transform( # task_func=stages.apply_snpeff, # name='apply_snpeff_ur', # input=output_from('apply_vcfanno_ur'), # filter=suffix('.vep.anno.vcf'), # output='.vep.anno.snpeff.vcf.gz') # .follows('apply_vcfanno_ur')) # # Apply tabix pipeline.transform(task_func=stages.apply_tabix, name='apply_tabix', input=output_from('apply_cat_vcf'), filter=suffix('.vcf.gz'), output='.vcf.gz.tbi') # # Apply HomopolymerRun # (pipeline.transform( # task_func=stages.apply_homopolymer_ann, # name='apply_homopolymer_ann', # input=output_from('apply_snpeff_ur'), # filter=suffix('.vep.anno.snpeff.vcf.gz'), # output='.annotated.vcf') # .follows('apply_tabix')) # # Apply summarize multi coverage # (pipeline.merge( # task_func=stages.apply_multicov, # name='apply_multicov', # input=output_from('primary_bam'), # # filter=suffix('.primary.bam'), # output='coverage/all.multicov.txt') # .follows('index_bam')) # Apply summarize picard coverage # (pipeline.merge( # task_func=stages.apply_summarize_picard, # name='apply_summarize_picard', # input=output_from('target_coverage'), # output='coverage/all.hsmetrics.txt') # .follows('target_coverage')) # # Apply summarize multicov coverage plots # (pipeline.merge( # task_func=stages.apply_multicov_plots, # name='apply_multicov_plots', # input=output_from('apply_multicov'), # output='coverage/coverage_analysis_main.html') # .follows('apply_multicov')) return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='vcf_annotation') # Get a list of paths to all the FASTQ files vcf_files = state.config.get_option('vcfs') # Stages are dependent on the state stages = Stages(state) # The original VCF files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_vcf, name='original_vcf', output=vcf_file) # Decompose VCF using Vt pipeline.transform( task_func=stages.decompose_vcf, name='decompose_vcf', input=output_from('original_vcf'), # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the VCF file name (e.g. study/family name. # This is needed within the stage for finding out sample specific # configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.decompose.normalize.vcf') # FILTER COMMON VARIANTS # ADD FILTER COMMON VARIANTS USING VEP # Annotate using VEP pipeline.transform( task_func=stages.annotate_vep, name='annotate_vep', input=output_from('decompose_vcf'), filter=suffix('.vcf'), output='.vep.vcf') # Annotate using SnpEff pipeline.transform( task_func=stages.annotate_snpeff, name='annotate_snpeff', input=output_from('annotate_vep'), filter=suffix('.vcf'), output='.snpeff.vcf') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='PCExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs( ['PCExomes.indel_recal', 'PCExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline