Example #1
0
 def call_mutect2_gatk(self, inputs, vcf_out):
     '''Call somatic variants from using MuTect2'''
     tumor_in, normal_in = inputs
     tumor_samfile = pysam.AlignmentFile(tumor_in, "rb")
     normal_samfile = pysam.AlignmentFile(normal_in, "rb")
     tumor_id = tumor_samfile.header['RG'][0]['SM']
     normal_id = normal_samfile.header['RG'][0]['SM']
     tumor_samfile.close()
     normal_samfile.close()
     # safe_make_dir('variants/mutect2/{sample}'.format(sample=sample_id))
     safe_make_dir('variants/mutect2/')
     command = "gatk Mutect2 -R {reference} " \
         "-I {tumor_in} " \
         "-tumor {tumor_id} " \
         "-I {normal_in} " \
         "-normal {normal_id} " \
         "--germline-resource {mutect2_gnomad} " \
         "--af-of-alleles-not-in-resource 0.001 " \
         "-O {out} " \
         "-L {gatk_bed} " \
         "--max-reads-per-alignment-start 0 " \
         "--dont-use-soft-clipped-bases".format(reference=self.reference,
                     tumor_in=tumor_in,
                     normal_in=normal_in,
                     tumor_id=tumor_id,
                     normal_id=normal_id,
                     mutect2_gnomad=self.mutect2_gnomad,
                     gatk_bed=self.gatk_bed,
                     out=vcf_out)
     # "--af-of-alleles-not-in-resource 0.00003125 " \
     run_stage(self.state, 'call_mutect2_gatk', command)
Example #2
0
    def apply_undr_rover(self, inputs, vcf_output, sample_id):
        '''Apply undr_rover to call variants from paired end fastq files'''
        fastq_read1_in, fastq_read2_in = inputs
        cores = self.get_stage_options('apply_undr_rover', 'cores')
        safe_make_dir('variants/undr_rover')
        safe_make_dir('variants/undr_rover/coverdir')
        coverdir = "variants/undr_rover/coverdir"
        coverfile = sample_id + ".coverage"

        command = 'undr_rover --primer_coords {coord_file} ' \
                  '--primer_sequences {primer_file} ' \
                  '--reference {reference} ' \
                  '--out {vcf_output} ' \
                  '--coverfile {coverdir}/{coverfile} ' \
                  '--proportionthresh {proportionthresh} ' \
                  '--absthresh {absthresh} ' \
                  '--max_variants {maxvariants} ' \
                  '{fastq_read1} {fastq_read2}'.format(
                        coord_file=self.coord_file, primer_file=self.primer_file,
                        reference=self.reference,
                        vcf_output=vcf_output,
                        coverdir=coverdir,
                        proportionthresh=self.proportionthresh,
                        absthresh=self.absthresh,
                        maxvariants=self.maxvariants,
                        coverfile=coverfile,
                        fastq_read1=fastq_read1_in,
                        fastq_read2=fastq_read2_in)
        run_stage(self.state, 'apply_undr_rover', command)
Example #3
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='genericpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
Example #4
0
 def apply_multicov(self, bam_in, multicov):
     '''Samtools mpileup'''
     # bam_in = bam_in
     bams = ' '.join([bam for bam in bam_in])
     safe_make_dir('coverage')
     command = 'bedtools multicov -bams {bams} -bed {target_bed} > {multicov}'.format(
         bams=bams, target_bed=self.target_bed, multicov=multicov)
     run_stage(self.state, 'apply_multicov', command)
Example #5
0
 def apply_samtools_mpileup(self, bam_in, mpileup_out_bcf):
     '''Samtools mpileup'''
     # bam_in = bam_in
     bams = ' '.join([bam for bam in bam_in])
     safe_make_dir('variants')
     command = 'samtools mpileup -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -go {mpileup_out_bcf} ' \
               '-f {reference} {bams}'.format(
                       mpileup_out_bcf=mpileup_out_bcf,reference=self.reference,bams=bams)
     run_stage(self.state, 'apply_samtools_mpileup', command)
Example #6
0
 def target_coverage(self, bam_in, coverage_out):
     '''Calculate coverage using Picard'''
     safe_make_dir('coverage')
     picard_args = 'CollectHsMetrics INPUT={bam_in} OUTPUT={coverage_out} ' \
                   'R={reference} BAIT_INTERVALS={interval_file} ' \
                   'TARGET_INTERVALS={interval_file}'.format(
                       bam_in=bam_in, coverage_out=coverage_out,
                       reference=self.reference, interval_file=self.interval_file)
     self.run_picard('target_coverage', picard_args)
Example #7
0
    def structural_variants_socrates(self, bam_in, variants_out, sample_dir):
        '''Call structural variants with Socrates'''
        threads = self.state.config.get_stage_option('structural_variants_socrates', 'cores') 
        # jvm_mem is in gb
        jvm_mem = self.state.config.get_stage_option('structural_variants_socrates', 'jvm_mem') 
        bowtie2_ref_dir = self.state.config.get_stage_option('structural_variants_socrates', 'bowtie2_ref_dir') 
        output_dir = os.path.join(sample_dir, 'socrates')
        safe_make_dir(output_dir)
        command = \
        '''
cd {output_dir}
export _JAVA_OPTIONS='-Djava.io.tmpdir={output_dir}'
Socrates all -t {threads} --bowtie2_threads {threads} --bowtie2_db {bowtie2_ref_dir} --jvm_memory {jvm_mem}g {bam}
        '''.format(output_dir=output_dir, threads=threads, bowtie2_ref_dir=bowtie2_ref_dir, jvm_mem=jvm_mem, bam=bam_in)
        run_stage(self.state, 'structural_variants_socrates', command)
Example #8
0
 def align_bwa(self, inputs, bam_out, fam, sample, id):
     '''Align the paired end fastq files to the reference genome using bwa'''
     fastq_read1_in, fastq_read2_in = inputs
     cores = self.get_stage_options('align_bwa', 'cores')
     read_group = '"@RG\\tID:{id}\\tSM:{sample}\\tPL:Illumina"'.format(sample=sample, id=id)
     command = 'bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \
               '| samtools view -b -h -o {bam} -' \
               .format(cores=cores,
                   read_group=read_group,
                   fastq_read1=fastq_read1_in,
                   fastq_read2=fastq_read2_in,
                   reference=self.reference,
                   bam=bam_out)
     safe_make_dir('results/alignments/FAM_{fam}_SM_{sample}'.format(fam=fam, sample=sample))
     run_stage(self.state, 'align_bwa', command)
Example #9
0
 def align_bwa(self, inputs, bam_out, read_id, lib, lane, sample_id):
     # def align_bwa(self, inputs, bam_out, sample_id):
     '''Align the paired end fastq files to the reference genome using bwa'''
     fastq_read1_in, fastq_read2_in = inputs
     cores = self.get_stage_options('align_bwa', 'cores')
     safe_make_dir('alignments/{sample}'.format(sample=sample_id))
     read_group = '"@RG\\tID:{readid}\\tSM:{sample}\\tPU:lib1\\tLN:{lane}\\tPL:Illumina"' \
         .format(readid=read_id, lib=lib, lane=lane, sample=sample_id)
     command = 'bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \
               '| samtools view -b -h -o {bam} -' \
               .format(cores=cores,
                       read_group=read_group,
                       fastq_read1=fastq_read1_in,
                       fastq_read2=fastq_read2_in,
                       reference=self.reference,
                       bam=bam_out)
     run_stage(self.state, 'align_bwa', command)
Example #10
0
 def align_bwa(self, inputs, bam_out, read_id, lib, lane, sample_id):
     # def align_bwa(self, inputs, bam_out, sample_id):
     '''Align the paired end fastq files to the reference genome using bwa'''
     fastq_read1_in, fastq_read2_in = inputs
     cores = self.get_stage_options('align_bwa', 'cores')
     safe_make_dir('alignments/{sample}'.format(sample=sample_id))
     read_group = '"@RG\\tID:{readid}\\tSM:{sample}\\tPU:lib1\\tLN:{lane}\\tPL:Illumina"' \
         .format(readid=read_id, lib=lib, lane=lane, sample=sample_id)
     command = 'bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \
               '| samtools view -b -h -o {bam} -' \
               .format(cores=cores,
                       read_group=read_group,
                       fastq_read1=fastq_read1_in,
                       fastq_read2=fastq_read2_in,
                       reference=self.reference,
                       bam=bam_out)
     run_stage(self.state, 'align_bwa', command)
Example #11
0
    def structural_variants_socrates(self, bam_in, variants_out, sample_dir):
        '''Call structural variants with Socrates'''
        threads = self.state.config.get_stage_option(
            'structural_variants_socrates', 'cores')
        # jvm_mem is in gb
        jvm_mem = self.state.config.get_stage_option(
            'structural_variants_socrates', 'jvm_mem')
        bowtie2_ref_dir = self.state.config.get_stage_option(
            'structural_variants_socrates', 'bowtie2_ref_dir')
        output_dir = os.path.join(sample_dir, 'socrates')
        safe_make_dir(output_dir)
        command = \
        '''
cd {output_dir}
export _JAVA_OPTIONS='-Djava.io.tmpdir={output_dir}'
Socrates all -t {threads} --bowtie2_threads {threads} --bowtie2_db {bowtie2_ref_dir} --jvm_memory {jvm_mem}g {bam}
        '''.format(output_dir=output_dir, threads=threads, bowtie2_ref_dir=bowtie2_ref_dir, jvm_mem=jvm_mem, bam=bam_in)
        run_stage(self.state, 'structural_variants_socrates', command)
Example #12
0
 def align_bwa(self, inputs, bam_out, sample, id):
     """Align the paired end fastq files to the reference genome using bwa"""
     fastq_read1_in, fastq_read2_in = inputs
     cores = self.get_stage_options("align_bwa", "cores")
     read_group = '"@RG\\tID:{id}\\tSM:{sample}\\tPL:Illumina"'.format(sample=sample, id=id)
     command = (
         "bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} "
         "| samtools view -b -h -o {bam} -".format(
             cores=cores,
             read_group=read_group,
             fastq_read1=fastq_read1_in,
             fastq_read2=fastq_read2_in,
             reference=self.reference,
             bam=bam_out,
         )
     )
     safe_make_dir("results/alignments/{sample}".format(sample=sample))
     run_stage(self.state, "align_bwa", command)
Example #13
0
 def align_bwa(self, inputs, bam_out, sample_id, lib):
     '''Align the paired end fastq files to the reference genome using bwa'''
     fastq_read1_in, fastq_read2_in = inputs
     cores = self.get_stage_options('align_bwa', 'cores')
     safe_make_dir('alignments')
     read_group = '"@RG\\tID:{sample}\\tSM:{sample}\\tPU:lib1\\tPL:Illumina"' \
         .format(sample=sample_id)
     command = 'bwa mem -M -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \
               '| {bamclipper} -i -p {primer_bedpe_file} -n 1 ' \
               '| samtools view -b -h -o {bam} -' \
               .format(cores=cores,
                       read_group=read_group,
                       fastq_read1=fastq_read1_in,
                       fastq_read2=fastq_read2_in,
                       reference=self.reference,
                       bamclipper=self.bamclipper,
                       primer_bedpe_file=self.primer_bedpe_file,
                       bam=bam_out)
     run_stage(self.state, 'align_bwa', command)
Example #14
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='hiplexpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')
    safe_make_dir('variants/undr_rover/coverdir')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('passed_filter_files'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='variants/undr_rover/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    #### concatenate undr_rover vcfs ####
    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('apply_undr_rover'),
        filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/undr_rover/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
Example #15
0
 def call_haplotypecaller_gatk(self, bam_in, vcf_out):
     '''Call variants using GATK'''
     safe_make_dir('variants/gatk')
     # safe_make_dir('variants}'.format(sample=sample_id))
     gatk_args = "-T HaplotypeCaller -R {reference} --min_base_quality_score 20 " \
                 "--emitRefConfidence GVCF " \
                 "-A AlleleBalance -A AlleleBalanceBySample " \
                 "-A ChromosomeCounts -A ClippingRankSumTest " \
                 "-A Coverage -A DepthPerAlleleBySample " \
                 "-A DepthPerSampleHC -A FisherStrand " \
                 "-A GCContent -A GenotypeSummaries " \
                 "-A HardyWeinberg -A HomopolymerRun " \
                 "-A LikelihoodRankSumTest -A LowMQ " \
                 "-A MappingQualityRankSumTest -A MappingQualityZero " \
                 "-A QualByDepth " \
                 "-A RMSMappingQuality -A ReadPosRankSumTest " \
                 "-A SampleList -A SpanningDeletions " \
                 "-A StrandBiasBySample -A StrandOddsRatio " \
                 "-A TandemRepeatAnnotator -A VariantType " \
                 "-I {bam} -L {interval_list} -o {out}".format(reference=self.reference,
                                                               bam=bam_in, interval_list=self.interval_file, out=vcf_out)
     self.run_gatk('call_haplotypecaller_gatk', gatk_args)
Example #16
0
    def combine_gvcf_gatk(self, vcf_files_in, vcf_out):
        '''Combine G.VCF files for all samples using GATK'''
        safe_make_dir('processed')
        safe_make_dir('processed/gatk')
        merge_commands = []
        temp_merge_outputs = []
        for n in range(0, int(math.ceil(float(len(vcf_files_in)) / 200.0))):
            start = n * 200
            filelist = vcf_files_in[start:start + 200]
            filelist_command = ' '.join(
                ['--variant ' + vcf for vcf in filelist])
            temp_merge_filename = vcf_out.rstrip(
                '.vcf') + ".temp_{start}.vcf".format(start=str(start))
            gatk_args_full = "java -Xmx{mem}g -jar {jar_path} -T CombineGVCFs -R {reference} " \
                             "--disable_auto_index_creation_and_locking_when_reading_rods " \
                             "{g_vcf_files} -o {vcf_out}; ".format(reference=self.reference,
                                                                   jar_path=self.gatk_jar,
                                                                   mem=self.state.config.get_stage_options('combine_gvcf_gatk', 'mem'),
                                                                   g_vcf_files=filelist_command,
                                                                   vcf_out=temp_merge_filename)
            merge_commands.append(gatk_args_full)
            temp_merge_outputs.append(temp_merge_filename)

        final_merge_vcfs = ' '.join(
            ['--variant ' + vcf for vcf in temp_merge_outputs])
        gatk_args_full_final = "java -Xmx{mem}g -jar {jar_path} -T CombineGVCFs -R {reference} " \
                               "--disable_auto_index_creation_and_locking_when_reading_rods " \
                               "{g_vcf_files} -o {vcf_out}".format(reference=self.reference,
                                                                   jar_path=self.gatk_jar,
                                                                   mem=self.state.config.get_stage_options('combine_gvcf_gatk', 'mem'),
                                                                   g_vcf_files=final_merge_vcfs,
                                                                   vcf_out=vcf_out)

        merge_commands.append(gatk_args_full_final)
        final_command = ''.join(merge_commands)
        run_stage(self.state, 'combine_gvcf_gatk', final_command)
Example #17
0
    def concatenate_vcfs(self, vcf_files_in, vcf_out):
        safe_make_dir('processed')
        safe_make_dir('processed/vardict')

        merge_commands = []
        temp_merge_outputs = []
        for n in range(0, int(math.ceil(float(len(vcf_files_in)) / 200.0))):
            start = n * 200
            filelist = vcf_files_in[start:start + 200]
            filelist_command = ' '.join([vcf for vcf in filelist])
            temp_merge_filename = vcf_out.rstrip(
                '.vcf') + ".temp_{start}.vcf".format(start=str(start))
            command1 = 'bcftools merge -O z -o {vcf_out} {join_vcf_files} && bcftools index -t -f {vcf_out}; '.format(
                vcf_out=temp_merge_filename, join_vcf_files=filelist_command)
            merge_commands.append(command1)
            temp_merge_outputs.append(temp_merge_filename)

        final_merge_vcfs = ' '.join([vcf for vcf in temp_merge_outputs])
        command2 = 'bcftools merge -O z -o {vcf_out} {join_vcf_files} '.format(
            vcf_out=vcf_out, join_vcf_files=final_merge_vcfs)

        merge_commands.append(command2)
        final_command = ''.join(merge_commands)
        run_stage(self.state, 'concatenate_vcfs', final_command)
Example #18
0
 def fastqc(self, fastq_in, dir_out):
     '''Quality check fastq file using fastqc'''
     safe_make_dir(dir_out)
     command = "fastqc --quiet -o {dir} {fastq}".format(dir=dir_out,
                                                        fastq=fastq_in)
     run_stage(self.state, 'fastqc', command)
Example #19
0
 def fastqc(self, fastq_in, dir_out):
     '''Quality check fastq file using fastqc'''
     safe_make_dir(dir_out)
     command = "fastqc --quiet -o {dir} {fastq}".format(dir=dir_out, fastq=fastq_in)
     run_stage(self.state, 'fastqc', command)
Example #20
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='cellfree_seq')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.sort.hq.bam')

    pipeline.transform(task_func=stages.run_connor,
                       name='run_connor',
                       input=output_from('align_bwa'),
                       filter=suffix('.sort.hq.bam'),
                       output='.sort.hq.connor.bam')

    safe_make_dir('metrics')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/connor')

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_raw',
        input=output_from('coverage_bed_raw', 'genome_reads_raw',
                          'target_reads_raw', 'total_reads_raw'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'summary.txt'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/connor/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_connor',
        input=output_from('coverage_bed_connor', 'genome_reads_connor',
                          'target_reads_connor', 'total_reads_connor'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/connor/all_sample.summary.\1.txt',
        extras=[r'\1', 'connor.summary.txt'])

    safe_make_dir('variants')
    safe_make_dir('variants/vardict')

    pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    (pipeline.merge(
        task_func=stages.concatenate_vcfs,
        name='concatenate_vcfs',
        input=output_from('sort_vcfs'),
        output='variants/vardict/combined.vcf.gz').follows('index_vcfs'))

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('vt_decompose_normalise'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
Example #21
0
def make_pipeline_process(state):
    #originate process pipeline state

    # Define empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the directories to be combined for variant calling
    run_directories = state.config.get_option('runs')
    #grab files from each of the processed directories in "runs"
    gatk_files = []
    for directory in run_directories:
        gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf'))

    stages = Stages(state)

    #dummy stage to take the globbed outputs of each run that is to be processed
    pipeline.originate(task_func=stages.glob_gatk,
                       name='glob_gatk',
                       output=gatk_files)

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('glob_gatk'),
                   output='processed/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Apply GT filters to genotyped vcf
    pipeline.transform(task_func=stages.genotype_filter_gatk,
                       name='genotype_filter_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.gt-filter.vcf')

    # Decompose and normalise multiallelic sites
    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('genotype_filter_gatk'),
                       filter=suffix('.raw.gt-filter.vcf'),
                       output='.raw.gt-filter.decomp.norm.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.raw.gt-filter.decomp.norm.vcf'),
                       output='.raw.gt-filter.decomp.norm.annotate.vcf')

    # Filter vcf
    pipeline.transform(
        task_func=stages.gatk_filter,
        name='gatk_filter',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vcf')

    #Apply VEP
    pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('gatk_filter'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf')

    ####### vardict stuff

    vardict_files = []
    for directory in run_directories:
        vardict_files.extend(
            glob.glob(directory + '/variants/vardict/*sorted.vcf.gz'))

    #dummy stage to take the globbed outputs of each run that is to be processed
    pipeline.originate(task_func=stages.glob_vardict,
                       name='glob_vardict',
                       output=vardict_files)

    safe_make_dir('processed/vardict')

    #concatenate all vardict vcfs
    pipeline.merge(task_func=stages.concatenate_vcfs,
                   name='concatenate_vcfs',
                   input=output_from('glob_vardict'),
                   output='processed/vardict/combined.vcf.gz')

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise_vardict',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise_vardict'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep_vardict',
        input=output_from('vt_decompose_normalise_vardict'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
Example #22
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the FASTQ files
    #fastq_files = state.config.get_option('fastqs')
    fastq_files = glob.glob("fastqs/*.gz")
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('processed_fastqs')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/pass_samples')
    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/vardict')

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    pipeline.transform(
        task_func=stages.run_surecalltrimmer,
        name='run_surecalltrimmer',
        input=output_from('original_fastqs'),
        filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'),
        #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'),
        #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'),
        extras=['{sample[0]}'],
        # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step.
        output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz')
    #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('run_surecalltrimmer'),
        filter=formatter(
            'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz'
        ),
        add_inputs=add_inputs(
            'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'),
        #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'),
        #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'),
        extras=['{sample[0]}'],
        output='alignments/{sample[0]}.bam')

    # Run locatit from agilent.  this should produce sorted bam files, so no sorting needed at the next step
    pipeline.collate(task_func=stages.run_locatit,
                     name='run_locatit',
                     input=output_from('align_bwa', 'original_fastqs'),
                     filter=regex(r'.+/(.+_L\d\d\d).+'),
                     output=r'alignments/\1.locatit.bam')

    pipeline.transform(task_func=stages.sort_bam,
                       name='sort_bam',
                       input=output_from('run_locatit'),
                       filter=suffix('.locatit.bam'),
                       output='.sorted.locatit.bam')

    # # # # # Metrics stages # # # # #
    # generate mapping metrics (post locatit)
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    # Intersect the bam file with the region of interest
    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    # Calculate coverage metrics from the intersected bam file
    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    # Count the number of mapped reads
    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    # Count the number of on-target reads
    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    # Count the number of total reads
    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    # Generate summary metrics from the stats files produces
    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])
    # # # # # Metrics stages end # # # # #

    # # # # # Checking metrics and calling # # # # #
    # Originate to set the location of the metrics summary file
    (pipeline.originate(
        task_func=stages.grab_summary_file,
        name='grab_summary_file',
        output='all_sample.summary.txt').follows('generate_stats'))

    # Awk command to produce a list of bam files passing filters
    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt')

    # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller
    pipeline.subdivide(name='passed_filter_files',
                       task_func=stages.read_samples,
                       input=output_from('filter_stats'),
                       filter=formatter(),
                       output="metrics/pass_samples/*.bam")

    # Call variants using GATK
    (pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam'))

    # Call variants with vardict
    (pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}']).follows('sort_bam'))

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    return (pipeline)
Example #23
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='fampipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # Make directories for outputs
    safe_make_dir('results')
    safe_make_dir('results/alignments')
    safe_make_dir('results/variants')

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        # IF THE READS ARE SPLIT IN LANES e.g. FAM_f2_SM_f2i5_ID_idx46-TCCCGA-L001-L002_LB_lb_PL_ILLUMINA_R2
        # filter=formatter(
            # '.+/FAM_(?P<famid>[a-zA-Z0-9]+)_SM_(?P<sample>[a-zA-Z0-9-]+)_ID_(?P<runid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1.fastq.gz'),
        filter=formatter('.+/FAM_(?P<fam>[a-zA-Z0-9]+)_SM_(?P<sample>[a-zA-Z0-9]+)' \
                 '_ID_(?P<id>[a-zA-Z0-9-]+)_LB_(?P<lb>[a-zA-Z0-9]+)' \
                 '_PL_(?P<pl>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/FAM_{fam[0]}_SM_{sample[0]}_ID_{id[0]}' \
                              '_LB_{lb[0]}_PL_{pl[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{id[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='results/alignments/{sample[0]}/FAM_{fam[0]}_SM_{sample[0]}_ID_{id[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.dedup.realn.recal.bam'),
        output='results/variants/{sample[0]}.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='FAMExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['FAMExomes.snp_recal', 'FAMExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['FAMExomes.indel_recal', 'FAMExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['FAMExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Filter variants using GATK
    pipeline.transform(
        task_func=stages.filter_variants_gatk,
        name='filter_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.filtered.vcf')

    # Select variants using GATK
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('filter_variants_gatk'),
        filter=suffix('.filtered.vcf'),
        output='.selected.vcf')

    # Rare variant genotyping using FamSeq
    pipeline.transform(
        task_func=stages.rare_variants_famseq,
        name='rare_variants_famseq',
        input=output_from('select_variants_gatk'),
        filter=suffix('.selected.vcf'),
        output='.famseq.vcf')

    return pipeline
Example #24
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='thepipelinex')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # Create output directories
    safe_make_dir('fastqc')
    safe_make_dir('alignments')
    safe_make_dir('variants')

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # FastQC on all FASTQ files
    pipeline.transform(
        task_func=stages.qc_fastqc,
        name='qc_fastqc',
        input=output_from('original_fastqs'),
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_(?P<read>[12]).fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz
        ##add_inputs=add_inputs(
        ##    '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        # extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'],
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='fastqc/{sample[0]}/')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    (pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'),
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz
        add_inputs=add_inputs(
            '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'],
        # extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam')
        .follows('qc_fastqc'))

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Local realignment using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('realigner_target_creator'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'),
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Merge lane bams to sample bams
    pipeline.collate(
        task_func=stages.merge_sample_bams,
        name='merge_sample_bams',
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'),
        # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        input=output_from('print_reads_gatk'),
        output='alignments/{sample[0]}/{sample[0]}.merged.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard2',
        input=output_from('merge_sample_bams'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'),
        filter=suffix('.merged.bam'),
        # XXX should make metricsup an extra output?
        output=['.merged.dedup.bam', '.metricsdup'])

    # Local realignment2 using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator2',
        input=output_from('mark_duplicates_picard2'),
        filter=suffix('.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk2',
        input=output_from('realigner_target_creator2'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).merged.intervals'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'),
        output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam')
        .follows('mark_duplicates_picard2'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('local_realignment_gatk2'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'),
        output='variants/{sample[0]}.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_haplotypecaller_gatk'),
        output='variants/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.combined.vcf'),
        output='.raw.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(
            ['ALL.indel_recal', 'ALL.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['ALL.recal_INDEL.vcf']),
        # output='.combined.vcf')
        output='ALL.raw.vqsr.vcf')
        .follows('apply_indel_recalibrate_gatk'))
    #
    # # Select variants using GATK
    # pipeline.transform(
    #     task_func=stages.select_variants_gatk,
    #     name='select_variants_gatk',
    #     input=output_from('combine_variants_gatk'),
    #     filter=suffix('.combined.vcf'),
    #     output='.selected.vcf')


    return pipeline
Example #25
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz'
        ),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.clipped.sort.hq.bam')

    # generate mapping metrics.
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])

    summary_file = 'all_sample.summary.txt'

    (pipeline.originate(task_func=stages.grab_summary_file,
                        name='grab_summary_file',
                        output=summary_file).follows('generate_stats'))

    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt',
                       extras=['all_sample.failed.summary.txt'])

    return pipeline
Example #26
0
def make_pipeline_process(state):
    # Define empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the directories to be combined for variant calling
    run_directories = state.config.get_option('runs')
    #grab files from each of the processed directories in "runs"
    gatk_files = []
    undr_rover_files = []
    for directory in run_directories:
        gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf'))
        undr_rover_files.extend(
            glob.glob(directory + '/variants/undr_rover/*sorted.vcf.gz'))

    # Stages are dependent on the state
    stages = Stages(state)

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.glob_gatk,
                       name='glob_gatk',
                       output=gatk_files)

    #Dummy stage to grab the undr rover files
    pipeline.originate(task_func=stages.glob_undr_rover,
                       name='glob_undr_rover',
                       output=undr_rover_files)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')

    pipeline.merge(task_func=stages.concatenate_vcfs,
                   name='concatenate_vcfs',
                   input=output_from('glob_undr_rover'),
                   output='variants/undr_rover/combined_undr_rover.vcf.gz')

    pipeline.transform(task_func=stages.index_final_vcf,
                       name='index_final_vcf',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.vcf.gz.tbi')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('glob_gatk'),
                   output='ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Apply GT filters to genotyped vcf
    pipeline.transform(task_func=stages.genotype_filter_gatk,
                       name='genotype_filter_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.gt-filter.vcf')

    # Decompose and normalise multiallelic sites
    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('genotype_filter_gatk'),
                       filter=suffix('.raw.gt-filter.vcf'),
                       output='.raw.gt-filter.decomp.norm.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.raw.gt-filter.decomp.norm.vcf'),
                       output='.raw.gt-filter.decomp.norm.annotate.vcf')

    # Filter vcf
    pipeline.transform(
        task_func=stages.gatk_filter,
        name='gatk_filter',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vcf')

    #Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('gatk_filter'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'),
        add_inputs=add_inputs(
            ['variants/undr_rover/combined_undr_rover.vcf.gz']),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf').follows(
            'index_final_vcf'))

    return pipeline