Beispiel #1
0
def indel_realigner(core_req=4,  # proxy for mem_req until i test mem_req out
                    mem_req=8 * 1024,
                    contig=None,
                    in_bams=find('bam$', n='>0'),
                    in_bais=find('bai$', n='>0'),
                    in_sites=find('denovo_realign_targets.bed'),
                    out_bam=out_dir('realigned.bam'),
                    out_bai=out_dir('realigned.bai')):
    in_bams = bam_list_to_inputs(in_bams)

    if s['ref']['version'] == 'b37':
        in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf']
    elif s['ref']['version'] == 'hg38':
        in_knowns = [s['ref']['mills_and_1kg_indel_vcf']]

    return r"""
        # IR does not support parallelization
        {gatk} \
        -T IndelRealigner \
        -R {s[ref][reference_fasta]} \
        -I {in_bams} \
        -o {out_bam} \
        -targetIntervals {in_sites} \
        {knowns} \
        -model USE_READS \
        --filter_bases_not_stored \
        {intervals}

        {s[opt][samtools]} index {out_bam}
    """.format(s=s,
               intervals=arg('--intervals', contig),
               gatk=gatk(mem_req),
               knowns=' '.join('-known %s' % p for p in in_knowns),
               **locals())
Beispiel #2
0
def indel_realigner(core_req=4,  # proxy for mem_req until i test mem_req out
                    mem_req=8 * 1024,
                    contig=None,
                    in_bams=find('bam$', n='>0'),
                    in_bais=find('bai$', n='>0'),
                    in_sites=find('denovo_realign_targets.bed'),
                    out_bam=out_dir('realigned.bam'),
                    out_bai=out_dir('realigned.bai')):
    in_bams = bam_list_to_inputs(in_bams)

    if s['ref']['version'] == 'b37':
        in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf']
    elif s['ref']['version'] == 'hg38':
        in_knowns = [s['ref']['mills_and_1kg_indel_vcf']]

    return r"""
        # IR does not support parallelization
        {gatk} \
        -T IndelRealigner \
        -R {s[ref][reference_fasta]} \
        -I {in_bams} \
        -o {out_bam} \
        -targetIntervals {in_sites} \
        {knowns} \
        -model USE_READS \
        --filter_bases_not_stored \
        {intervals}

        {s[opt][samtools]} index {out_bam}
    """.format(s=s,
               intervals=arg('--intervals', contig),
               gatk=gatk(mem_req),
               knowns=' '.join('-known %s' % p for p in in_knowns),
               **locals())
Beispiel #3
0
def merge_sam_files(in_bams=find('bam', n='>=1'),
                    out_bam=out_dir('merged.bam'),
                    out_bai=out_dir('merged.bai')):
    return r"""
        {picard} MergeSamFiles \
        {inputs} \
        O={out_bam} \
        ASSUME_SORTED=True \
        CREATE_INDEX=True
    """.format(picard=picard(), inputs=list_to_input(in_bams), **locals())
Beispiel #4
0
def mark_illumina_adapters(mem_req=8 * 1024,
                           in_bam=find('bam'),
                           out_bam=out_dir('unaligned_trimmed.bam'),
                           out_metrics=out_dir('adapter.metrics')):
    return r"""
        {picard} MarkIlluminaAdapters\
        I={in_bam} \
        O={out_bam} \
        METRICS={out_metrics}
    """.format(s=s, picard=picard(), **locals())
Beispiel #5
0
def bwa_mem(rgid, sample_name, library, platform, platform_unit,
            reference=s['ref']['reference_fasta'],
            core_req=16,
            in_fastqs=find('.fastq|.fq|.fq.gz|.fastq.gz', n=2),
            out_cutadapt_log=out_dir('cutadapt.log'),
            out_bam=out_dir('aligned.bam'),
            out_bai=out_dir('aligned.bai')):
    in_fastq1, in_fastq2 = in_fastqs
    fifo1 = out_bam.replace('aligned.bam', 'fifo1')
    fifo2 = out_bam.replace('aligned.bam', 'fifo2')

    return r"""
        {s[opt][bwa]} mem \
          -t {bwa_cores} -L 0 -M \
          -R "@RG\tID:{rgid}\tLB:{library}\tSM:{sample_name}\tPL:{platform}\tPU:{platform_unit}" \
          {reference} \
          {in_fastq1} \
          {in_fastq2} \
        | {picard} SortSam I=/dev/stdin O={out_bam} CREATE_INDEX=true SORT_ORDER=coordinate
        """.format(s=s,
                   bwa_cores=core_req-2,
                   picard=picard.picard(),
                   **locals())


    # @can_stream(['in_fastq1', 'in_fastq2'])
    # def bwa_mem_with_trimming(rgid, sample_name, library, platform, platform_unit,
    # reference=s['ref']['reference_fasta'],
    #             core_req=16,
    #             in_fastq1=find('.fastq', tags=dict(read_pair='1')),
    #             in_fastq2=find('.fastq', tags=dict(read_pair='2')),
    #             out_bam=out_dir('aligned.bam'),
    #             out_bai=out_dir('aligned.bam.bai'),
    #             out_adapter_metrics=out_dir('adapter.metrics')):
    #     return r"""
    #
    #             {fastq_to_sam} \
    #             | {mark_illumina_adapters} \
    #             | {sam_to_fastq}
    #             | {s[opt][bwa]} mem \
    #               -t {core_req} -L 0 -M -p \
    #               -R "@RG\tID:{rgid}\tLB:{library}\tSM:{sample_name}\tPL:{platform}\tPU:{platform_unit}" \
    #               {reference} \
    #               /dev/stdin \
    #             | {s[opt][samtools]} sort -@ 2 -m 2G - {samtools_out}
    #
    #             {s[opt][samtools]} index {out_bam}
    #             """.format(s=s,
    #                        fastq_to_sam=picard.fastq_to_sam(rgid=rgid, sample_name=sample_name, library=library, platform=platform, platform_unit=platform_unit,
    #                                                         in_fastq1=in_fastq1, in_fastq2=in_fastq2, out_bam='/dev/stdout').strip(),
    #                        mark_illumina_adapters=picard.mark_illumina_adapters(in_bam='/dev/stdin', out_bam='/dev/stdout', metrics=out_adapter_metrics).strip(),
    #                        sam_to_fastq=picard.sam_to_fastq_interleave('/dev/stdin', '/dev/stdout'),
    #
    #                        samtools_out=out_bam.replace('.bam', ''),
    #                        **locals())
Beispiel #6
0
def merge_sam_files(in_bams=find('bam', n='>=1'),
                    out_bam=out_dir('merged.bam'),
                    out_bai=out_dir('merged.bai')):
    return r"""
        {picard} MergeSamFiles \
        {inputs} \
        O={out_bam} \
        ASSUME_SORTED=True \
        CREATE_INDEX=True
    """.format(picard=picard(),
               inputs=list_to_input(in_bams),
               **locals())
Beispiel #7
0
def mark_illumina_adapters(mem_req=8 * 1024,
                           in_bam=find('bam'),
                           out_bam=out_dir('unaligned_trimmed.bam'),
                           out_metrics=out_dir('adapter.metrics')):
    return r"""
        {picard} MarkIlluminaAdapters\
        I={in_bam} \
        O={out_bam} \
        METRICS={out_metrics}
    """.format(s=s,
               picard=picard(),
               **locals())
Beispiel #8
0
def trim_galore(in_fastq1=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='1')),
                in_fastq2=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='2')),
                out_directory=out_dir(''),
                out_fastq1=out_dir('trimmed_r1.fastq.gz'),
                out_fastq2=out_dir('trimmed_r2.fastq.gz')):
    return r"""
        {s[opt][trim_galore]} \
        --paired \
        --dont_gzip \
        -o {out_directory} \
        --path_to_cutadapt {s[opt][cutadapt]} \
        {in_fastq1} {in_fastq2}
    """.format(s=s, **locals())
Beispiel #9
0
def trim_galore(in_fastq1=find('fq.gz|\.fastq|fastq.gz',
                               tags=dict(read_pair='1')),
                in_fastq2=find('fq.gz|\.fastq|fastq.gz',
                               tags=dict(read_pair='2')),
                out_directory=out_dir(''),
                out_fastq1=out_dir('trimmed_r1.fastq.gz'),
                out_fastq2=out_dir('trimmed_r2.fastq.gz')):
    return r"""
        {s[opt][trim_galore]} \
        --paired \
        --dont_gzip \
        -o {out_directory} \
        --path_to_cutadapt {s[opt][cutadapt]} \
        {in_fastq1} {in_fastq2}
    """.format(s=s, **locals())
Beispiel #10
0
def haplotype_caller(core_req=16,
                     mem_req=12 * 1024,
                     in_bams=find('bam$', n='>0'),
                     in_bais=find('bai$', n='>0'),
                     in_target_bed=find('target.bed'),
                     out_vcf=out_dir('raw_variants.g.vcf')):
    in_bams = bam_list_to_inputs(in_bams)
    intervals = arg('--intervals', in_target_bed)
    return r"""
        {gatk} \
        -T HaplotypeCaller \
        -R {s[ref][reference_fasta]} \
        -D {s[ref][dbsnp_vcf]} \
        -nct {core_req} \
        --emitRefConfidence GVCF \
        -stand_call_conf 30 \
        -stand_emit_conf 10 \
        -I {in_bams} \        -o {out_vcf} \
        {intervals} \
        -A Coverage \
        -A GCContent \
        -A AlleleBalanceBySample \
        -A AlleleBalance \
        -A MappingQualityRankSumTest \
        -A InbreedingCoeff \
        -A FisherStrand \
        -A QualByDepth
    """.format(s=s, gatk=gatk(mem_req), **locals())
Beispiel #11
0
def word_count(chars=False,
               in_txts=find('txt$', n='>=1'),
               out_txt=out_dir('wc.txt')):
    c = ' -c' if chars else ''
    return 'wc{c} {input} > {out_txt}'.format(input=' '.join(map(str,
                                                                 in_txts)),
                                              **locals())
Beispiel #12
0
def realigner_target_creator(core_req=8,
                             mem_req=8 * 1024,
                             in_target_bed=find('target.bed'),
                             in_bams=find('bam$', n='>0'),
                             in_bais=find('bai$', n='>0'),
                             out_bams=forward('in_bams'),
                             out_bais=forward('in_bais'),
                             out_sites=out_dir('denovo_realign_targets.bed')):
    in_bams = bam_list_to_inputs(in_bams)

    if s['ref']['version'] == 'b37':
        in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf']
    elif s['ref']['version'] == 'hg38':
        in_knowns = [s['ref']['mills_and_1kg_indel_vcf']]

    # TODO should we pad intervals?  might be indels on perimeter that need realigner.  Not too worried because we're using HaplotypeCaller, though.
    return r"""
        #could add more knowns from ESP and other seq projects...
        {gatk} \
        -T RealignerTargetCreator \
        -R {s[ref][reference_fasta]} \
        -I {in_bams} \
        -o {out_sites} \
        {knowns} \
        -nt {core_req} \
        {args}
    """.format(s=s, gatk=gatk(mem_req),
               args=arg('--intervals', in_target_bed),
               knowns=' '.join('-known %s' % p for p in in_knowns),
               **locals())
Beispiel #13
0
def sam_to_fastq_interleave(in_bam=find('bam$'),
                            out_fastq=out_dir('reads.fastq')):
    return r"""
        {picard} SamToFastq \
        I={in_bam} \
        FASTQ={out_fastq}
    """.format(s=s, picard=picard(), **locals())
Beispiel #14
0
def haplotype_caller(core_req=16,
                     mem_req=29 * 1024,
                     in_bams=find('bam$', n='>0'),
                     in_bais=find('bai$', n='>0'),
                     in_target_bed=find('target.bed'),
                     out_vcf=out_dir('raw_variants.g.vcf')):
    in_bams = bam_list_to_inputs(in_bams)
    intervals = arg('--intervals', in_target_bed)
    return r"""
        {gatk} \
        -T HaplotypeCaller \
        -R {s[ref][reference_fasta]} \
        -D {s[ref][dbsnp_vcf]} \
        -nct {core_req} \
        --emitRefConfidence GVCF \
        -I {in_bams} \
        -o {out_vcf} \
        {intervals} \
        -A Coverage \
        -A GCContent \
        -A AlleleBalanceBySample \
        -A AlleleBalance \
        -A MappingQualityRankSumTest \
        -A InbreedingCoeff \
        -A FisherStrand \
        -A QualByDepth
    """.format(s=s, gatk=gatk(mem_req), **locals())
Beispiel #15
0
def filter_bed_by_contig(contig,
                       drm='local',
                       in_bed=find('bed$'),
                       out_bed=out_dir('target.bed')):
    return r"""
        grep -P "^{contig}\t" {in_bed} > {out_bed}
    """.format(s=s, **locals())
Beispiel #16
0
def realigner_target_creator(core_req=8,
                             mem_req=8 * 1024,
                             in_target_bed=find('target.bed'),
                             in_bams=find('bam$', n='>0'),
                             in_bais=find('bai$', n='>0'),
                             out_bams=forward('in_bams'),
                             out_bais=forward('in_bais'),
                             out_sites=out_dir('denovo_realign_targets.bed')):
    in_bams = bam_list_to_inputs(in_bams)

    if s['ref']['version'] == 'b37':
        in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf']
    elif s['ref']['version'] == 'hg38':
        in_knowns = [s['ref']['mills_and_1kg_indel_vcf']]

    # TODO should we pad intervals?  might be indels on perimeter that need realigner.  Not too worried because we're using HaplotypeCaller, though.
    return r"""
        #could add more knowns from ESP and other seq projects...
        {gatk} \
        -T RealignerTargetCreator \
        -R {s[ref][reference_fasta]} \
        -I {in_bams} \
        -o {out_sites} \
        {knowns} \
        -nt {core_req} \
        {args}
    """.format(s=s, gatk=gatk(mem_req),
               args=arg('--intervals', in_target_bed),
               knowns=' '.join('-known %s' % p for p in in_knowns),
               **locals())
Beispiel #17
0
def filter_bed_by_contig(contig,
                         drm='local',
                         in_bed=find('bed$'),
                         out_bed=out_dir('target.bed')):
    return r"""
        grep -P "^{contig}\t" {in_bed} > {out_bed}
    """.format(s=settings, **locals())
Beispiel #18
0
def run_germline(execution,
                 max_cores,
                 max_attempts,
                 target_bed,
                 input_path=None,
                 s3fs=None):
    """
    Executes the germline variant calling pipeline

    :type execution: Execution
    :param str target_bed: The target bed to call variants in
    :param str input_path: The path to the input_file tsv of fastq files
    """
    #: chrom -> target_bed_path
    target_bed = os.path.abspath(os.path.expanduser(target_bed))
    input_path = os.path.abspath(os.path.expanduser(input_path))

    # Copy the target.bed to the output_dir
    assert os.path.exists(target_bed), '%s does not exist' % target_bed
    cp_target_bed_task = execution.add_task(
        lambda drm='local', out_bed=out_dir('target.bed'): 'cp %s %s' %
        (target_bed, out_bed),
        out_dir='',
        stage_name='Copy_Target_Bed')

    target_bed_tasks = [
        execution.add_task(bed.filter_bed_by_contig, dict(contig=contig),
                           [cp_target_bed_task], 'work/contigs/{contig}')
        for contig in util.get_bed_contigs(target_bed)
    ]

    fastq_tasks = list(util.gen_fastq_tasks(execution, input_path))
    # fastq_tasks = [execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs')
    # for fastq_path, tags in parse_inputs(input_path)]

    fastqc_tasks = many2one(fastqc.fastqc,
                            fastq_tasks, ['sample_name', 'library'],
                            out_dir='SM_{sample_name}/qc/LB_{library}')

    # fastq_tasks = split_large_fastq_files(execution, fastq_tasks) # not working yet
    aligned_tasks = align(execution, fastq_tasks, target_bed_tasks)
    call_task = variant_call(execution, aligned_tasks, target_bed_tasks)

    execution.run(max_cores=max_cores,
                  max_attempts=max_attempts,
                  cmd_wrapper=make_s3_cmd_fxn_wrapper(s3fs)
                  if s3fs else shared_fs_cmd_fxn_wrapper)

    if execution.successful:
        execution.log.info('Final vcf: %s' % opj(
            s3fs if s3fs else execution.output_dir, call_task.output_files[0]))

    # Copy the sqlite db to s3
    dburl = env.config['gk']['database_url']
    if s3fs and dburl.startswith('sqlite'):
        # TODO implement so there is a 1-to-1 relationship between a sqlite database and an Execution.  Currently this is pushing way too much information,
        # TODO but will soon be replaced.  Alternative: use amazon RDS!  Or perhaps both?  Could do a sqlalchemy merge and save to sqlite, or implement
        # TODO cosmos multiverse
        s3cmd.cp(dburl.replace('sqlite:///', ''), opj(s3fs,
                                                      'sqlite.db.backup'))
Beispiel #19
0
def cut_adapt(minimum_length=50,
              in_fastq1=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='1')),
              in_fastq2=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='2')),
              out_fastq1=out_dir('trimmed_r1.fastq.gz'),
              out_fastq2=out_dir('trimmed_r2.fastq.gz')):
    # out_fastq1='>( gzip > %s)' % out_fastq1
    # out_fastq2='>( gzip > %s)' % out_fastq2
    return r"""
        {s[opt][cutadapt]} \
        -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
        -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \
        {args} \
        -o {out_fastq1} -p {out_fastq2} \
        {in_fastq1} {in_fastq2}
    """.format(s=s,
               args=args(('--minimum-length', minimum_length)),
               **locals())
Beispiel #20
0
def gen_fastq_tasks(execution, input_path):
    for fastq_path, tags in parse_inputs(input_path):
        if fastq_path.startswith('s3://'):
            yield execution.add_task(download_from_s3,
                                     dict(in_file=fastq_path, out_file=out_dir('SM_{sample_name}/work/input/%s' % os.path.basename(fastq_path)), **tags),
                                     stage_name='Download_Fastqs_From_S3')
        else:
            yield execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs')
Beispiel #21
0
def sam_to_fastq_interleave(in_bam=find('bam$'),
                            out_fastq=out_dir('reads.fastq')):
    return r"""
        {picard} SamToFastq \
        I={in_bam} \
        FASTQ={out_fastq}
    """.format(s=s,
               picard=picard(),
               **locals())
Beispiel #22
0
def collect_variant_calling_metrics(in_vcf=find('in_vcf'),
                                    in_dbsnp=s['ref']['dbsnp_vcf'],
                                    out_path=out_dir('picard.variant_metrics')):
    return r"""
        {picard} CollectVariantCallingMetrics \
        I={in_vcf} \
        DBSNP={in_dbsnp} \
        O={out_path}
    """.format(picard=picard(), **locals())
Beispiel #23
0
def cut_adapt(minimum_length=50,
              in_fastq1=find('fq.gz|\.fastq|fastq.gz',
                             tags=dict(read_pair='1')),
              in_fastq2=find('fq.gz|\.fastq|fastq.gz',
                             tags=dict(read_pair='2')),
              out_fastq1=out_dir('trimmed_r1.fastq.gz'),
              out_fastq2=out_dir('trimmed_r2.fastq.gz')):
    # out_fastq1='>( gzip > %s)' % out_fastq1
    # out_fastq2='>( gzip > %s)' % out_fastq2
    return r"""
        {s[opt][cutadapt]} \
        -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
        -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \
        {args} \
        -o {out_fastq1} -p {out_fastq2} \
        {in_fastq1} {in_fastq2}
    """.format(s=s,
               args=args(('--minimum-length', minimum_length)),
               **locals())
Beispiel #24
0
def collect_variant_calling_metrics(
    in_vcf=find('in_vcf'),
    in_dbsnp=s['ref']['dbsnp_vcf'],
    out_path=out_dir('picard.variant_metrics')):
    return r"""
        {picard} CollectVariantCallingMetrics \
        I={in_vcf} \
        DBSNP={in_dbsnp} \
        O={out_path}
    """.format(picard=picard(), **locals())
Beispiel #25
0
def mark_duplicates(
    core_req=4,  # for scratch space
    mem_req=12 * 1024,
    in_bams=find('bam$', n='>=1'),
    in_bais=find('bai$', n='>=1'),
    out_bam=out_dir('deduped.bam'),
    out_bai=out_dir('deduped.bam.bai'),
    out_metrics=out_dir('deduped.metrics')):
    return r"""
        {picard} MarkDuplicates \
        {inputs} \
        O={out_bam} \
        METRICS_FILE={out_metrics} \
        ASSUME_SORTED=True \
        MAX_RECORDS_IN_RAM=1000000 \
        VALIDATION_STRINGENCY=SILENT \
        VERBOSITY=INFO

        {s[opt][samtools]} index {out_bam}
    """.format(inputs=list_to_input(in_bams), s=s, picard=picard(), **locals())
Beispiel #26
0
def merge(in_bams=find('bam$', n='>0'), out_bam=out_dir('merged.bam')):
    if len(in_bams) == 1:
        # Can't merge 1 bam, just copy it
        return r"""
        cp {in_bams[0]} {out_bam}
        """.format(**locals())
    else:
        in_bams = ' '.join(map(str, in_bams))
        return r"""
            {s[opt][samtools]} merge -f {out_bam} {in_bams}
        """.format(s=s, **locals())
Beispiel #27
0
def collect_wgs_metrics(in_bam=find('bam'),
                        out_path=out_dir('picard.raw_wgs_metrics.txt'),
                        reference_fasta=s['ref']['reference_fasta']):
    return r"""
    {picard} CollectRawWgsMetrics \
      I={in_bam} \
      O={out_path} \
      R={reference_fasta} \
      INCLUDE_BQ_HISTOGRAM=true
    """.format(picard=picard(),
               **locals())
Beispiel #28
0
def mark_duplicates(core_req=4,  # for scratch space
                    mem_req=12 * 1024,
                    in_bams=find('bam$', n='>=1'),
                    in_bais=find('bai$', n='>=1'),
                    out_bam=out_dir('deduped.bam'),
                    out_bai=out_dir('deduped.bam.bai'),
                    out_metrics=out_dir('deduped.metrics')):
    return r"""
        {picard} MarkDuplicates \
        {inputs} \
        O={out_bam} \
        METRICS_FILE={out_metrics} \
        ASSUME_SORTED=True \
        MAX_RECORDS_IN_RAM=1000000 \
        VALIDATION_STRINGENCY=SILENT \
        VERBOSITY=INFO

        {s[opt][samtools]} index {out_bam}
    """.format(inputs=list_to_input(in_bams), s=s,
               picard=picard(),
               **locals())
Beispiel #29
0
def combine_gvcfs(mem_req=12 * 1024,
                  in_vcfs=find('vcf|vcf.gz$', n='>0'),
                  out_vcf=out_dir('variants.g.vcf')):
    in_vcfs = vcf_list_to_input(in_vcfs)

    return r"""
        {gatk} \
        -T CombineGVCFs \
        -R {s[ref][reference_fasta]} \
        {in_vcfs} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req), **locals())
Beispiel #30
0
def combine_gvcfs(mem_req=12 * 1024,
                  in_vcfs=find('vcf|vcf.gz$', n='>0'),
                  out_vcf=out_dir('variants.g.vcf')):
    in_vcfs = vcf_list_to_input(in_vcfs)

    return r"""
        {gatk} \
        -T CombineGVCFs \
        -R {s[ref][reference_fasta]} \
        {in_vcfs} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req), **locals())
Beispiel #31
0
def merge(in_bams=find('bam$', n='>0'),
          out_bam=out_dir('merged.bam')):
    if len(in_bams) == 1:
        # Can't merge 1 bam, just copy it
        return r"""
        cp {in_bams[0]} {out_bam}
        """.format(**locals())
    else:
        in_bams = ' '.join(map(str, in_bams))
        return r"""
            {s[opt][samtools]} merge -f {out_bam} {in_bams}
        """.format(s=s, **locals())
Beispiel #32
0
def genotype_gvcfs(core_req=8,
                   mem_req=12 * 1024,
                   in_vcfs=find('vcf|vcf.gz$', n='>0'),
                   out_vcf=out_dir('variants.vcf')):
    return r"""
        {gatk} \
        -T GenotypeGVCFs \
        -R {s[ref][reference_fasta]} \
        -D {s[ref][dbsnp_vcf]} \
        -nt {core_req} \
        {inputs} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req), inputs=vcf_list_to_input(in_vcfs), **locals())
Beispiel #33
0
def genotype_gvcfs(core_req=8,
                   mem_req=12 * 1024,
                   in_vcfs=find('vcf|vcf.gz$', n='>0'),
                   out_vcf=out_dir('variants.vcf')):
    return r"""
        {gatk} \
        -T GenotypeGVCFs \
        -R {s[ref][reference_fasta]} \
        -D {s[ref][dbsnp_vcf]} \
        -nt {core_req} \
        {inputs} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req), inputs=vcf_list_to_input(in_vcfs), **locals())
Beispiel #34
0
def gen_fastq_tasks(execution, input_path):
    for fastq_path, tags in parse_inputs(input_path):
        if fastq_path.startswith('s3://'):
            yield execution.add_task(
                download_from_s3,
                dict(in_file=fastq_path,
                     out_file=out_dir('SM_{sample_name}/work/input/%s' %
                                      os.path.basename(fastq_path)),
                     **tags),
                stage_name='Download_Fastqs_From_S3')
        else:
            yield execution.add_task(load_input,
                                     dict(in_file=fastq_path, **tags),
                                     stage_name='Load_Fastqs')
Beispiel #35
0
def freebayes(reference_fasta=settings['ref']['reference_fasta'],
              max_complex_gap=2,
              no_complex=True,
              in_target_bed=find('bed$'), in_bam=find('bam$'),
              out_vcf=out_dir('variants.vcf')):
    return r"""
        {s[opt][freebayes]} -f {reference_fasta} \
        --vcf {out_vcf} \
        --targets {in_target_bed} \
        {args} \
        -m 30 -q 10 -R 0 -S 0 -F 0.1 \
        {in_bam}
    """.format(s=settings,
               args=args(('--max-complex-gap', max_complex_gap),
                         ('--no-complex', no_complex)),
               **locals())
Beispiel #36
0
def collect_multiple_metrics(in_bam=find('bam'),
                             out_path=out_dir('picard'),
                             reference_fasta=s['ref']['reference_fasta']):
    return r"""
      {picard} CollectMultipleMetrics \
      I={in_bam} \
      O={out_path} \
      R={reference_fasta} \
      {programs}
    """.format(picard=picard(),
               programs=' '.join('PROGRAM=%s' % p for p in
                                 ['CollectAlignmentSummaryMetrics', 'CollectInsertSizeMetrics',
                                  'QualityScoreDistribution', 'MeanQualityByCycle',
                                  'CollectBaseDistributionByCycle', 'CollectGcBiasMetrics',
                                  'CollectSequencingArtifactMetrics', 'CollectQualityYieldMetrics',
                                  ]),
               **locals())
Beispiel #37
0
def freebayes(reference_fasta=settings['ref']['reference_fasta'],
              max_complex_gap=2,
              no_complex=True,
              in_target_bed=find('bed$'),
              in_bam=find('bam$'),
              out_vcf=out_dir('variants.vcf')):
    return r"""
        {s[opt][freebayes]} -f {reference_fasta} \
        --vcf {out_vcf} \
        --targets {in_target_bed} \
        {args} \
        -m 30 -q 10 -R 0 -S 0 -F 0.1 \
        {in_bam}
    """.format(s=settings,
               args=args(('--max-complex-gap', max_complex_gap),
                         ('--no-complex', no_complex)),
               **locals())
Beispiel #38
0
def run_germline(execution, max_cores, max_attempts, target_bed, input_path=None, s3fs=None):
    """
    Executes the germline variant calling pipeline

    :type execution: Execution
    :param str target_bed: The target bed to call variants in
    :param str input_path: The path to the input_file tsv of fastq files
    """
    #: chrom -> target_bed_path
    target_bed = os.path.abspath(os.path.expanduser(target_bed))
    input_path = os.path.abspath(os.path.expanduser(input_path))

    # Copy the target.bed to the output_dir
    assert os.path.exists(target_bed), '%s does not exist' % target_bed
    cp_target_bed_task = execution.add_task(lambda drm='local', out_bed=out_dir('target.bed'): 'cp %s %s' % (target_bed, out_bed),
                                            out_dir='', stage_name='Copy_Target_Bed')

    target_bed_tasks = [execution.add_task(bed.filter_bed_by_contig, dict(contig=contig), [cp_target_bed_task], 'work/contigs/{contig}')
                        for contig in util.get_bed_contigs(target_bed)]

    fastq_tasks = list(util.gen_fastq_tasks(execution, input_path))
    # fastq_tasks = [execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs')
    # for fastq_path, tags in parse_inputs(input_path)]

    fastqc_tasks = many2one(fastqc.fastqc, fastq_tasks, ['sample_name', 'library'], out_dir='SM_{sample_name}/qc/LB_{library}')

    # fastq_tasks = split_large_fastq_files(execution, fastq_tasks) # not working yet
    aligned_tasks = align(execution, fastq_tasks, target_bed_tasks)
    call_task = variant_call(execution, aligned_tasks, target_bed_tasks)

    execution.run(max_cores=max_cores, max_attempts=max_attempts,
                  cmd_wrapper=make_s3_cmd_fxn_wrapper(s3fs) if s3fs else shared_fs_cmd_fxn_wrapper)

    if execution.successful:
        execution.log.info('Final vcf: %s' % opj(s3fs if s3fs else execution.output_dir,
                                                 call_task.output_files[0]))


    # Copy the sqlite db to s3
    dburl = env.config['gk']['database_url']
    if s3fs and dburl.startswith('sqlite'):
        # TODO implement so there is a 1-to-1 relationship between a sqlite database and an Execution.  Currently this is pushing way too much information,
        # TODO but will soon be replaced.  Alternative: use amazon RDS!  Or perhaps both?  Could do a sqlalchemy merge and save to sqlite, or implement
        # TODO cosmos multiverse
        s3cmd.cp(dburl.replace('sqlite:///', ''), opj(s3fs, 'sqlite.db.backup'))
Beispiel #39
0
def fastq_to_sam(rgid, sample_name, library, platform, platform_unit,
                 in_fastq1=find('.fastq', tags=dict(read_pair='1')),
                 in_fastq2=find('.fastq', tags=dict(read_pair='2')),
                 out_bam=out_dir('unaligned.bam')):
    return r"""
        {picard} FastqToSam \
        FASTQ={in_fastq1} \
        FASTQ2={in_fastq2} \
        O={out_bam} \
        SAMPLE_NAME={sample_name} \
        LIBRARY_NAME={library} \
        PLATFORM_UNIT={platform_unit} \
        PLATFORM={platform} \
        READ_GROUP_NAME={rgid}

    """.format(s=s,
               picard=picard(),
               **locals())
Beispiel #40
0
def collect_multiple_metrics(in_bam=find('bam'),
                             out_path=out_dir('picard'),
                             reference_fasta=s['ref']['reference_fasta']):
    return r"""
      {picard} CollectMultipleMetrics \
      I={in_bam} \
      O={out_path} \
      R={reference_fasta} \
      {programs}
    """.format(picard=picard(),
               programs=' '.join('PROGRAM=%s' % p for p in [
                   'CollectAlignmentSummaryMetrics',
                   'CollectInsertSizeMetrics', 'QualityScoreDistribution',
                   'MeanQualityByCycle', 'CollectBaseDistributionByCycle',
                   'CollectGcBiasMetrics', 'CollectSequencingArtifactMetrics',
                   'CollectQualityYieldMetrics', 'CollectWgsMetrics'
               ]),
               **locals())
Beispiel #41
0
def select_variants(select_type,
                    in_vcfs=find('vcf|vcf.gz$', n='>0'),
                    out_vcf=out_dir('variants.vcf'),
                    in_reference_fasta=s['ref']['reference_fasta'],
                    mem_req=6 * 1024):
    """
    :param select_type: "SNP" or "INDEL"
    """

    return r"""
        {gatk} \
        -T SelectVariants \
        -R {in_reference_fasta} \
        {inputs} \
        -selectType {select_type} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req),
               inputs=vcf_list_to_input(in_vcfs),
               **locals())
Beispiel #42
0
def fastq_to_sam(rgid,
                 sample_name,
                 library,
                 platform,
                 platform_unit,
                 in_fastq1=find('.fastq', tags=dict(read_pair='1')),
                 in_fastq2=find('.fastq', tags=dict(read_pair='2')),
                 out_bam=out_dir('unaligned.bam')):
    return r"""
        {picard} FastqToSam \
        FASTQ={in_fastq1} \
        FASTQ2={in_fastq2} \
        O={out_bam} \
        SAMPLE_NAME={sample_name} \
        LIBRARY_NAME={library} \
        PLATFORM_UNIT={platform_unit} \
        PLATFORM={platform} \
        READ_GROUP_NAME={rgid}

    """.format(s=s, picard=picard(), **locals())
Beispiel #43
0
def fastqc(core_req=8,
           in_r1s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='1')),
           in_r2s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='2')),
           out_dir=out_dir('fastqc/')):
    assert len(in_r1s) == len(in_r2s)

    # if len(in_r1s) > 1 or in_r1s[0].startswith('<('):
    #     # If there are more than 1 fastqs per read_pair, merge them into one file per read_pair
    #     # Note, catting compressed files together seems fine
    #     # Have to cat because fastqc does not support streaming
    #     # TODO make sure we are concating to local temp disc if available.  For the usual S3 option this is fine, since we're already in a tmp dir
    #     # TODO stream from s3 into a cat command when input files start with s3://
    #
    #     r1, r2 = 'cat_r1.fastq.gz', 'cat_r2.fastq.gz'
    #     cat = r"""
    #         cat {r1s_join} > {r1}
    #         cat {r2s_join} > {r2}
    #         """.format(s=s,
    #                    r1s_join=' '.join(map(str, in_r1s)),
    #                    r2s_join=' '.join(map(str, in_r2s)),
    #                    **locals())
    #     cleanup = 'rm %s %s' % (r1, r2)
    # else:
    #     r1, r2 = in_r1s[0], in_r2s[0]
    #     cat = ""
    #     cleanup = ""

    cat = 'cat {fqs} | {zcat_or_cat}'.format(fqs=' '.join(in_r1s + in_r2s),
                                             zcat_or_cat='zcat' if '.gz' in in_r1s[0] else 'cat')

    return r"""
            mkdir -p {out_dir}

            {cat} | \
            {s[opt][fastqc]} \
            --threads {core_req} \
            --dir {s[gk][tmp_dir]} \
            -o {out_dir} \
            /dev/stdin

            """.format(s=s, **locals())
Beispiel #44
0
def fastqc(core_req=8,
           in_r1s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='1')),
           in_r2s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='2')),
           out_dir=out_dir('fastqc/')):
    assert len(in_r1s) == len(in_r2s)

    # if len(in_r1s) > 1 or in_r1s[0].startswith('<('):
    #     # If there are more than 1 fastqs per read_pair, merge them into one file per read_pair
    #     # Note, catting compressed files together seems fine
    #     # Have to cat because fastqc does not support streaming
    #     # TODO make sure we are concating to local temp disc if available.  For the usual S3 option this is fine, since we're already in a tmp dir
    #     # TODO stream from s3 into a cat command when input files start with s3://
    #
    #     r1, r2 = 'cat_r1.fastq.gz', 'cat_r2.fastq.gz'
    #     cat = r"""
    #         cat {r1s_join} > {r1}
    #         cat {r2s_join} > {r2}
    #         """.format(s=s,
    #                    r1s_join=' '.join(map(str, in_r1s)),
    #                    r2s_join=' '.join(map(str, in_r2s)),
    #                    **locals())
    #     cleanup = 'rm %s %s' % (r1, r2)
    # else:
    #     r1, r2 = in_r1s[0], in_r2s[0]
    #     cat = ""
    #     cleanup = ""


    return r"""
            mkdir -p {out_dir}

            {s[opt][fastqc]} \
            --threads {core_req} \
            --dir {s[gk][tmp_dir]} \
            -o {out_dir} \
            {fqs}

            """.format(s=s, fqs=' '.join(in_r1s + in_r2s),**locals())
Beispiel #45
0
def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')):
    return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format(
        s=s, **locals())
Beispiel #46
0
def cat(in_txts=find('txt$', n='>=1'), out_txt=out_dir('cat.txt')):
    return 'cat {input_str} > {out_txt}'.format(input_str=' '.join(
        map(str, in_txts)),
                                                **locals())
Beispiel #47
0
def echo(word, out_txt=out_dir('echo.txt')):
    return '{s[echo_path]} {word} > {out_txt}'.format(s=s, **locals())
Beispiel #48
0
def md5sum(in_file=find('.*', n=1), out_md5=out_dir('checksum.md5')):
    out_md5.basename = in_file.basename + '.md5'
    return 'md5sum {in_file}'.format(**locals())
Beispiel #49
0
def word_count(chars=False, in_txts=find('txt$', n='>=1'), out_txt=out_dir('wc.txt')):
    c = ' -c' if chars else ''
    return 'wc{c} {input} > {out_txt}'.format(
        input=' '.join(map(str, in_txts)),
        **locals()
    )
Beispiel #50
0
def md5sum(in_file=find('.*', n=1), out_md5=out_dir('checksum.md5')):
    out_md5.basename = in_file.basename + '.md5'
    return 'md5sum {in_file}'.format(**locals())
Beispiel #51
0
def download_from_s3(in_file, out_file=out_dir('{in_file}')):
    assert in_file.startswith('s3://')
    return s3cmd.cp(in_file, out_file)
Beispiel #52
0
def paste(in_txts=find('txt$', n='>=1'), out_txt=out_dir('paste.txt')):
    return 'paste {input} > {out_txt}'.format(input=' '.join(
        map(str, (in_txts, ))),
                                              **locals())
Beispiel #53
0
def paste(in_txts=find('txt$', n='>=1'), out_txt=out_dir('paste.txt')):
    return 'paste {input} > {out_txt}'.format(
        input=' '.join(map(str, (in_txts,))),
        **locals()
    )
Beispiel #54
0
def align(execution, fastq_tasks, target_bed_tasks):
    """
    Reads -> Alignments

    :param Execution execution: The Execution instance to create Tasks in
    :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks
    :param list[Task] target_bed_tasks: target beds to parallelize/split on
    :return: Indel Realigned Tasks
    """

    # Do we need to split fastqs into smaller pieces?
    aligns = []
    for tags, fastq_task_group in group(fastq_tasks,
                                        by=[
                                            'sample_name', 'library',
                                            'platform', 'platform_unit',
                                            'rgid', 'chunk'
                                        ]):
        # trim_task = execution.add_task(fastq.trim_galore,
        # tags=dict(**tags),
        # parents=fastq_task_group,
        # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')

        align_task = execution.add_task(
            bwa.bwa_mem,
            tags=dict(**tags),
            parents=fastq_task_group,
            out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')
        aligns.append(align_task)

    dedupe = many2one(picard.mark_duplicates,
                      aligns,
                      groupby=['sample_name', 'library'],
                      out_dir='SM_{sample_name}/work/LB_{library}')

    # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage
    # for tags, parents in group(dedupe, ['sample_name']):
    # for target_bed_task in target_bed_tasks:
    # d = dict(contig=target_bed_task.tags['contig'],
    # in_target_bed=target_bed_task.output_files[0],
    # **tags)
    #

    rtc_tasks = [
        execution.add_task(gatk.realigner_target_creator,
                           dict(contig=target_bed_task.tags['contig'],
                                in_target_bed=target_bed_task.output_files[0],
                                **tags),
                           parents + [target_bed_task],
                           out_dir='SM_{sample_name}/work/contigs/{contig}')
        for tags, parents in group(dedupe, ['sample_name'])  # Many2one
        for target_bed_task in target_bed_tasks
    ]  # One2many

    realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks)
    realigned_by_sample_contig_tasks += [
        execution.add_task(samtools.view,
                           dict(out_bam=out_dir('both_pairs_unmapped.bam' %
                                                lb_task),
                                f='12',
                                sample_name=tags['sample_name'],
                                contig='BOTH_PAIRS_UNMAPPED',
                                library=lb_task.tags['library']),
                           parents=lb_task,
                           out_dir='SM_{sample_name}/work/LB_{library}',
                           stage_name='Filter_Both_Pairs_Unmapped')
        for tags, sm_tasks in group(dedupe, ['sample_name'])
        for lb_task in sm_tasks
    ]

    # Skipping BQSR.  Will improve results only slightly, if at all.

    # Merge bams so we have a sample bam.  Returning realign, so bams remained split by contig for downstream
    # parallelization
    merged = many2one(picard.merge_sam_files,
                      realigned_by_sample_contig_tasks, ['sample_name'],
                      out_dir='SM_{sample_name}',
                      stage_name="Merge_Sample_Bams")
    one2one(picard.collect_multiple_metrics,
            merged,
            out_dir='SM_{sample_name}/metrics')

    return merged
Beispiel #55
0
def vcf_concat_parts(in_vcfs=find('vcf$', n='>0'),
                     out_vcf=out_dir('freebayes.vcf')):
    return r"""
        {s[opt][vcf_concat_parts]} {vcfs} > {out_vcf}
    """.format(s=settings, vcfs=' '.join(in_vcfs), **locals())
Beispiel #56
0
def align(execution, fastq_tasks, target_bed_tasks):
    """
    Reads -> Alignments

    :param Execution execution: The Execution instance to create Tasks in
    :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks
    :param list[Task] target_bed_tasks: target beds to parallelize/split on
    :return: Indel Realigned Tasks
    """

    # Do we need to split fastqs into smaller pieces?
    aligns = []
    for tags, fastq_task_group in group(fastq_tasks, by=['sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk']):
        # trim_task = execution.add_task(fastq.trim_galore,
        # tags=dict(**tags),
        # parents=fastq_task_group,
        # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')

        align_task = execution.add_task(bwa.bwa_mem,
                                        tags=dict(**tags),
                                        parents=fastq_task_group,
                                        out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')
        aligns.append(align_task)

    dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}')

    # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage
    # for tags, parents in group(dedupe, ['sample_name']):
    # for target_bed_task in target_bed_tasks:
    # d = dict(contig=target_bed_task.tags['contig'],
    # in_target_bed=target_bed_task.output_files[0],
    # **tags)
    #

    rtc_tasks = [execution.add_task(gatk.realigner_target_creator,
                                    dict(contig=target_bed_task.tags['contig'],
                                         in_target_bed=target_bed_task.output_files[0], **tags),
                                    parents + [target_bed_task],
                                    out_dir='SM_{sample_name}/work/contigs/{contig}')
                 for tags, parents in group(dedupe, ['sample_name'])  # Many2one
                 for target_bed_task in target_bed_tasks]  # One2many

    realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks)
    realigned_by_sample_contig_tasks += [execution.add_task(samtools.view,
                                                            dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task),
                                                                 f='12',
                                                                 sample_name=tags['sample_name'],
                                                                 contig='BOTH_PAIRS_UNMAPPED',
                                                                 library=lb_task.tags['library']),
                                                            parents=lb_task,
                                                            out_dir='SM_{sample_name}/work/LB_{library}',
                                                            stage_name='Filter_Both_Pairs_Unmapped')
                                         for tags, sm_tasks in group(dedupe, ['sample_name'])
                                         for lb_task in sm_tasks]


    # Skipping BQSR.  Will improve results only slightly, if at all.


    # Merge bams so we have a sample bam.  Returning realign, so bams remained split by contig for downstream
    # parallelization
    merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams")
    one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics')
    one2one(picard.collect_wgs_metrics, merged, out_dir='SM_{sample_name}/metrics')

    return merged
Beispiel #57
0
def echo(word, out_txt=out_dir('echo.txt')):
    return '{s[echo_path]} {word} > {out_txt}'.format(s=s, **locals())
Beispiel #58
0
def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')):
    return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format(s=s,
                                                                           **locals())
Beispiel #59
0
def cat(in_txts=find('txt$', n='>=1'), out_txt=out_dir('cat.txt')):
    return 'cat {input_str} > {out_txt}'.format(
        input_str=' '.join(map(str, in_txts)),
        **locals()
    )