Ejemplo n.º 1
0
def run_germline(execution,
                 max_cores,
                 max_attempts,
                 target_bed,
                 input_path=None,
                 s3fs=None):
    """
    Executes the germline variant calling pipeline

    :type execution: Execution
    :param str target_bed: The target bed to call variants in
    :param str input_path: The path to the input_file tsv of fastq files
    """
    #: chrom -> target_bed_path
    target_bed = os.path.abspath(os.path.expanduser(target_bed))
    input_path = os.path.abspath(os.path.expanduser(input_path))

    # Copy the target.bed to the output_dir
    assert os.path.exists(target_bed), '%s does not exist' % target_bed
    cp_target_bed_task = execution.add_task(
        lambda drm='local', out_bed=out_dir('target.bed'): 'cp %s %s' %
        (target_bed, out_bed),
        out_dir='',
        stage_name='Copy_Target_Bed')

    target_bed_tasks = [
        execution.add_task(bed.filter_bed_by_contig, dict(contig=contig),
                           [cp_target_bed_task], 'work/contigs/{contig}')
        for contig in util.get_bed_contigs(target_bed)
    ]

    fastq_tasks = list(util.gen_fastq_tasks(execution, input_path))
    # fastq_tasks = [execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs')
    # for fastq_path, tags in parse_inputs(input_path)]

    fastqc_tasks = many2one(fastqc.fastqc,
                            fastq_tasks, ['sample_name', 'library'],
                            out_dir='SM_{sample_name}/qc/LB_{library}')

    # fastq_tasks = split_large_fastq_files(execution, fastq_tasks) # not working yet
    aligned_tasks = align(execution, fastq_tasks, target_bed_tasks)
    call_task = variant_call(execution, aligned_tasks, target_bed_tasks)

    execution.run(max_cores=max_cores,
                  max_attempts=max_attempts,
                  cmd_wrapper=make_s3_cmd_fxn_wrapper(s3fs)
                  if s3fs else shared_fs_cmd_fxn_wrapper)

    if execution.successful:
        execution.log.info('Final vcf: %s' % opj(
            s3fs if s3fs else execution.output_dir, call_task.output_files[0]))

    # Copy the sqlite db to s3
    dburl = env.config['gk']['database_url']
    if s3fs and dburl.startswith('sqlite'):
        # TODO implement so there is a 1-to-1 relationship between a sqlite database and an Execution.  Currently this is pushing way too much information,
        # TODO but will soon be replaced.  Alternative: use amazon RDS!  Or perhaps both?  Could do a sqlalchemy merge and save to sqlite, or implement
        # TODO cosmos multiverse
        s3cmd.cp(dburl.replace('sqlite:///', ''), opj(s3fs,
                                                      'sqlite.db.backup'))
Ejemplo n.º 2
0
def variant_call(execution, bam_path, target_bed_path, max_complex_gap):
    """
    Bioinformatics variant calling workflow
    """
    contigs = sp.check_output("cat %s |cut -f1|uniq" % target_bed_path, shell=True).strip().split("\n")

    bed_tasks = [execution.add_task(tools.filter_bed_by_contig, tags=dict(in_bam=bam_path, in_bed=target_bed_path, contig=contig), out_dir='work/{contig}')
                 for contig in contigs ]

    freebayes_tasks = one2one(tools.freebayes, bed_tasks, dict(max_complex_gap=max_complex_gap))

    merge_vcf_tasks = many2one(tools.vcf_concat_parts, freebayes_tasks)

    execution.run()
Ejemplo n.º 3
0
def variant_call(execution, max_attempts, max_cores, bam_path, target_bed_path, max_complex_gap):
    """
    Bioinformatics variant calling workflow
    """
    contigs = sp.check_output("cat %s |cut -f1|uniq" % target_bed_path, shell=True).strip().split("\n")

    freebayes_tasks = []
    for contig in contigs:
        bed_task = execution.add_task(tools.filter_bed_by_contig, tags=dict(in_bam=bam_path, in_bed=target_bed_path, contig=contig), out_dir='work/{contig}')
        freebayes_task = execution.add_task(tools.freebayes, tags=dict(max_complex_gap=max_complex_gap), parents=bed_task, out_dir='work/{contig}')
        freebayes_tasks.append(freebayes_task)

    merge_vcf_tasks = many2one(tools.vcf_concat_parts, parents=freebayes_tasks)

    execution.run(max_attempts=max_attempts, max_cores=max_cores)
Ejemplo n.º 4
0
def run_germline(execution, max_cores, max_attempts, target_bed, input_path=None, s3fs=None):
    """
    Executes the germline variant calling pipeline

    :type execution: Execution
    :param str target_bed: The target bed to call variants in
    :param str input_path: The path to the input_file tsv of fastq files
    """
    #: chrom -> target_bed_path
    target_bed = os.path.abspath(os.path.expanduser(target_bed))
    input_path = os.path.abspath(os.path.expanduser(input_path))

    # Copy the target.bed to the output_dir
    assert os.path.exists(target_bed), '%s does not exist' % target_bed
    cp_target_bed_task = execution.add_task(lambda drm='local', out_bed=out_dir('target.bed'): 'cp %s %s' % (target_bed, out_bed),
                                            out_dir='', stage_name='Copy_Target_Bed')

    target_bed_tasks = [execution.add_task(bed.filter_bed_by_contig, dict(contig=contig), [cp_target_bed_task], 'work/contigs/{contig}')
                        for contig in util.get_bed_contigs(target_bed)]

    fastq_tasks = list(util.gen_fastq_tasks(execution, input_path))
    # fastq_tasks = [execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs')
    # for fastq_path, tags in parse_inputs(input_path)]

    fastqc_tasks = many2one(fastqc.fastqc, fastq_tasks, ['sample_name', 'library'], out_dir='SM_{sample_name}/qc/LB_{library}')

    # fastq_tasks = split_large_fastq_files(execution, fastq_tasks) # not working yet
    aligned_tasks = align(execution, fastq_tasks, target_bed_tasks)
    call_task = variant_call(execution, aligned_tasks, target_bed_tasks)

    execution.run(max_cores=max_cores, max_attempts=max_attempts,
                  cmd_wrapper=make_s3_cmd_fxn_wrapper(s3fs) if s3fs else shared_fs_cmd_fxn_wrapper)

    if execution.successful:
        execution.log.info('Final vcf: %s' % opj(s3fs if s3fs else execution.output_dir,
                                                 call_task.output_files[0]))


    # Copy the sqlite db to s3
    dburl = env.config['gk']['database_url']
    if s3fs and dburl.startswith('sqlite'):
        # TODO implement so there is a 1-to-1 relationship between a sqlite database and an Execution.  Currently this is pushing way too much information,
        # TODO but will soon be replaced.  Alternative: use amazon RDS!  Or perhaps both?  Could do a sqlalchemy merge and save to sqlite, or implement
        # TODO cosmos multiverse
        s3cmd.cp(dburl.replace('sqlite:///', ''), opj(s3fs, 'sqlite.db.backup'))
Ejemplo n.º 5
0
def variant_call(execution, bam_path, target_bed_path, max_complex_gap):
    """
    Bioinformatics variant calling workflow
    """
    contigs = sp.check_output("cat %s |cut -f1|uniq" % target_bed_path,
                              shell=True).strip().split("\n")

    bed_tasks = [
        execution.add_task(tools.filter_bed_by_contig,
                           tags=dict(in_bam=bam_path,
                                     in_bed=target_bed_path,
                                     contig=contig),
                           out_dir='work/{contig}') for contig in contigs
    ]

    freebayes_tasks = one2one(tools.freebayes, bed_tasks,
                              dict(max_complex_gap=max_complex_gap))

    merge_vcf_tasks = many2one(tools.vcf_concat_parts, freebayes_tasks)

    execution.run()
Ejemplo n.º 6
0
def variant_call(ex, aligned_tasks, target_bed_tasks):
    """
    Alignments -> Variants

    :param Execution execution:
    :param list[Task] aligned_tasks:
    :param list[Task] target_bed_tasks:
    :return:
    """
    mkdir('work output', ex.output_dir)

    contig_to_targets = {t.tags['contig']: t for t in target_bed_tasks}

    hapcall_tasks = [
        ex.add_task(gatk.haplotype_caller,
                    tags=dict(contig=contig, **tags),
                    parents=parents + [target_bed_task],
                    out_dir='SM_{sample_name}/work/contigs/{contig}')
        for tags, parents in group(aligned_tasks, ['sample_name'])
        for contig, target_bed_task in contig_to_targets.items()
    ]

    # combine_gvcf_tasks = many2one(gatk.combine_gvcfs, hapcall_tasks, groupby=['sample_name'], out_dir='SM_{sample_name}')

    genotype_task = many2one(gatk.genotype_gvcfs,
                             hapcall_tasks,
                             groupby=[],
                             out_dir='work/variants_raw.vcf')[0]

    select_snps_task = ex.add_task(gatk.select_variants,
                                   tags=dict(
                                       in_vcfs=[genotype_task.output_files[0]],
                                       out_vcf='work/snps_raw.vcf',
                                       select_type='SNP'),
                                   parents=genotype_task)

    filter_snps_task = ex.add_task(
        gatk.variant_filtration,
        tags=dict(in_vcfs=[select_snps_task.output_files[0]],
                  out_vcf='work/snps_filtered.vcf',
                  filters=[('Qual', 'QUAL < 30'), ('QD', 'QD < 2.0'),
                           ('FS_snp', 'FS > 60.0'), ('MQ', 'MQ < 40.0'),
                           ('MQRankSum', 'MQRankSum < -12.5'),
                           ('ReadPosRankSum', 'ReadPosRankSum < -8.0')]),
        parents=select_snps_task)

    select_indels_task = ex.add_task(
        gatk.select_variants,
        tags=dict(in_vcfs=[genotype_task.output_files[0]],
                  out_vcf='work/indels_raw.vcf',
                  select_type='INDEL'),
        parents=genotype_task)

    filter_indels_task = ex.add_task(
        gatk.variant_filtration,
        tags=dict(in_vcfs=[select_indels_task.output_files[0]],
                  out_vcf='work/indels_filtered.vcf',
                  filters=[('Qual', 'QUAL < 30'), ('QD', 'QD < 2.0'),
                           ('FS_indel', 'FS > 200.0'),
                           ('ReadPosRankSum_indel', 'ReadPosRankSum < -2')]),
        parents=select_indels_task)

    combine_variants_task = ex.add_task(
        gatk.combine_variants,
        tags=dict(in_vcfs=[
            filter_indels_task.tags['out_vcf'],
            filter_snps_task.tags['out_vcf']
        ],
                  out_vcf='output/variants.vcf',
                  genotype_merge_option='PRIORITIZE'),
        parents=[filter_snps_task, filter_indels_task])

    variant_stats_task = ex.add_task(
        picard.collect_variant_calling_metrics,
        tags=dict(in_vcf=combine_variants_task.tags['out_vcf'],
                  out_path='output/picard'),
        parents=[combine_variants_task])

    # Run VQSR?

    return combine_variants_task
Ejemplo n.º 7
0
def align(execution, fastq_tasks, target_bed_tasks):
    """
    Reads -> Alignments

    :param Execution execution: The Execution instance to create Tasks in
    :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks
    :param list[Task] target_bed_tasks: target beds to parallelize/split on
    :return: Indel Realigned Tasks
    """

    # Do we need to split fastqs into smaller pieces?
    aligns = []
    for tags, fastq_task_group in group(fastq_tasks,
                                        by=[
                                            'sample_name', 'library',
                                            'platform', 'platform_unit',
                                            'rgid', 'chunk'
                                        ]):
        # trim_task = execution.add_task(fastq.trim_galore,
        # tags=dict(**tags),
        # parents=fastq_task_group,
        # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')

        align_task = execution.add_task(
            bwa.bwa_mem,
            tags=dict(**tags),
            parents=fastq_task_group,
            out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')
        aligns.append(align_task)

    dedupe = many2one(picard.mark_duplicates,
                      aligns,
                      groupby=['sample_name', 'library'],
                      out_dir='SM_{sample_name}/work/LB_{library}')

    # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage
    # for tags, parents in group(dedupe, ['sample_name']):
    # for target_bed_task in target_bed_tasks:
    # d = dict(contig=target_bed_task.tags['contig'],
    # in_target_bed=target_bed_task.output_files[0],
    # **tags)
    #

    rtc_tasks = [
        execution.add_task(gatk.realigner_target_creator,
                           dict(contig=target_bed_task.tags['contig'],
                                in_target_bed=target_bed_task.output_files[0],
                                **tags),
                           parents + [target_bed_task],
                           out_dir='SM_{sample_name}/work/contigs/{contig}')
        for tags, parents in group(dedupe, ['sample_name'])  # Many2one
        for target_bed_task in target_bed_tasks
    ]  # One2many

    realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks)
    realigned_by_sample_contig_tasks += [
        execution.add_task(samtools.view,
                           dict(out_bam=out_dir('both_pairs_unmapped.bam' %
                                                lb_task),
                                f='12',
                                sample_name=tags['sample_name'],
                                contig='BOTH_PAIRS_UNMAPPED',
                                library=lb_task.tags['library']),
                           parents=lb_task,
                           out_dir='SM_{sample_name}/work/LB_{library}',
                           stage_name='Filter_Both_Pairs_Unmapped')
        for tags, sm_tasks in group(dedupe, ['sample_name'])
        for lb_task in sm_tasks
    ]

    # Skipping BQSR.  Will improve results only slightly, if at all.

    # Merge bams so we have a sample bam.  Returning realign, so bams remained split by contig for downstream
    # parallelization
    merged = many2one(picard.merge_sam_files,
                      realigned_by_sample_contig_tasks, ['sample_name'],
                      out_dir='SM_{sample_name}',
                      stage_name="Merge_Sample_Bams")
    one2one(picard.collect_multiple_metrics,
            merged,
            out_dir='SM_{sample_name}/metrics')

    return merged
Ejemplo n.º 8
0
def variant_call(ex, aligned_tasks, target_bed_tasks):
    """
    Alignments -> Variants

    :param Execution execution:
    :param list[Task] aligned_tasks:
    :param list[Task] target_bed_tasks:
    :return:
    """
    mkdir('work output', ex.output_dir)

    contig_to_targets = {t.tags['contig']: t for t in target_bed_tasks}

    hapcall_tasks = [ex.add_task(gatk.haplotype_caller,
                                 tags=dict(contig=contig, **tags),
                                 parents=parents + [target_bed_task],
                                 out_dir='SM_{sample_name}/work/contigs/{contig}')
                     for tags, parents in group(aligned_tasks, ['sample_name'])
                     for contig, target_bed_task in contig_to_targets.items()]

    # combine_gvcf_tasks = many2one(gatk.combine_gvcfs, hapcall_tasks, groupby=['sample_name'], out_dir='SM_{sample_name}')

    genotype_tasks = many2one(gatk.genotype_gvcfs, hapcall_tasks, groupby=['contig'], out_dir='work/contigs/{contig}/variants_raw.vcf')

    combine_variants_task1 = ex.add_task(gatk.combine_variants,
                                         stage_name='Combine_Raw_Variants',
                                         tags=dict(in_vcfs=[t.output_files[0] for t in genotype_tasks],
                                                   out_vcf='work/variants.combined.raw.vcf',
                                                   genotype_merge_option='UNSORTED'),
                                         parents=genotype_tasks)

    select_snps_task = ex.add_task(gatk.select_variants,
                                   parents=[combine_variants_task1],
                                   tags=dict(select_type='SNP',
                                             in_vcfs=[combine_variants_task1.output_files[0]],
                                             out_vcf='work/variants.raw.snps.vcf'))

    filter_snps_task = ex.add_task(gatk.variant_filtration,
                                   tags=dict(in_vcfs=[select_snps_task.output_files[0]],
                                             out_vcf='work/variants.filtered.snps.vcf',
                                             filters=[('Qual', 'QUAL < 30'),
                                                      ('QD', 'QD < 2.0'),
                                                      ('FS_snp', 'FS > 60.0'),
                                                      ('MQ', 'MQ < 40.0'),
                                                      ('MQRankSum', 'MQRankSum < -12.5'),
                                                      ('ReadPosRankSum', 'ReadPosRankSum < -8.0')]),
                                   parents=select_snps_task)

    select_indels_task = ex.add_task(gatk.select_variants,
                                     parents=[combine_variants_task1],
                                     tags=dict(select_type='INDEL',
                                               in_vcfs=[combine_variants_task1.output_files[0]],
                                               out_vcf='work/variants.raw.indels.vcf'))

    filter_indels_task = ex.add_task(gatk.variant_filtration,
                                     tags=dict(in_vcfs=[select_indels_task.output_files[0]],
                                               out_vcf='work/variants.filtered.indels.vcf',
                                               filters=[('Qual', 'QUAL < 30'),
                                                        ('QD', 'QD < 2.0'),
                                                        ('FS_indel', 'FS > 200.0'),
                                                        ('ReadPosRankSum_indel', 'ReadPosRankSum < -2')]),
                                     parents=select_indels_task)

    combine_variants_task2 = ex.add_task(gatk.combine_variants,
                                         tags=dict(in_vcfs=[filter_indels_task.tags['out_vcf'], filter_snps_task.tags['out_vcf']],
                                                   out_vcf='output/variants.vcf',
                                                   genotype_merge_option='PRIORITIZE'),
                                         parents=[filter_snps_task, filter_indels_task])

    variant_stats_task = ex.add_task(picard.collect_variant_calling_metrics,
                                     tags=dict(in_vcf=combine_variants_task2.tags['out_vcf'],
                                               out_path='output/picard'),
                                     parents=[combine_variants_task2])

    # Run VQSR?



    return combine_variants_task2
Ejemplo n.º 9
0
def align(execution, fastq_tasks, target_bed_tasks):
    """
    Reads -> Alignments

    :param Execution execution: The Execution instance to create Tasks in
    :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks
    :param list[Task] target_bed_tasks: target beds to parallelize/split on
    :return: Indel Realigned Tasks
    """

    # Do we need to split fastqs into smaller pieces?
    aligns = []
    for tags, fastq_task_group in group(fastq_tasks, by=['sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk']):
        # trim_task = execution.add_task(fastq.trim_galore,
        # tags=dict(**tags),
        # parents=fastq_task_group,
        # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')

        align_task = execution.add_task(bwa.bwa_mem,
                                        tags=dict(**tags),
                                        parents=fastq_task_group,
                                        out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')
        aligns.append(align_task)

    dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}')

    # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage
    # for tags, parents in group(dedupe, ['sample_name']):
    # for target_bed_task in target_bed_tasks:
    # d = dict(contig=target_bed_task.tags['contig'],
    # in_target_bed=target_bed_task.output_files[0],
    # **tags)
    #

    rtc_tasks = [execution.add_task(gatk.realigner_target_creator,
                                    dict(contig=target_bed_task.tags['contig'],
                                         in_target_bed=target_bed_task.output_files[0], **tags),
                                    parents + [target_bed_task],
                                    out_dir='SM_{sample_name}/work/contigs/{contig}')
                 for tags, parents in group(dedupe, ['sample_name'])  # Many2one
                 for target_bed_task in target_bed_tasks]  # One2many

    realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks)
    realigned_by_sample_contig_tasks += [execution.add_task(samtools.view,
                                                            dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task),
                                                                 f='12',
                                                                 sample_name=tags['sample_name'],
                                                                 contig='BOTH_PAIRS_UNMAPPED',
                                                                 library=lb_task.tags['library']),
                                                            parents=lb_task,
                                                            out_dir='SM_{sample_name}/work/LB_{library}',
                                                            stage_name='Filter_Both_Pairs_Unmapped')
                                         for tags, sm_tasks in group(dedupe, ['sample_name'])
                                         for lb_task in sm_tasks]


    # Skipping BQSR.  Will improve results only slightly, if at all.


    # Merge bams so we have a sample bam.  Returning realign, so bams remained split by contig for downstream
    # parallelization
    merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams")
    one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics')
    one2one(picard.collect_wgs_metrics, merged, out_dir='SM_{sample_name}/metrics')

    return merged