Exemple #1
0
def finalize_one(cnf, *abnormal_regions_reports):
    msg = ['Regions with abnormal regions finished for ' + cnf.sample + ':']

    if abnormal_regions_reports:
        msg.append('Abnormal region reports: ')
        info('Abnormal region reports:')
        for rep in abnormal_regions_reports:
            msg.append('  ' + rep)
            info('  ' + rep)
Exemple #2
0
def process_one(cnf):
    sample = VarSample(cnf.sample, cnf.output_dir, vcf=cnf.vcf, bam=cnf.bam, genome=cnf.genome)

    step_greetings('Fixing "SAMPLE" INFO annotation and SAMPLE header...')
    vcf_fpath = fix_vcf_sample_name(cnf, sample.name, cnf.vcf)

    # this method will also gunzip the vcf file
    # sample.vcf = fix_chromosome_names(cnf, sample.vcf)
    # if cnf.vcf.endswith('.gz'):
    #     vcf_fpath = intermediate_fname(cnf, splitext(sample.vcf)[0], None)
    #     info('Ungzipping ' + sample.vcf + ', writing to ' + vcf_fpath)
    #     gunzip = get_system_path(cnf, 'gunzip', is_critical=True)
    #     cmdl = '{gunzip} {sample.vcf} --to-stdout'.format(**locals())
    #     call(cnf, cmdl, output_fpath=vcf_fpath)
    #     verify_vcf(vcf_fpath)
    #     sample.vcf = vcf_fpath

    step_greetings('Removing rejeted records...')
    pass_vcf_fpath = remove_rejected(cnf, vcf_fpath)
    info()

    # if sample.vcf is None:
    #     err('No variants left for ' + cnf.vcf + ': all rejected and removed.')
    #     return None, None, None

    # # In mutect, running paired analysis on a single sample could lead
    # # to a "none" sample column. Removing that column.
    # info('get_sample_column_index')
    # none_idx = get_sample_column_index(sample.vcf, 'none', suppress_warn=True)
    # if none_idx is not None:
    #     info('Removing the "none" column.')
    #     def fn(line, i):
    #         if line and not line.startswith('##'):
    #             ts = line.split('\t')
    #             del ts[9 + none_idx]
    #             return '\t'.join(ts) + '\n'
    #         return line
    #     sample.vcf = iterate_file(cnf, sample.vcf, fn, suffix='none_col')

    # Replacing so the main sample goes first (if it is not already)
    # main_idx = get_sample_column_index(sample.vcf, sample.name)
    # if main_idx:
    #     info('Moving the main sample column (' + sample.name + ') to the first place.')
    #     def fn(line, i):
    #         if line and not line.startswith('##'):
    #             ts = line.split('\t')
    #             main_sample_field = ts[9 + main_idx]
    #             del ts[9 + main_idx]
    #             ts = ts[:9] + [main_sample_field] + ts[9:]
    #             return '\t'.join(ts) + '\n'
    #         return line
    #     sample.vcf = iterate_file(cnf, sample.vcf, fn, suffix='main_col')

    anno_vcf_fpath = run_annotators(cnf, pass_vcf_fpath, sample.bam)

    return finialize_annotate_file(cnf, anno_vcf_fpath, sample, cnf.caller)
Exemple #3
0
def finalize_all(cnf, samples, results):
    for (sample_name, cnf), (vcf, tsv, maf) in zip(samples.items(), results):
        if vcf or tsv:
            info(sample_name + ':')
        if vcf:
            info('  ' + vcf)
        if tsv:
            info('  ' + tsv)
        if maf:
            info('  ' + maf)
def picard_ins_size_hist(cnf, sample, bam_fpath, output_dir):
    picard = get_system_path(cnf, 'java', 'picard')
    if picard:
        safe_mkdir(dirname(sample.picard_ins_size_hist_txt_fpath))
        safe_mkdir(dirname(sample.picard_ins_size_hist_pdf_fpath))
        info('Picard ins size hist for "' + basename(bam_fpath) + '"')
        cmdline = '{picard} CollectInsertSizeMetrics' \
                  ' I={bam_fpath}' \
                  ' O={sample.picard_ins_size_hist_txt_fpath}' \
                  ' H={sample.picard_ins_size_hist_pdf_fpath}' \
                  ' VALIDATION_STRINGENCY=LENIENT'

        cmdline = cmdline.format(**locals())
        call(cnf,
             cmdline,
             output_fpath=sample.picard_ins_size_hist_txt_fpath,
             stdout_to_outputfile=False,
             exit_on_error=False)
def proc_fastq(cnf, sample, l_fpath, r_fpath):
    if cnf.downsample_to:
        info('Downsampling the reads to ' + str(cnf.downsample_to))
        l_fpath, r_fpath = downsample(cnf,
                                      sample.nname,
                                      l_fpath,
                                      r_fpath,
                                      cnf.downsample_to,
                                      output_dir=cnf.work_dir,
                                      suffix='subset')

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)
    bwa = get_system_path(cnf, 'bwa')
    bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
    if not (sambamba and bwa and bammarkduplicates):
        critical(
            'sambamba, BWA, and bammarkduplicates are required to align BAM')
    info()
    info('Aligning reads to the reference')
    bam_fpath = align(cnf, sample, l_fpath, r_fpath, sambamba, bwa,
                      bammarkduplicates, cnf.genome.bwa, cnf.is_pcr)
    bam_fpath = verify_bam(bam_fpath)
    if not bam_fpath:
        critical('Sample ' + sample + ' was not aligned successfully.')
    return bam_fpath
def process_one(cnf, output_dir, bam_fpath, features_bed,
                features_no_genes_bed):
    sample = TargQC_Sample(cnf.sample, output_dir, bed=cnf.bed, bam=cnf.bam)
    sample.l_fpath = cnf.l_fpath
    sample.r_fpath = cnf.r_fpath

    # if not sample.bam and sample.l_fpath and sample.r_fpath:
    #     sample.bam = proc_fastq(cnf, sample, verify_file(cnf.l_fpath), verify_file(cnf.r_fpath))

    info('Using alignment ' + sample.bam)

    if not bam_fpath:
        critical(sample.name + ': BAM file is required.')

    target_bed = verify_file(cnf.bed, is_critical=True) if cnf.bed else None
    bam_fpath = verify_file(sample.bam, is_critical=True)
    index_bam(cnf, bam_fpath)

    gene_keys_list = None
    if cnf.prep_bed is not False:
        info('Preparing the BED file.')
        features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds(
            cnf, features_bed, target_bed)

        gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)
    else:
        info('The BED file is ready, skipping preparing.')
        gene_keys_set, gene_keys_list, _, _, _ = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)

    picard_ins_size_hist(cnf, sample, bam_fpath, output_dir)

    avg_depth, gene_by_name_and_chrom, reports = make_targqc_reports(
        cnf, output_dir, sample, bam_fpath, features_bed,
        features_no_genes_bed, target_bed, gene_keys_list)

    # #if cnf.extended:
    # try:
    #     info('Generating flagged regions report...')
    #     flagged_report = generate_flagged_regions_report(cnf, cnf.output_dir, sample, avg_depth, gene_by_name_and_chrom)
    #     if not flagged_report:
    #         err('Flagged regions report was not generated')
    #         err()
    # except:
    #     err(format_exc())

    return reports
def main(args):
    cnf = read_opts_and_cnfs(extra_opts=[
        (['--bam'], dict(dest='bam', help='a path to the BAM file to study')),
        (['-1'], dict(dest='l_fpath')), (['-2'], dict(dest='r_fpath')),
        (['--bed', '--capture', '--amplicons'],
         dict(dest='bed', help='a BED file for capture panel or amplicons')),
        (['--exons', '--exome', '--features'],
         dict(
             dest='features',
             help=
             'a BED file with real CDS/Exon/Gene/Transcript regions with annotations (default "features" is in system_config)'
         )),
        (['--exons-no-genes', '--features-no-genes'],
         dict(
             dest='features_no_genes',
             help=
             'a BED file with real CDS/Exon regions with annotations, w/o Gene/Transcript records (default "features" is in system_config)'
         )),
        (['--original-bed'],
         dict(dest='original_target_bed', help=SUPPRESS_HELP)),
        (['--original-exons', '--original-features'],
         dict(
             dest='original_features_bed',
             help='original features genes bed file path (just for reporting)')
         ),
        (['--reannotate'],
         dict(dest='reannotate',
              help='re-annotate BED file with gene names',
              action='store_true',
              default=False)),
        (['--no-prep-bed'],
         dict(dest='prep_bed',
              help='do not fix input beds and exons',
              action='store_false',
              default=True)),
        (['-e', '--extended'],
         dict(dest='extended',
              help='extended - flagged regions and missed variants',
              action='store_true',
              default=False)),
        (['--genes'], dict(dest='genes', help='custom list of genes')),
        (['--padding'],
         dict(
             dest='padding',
             help=
             'integer indicating the number of bases to extend each target region up and down-stream. '
             'Default is ' + str(defaults['coverage_reports']['padding']),
             type='int')),
        (['--no-dedup'],
         dict(dest='no_dedup', action='store_true', help=SUPPRESS_HELP)),
        (['--downsample-to'],
         dict(dest='downsample_to', type='int', help=SUPPRESS_HELP)),
        (['--downsampled'],
         dict(dest='downsampled', action='store_true', help=SUPPRESS_HELP)),
        (['--fastqc-dirpath'], dict(dest='fastqc_dirpath', help=SUPPRESS_HELP))
    ],
                             file_keys=['bam', 'l_fpath', 'r_fpath', 'bed'],
                             key_for_sample_name='bam')

    if cnf.padding:
        cnf.coverage_reports.padding = cnf.padding

    check_system_resources(cnf, required=['bedtools'], optional=[])

    check_genome_resources(cnf)

    features_bed = adjust_path(cnf.features) if cnf.features else adjust_path(
        cnf.genome.features)
    if features_bed:
        info('Features: ' + features_bed)
        features_bed = verify_file(features_bed)
    else:
        info('No features BED found')

    if cnf.bed:
        cnf.bed = verify_file(cnf.bed, is_critical=True)
        info('Using amplicons/capture panel ' + cnf.bed)
    elif features_bed:
        info('WGS, taking CDS as target')

    cnf.bam = verify_bam(cnf.bam, is_critical=True)

    reports = process_one(cnf,
                          cnf.output_dir,
                          cnf.bam,
                          features_bed=features_bed,
                          features_no_genes_bed=cnf.features_no_genes)
    summary_report, gene_report = reports[:2]

    info('')
    info('*' * 70)
    if summary_report.txt_fpath:
        info('Summary report: ' + summary_report.txt_fpath)
    if gene_report:
        if gene_report.txt_fpath:
            info('All regions: ' + gene_report.txt_fpath + ' (' +
                 str(len(gene_report.rows)) + ' regions)')

    if len(reports) > 2:
        selected_regions_report = reports[2]
        if selected_regions_report.txt_fpath:
            info('Flagged regions: ' + selected_regions_report.txt_fpath +
                 ' (' + str(len(selected_regions_report.rows)) + ' regions)')

    for fpaths in reports:
        if fpaths:
            ok = True
            info('Checking expected results...')
            if not isinstance(fpaths, list):
                fpaths = [fpaths]
            for fpath in fpaths:
                if isinstance(fpath, basestring):
                    if not verify_file(fpath):
                        ok = False
            if ok:
                info('The results are good.')

    if not cnf['keep_intermediate']:
        shutil.rmtree(cnf['work_dir'])
Exemple #8
0
def finalize_one(cnf, anno_vcf_fpath):
    msg = ['Annoatation finished for ' + cnf.sample + ':']
    if anno_vcf_fpath:
        msg.append('VCF: ' + anno_vcf_fpath)
        info('Saved final VCF to ' + anno_vcf_fpath)
def main(args):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--vcf', '--var'], dict(
                dest='vcf',
                help='variants to filter')
             ),
            (['--vcf2txt'], dict(
                dest='vcf2txt',
                help='variants in vcf2txt to filter')
             ),
            (['--cohort-freqs'], dict(
                dest='cohort_freqs_fpath',
                help='frequencies of variants in a cohort')
             ),
            (['--qc'], dict(
                dest='qc',
                action='store_true',
                default=True,
                help=SUPPRESS_HELP)
             ),
            (['--no-qc'], dict(
                dest='qc',
                action='store_false',
                help=SUPPRESS_HELP)
             ),
            (['--no-tsv'], dict(
                dest='tsv',
                action='store_false',
                default=True,
                help=SUPPRESS_HELP)
             ),
        ],
        required_keys=['vcf'],
        file_keys=['vcf'],
        key_for_sample_name='vcf',
        proc_name=source.varfilter_name + '_post')

    check_system_resources(cnf, required=['perl'])
    check_genome_resources(cnf)

    if not cnf.output_file:
        cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt')

    safe_mkdir(dirname(cnf.output_file))
    safe_mkdir(cnf.output_dir)

    if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'):
        verify_vcf(cnf.vcf, is_critical=True)

    if not cnf.vcf2txt:
        vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file)
        if not vcf2txt_res_fpath:
            critical('vcf2txt run returned non-0')
        info('Saved vcf2txt output to ' + vcf2txt_res_fpath)
    else:
        cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True)
        info('Input is vcf2txt output, grepping by sample name ' + cnf.sample)
        vcf2txt_res_fpath = cnf.output_file
        with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx:
            with open(cnf.vcf2txt) as f, open(tx, 'w') as out:
                for i, l in enumerate(f):
                    if l.strip():
                        if i == 0:
                            out.write(l)
                        else:
                            if l.split('\t')[0] == cnf.sample:
                                out.write(l)
        info('Using vcf2txt from ' + vcf2txt_res_fpath)

    # if is_local():
    #     vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl'))
    #     info('Running vardict2mut perl')
    #     res = run_vardict2mut(cnf, vcf2txt_res_fpath,
    #         add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'),
    #         vardict2mut_executable=vardict2mut_pl)
    #     if not res:
    #         critical('vardict2mut.pl run returned non-0')

    mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix))
    if not mut_fpath:
        err('vardict2mut failed')
    else:
        info('Saved passed mutations to ' + mut_fpath)

        var_s = source.VarSample(cnf.sample, cnf.output_dir)
        var_s.anno_vcf_fpath = cnf.vcf
        var_s.varfilter_dirpath = var_s.dirpath

        ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0]
        ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt'))
        var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz'

        var_s.variants_fpath = vcf2txt_res_fpath
        var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix)

        ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass')
        var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass')

        filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath)
        index_vcf(cnf, var_s.name, filt_vcf, cnf.caller)
        index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller)

        if cnf.qc:
            report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s)
            qc_dirpath = join(cnf.output_dir, 'qc')
            safe_mkdir(qc_dirpath)
            qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name)
            info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')')
            info('-' * 70)
            info()

        if not cnf['keep_intermediate']:
            shutil.rmtree(cnf['work_dir'])

        info()
        info('*' * 70)
        info('Done filtering ' + var_s.name)