Example #1
0
def get_chr_len_fpath(cnf):
    chr_len_fpath = join(cnf.work_dir, 'chr_lengths.txt')
    if cnf.reuse_intermediate and file_exists(chr_len_fpath):
        info(chr_len_fpath + ' exists, reusing')
        return chr_len_fpath

    else:
        if not cnf.genome.seq:
            critical('There is no "seq" key in ' + cnf.sys_cnf + ' for "' +
                     cnf.genome.name + '" section')
            return None

        chr_lengths = get_chr_lengths_from_seq(adjust_path(cnf.genome.seq))

        with file_transaction(cnf.work_dir, chr_len_fpath) as tx:
            with open(tx, 'w') as handle:
                for c, l in chr_lengths:
                    handle.write(c + '\t' + str(l) + '\n')
    return chr_len_fpath
Example #2
0
def _concat_fastq(cnf, fastq_fpaths, output_fpath):
    if len(fastq_fpaths) == 1:
        if not isfile(output_fpath):
            info('  no need to merge - symlinking ' + fastq_fpaths[0] +
                 ' -> ' + output_fpath)
            if not isdir(dirname(output_fpath)):
                critical('Dir for the symlink ' + dirname(output_fpath) +
                         ' does not exist')
            os.symlink(fastq_fpaths[0], output_fpath)
            return output_fpath
    else:
        info('  merging ' + ', '.join(fastq_fpaths))
        if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
        else:
            with file_transaction(cnf.work_dir, output_fpath) as tx:
                with open(tx, 'w') as out:
                    for fq_fpath in fastq_fpaths:
                        with open(fq_fpath, 'r') as inp:
                            shutil.copyfileobj(inp, out)
        return output_fpath
Example #3
0
def sort_bed_by_alphabet(cnf,
                         input_bed_fpath,
                         output_bed_fpath=None,
                         chr_len_fpath=None):
    chr_lengths = get_chr_lengths(cnf, chr_len_fpath)
    chromosomes = set([c for (c, l) in chr_lengths])
    output_bed_fpath = adjust_path(
        output_bed_fpath) if output_bed_fpath else add_suffix(
            input_bed_fpath, 'sorted')

    regions = defaultdict(list)

    info('Sorting regions...')
    chunk_size = 10
    chunk_counter = 0
    with open(input_bed_fpath) as f:
        with file_transaction(cnf.work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    if chrom not in chromosomes:
                        continue
                    if chunk_counter == chunk_size or not regions[chrom]:
                        chunk_counter = 0
                        regions[chrom].append('')
                    regions[chrom][-1] += l
                    chunk_counter += 1
                for chr in sorted(regions.keys()):
                    for region in regions[chr]:
                        out.write(region)

    return output_bed_fpath
Example #4
0
def get_bedgraph_coverage(cnf,
                          bam_fpath,
                          chr_len_fpath=None,
                          output_fpath=None,
                          bed_fpath=None,
                          exit_on_error=True):
    chr_len_fpath = chr_len_fpath or get_chr_len_fpath(cnf)
    dedup_bam = intermediate_fname(cnf, bam_fpath, source.dedup_bam)
    if not verify_bam(dedup_bam, silent=True):
        info('Deduplicating bam file ' + bam_fpath)
        remove_dups(cnf, bam_fpath, dedup_bam)
    else:
        info(dedup_bam + ' exists')
    index_bam(cnf, dedup_bam)
    bam_bed_fpath = bam_to_bed(cnf, dedup_bam, to_gzip=False)
    if getsize(bam_bed_fpath) <= 0:
        info('No coverage for ' + bam_fpath + ', skipping.')
        return None

    sorted_bed_fpath = sort_bed_by_alphabet(cnf,
                                            bam_bed_fpath,
                                            chr_len_fpath=chr_len_fpath)
    if bed_fpath:
        in_bed_fpath = intersect_bed(cnf, sorted_bed_fpath, bed_fpath)
    else:
        in_bed_fpath = sorted_bed_fpath

    if not verify_file(in_bed_fpath, silent=True):
        info('No coverage in ' + in_bed_fpath)
        return None

    bedgraph_fpath = output_fpath or '%s.bedgraph' % splitext(bam_fpath)[0]
    with file_transaction(cnf.work_dir, bedgraph_fpath) as tx_fpath:
        bedtools = get_system_path(cnf, 'bedtools')
        cmdl = '{bedtools} genomecov -bg -split -g {chr_len_fpath} -i {in_bed_fpath}'.format(
            **locals())
        call(cnf, cmdl, exit_on_error=exit_on_error, output_fpath=tx_fpath)
    return bedgraph_fpath
Example #5
0
def annotate_gene_counts(cnf, counts_fpath, ann_counts_fpath, genes_dict):
    unannotated_fpath = counts_fpath
    if not verify_file(unannotated_fpath):
        critical('Not found counts ' + unannotated_fpath)
    with file_transaction(cnf.work_dir, ann_counts_fpath) as tx:
        with open(tx, 'w') as annotated_f:
            with open(unannotated_fpath) as f:
                for i, l in enumerate(f):
                    if i == 0:
                        header = l.replace('\n', '').split('\t')
                        l = '\t'.join(header + ['HUGO'])
                        annotated_f.write(l + '\n')
                        continue
                    fs = l.replace('\n', '').split('\t')
                    gene_and_exon = fs[0].split(':')
                    gene_id = gene_and_exon[0]
                    if gene_id not in genes_dict:
                        continue
                    gene_symbol = genes_dict[gene_id]
                    l = '\t'.join(fs + [gene_symbol])
                    annotated_f.write(l + '\n')
    if not verify_file(ann_counts_fpath):
        critical('Could not annotate counts ' + unannotated_fpath)
Example #6
0
def parse_svs(cnf, sv_file, out_bed_fpath):
    """
    Parse sv vcf into a bed file
    """
    bp_dict = {}
    vcf_reader = vcf.Reader(filename=sv_file)
    with file_transaction(cnf.work_dir, out_bed_fpath) as tx:
        with open(tx, 'w') as out:
            for record in vcf_reader:
                if record.FILTER is None:
                    record.FILTER = []
                try: # if there is no SVTYPE or MATEID, ignore for now
                    if record.INFO['SVTYPE'] == "BND":
                        if record.INFO['MATEID'][0] not in bp_dict:
                            #Store the record in the dict until its pair is found
                            bp_dict[record.ID] = record
                        else:
                            #If the other BND is in the dict, annotate
                            record2 = bp_dict[record.INFO['MATEID'][0]]
                            try:
                                if record.samples[0]["PR"][1] + record.samples[0]["SR"][1] >= 5:
                                    out.write('\t'.join([str(record.CHROM), str(record.POS), str(record2.CHROM), str(record2.POS) + '\n']))
                            except AttributeError:
                                pass
                            #remove used record from the dict
                            del bp_dict[record.INFO['MATEID'][0]]
                    else:
                        #first check if 'END' is specified
                        if 'END' in record.INFO:
                            try:
                                # require 1Mb difference in coordinates and evidence from 10 reads or more
                                if record.samples[0]["PR"][1] + record.samples[0]["SR"][1] >= 10 and abs(record.INFO['END'] - record.POS) > 1000000:
                                    out.write('\t'.join([str(record.CHROM), str(record.POS), str(record.CHROM), str(record.INFO['END']) + '\n']))
                            except AttributeError:
                                pass
                except KeyError:
                    pass
def write_combined_results(cnf,
                           variants_fpath,
                           samples,
                           vcf2txt_fpaths,
                           freq_in_cohort_by_vark,
                           count_in_cohort_by_vark,
                           suffix=variant_filtering.mut_pass_suffix,
                           do_cohort_filtering=True):
    artefacts_samples = OrderedDefaultDict(list)
    artefacts_data = OrderedDict()

    variants_count = defaultdict(int)
    written_lines_count = 0
    status_col, reason_col, n_samples_col, n_var_col, pcnt_sample_col, ave_af_col, incidentalome_col \
        = None, None, None, None, None, None, None

    with file_transaction(cnf.work_dir, variants_fpath) as tx:
        with open(tx, 'w') as out:
            for sample_i, (sample, vcf2txt_fpath) in enumerate(
                    zip(samples, vcf2txt_fpaths)):
                mut_fpath = add_suffix(vcf2txt_fpath, suffix)
                with file_transaction(cnf.work_dir,
                                      mut_fpath) as fixed_mut_fpath_tx:
                    with open(mut_fpath) as f, open(fixed_mut_fpath_tx,
                                                    'w') as fixed_f_out:
                        for line_i, l in enumerate(f):
                            fs = l.replace('\n', '').split('\t')
                            if line_i == 0 and sample_i == 0:
                                out.write(l)
                            if line_i == 0:
                                fixed_f_out.write(l)
                                if status_col is not None and status_col != fs.index(
                                        'Significance'):
                                    critical(
                                        'Different format in ' + mut_fpath +
                                        ': status_col=' +
                                        str(fs.index('Significance')) +
                                        ', but the first sample was ' +
                                        str(status_col) +
                                        ', please rerun VarFilter from the beginning'
                                    )
                                status_col = fs.index('Significance')
                                reason_col = status_col + 1
                                n_samples_col = fs.index('N_samples')
                                n_var_col = fs.index('N_Var')
                                pcnt_sample_col = fs.index('Pcnt_sample')
                                ave_af_col = fs.index('Ave_AF')
                                if 'Incidentalome' in fs:
                                    incidentalome_col = fs.index(
                                        'Incidentalome')
                            if line_i > 0:
                                fs = l.replace('\n', '').split('\t')
                                chrom, pos, db_id, ref, alt = fs[1:6]
                                vark = ':'.join([chrom, pos, ref, alt])
                                assert len(fs) > reason_col, 'len(fs)=' + str(len(fs)) + ' > reason_col=' + str(reason_col) + \
                                                             ' in ' + sample.name + ', ' + vcf2txt_fpath + ' for line\n' + l

                                freq = freq_in_cohort_by_vark[vark]
                                cnt = count_in_cohort_by_vark[vark]
                                fs[n_samples_col] = str(len(samples))
                                fs[n_var_col] = str(cnt)
                                fs[pcnt_sample_col] = str(freq)
                                fs[ave_af_col] = ''
                                l = '\t'.join(fs) + '\n'

                                if do_cohort_filtering:
                                    if fs[status_col] in ['known', 'likely']:
                                        variants_count['not_filtered'] += 1
                                    elif freq >= cnf.variant_filtering.max_ratio and cnt > cnf.variant_filtering.max_sample_cnt:
                                        artefacts_samples[vark].append(
                                            sample.name)
                                        # if incidentalome_col:
                                        #     fs.remove(fs[incidentalome_col])
                                        artefacts_data[vark] = fs
                                        continue
                                variants_count['good_freq'] += 1
                                fixed_f_out.write(l)
                                out.write(l)
                                written_lines_count += 1
    return artefacts_samples, artefacts_data, variants_count, written_lines_count
def combine_results(cnf,
                    samples,
                    vcf2txt_fpaths,
                    variants_fpath,
                    pass_variants_fpath=None,
                    reject_variants_fpath=None):
    info('Combining vcf2txt variants')
    not_existing_snames = []
    if cnf.reuse_intermediate and isfile(variants_fpath) and verify_file(
            variants_fpath):
        info('Combined filtered results ' + variants_fpath +
             ' exist, reusing.')
    else:
        for sample_i, (sample,
                       vcf2txt_fpath) in enumerate(zip(samples,
                                                       vcf2txt_fpaths)):
            if not verify_file(vcf2txt_fpath, description='variants file'):
                not_existing_snames.append(sample.name)
        if not_existing_snames:
            critical(
                'For some samples do not exist, variants file was not found: '
                + ', '.join(not_existing_snames))
        with file_transaction(cnf.work_dir, variants_fpath) as tx:
            with open(tx, 'w') as out:
                for sample_i, (sample, vcf2txt_fpath) in enumerate(
                        zip(samples, vcf2txt_fpaths)):
                    with open(vcf2txt_fpath) as f:
                        for line_i, l in enumerate(f):
                            if line_i == 0 and sample_i == 0:
                                out.write(l)
                            if line_i > 0:
                                out.write(l)
        verify_file(variants_fpath,
                    is_critical=True,
                    description='combined mutation calls')
        info('Saved vcf2txt variants to ' + variants_fpath)

    info()
    info('Combining PASSed mutations')
    pass_variants_fpath = pass_variants_fpath or add_suffix(
        variants_fpath, variant_filtering.mut_pass_suffix)
    reject_variants_fpath = reject_variants_fpath or add_suffix(
        variants_fpath, variant_filtering.mut_reject_suffix)
    not_existing_pass_snames = []
    if cnf.reuse_intermediate and isfile(pass_variants_fpath) and verify_file(pass_variants_fpath)\
            and isfile(reject_variants_fpath) and verify_file(reject_variants_fpath):
        info('Combined PASSed filtered results ' + pass_variants_fpath +
             ' exist, reusing.')
    else:
        for sample_i, (sample,
                       vcf2txt_fpath) in enumerate(zip(samples,
                                                       vcf2txt_fpaths)):
            if not verify_file(add_suffix(vcf2txt_fpath,
                                          variant_filtering.mut_pass_suffix),
                               description='PASS variants file'):
                not_existing_pass_snames.append(sample.name)
        if not_existing_pass_snames:
            critical(
                'For some samples do not exist, PASS variants file was not found: '
                + ', '.join(not_existing_pass_snames))
        info('*' * 70)
        if cnf.variant_filtering.max_ratio < 1.0:
            info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio))
        else:
            info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio) +
                 ', i.e. no filter')

        info('Calculating frequences of variants in the cohort')
        info('*' * 70)
        freq_in_cohort_by_vark, count_in_cohort_by_vark = count_mutations_freq(
            cnf, samples, vcf2txt_fpaths)
        reject_freq_in_cohort_by_vark, reject_count_in_cohort_by_vark = count_mutations_freq(
            cnf,
            samples,
            vcf2txt_fpaths,
            suffix=variant_filtering.mut_reject_suffix)
        info()

        if cnf.variant_filtering.max_ratio < 1.0:
            info('Saving passing threshold if cohort freq < ' +
                 str(cnf.variant_filtering.max_ratio) + ' to ' +
                 pass_variants_fpath)

        artefacts_samples, artefacts_data, variants_count, written_lines_count = write_combined_results(
            cnf,
            pass_variants_fpath,
            samples,
            vcf2txt_fpaths,
            freq_in_cohort_by_vark,
            count_in_cohort_by_vark,
            suffix=variant_filtering.mut_pass_suffix,
            do_cohort_filtering=True)

        _, _, _, reject_written_lines_count = write_combined_results(
            cnf,
            reject_variants_fpath,
            samples,
            vcf2txt_fpaths,
            reject_freq_in_cohort_by_vark,
            reject_count_in_cohort_by_vark,
            suffix=variant_filtering.mut_reject_suffix,
            do_cohort_filtering=False)

        if len(artefacts_samples.keys()) > 0:
            reason = 'cohort freq > ' + str(cnf.variant_filtering.max_ratio)
            with open(reject_variants_fpath) as f:
                line = f.readline().split()
                reason_col = line.index('Reason') if 'Reason' in line else None
            with open(reject_variants_fpath, 'a') as f:
                for vark, samples in artefacts_samples.items():
                    fs = artefacts_data[vark]
                    if reason_col:
                        fs[reason_col] = reason
                    else:
                        fs.append(reason)
                    f.write('\t'.join(fs) + '\n')

            info('Skipped artefacts with cohort freq > ' +
                 str(cnf.variant_filtering.max_ratio) +
                 ' and sample count > ' +
                 str(cnf.variant_filtering.max_sample_cnt) + ': ' +
                 str(len(artefacts_samples.keys())))
            info('Added artefacts into ' + reject_variants_fpath)

        info('All variants not under filtering: ' +
             str(variants_count['not_filtered']))
        if len(artefacts_samples.keys()) > 0:
            info('Variants not under filtering with freq > ' +
                 str(cnf.variant_filtering.max_ratio) + ': ' +
                 str(variants_count['good_freq']))

        verify_file(pass_variants_fpath,
                    'PASS variants file',
                    is_critical=True)
        info('Written ' + str(written_lines_count) + ' records to ' +
             pass_variants_fpath)
        info('Written ' +
             str(reject_written_lines_count + len(artefacts_samples.keys())) +
             ' rejected records to ' + reject_variants_fpath)

        variants_fpath = verify_file(variants_fpath, is_critical=True)
        pass_variants_fpath = verify_file(pass_variants_fpath,
                                          is_critical=True)

        if not_existing_snames or not_existing_pass_snames:
            return None, None

    return variants_fpath, pass_variants_fpath
def postprocess_vcf(cnf, work_dir, var_sample, caller_name, variants,
                    mutations, vcf2txt_res_fpath):
    if cnf is None:
        global glob_cnf
        cnf = glob_cnf

    info(var_sample.name + ((', ' + caller_name) if caller_name else '') +
         ': writing filtered VCFs')

    filter_values = set(variants.values())

    # Saving .anno.filt.vcf.gz and .anno.filt.pass.vcf
    ungz, gz = None, None
    if var_sample.filt_vcf_fpath.endswith('.gz'):
        ungz = splitext(var_sample.filt_vcf_fpath)[0]
        gz = var_sample.filt_vcf_fpath
    else:
        ungz = var_sample.filt_vcf_fpath
        gz = var_sample.filt_vcf_fpath + '.gz'
    if not var_sample.filt_tsv_fpath:
        var_sample.filt_tsv_fpath = splitext(ungz)[0] + '.tsv'

    if cnf.reuse_intermediate \
            and verify_file(var_sample.filt_vcf_fpath, silent=True) \
            and verify_file(var_sample.pass_filt_vcf_fpath, silent=True) \
            and verify_file(var_sample.filt_tsv_fpath, silent=True):
        info(var_sample.filt_vcf_fpath + ' and ' +
             var_sample.pass_filt_vcf_fpath + ' exist; reusing.')

    else:
        safe_mkdir(dirname(var_sample.filt_vcf_fpath))
        safe_mkdir(dirname(var_sample.pass_filt_vcf_fpath))

        with open_gzipsafe(var_sample.anno_vcf_fpath) as vcf_f, \
             file_transaction(work_dir, ungz) as filt_tx, \
             file_transaction(work_dir, var_sample.pass_filt_vcf_fpath) as pass_tx:
            with open(filt_tx, 'w') as filt_f, open(pass_tx, 'w') as pass_f:
                info(var_sample.name +
                     ((', ' + caller_name) if caller_name else '') +
                     ': opened ' + var_sample.anno_vcf_fpath +
                     ', writing to ' + ungz + ' and ' +
                     var_sample.pass_filt_vcf_fpath)

                for l in vcf_f:
                    if l.startswith('#'):
                        if l.startswith('#CHROM'):
                            filt_f.write(
                                '##FILTER=<ID=vcf2txt,Description="Hard-filtered by vcf2txt.pl">\n'
                            )
                            filt_f.write(
                                '##FILTER=<ID=vardict2mut,Description="Hard-filtered by vardict2mut.pl">\n'
                            )
                            for filt_val in filter_values:
                                if filt_val != 'PASS':
                                    filt_f.write('##FILTER=<ID=' + filt_val +
                                                 ',Description="">\n')
                        filt_f.write(l)
                        pass_f.write(l)
                    else:
                        ts = l.split('\t')
                        chrom, pos, alt = ts[0], ts[1], ts[4]
                        if (chrom, pos, alt) in mutations:
                            ts[6] = 'PASS'
                            filt_f.write('\t'.join(ts))
                            pass_f.write('\t'.join(ts))
                        else:
                            if ts[6] in ['', '.', 'PASS']:
                                ts[6] = ''
                                filter_value = variants.get((chrom, pos, alt))
                                if filter_value is None:
                                    ts[6] += 'vcf2txt'
                                elif filter_value == 'TRUE':
                                    ts[6] += 'vardict2mut'
                                else:
                                    ts[6] += filter_value
                            filt_f.write('\t'.join(ts))

        info(var_sample.name + ((', ' + caller_name) if caller_name else '') +
             ': saved filtered VCFs to ' + ungz + ' and ' +
             var_sample.pass_filt_vcf_fpath)

        if False:
            info()
            info(var_sample.name +
                 ((', ' + caller_name) if caller_name else '') +
                 ': writing filtered TSVs')
            # Converting to TSV - saving .anno.filt.tsv
            if 'tsv_fields' in cnf.annotation and cnf.tsv:
                tmp_tsv_fpath = make_tsv(cnf, ungz, var_sample.name)
                if not tmp_tsv_fpath:
                    err('TSV convertion didn\'t work')
                else:
                    if isfile(var_sample.filt_tsv_fpath):
                        os.remove(var_sample.filt_tsv_fpath)
                    shutil.copy(tmp_tsv_fpath, var_sample.filt_tsv_fpath)

                info(var_sample.name +
                     ((', ' + caller_name) if caller_name else '') +
                     ': saved filtered TSV to ' + var_sample.filt_tsv_fpath)

    info('Done postprocessing filtered VCF.')
    return ungz
def run_vardict2mut(cnf,
                    vcf2txt_res_fpath,
                    vardict2mut_res_fpath=None,
                    vardict2mut_executable=None):
    cmdline = None
    if vardict2mut_res_fpath is None:
        vardict2mut_res_fpath = add_suffix(vcf2txt_res_fpath,
                                           variant_filtering.mut_pass_suffix)
    vardict2mut_reject_fpath = add_suffix(vcf2txt_res_fpath,
                                          variant_filtering.mut_reject_suffix)

    check_filtering_results(vardict2mut_res_fpath)

    if not vardict2mut_executable:
        # vardict2mut_executable = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py'))
        vardict2mut_executable = 'vardict2mut'

    c = cnf.variant_filtering

    cmdline = '{vardict2mut_executable} {vcf2txt_res_fpath} '
    if vardict2mut_executable.endswith('.pl'):
        cmdline += ' --report_reason '
        if c.min_hotspot_freq is not None and c.min_hotspot_freq != 'default':
            cmdline += ' -F ' + str(c.min_hotspot_freq)
        if c.max_ratio_vardict2mut is not None:
            cmdline += ' -R ' + str(c.max_ratio_vardict2mut)
        if cnf.genome.filter_common_snp:
            cmdline += ' --filter_common_snp {cnf.genome.filter_common_snp} '
        if cnf.genome.filter_common_artifacts:
            cmdline += ' --filter_common_artifacts {cnf.genome.filter_common_artifacts} '
        if cnf.genome.actionable:
            cmdline += ' --actionable {cnf.genome.actionable} '
        if cnf.genome.compendia_ms7_hotspot:
            cmdline += ' --compendia_ms7_hotspot {cnf.genome.compendia_ms7_hotspot} '
        if cnf.snpeffect_export_polymorphic:
            cmdline += ' --snpeffect_export_polymorphic {cnf.snpeffect_export_polymorphic} '
        if cnf.actionable_hotspot:
            cmdline += ' --actionable_hotspot {cnf.actionable_hotspot} '
        if cnf.ruledir: cmdline += ' --ruledir {cnf.ruledir} '
        cmdline = cmdline.format(**locals())
        res = call(cnf, cmdline, vardict2mut_res_fpath, exit_on_error=False)

    else:
        filt_yaml_fpath = join(cnf.work_dir, 'filt_cnf.yaml')
        info('Writing filtering yaml into ' + filt_yaml_fpath)
        with file_transaction(cnf.work_dir, filt_yaml_fpath) as tx, open(
                filt_yaml_fpath, 'w') as out:
            with open(cnf.run_cnf) as run_cnf:
                lines = []
                met_variant_filtering = False
                for l in run_cnf:
                    if l.startswith('variant_filtering:'):
                        met_variant_filtering = True
                        continue
                    if met_variant_filtering:
                        if l.startswith(' '):
                            out.write(l.lstrip())
                        else:
                            break

        cmdline += ' --filt-cnf ' + filt_yaml_fpath
        cmdline += ' --work-dir ' + cnf.work_dir
        cmdline += (' --debug ' if cnf.debug else '')
        cmdline += ' --genome ' + cnf.genome.name
        cmdline += ' -o ' + vardict2mut_res_fpath
        cmdline += ' --o-reject ' + vardict2mut_reject_fpath

        if cnf.cohort_freqs_fpath:
            cmdline += ' --cohort-freqs ' + cnf.cohort_freqs_fpath

        cmdline = cmdline.format(**locals())
        res = call(cnf,
                   cmdline,
                   output_fpath=vardict2mut_res_fpath,
                   stdout_to_outputfile=False)

    if not res:
        return None
    else:
        return res
Example #11
0
def downsample(cnf,
               sample_name,
               fastq_L_fpath,
               fastq_R_fpath,
               N,
               output_dir,
               suffix=None,
               quick=False):
    """ get N random headers from a fastq file without reading the
    whole thing into memory
    modified from: http://www.biostars.org/p/6544/
    quick=True will just grab the first N reads rather than do a true
    downsampling
    """
    sample_name = sample_name or splitext(''.join(
        lc if lc == rc else ''
        for lc, rc in izip(fastq_L_fpath, fastq_R_fpath)))[0]

    l_out_fpath = join(output_dir,
                       add_suffix(basename(fastq_L_fpath), suffix or 'subset'))
    r_out_fpath = join(output_dir,
                       add_suffix(basename(fastq_R_fpath), suffix or 'subset'))
    if cnf.reuse_intermediate and verify_file(
            l_out_fpath, silent=True) and verify_file(r_out_fpath,
                                                      silent=True):
        info(l_out_fpath + ' and ' + r_out_fpath + ' exist, reusing.')
        return l_out_fpath, r_out_fpath

    info('Processing ' + sample_name)
    N = int(N)
    records_num = N
    if quick:
        rand_records = range(N)
    else:
        info(sample_name + ': getting number of reads in fastq...')
        records_num = sum(1 for _ in open_gzipsafe(fastq_L_fpath)) / 4
        if records_num > LIMIT:
            info(sample_name + ' the number of reads is higher than ' +
                 str(LIMIT) + ', sampling from only first ' + str(LIMIT))
            records_num = LIMIT
        info(sample_name + ': ' + str(records_num) + ' reads')
        if records_num < N:
            info(sample_name + ': and it is less than ' + str(N) +
                 ', so no downsampling.')
            return fastq_L_fpath, fastq_R_fpath
        else:
            info(sample_name + ': downsampling to ' + str(N))
            rand_records = sorted(random.sample(xrange(records_num), N))

    info('Opening ' + fastq_L_fpath)
    fh1 = open_gzipsafe(fastq_L_fpath)
    info('Opening ' + fastq_R_fpath)
    fh2 = open_gzipsafe(fastq_R_fpath) if fastq_R_fpath else None

    out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath)

    written_records = 0
    with file_transaction(cnf.work_dir, out_files) as tx_out_files:
        if isinstance(tx_out_files, basestring):
            tx_out_f1 = tx_out_files
        else:
            tx_out_f1, tx_out_f2 = tx_out_files
        info('Opening ' + str(tx_out_f1) + ' to write')
        sub1 = open_gzipsafe(tx_out_f1, "w")
        info('Opening ' + str(tx_out_f2) + ' to write')
        sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None
        rec_no = -1
        for rr in rand_records:
            while rec_no < rr:
                rec_no += 1
                for i in range(4):
                    fh1.readline()
                if fh2:
                    for i in range(4):
                        fh2.readline()
            for i in range(4):
                sub1.write(fh1.readline())
                if sub2:
                    sub2.write(fh2.readline())
            written_records += 1
            rec_no += 1
            if written_records % 10000 == 0:
                info(sample_name + ': written ' + str(written_records) +
                     ', rec_no ' + str(rec_no))
            if rec_no > records_num:
                info(sample_name + ' reached the limit of ' + str(records_num),
                     ' read lines, stopping.')
                break
        info(sample_name + ': done, written ' + str(written_records) +
             ', rec_no ' + str(rec_no))
        fh1.close()
        sub1.close()
        if fastq_R_fpath:
            fh2.close()
            sub2.close()

    info(sample_name + ': done downsampling, saved to ' + l_out_fpath +
         ' and ' + r_out_fpath + ', total ' + str(written_records) +
         ' paired reads written')
    return l_out_fpath, r_out_fpath
Example #12
0
def write_to_csv_file(work_dir,
                      jira_case,
                      project_list_fpath,
                      country_id,
                      project_name,
                      samples_num=None,
                      analysis_dirpath=None,
                      html_report_url=None):
    info('Reading project list ' + project_list_fpath)
    with open(project_list_fpath) as f:
        lines = f.readlines()
    uncom_lines = [l.strip() for l in lines if not l.strip().startswith('#')]

    header = uncom_lines[0].strip()
    info('header: ' + header)
    header_keys = header.split(
        ','
    )  # 'Updated By,PID,Name,JIRA URL,HTML report path,Datestamp,Data Hub,Analyses directory UK,Analyses directory US,Type,Division,Department,Sample Number,Reporter,Assignee,Description,IGV,Notes'
    index_of_pid = header_keys.index('PID')
    if index_of_pid == -1: index_of_pid = 1

    values_by_keys_by_pid = OrderedDict()
    for l in uncom_lines[1:]:
        if l:
            values = map(__unquote, l.split(','))
            pid = values[index_of_pid]
            values_by_keys_by_pid[pid] = OrderedDict(zip(header_keys, values))

    pid = project_name
    with file_transaction(work_dir, project_list_fpath) as tx_fpath:
        if pid not in values_by_keys_by_pid.keys():
            # info(pid + ' not in ' + str(values_by_keys_by_pid.keys()))
            info('Adding new record for ' + pid)
            values_by_keys_by_pid[pid] = OrderedDict(
                zip(header_keys, [''] * len(header_keys)))
        else:
            info('Updating existing record for ' + pid)
        d = values_by_keys_by_pid[pid]
        for k in header_keys:
            if k not in d:
                err('Error: ' + k + ' not in ' + project_list_fpath + ' for ' +
                    pid)

        d['PID'] = pid

        if analysis_dirpath:
            d['Analyses directory ' +
              (country_id if not is_local() else 'US')] = analysis_dirpath
        if project_name and (
                analysis_dirpath or not __unquote(d['Name'])
        ):  # update only if running after bcbio, or no value there at all
            d['Name'] = project_name
        if html_report_url and (
                analysis_dirpath or not __unquote(d['HTML report path'])
        ):  # update only if running after bcbio, or no value there at all
            d['HTML report path'] = html_report_url

        if jira_case:
            d['JIRA URL'] = jira_case.url
            # if 'Updated By' in d and __unquote(d['Updated By']):
            d['Updated By'] = getpass.getuser()
            if jira_case.description:
                d['Description'] = jira_case.summary
            if jira_case.data_hub:
                d['Data Hub'] = jira_case.data_hub
            if jira_case.type:
                d['Type'] = jira_case.type
            if jira_case.department:
                d['Department'] = jira_case.department
            if jira_case.division:
                d['Division'] = jira_case.division
            if jira_case.assignee:
                d['Assignee'] = jira_case.assignee
            if jira_case.reporter:
                d['Reporter'] = jira_case.reporter
        if samples_num:
            d['Sample Number'] = str(samples_num)
        d['Datestamp'] = timestamp()

        new_line = ','.join(
            __requote(d.get(k, '').replace(',', ';').replace('\n', ' | '))
            or '' for k in header_keys)

        with open(tx_fpath, 'w') as f:
            os.umask(0002)
            try:
                os.chmod(tx_fpath, 0774)
            except OSError:
                err(format_exc())
            for l in lines:
                if not l:
                    pass
                if l.startswith('#'):
                    f.write(l)
                else:
                    l = unicode(l, 'utf-8')
                    l_ascii = l.encode('ascii', 'ignore')
                    if ',' + project_name + ',' in l_ascii or ',"' + project_name + '",' in l_ascii:
                        info('Old csv line: ' + l_ascii)
                        # f.write('#' + l)
                    else:
                        f.write(l)
            f.write(new_line + '\n')
        info()
        info('New line: ' + new_line)
        info()
                else:
                    if slept >= limit:
                        return None
                    else:
                        if not silent:
                            err('Waiting ' + str(timeout) + ' seconds...')
                        time.sleep(timeout)
                        slept += timeout
                        if not silent:
                            err('Retrying...')
                            err()
        return res_

    res = None  # = proc or output_fpath
    if output_fpath and not output_is_dir:
        with file_transaction(cnf.work_dir, output_fpath) as tx_out_fpath:
            res = do_handle_oserror(cmdline,
                                    tx_out_fpath,
                                    stderr_dump=stderr_dump,
                                    max_number_of_tries=max_number_of_tries)
    else:
        res = do_handle_oserror(cmdline,
                                stderr_dump=stderr_dump,
                                max_number_of_tries=max_number_of_tries)
        if res is not None:
            clean()
            return res

    clean()

    if res:
Example #14
0
def _extract_fields(cnf, vcf_fpath, samplename, main_sample_index=0):
    fname, _ = splitext_plus(basename(vcf_fpath))
    tsv_fpath = join(cnf.work_dir, fname + '.tsv')

    if cnf.get('reuse_intermediate'):
        if file_exists(tsv_fpath):
            info(tsv_fpath + ' exists, reusing')
            return tsv_fpath

    manual_tsv_fields = cnf.annotation['tsv_fields']
    if not manual_tsv_fields:
        return None

    all_fields = []
    basic_fields = []
    info_fields = []
    eff_fields = []
    gt_fields = []
    tumor_gt = 'GEN[' + str(main_sample_index) + '].'
    normal_gt = 'GEN[' + str(1 - main_sample_index) + '].'

    lines = []

    with open(vcf_fpath) as inp:
        reader = vcf.Reader(inp)

        info('TSV saver: Building field list')
        for f in [rec.keys()[0] for rec in manual_tsv_fields]:
            if f.startswith('GEN'):
                _f = f.split('.')[1]
                if len(reader.samples) > 0:
                    if _f in reader.formats:
                        gt_fields.append(_f)
                        all_fields.append(f.replace('GEN[*].', tumor_gt))
                        if len(reader.samples) > 1:
                            all_fields.append(f.replace('GEN[*].', normal_gt))
                else:
                    warn('TSV Saver: Warning: ' + f + ' is not in VCF header FORMAT records')

            elif f in ['CHROM', 'POS', 'REF', 'ALT', 'ID', 'FILTER', 'QUAL']:
                all_fields.append(f)
                basic_fields.append(f)

            elif any(f.startswith(af) and af in reader.infos for af in ['EFF', 'ANN']):
                all_fields.append(f)
                eff_fields.append(f)

            else:
                if f in reader.infos:
                    info_fields.append(f)
                    all_fields.append(f)
                elif f == 'SAMPLE':
                    all_fields.append(f)
                else:
                    warn('TSV Saver: Warning: ' + f + ' is not in VCF header INFO records')

        info('TSV saver: Iterating over records...')
        d = OrderedDict()
        for rec in reader:
            for f in basic_fields:
                d[f] = rec.__dict__[f]

            for f in info_fields:
                d[f] = rec.INFO[f] if f in rec.INFO else ''

            if 'SAMPLE' not in d:
                d['SAMPLE'] = samplename

            if eff_fields:
                eff = rec.INFO.get(eff_fields[0][:3])
                if not eff:
                    for f in eff_fields:
                        d[f] = ''
                else:
                    eff_fs = eff[0].split('|')
                    eff_d = dict()
                    for val, header in zip(eff_fs, ['ALLELE', 'EFFECT', 'IMPACT', 'GENE', 'GENEID', 'FEATURE', 'FEATUREID', 'BIOTYPE', 'RANK', 'HGVS_C', 'HGVS_P', 'CDNA_POSLEN', 'CDS_POSLEN', 'AA_POSLEN', 'DISTANCE', 'LOG']):
                        if 'POSLEN' in header:
                            eff_d[header.split('_')[0] + '_POS'] = val.split('/')[0] if val else ''
                            eff_d[header.split('_')[0] + '_LEN'] = val.split('/')[1] if val else ''
                        else:
                            eff_d[header] = val
                    #ANN=GA |3_prime_UTR_variant|MODIFIER|RPL22|RPL22|transcript|NM_000983.3|Coding|4/4|c.*173dupT|||||173|;
                    #Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'
                    for f in eff_fields:
                        d[f] = eff_d[f.split('.')[1]]

            if rec.FORMAT:
                for _f in gt_fields:
                    if _f in rec.FORMAT:
                        d[tumor_gt + _f] = rec.samples[main_sample_index][_f]
                        if len(rec.samples) > 1 - main_sample_index:
                            d[normal_gt + _f] = rec.samples[1 - main_sample_index][_f]
                        else:
                            d[normal_gt + _f] = ''
                    else:
                        d[tumor_gt + _f] = ''
                        d[normal_gt + _f] = ''

            fs = []
            for f in all_fields:
                v = d[f]
                fs.append(v if v != '.' else '')
            lines.append(fs)

    info('TSV saver: Adding GEN[*] fields both for sample and for matched normal...')
    field_map = dict()
    for rec in manual_tsv_fields:
        k = rec.keys()[0]
        v = rec.values()[0]
        if k.startswith('GEN[*].'):
            _f = k.split('.')[1]
            field_map[tumor_gt + _f] = v
            field_map[normal_gt + _f] = 'Matched_' + v
        else:
            field_map[k] = v

    info('TSV saver: Writing TSV to ' + tsv_fpath)
    with file_transaction(cnf.work_dir, tsv_fpath) as tx:
        with open(tx, 'w') as out:
            out.write('\t'.join(field_map[f] for f in all_fields) + '\n')
            for fs in lines:
                new_fs = []
                for f in fs:
                    if isinstance(f, list):
                        new_fs.append(','.join(map(str, f)))
                    elif f is None:
                        new_fs.append('')
                    else:
                        new_fs.append(str(f))
                out.write('\t'.join(new_fs) + '\n')

    info('TSV saver: saved ' + tsv_fpath)
    return tsv_fpath
def main(args):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--vcf', '--var'], dict(
                dest='vcf',
                help='variants to filter')
             ),
            (['--vcf2txt'], dict(
                dest='vcf2txt',
                help='variants in vcf2txt to filter')
             ),
            (['--cohort-freqs'], dict(
                dest='cohort_freqs_fpath',
                help='frequencies of variants in a cohort')
             ),
            (['--qc'], dict(
                dest='qc',
                action='store_true',
                default=True,
                help=SUPPRESS_HELP)
             ),
            (['--no-qc'], dict(
                dest='qc',
                action='store_false',
                help=SUPPRESS_HELP)
             ),
            (['--no-tsv'], dict(
                dest='tsv',
                action='store_false',
                default=True,
                help=SUPPRESS_HELP)
             ),
        ],
        required_keys=['vcf'],
        file_keys=['vcf'],
        key_for_sample_name='vcf',
        proc_name=source.varfilter_name + '_post')

    check_system_resources(cnf, required=['perl'])
    check_genome_resources(cnf)

    if not cnf.output_file:
        cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt')

    safe_mkdir(dirname(cnf.output_file))
    safe_mkdir(cnf.output_dir)

    if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'):
        verify_vcf(cnf.vcf, is_critical=True)

    if not cnf.vcf2txt:
        vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file)
        if not vcf2txt_res_fpath:
            critical('vcf2txt run returned non-0')
        info('Saved vcf2txt output to ' + vcf2txt_res_fpath)
    else:
        cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True)
        info('Input is vcf2txt output, grepping by sample name ' + cnf.sample)
        vcf2txt_res_fpath = cnf.output_file
        with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx:
            with open(cnf.vcf2txt) as f, open(tx, 'w') as out:
                for i, l in enumerate(f):
                    if l.strip():
                        if i == 0:
                            out.write(l)
                        else:
                            if l.split('\t')[0] == cnf.sample:
                                out.write(l)
        info('Using vcf2txt from ' + vcf2txt_res_fpath)

    # if is_local():
    #     vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl'))
    #     info('Running vardict2mut perl')
    #     res = run_vardict2mut(cnf, vcf2txt_res_fpath,
    #         add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'),
    #         vardict2mut_executable=vardict2mut_pl)
    #     if not res:
    #         critical('vardict2mut.pl run returned non-0')

    mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix))
    if not mut_fpath:
        err('vardict2mut failed')
    else:
        info('Saved passed mutations to ' + mut_fpath)

        var_s = source.VarSample(cnf.sample, cnf.output_dir)
        var_s.anno_vcf_fpath = cnf.vcf
        var_s.varfilter_dirpath = var_s.dirpath

        ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0]
        ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt'))
        var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz'

        var_s.variants_fpath = vcf2txt_res_fpath
        var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix)

        ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass')
        var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass')

        filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath)
        index_vcf(cnf, var_s.name, filt_vcf, cnf.caller)
        index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller)

        if cnf.qc:
            report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s)
            qc_dirpath = join(cnf.output_dir, 'qc')
            safe_mkdir(qc_dirpath)
            qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name)
            info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')')
            info('-' * 70)
            info()

        if not cnf['keep_intermediate']:
            shutil.rmtree(cnf['work_dir'])

        info()
        info('*' * 70)
        info('Done filtering ' + var_s.name)
def convert_vardict_txts_to_bcbio_vcfs(cnf,
                                       bs,
                                       sample,
                                       output_dir=None,
                                       pass_only=False):
    info('')
    info('Preparing data for ' + sample.name)
    anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name)
    if not anno_filt_vcf_fpath:
        return None, None

    if not output_dir:
        output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath)
    output_vcf_fpath = join(
        output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending)
    pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass')
    if cnf.reuse_intermediate and verify_vcf(
            output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath +
                                                     '.gz'):
        info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath +
             '.gz exists, reusing')
        return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz'

    info('Parsing PASS and REJECT mutations...')
    pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts(
        cnf, bs, sample, pass_only=pass_only)
    sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict)

    info('')
    info('Writing VCFs')
    vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r'))
    vcf_reader = add_keys_to_header(vcf_reader, filter_values)
    with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \
        file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx:
        vcf_writer = None
        if not pass_only:
            vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader)
        vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader)
        for key, mut in sorted_mut_dict.items():
            record = get_record_from_vcf(vcf_reader, mut)
            if record:
                if key in pass_mut_dict:
                    record.FILTER = ['PASS']
                    if mut.reason:
                        record.INFO['Reason'] = mut.reason.replace(' ', '_')
                elif pass_only:
                    continue
                elif key in reject_mut_dict:
                    if not mut.reason:
                        continue
                    reject_reason_ids = [
                        filter_descriptions_dict[reason]
                        if reason in filter_descriptions_dict else reason
                        for reason in mut.reason.split(' and ')
                    ]
                    record.FILTER = [';'.join(reject_reason_ids)]
                if mut.signif:
                    record.INFO['Signif'] = mut.signif
                if mut.status:
                    record.INFO['Status'] = mut.status
                if vcf_writer:
                    vcf_writer.write_record(record)
                if key in pass_mut_dict:
                    vcf_pass_writer.write_record(record)
            else:
                warn('No record was found in ' + anno_filt_vcf_fpath +
                     ' for mutation ' + str(mut))

    output_gzipped_vcf_fpath = None
    if vcf_writer:
        vcf_writer.close()
        output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath)
        info('VCF file for vardict.txt is saved to ' +
             output_gzipped_vcf_fpath)
    vcf_pass_writer.close()
    output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath)
    info('VCF file for vardict.PASS.txt is saved to ' +
         output_gzipped_pass_vcf_fpath)
    return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath