def get_vcf_readers(mutations_by_experiment, cur_group_num):
    vcf_readers, filt_vcf_readers = dict(), dict()
    for e, muts in mutations_by_experiment.items():
        if not cur_group_num or get_group_num(e.key) == cur_group_num:
            variant_caller = 'vardict' if 'vardict' in e.sample.variantcallers else 'vardict-java'
            if e.sample.vcf_by_callername.get(variant_caller):
                vcf_fpath = e.sample.vcf_by_callername.get(variant_caller)
                filt_vcf_fpath = e.sample.find_filt_vcf_by_callername(
                    variant_caller)
                if vcf_fpath:
                    vcf_readers[e] = vcf.Reader(open_gzipsafe(vcf_fpath, 'r'))
                if filt_vcf_fpath:
                    filt_vcf_readers[e] = vcf.Reader(
                        open_gzipsafe(filt_vcf_fpath, 'r'))
    return vcf_readers, filt_vcf_readers
def verify_vcf(vcf_fpath, silent=False, is_critical=False):
    if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical):
        return None
    debug('File ' + vcf_fpath + ' exists and not empty')
    vcf = open_gzipsafe(vcf_fpath)
    debug('File ' + vcf_fpath + ' opened')
    l = next(vcf, None)
    if l is None:
        (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath)
        return None
    if not l.startswith('##fileformat=VCF'):
        (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath)
        return None

    try:
        reader = vcf_parser.Reader(vcf)
    except:
        err('Error: cannot open the VCF file ' + vcf_fpath)
        if is_critical: raise
    else:
        debug('File ' + vcf_fpath + ' opened as VCF')
        try:
            rec = next(reader)
        except IndexError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('IndexError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except ValueError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('ValueError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except StopIteration:
            debug('No records in the VCF file ' + vcf_fpath)
            if not silent:
                warn('VCF file ' + vcf_fpath + ' has no records.')
            return vcf_fpath
        except:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('Other error parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        else:
            debug('A record was read from the VCF file ' + vcf_fpath)
            return vcf_fpath
        # f = open_gzipsafe(output_fpath)
        # l = f.readline()
        # if 'Cannot allocate memory' in l:
        #     f.close()
        #     f = open_gzipsafe(output_fpath)
        #     contents = f.read()
        #     if not silent:
        #         if is_critical:
        #             critical('SnpSift failed with memory issue:\n' + contents)
        #         else:
        #             err('SnpSift failed with memory issue:\n' + contents)
        #             return None
        #     f.close()
        #     return None
        # return output_fpath
    finally:
        vcf.close()
    def _convert_vcf(inp_f, out_f):
        max_bunch_size = 100000
        written_records = 0
        bunch = []

        reader = vcf_parser.Reader(inp_f)
        writer = vcf_parser.Writer(out_f, reader)

        i = 0
        while True:
            rec = next(reader, None)
            if rec is None:
                break

            rec = proc_rec_fun(Record(rec, input_fpath, i), *args, **kwargs)
            if rec:
                bunch.append(rec)
                written_records += 1

            if len(bunch) >= max_bunch_size:
                writer.write_records(bunch)
                info('Written lines: ' + str(written_records))
                bunch = []
            i += 1

        writer.write_records(bunch)
        bunch = []
        info('Written lines: ' + str(written_records))
def vcf_is_empty(cnf, vcf_fpath):
    vcf = open_gzipsafe(vcf_fpath)
    reader = vcf_parser.Reader(vcf)
    result = True
    for rec in reader:
        result = False
    vcf.close()
    return result
def _get_subs_and_indel_stats(vcf_fpath, chr_lengths, plot_scale):
    reader = vcf.Reader(open_gzipsafe(vcf_fpath, 'r'))

    variants_distribution = dict()
    for chr_name, chr_length in chr_lengths:
        variants_distribution[chr_name] = [0] * max(1, chr_length / plot_scale)
    variants_distribution['OTHER'] = 0

    substituitions = OrderedDict()
    nucleotides = ['A', 'C', 'G', 'T']

    def _add_nuc(nuc):
        substituitions[nuc] = OrderedDict()
        for nuc2 in nucleotides:
            if nuc != nuc2:
                substituitions[nuc][nuc2] = 0

    for nuc in nucleotides:
        _add_nuc(nuc)

    indel_lengths = []
    for rec in reader:
        # for variants distribution plot
        if rec.CHROM not in variants_distribution:
            variants_distribution['OTHER'] += 1
        else:
            region_id = min((rec.POS - 1) / plot_scale, len(variants_distribution[rec.CHROM]) - 1)
            variants_distribution[rec.CHROM][region_id] += 1
        # for substitution and indel plots
        for alt in rec.ALT:
            if rec.is_snp:
                if rec.REF not in substituitions:
                    nucleotides.append(rec.REF)
                    _add_nuc(rec.REF)
                if alt.sequence not in substituitions:
                    nucleotides.append(alt.sequence)
                    _add_nuc(alt.sequence)
                substituitions[rec.REF][str(alt)] += 1
            elif rec.is_indel:
                if alt is None:
                    indel_lengths.append(-1)
                else:
                    indel_lengths.append(len(alt) - len(rec.REF))

    # the last region in each chromosome is not exactly equal to plot_scale
    for chr_name, chr_length in chr_lengths:
        last_region_length = chr_length % plot_scale + (0 if chr_length < plot_scale else plot_scale)
        variants_distribution[chr_name][-1] = int(variants_distribution[chr_name][-1] * plot_scale /
                                                  float(last_region_length))
    return variants_distribution, substituitions, indel_lengths
Beispiel #6
0
def make_report(cnf, vcf_fpath, sample):
    set_db_versions(cnf)
    step_greetings('Quality control reports')

    total_with_rejected = 0
    total = 0
    snps = 0
    inss = 0
    dels = 0
    dbsnps = 0
    cosmics = 0
    novels = 0
    hets = 0
    homs = 0
    transitions = 0
    transversions = 0

    with open_gzipsafe(vcf_fpath) as f:
        reader = vcf_parser.Reader(f)
        for rec in (vcf_processing.Record(rec, vcf_fpath, i)
                    for i, rec in enumerate(reader)):
            total_with_rejected += 1

            if not rec.FILTER or rec.FILTER == 'PASS':
                if rec.FILTER:
                    warn('Warn: ' + rec.get_variant() + ' FILTER=' +
                         str(rec.FILTER))

                total += 1

                if rec.is_snp:
                    snps += 1
                    if rec.is_transition:
                        transitions += 1
                    elif len(rec.ALT) == 1:
                        transversions += 1
                elif rec.is_indel:
                    if rec.is_deletion:
                        dels += 1
                    elif len(rec.ALT) == 1:
                        inss += 1

                if not rec.ID:
                    novels += 1
                else:
                    ids = rec.ID
                    if isinstance(ids, basestring):
                        ids = [ids]
                    if any(id.startswith('COS') for id in ids):
                        cosmics += 1
                    if any(id.startswith('rs') for id in ids):
                        dbsnps += 1

                call = rec.samples[0]
                if call.called:
                    if call.gt_type == 1:
                        hets += 1
                    elif call.gt_type == 2:
                        homs += 1

    report = SampleReport(sample, metric_storage=metric_storage)
    report.add_record('Total variants', total)
    report.add_record('SNPs', snps)
    report.add_record('Insertions', inss)
    report.add_record('Deletions', dels)
    report.add_record('Novel', novels)
    report.add_record('Novel, %', 1.0 * novels / total if total else None)
    report.add_record('In dbSNP', dbsnps)
    report.add_record('In dbSNP, %', 1.0 * dbsnps / total if total else None)
    report.add_record('In Cosmic', cosmics)
    report.add_record('In Cosmic, %', 1.0 * cosmics / total if total else None)
    report.add_record('Het/hom', float(hets) / homs if homs != 0 else None)
    report.add_record(
        'Ti/tv',
        float(transitions) / transversions if transversions != 0 else None)
    report.add_record('Total with rejected', total_with_rejected)

    return report
Beispiel #7
0
def _extract_fields(cnf, vcf_fpath, samplename, main_sample_index=0):
    fname, _ = splitext_plus(basename(vcf_fpath))
    tsv_fpath = join(cnf.work_dir, fname + '.tsv')

    if cnf.get('reuse_intermediate'):
        if file_exists(tsv_fpath):
            info(tsv_fpath + ' exists, reusing')
            return tsv_fpath

    manual_tsv_fields = cnf.annotation['tsv_fields']
    if not manual_tsv_fields:
        return None

    all_fields = []
    basic_fields = []
    info_fields = []
    eff_fields = []
    gt_fields = []
    tumor_gt = 'GEN[' + str(main_sample_index) + '].'
    normal_gt = 'GEN[' + str(1 - main_sample_index) + '].'

    lines = []

    with open(vcf_fpath) as inp:
        reader = vcf.Reader(inp)

        info('TSV saver: Building field list')
        for f in [rec.keys()[0] for rec in manual_tsv_fields]:
            if f.startswith('GEN'):
                _f = f.split('.')[1]
                if len(reader.samples) > 0:
                    if _f in reader.formats:
                        gt_fields.append(_f)
                        all_fields.append(f.replace('GEN[*].', tumor_gt))
                        if len(reader.samples) > 1:
                            all_fields.append(f.replace('GEN[*].', normal_gt))
                else:
                    warn('TSV Saver: Warning: ' + f + ' is not in VCF header FORMAT records')

            elif f in ['CHROM', 'POS', 'REF', 'ALT', 'ID', 'FILTER', 'QUAL']:
                all_fields.append(f)
                basic_fields.append(f)

            elif any(f.startswith(af) and af in reader.infos for af in ['EFF', 'ANN']):
                all_fields.append(f)
                eff_fields.append(f)

            else:
                if f in reader.infos:
                    info_fields.append(f)
                    all_fields.append(f)
                elif f == 'SAMPLE':
                    all_fields.append(f)
                else:
                    warn('TSV Saver: Warning: ' + f + ' is not in VCF header INFO records')

        info('TSV saver: Iterating over records...')
        d = OrderedDict()
        for rec in reader:
            for f in basic_fields:
                d[f] = rec.__dict__[f]

            for f in info_fields:
                d[f] = rec.INFO[f] if f in rec.INFO else ''

            if 'SAMPLE' not in d:
                d['SAMPLE'] = samplename

            if eff_fields:
                eff = rec.INFO.get(eff_fields[0][:3])
                if not eff:
                    for f in eff_fields:
                        d[f] = ''
                else:
                    eff_fs = eff[0].split('|')
                    eff_d = dict()
                    for val, header in zip(eff_fs, ['ALLELE', 'EFFECT', 'IMPACT', 'GENE', 'GENEID', 'FEATURE', 'FEATUREID', 'BIOTYPE', 'RANK', 'HGVS_C', 'HGVS_P', 'CDNA_POSLEN', 'CDS_POSLEN', 'AA_POSLEN', 'DISTANCE', 'LOG']):
                        if 'POSLEN' in header:
                            eff_d[header.split('_')[0] + '_POS'] = val.split('/')[0] if val else ''
                            eff_d[header.split('_')[0] + '_LEN'] = val.split('/')[1] if val else ''
                        else:
                            eff_d[header] = val
                    #ANN=GA |3_prime_UTR_variant|MODIFIER|RPL22|RPL22|transcript|NM_000983.3|Coding|4/4|c.*173dupT|||||173|;
                    #Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'
                    for f in eff_fields:
                        d[f] = eff_d[f.split('.')[1]]

            if rec.FORMAT:
                for _f in gt_fields:
                    if _f in rec.FORMAT:
                        d[tumor_gt + _f] = rec.samples[main_sample_index][_f]
                        if len(rec.samples) > 1 - main_sample_index:
                            d[normal_gt + _f] = rec.samples[1 - main_sample_index][_f]
                        else:
                            d[normal_gt + _f] = ''
                    else:
                        d[tumor_gt + _f] = ''
                        d[normal_gt + _f] = ''

            fs = []
            for f in all_fields:
                v = d[f]
                fs.append(v if v != '.' else '')
            lines.append(fs)

    info('TSV saver: Adding GEN[*] fields both for sample and for matched normal...')
    field_map = dict()
    for rec in manual_tsv_fields:
        k = rec.keys()[0]
        v = rec.values()[0]
        if k.startswith('GEN[*].'):
            _f = k.split('.')[1]
            field_map[tumor_gt + _f] = v
            field_map[normal_gt + _f] = 'Matched_' + v
        else:
            field_map[k] = v

    info('TSV saver: Writing TSV to ' + tsv_fpath)
    with file_transaction(cnf.work_dir, tsv_fpath) as tx:
        with open(tx, 'w') as out:
            out.write('\t'.join(field_map[f] for f in all_fields) + '\n')
            for fs in lines:
                new_fs = []
                for f in fs:
                    if isinstance(f, list):
                        new_fs.append(','.join(map(str, f)))
                    elif f is None:
                        new_fs.append('')
                    else:
                        new_fs.append(str(f))
                out.write('\t'.join(new_fs) + '\n')

    info('TSV saver: saved ' + tsv_fpath)
    return tsv_fpath
def convert_vardict_txts_to_bcbio_vcfs(cnf,
                                       bs,
                                       sample,
                                       output_dir=None,
                                       pass_only=False):
    info('')
    info('Preparing data for ' + sample.name)
    anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name)
    if not anno_filt_vcf_fpath:
        return None, None

    if not output_dir:
        output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath)
    output_vcf_fpath = join(
        output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending)
    pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass')
    if cnf.reuse_intermediate and verify_vcf(
            output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath +
                                                     '.gz'):
        info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath +
             '.gz exists, reusing')
        return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz'

    info('Parsing PASS and REJECT mutations...')
    pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts(
        cnf, bs, sample, pass_only=pass_only)
    sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict)

    info('')
    info('Writing VCFs')
    vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r'))
    vcf_reader = add_keys_to_header(vcf_reader, filter_values)
    with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \
        file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx:
        vcf_writer = None
        if not pass_only:
            vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader)
        vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader)
        for key, mut in sorted_mut_dict.items():
            record = get_record_from_vcf(vcf_reader, mut)
            if record:
                if key in pass_mut_dict:
                    record.FILTER = ['PASS']
                    if mut.reason:
                        record.INFO['Reason'] = mut.reason.replace(' ', '_')
                elif pass_only:
                    continue
                elif key in reject_mut_dict:
                    if not mut.reason:
                        continue
                    reject_reason_ids = [
                        filter_descriptions_dict[reason]
                        if reason in filter_descriptions_dict else reason
                        for reason in mut.reason.split(' and ')
                    ]
                    record.FILTER = [';'.join(reject_reason_ids)]
                if mut.signif:
                    record.INFO['Signif'] = mut.signif
                if mut.status:
                    record.INFO['Status'] = mut.status
                if vcf_writer:
                    vcf_writer.write_record(record)
                if key in pass_mut_dict:
                    vcf_pass_writer.write_record(record)
            else:
                warn('No record was found in ' + anno_filt_vcf_fpath +
                     ' for mutation ' + str(mut))

    output_gzipped_vcf_fpath = None
    if vcf_writer:
        vcf_writer.close()
        output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath)
        info('VCF file for vardict.txt is saved to ' +
             output_gzipped_vcf_fpath)
    vcf_pass_writer.close()
    output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath)
    info('VCF file for vardict.PASS.txt is saved to ' +
         output_gzipped_pass_vcf_fpath)
    return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath