def add_backfill_genotypes(vcf): # Will add genotype columns for backfill in place. from genomicode import vcflib # FORMAT GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR # <genotype> 0/1:12:28:28:24:4:14.29%:5.5746E-2:37:36:14:10:3:1 # Columns to add. COLUMNS = ["BFILL_REF", "BFILL_ALT", "BFILL_COV", "BFILL_VAF"] for i in range(vcf.num_variants()): var = vcflib.get_variant(vcf, i) changed = False for col in COLUMNS: if col not in var.genotype_names: var.genotype_names.append(col) changed = True for genodict in var.sample2genodict.itervalues(): if col in genodict: continue genodict[col] = "." changed = True if changed: vcflib.set_variant(vcf, i, var)
def fix_vcf_file(sample, infile, outfile): # JointSNVMix produces VCF files that don't have FORMAT and # <SAMPLE> columns. Add them. from genomicode import vcflib vcf = vcflib.read(infile) matrix = vcf.matrix genotype_names = ["DP", "RD", "AD", "FREQ"] # Get the calls for each variant. all_genotypes = [] # one for each variant for i in range(vcf.num_variants()): var = vcflib.get_variant(vcf, i) call = vcflib.get_call(var, None) geno_dict = { "DP": call.total_reads, "RD": call.num_ref, "AD": call.num_alt, "FREQ": call.vaf, } x = vcflib._format_genotype(genotype_names, geno_dict) all_genotypes.append(x) # Add FORMAT. FORMAT_STRING = ":".join(genotype_names) assert "FORMAT" not in matrix matrix.headers.append("FORMAT") matrix.headers_h.append("FORMAT") matrix.header2annots["FORMAT"] = [FORMAT_STRING] * matrix.num_annots() # Add the sample. assert not vcf.samples assert sample not in matrix matrix.headers.append(sample) matrix.headers_h.append(sample) matrix.header2annots[sample] = all_genotypes vcf.samples = [sample] # Add the proper header lines. lines = [ '##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Allelic depth for the ref allele in the tumor sample">', '##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Allelic depth for the alt allele in the tumor sample">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">', '##FORMAT=<ID=FREQ,Number=1,Type=Integer,Description="Variant allele frequency">', ] matrix.headerlines.extend(lines) vcflib.write(outfile, vcf)
def backfill_vcf(in_file, bf_file, out_file): import copy from genomicode import vcflib #print in_mvcf_node.identifier #print back_mvcf_node.identifier in_vcf = vcflib.read(in_file) bf_vcf = vcflib.read(bf_file) # May have multiple samples, e.g. germline and tumor. #assert len(in_vcf.samples) == 1, "Too many samples: %s" % in_vcf.samples x = [x for x in in_vcf.samples if x in bf_vcf.samples] SAMPLES = x # Parse out the read counts from the backfill vcf. bf_variants = {} # (sample, chrom, pos) -> ref, alt, Variant, Call for i in range(bf_vcf.num_variants()): var = vcflib.get_variant(bf_vcf, i) for sample in SAMPLES: call = vcflib.get_call(var, sample) if call.num_ref is None and call.num_alt is None and \ call.total_reads is None and call.vaf is None: continue x = sample, var.chrom, var.pos assert x not in bf_variants, "Duplicate: %s %s %s" % x bf_variants[x] = var.ref, var.alt, var, call # Find the variants that can be backfilled. # List of (chrom, pos, in_var_num, sample, in_call, bf_var, bf_call) matches = [] for i in range(in_vcf.num_variants()): in_var = vcflib.get_variant(in_vcf, i) for sample in SAMPLES: # Skip if there is no backfill information. key = sample, in_var.chrom, in_var.pos if key not in bf_variants: continue bf_ref, bf_alt, bf_var, bf_call = bf_variants[key] # Don't worry if the variants match. Just want a # rough estimate of the coverage at this location. ## Make sure the variants match. ##if not is_same_variants(ref, alt, bf_ref, bf_alt): ## continue in_call = vcflib.get_call(in_var, sample) x = in_var.chrom, in_var.pos, i, sample, in_call, bf_var, bf_call matches.append(x) # Update the read counts from annotated VCF file. out_vcf = copy.deepcopy(in_vcf) add_backfill_genotypes(out_vcf) seen = {} for x in matches: chrom, pos, var_num, sample, in_call, bf_var, bf_call = x seen[(sample, chrom, pos)] = 1 var = vcflib.get_variant(out_vcf, var_num) GD = var.sample2genodict[sample] mapping = [ ("BFILL_REF", "num_ref"), ("BFILL_ALT", "num_alt"), ("BFILL_COV", "total_reads"), ("BFILL_VAF", "vaf"), ] changed = False for gt_key, call_attr in mapping: x = getattr(bf_call, call_attr) if x is None: continue if type(x) is type([]): # arbitrarily use max x = max(x) GD[gt_key] = vcflib._format_vcf_value(x) changed = True if changed: vcflib.set_variant(out_vcf, var_num, var) # Add the variants that are in bf_file, but not in in_file. for x in bf_variants: # sample, chrom, pos = x if x in seen: continue bf_ref, bf_alt, bf_var, bf_call = bf_variants[x] # VarScan sets the filter_ to "PASS" for everything. Get rid # of this. bf_var.filter_ = ["BACKFILL"] vcflib.add_variant(out_vcf, bf_var) vcflib.write(out_file, out_vcf)
def parse_snpeff_file(vcf_filename, out_filename): from genomicode import vcflib # Parse out the snpEff annotations. Should have ANN in INFO. # Make a tab-delimited text file containin columns: # Chrom Pos Ref Alt <snpEff-specific columns> # # ##INFO=<ID=ANN,Number=.,Type=String, # Description="Functional annotations: ' # Allele | # Annotation | # Annotation_Impact | # Gene_Name | # Gene_ID | # Feature_Type | # Feature_ID | # Transcript_BioType | # Rank | # HGVS.c | # HGVS.p | # cDNA.pos / cDNA.length | # CDS.pos / CDS.length | # AA.pos / AA.length | # Distance | ERRORS / WARNINGS / INFO' "> vcf = vcflib.read(vcf_filename) # Figure out the Functional annotations. assert vcf.matrix.headerlines, "No header lines" x = [x for x in vcf.matrix.headerlines if x.find("<ID=ANN,") >= 0] if not x: return # No duplicates. # The ANN line can end with: # ERRORS / WARNINGS / INFO'"> # ERRORS / WARNINGS / INFO' "> # I encountered a VCF file that contained two ANN lines differing # by this spacing. Normalize these lines and make sure there are # no duplicates. x = [ x.replace("ERRORS / WARNINGS / INFO' \">", "ERRORS / WARNINGS / INFO'\">") for x in x ] x = {}.fromkeys(x).keys() assert len(x) == 1, "Multiple ANN headers: %s" % vcf_filename header = x[0] x = header.strip() TEXT = "Functional annotations:" assert TEXT in x x = x[x.index(TEXT) + len(TEXT):] # Get rid of "Functional annotations:" assert x.endswith('">') # No "> x = x[:-2].strip() assert x.startswith("'") and x.endswith("'") # No '' x = x[1:-1] x = x.split("|") x = [x.strip() for x in x] annotations = x handle = open(out_filename, 'w') header = ["Chrom", "Pos", "Ref", "Alt"] + annotations print >> handle, "\t".join(header) for i in range(vcf.num_variants()): var = vcflib.get_variant(vcf, i) if "ANN" not in var.infodict: continue # Can have multiple annotations if there are more than one allele. # <ALLELE>|...|...|,<ALLELE>|...|...| # If this happens, just add them to the file. x = var.infodict["ANN"] annots = x.split(",") for annot in annots: x = annot.split("|") x = [x.strip() for x in x] values = x assert len(values) == len(annotations), \ "Mismatch annotations %d %d: %s %s %d" % ( len(annotations), len(values), vcf_filename, var.chrom, var.pos) alt = ",".join(var.alt) x = [var.chrom, var.pos, var.ref, alt] + values assert len(x) == len(header) print >> handle, "\t".join(map(str, x))
def summarize_vcf_file(filename, filestem, header, outfilename, lock): from genomicode import hashlib from genomicode import vcflib vcf = vcflib.read(filename) lines = [] for i in range(vcf.num_variants()): var = vcflib.get_variant(vcf, i) caller_name = var.caller.name ref = var.ref alt = ",".join(var.alt) filter_str = vcf.caller.get_filter(var) for sample in var.samples: # If sample begins with an integer, there may be a # "X" pre-pended to it. Try to detect this case # and fix it. clean_sample = sample if sample == hashlib.hash_var(filestem): clean_sample = filestem source = "DNA" if caller_name == "Radia": # DNA <clean_sample> 196B-lung # RNA <clean_sample>_RNA 196B-lung_RNA # Figure out whether this is RNA and fix it. if clean_sample.endswith("_RNA"): clean_sample = clean_sample[:-4] source = "RNA" genodict = var.sample2genodict[sample] call = vcflib.get_call(var, sample) num_ref = vcflib._format_vcf_value(call.num_ref, None_char="") num_alt = vcflib._format_vcf_value(call.num_alt, None_char="") total_reads = vcflib._format_vcf_value(call.total_reads, None_char="") vaf = vcflib._format_vcf_value(call.vaf, None_char="") call_str = vcflib._format_vcf_value(call.call, None_char="") GQ = genodict.get("GQ", "") if GQ in [None, "."]: GQ = "" x = caller_name, filestem, clean_sample, var.chrom, var.pos, \ ref, alt, source, \ num_ref, num_alt, total_reads, vaf, filter_str, call_str, GQ assert len(x) == len(header) x = "\t".join(map(str, x)) lines.append(x) if len(lines) >= 100000: x = "\n".join(lines) + "\n" lock.acquire() handle = open(outfilename, 'a') handle.write(x) handle.close() lock.release() lines = [] x = "\n".join(lines) + "\n" lock.acquire() handle = open(outfilename, 'a') handle.write(x) handle.close() lock.release()