def strelka_indel_af(vcf_file): """Print basic info for each indel variant in strelka vcf and adds indel AF for each sample """ if vcf_file == "-": vcfreader = vcf.VCFReader(sys.stdin) else: assert os.path.exists(vcf_file) vcfreader = vcf.VCFReader(filename=vcf_file) # NOTE: pyvcf swallows first line, i.e. expects a header! print "CHROM\tPOS\t{}".format('\t'.join(vcfreader.samples)) for var in vcfreader: assert var.is_indel # print minimal variant info print "{}\t{}".format(var.CHROM, var.POS), for s in range(len(var.samples)): tar = [int(x) for x in var.samples[s].data.TAR] tir = [int(x) for x in var.samples[s].data.TIR] #print "tar", tar, " tir", tir tar = sum(tar) tir = sum(tir) print "\t{}".format(tir/float(tir+tar)), print
def filter_by_background(in_vcf, full_vcf, background, data): """Filter SV calls also present in background samples. Skips filtering of inversions, which are not characterized differently between cases and controls in test datasets. """ Filter = collections.namedtuple('Filter', ['id', 'desc']) back_filter = Filter(id='InBackground', desc='Rejected due to presence in background sample') out_file = "%s-filter.vcf" % utils.splitext_plus(in_vcf)[0] if not utils.file_uptodate(out_file, in_vcf) and not utils.file_uptodate( out_file + ".vcf.gz", in_vcf): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: reader = vcf.VCFReader(filename=in_vcf) reader.filters["InBackground"] = back_filter full_reader = vcf.VCFReader(filename=full_vcf) writer = vcf.VCFWriter(out_handle, template=reader) for out_rec, rec in zip(reader, full_reader): rec_type = rec.genotype(dd.get_sample_name(data)).gt_type if rec_type == 0 or any(rec_type == rec.genotype( dd.get_sample_name(x)).gt_type for x in background): out_rec.add_filter("InBackground") writer.write_record(out_rec) return vcfutils.bgzip_and_index(out_file, data["config"])
def main_maize(ki11_snps=None, dirs=None): if ki11_snps is None: ki11_snps = defaultdict(lambda: {}) # chrom -> pos -> VCF record debug_count = 0 for r in vcf.VCFReader(open('B73Ki11.q20.vcf')): ki11_snps[r.CHROM][r.POS] = r #if debug_count > 100000: break debug_count += 1 print >> sys.stderr, 'Finished reading B73Ki11.q20.vcf.' ki11_shortread_cov = defaultdict( lambda: {}) # chrom -> pos -> short read cov # read the raw Ki11 pileup to get coverage in places where no SNPs were called for r in sp.MPileUpReader('Ki11.raw.mpileup'): if r is not None: ki11_shortread_cov[r.chr][r.pos] = r.cov print >> sys.stderr, "Fnished reading Ki11.raw.mpileup." repeat_by_chrom = {} # read the Tandem Repeat Finder summary for r in DictReader(open('B73_RefV4.fa.repeat_list.txt'), delimiter='\t'): if r['chrom'] not in repeat_by_chrom: repeat_by_chrom[r['chrom']] = IntervalTree() repeat_by_chrom[r['chrom']].add(int(r['start0']), int(r['end1'])) print >> sys.stderr, 'Finished reading B73_RefV4.fa.repeat_list.txt.' FIELDS = [ 'dir', 'chrom', 'pos', 'ref', 'alt_Short', 'alt_PB', 'in_Short', 'in_PB', 'cov_Short', 'cov_PB', 'genomic_HP' ] out_f = open('evaled.isophase_SNP.txt', 'w') writer_f = DictWriter(out_f, FIELDS, delimiter='\t') writer_f.writeheader() debug_count = 0 if dirs is None: dirs = glob.glob('by_loci/*size*/') for d1 in dirs: #if debug_count > 100: break debug_count += 1 mpileup = os.path.join(d1, 'ccs.mpileup') mapfile = os.path.join(d1, 'fake.mapping.txt') vcffile = os.path.join(d1, 'phased.partial.vcf') nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND') if not os.path.exists(vcffile): assert os.path.exists(nosnp) print >> sys.stderr, ( 'Skipping {0} because no SNPs found.').format(d1) else: print >> sys.stderr, ('Evaluating {0}.').format(d1) good_positions, cov_at_pos = get_positions_to_recover( mapfile, mpileup, ki11_snps, min_cov=30 ) # use lower min cov here becuz a few close cases where BQ filtering lowered cov name = d1.split('/')[1] eval_isophase(vcffile, ki11_snps, good_positions, cov_at_pos, repeat_by_chrom, ki11_shortread_cov, writer_f, name) out_f.close() return ki11_snps
def parse(cls, vcf_path): if hasattr(vcf_path, "read"): h = vcf_path else: if vcf_path.endswith(".gz"): h = gzip.open(vcf_path) else: h = open(vcf_path) try: variantes = vcf.VCFReader(h) for v in variantes: effects = [ SnpeffEffect.read(x) for x in (v.INFO["ANN"] if "ANN" in v.INFO else []) ] intergenic = [(i, x) for i, x in enumerate(effects) if "intragenic_variant" in x.annotation] if intergenic: i, intergenic = intergenic[0] if (("upstream_gene_variant" in effects[0].annotation) or ("downstream_gene_variant" in effects[0].annotation)): effects = [effects[i]] + effects[:i - 1] + effects[i:] yield (v, effects) finally: h.close()
def main_maize(ki11_snps=None, dirs=None): if ki11_snps is None: ki11_snps = defaultdict(lambda: {}) # chrom -> pos -> VCF record for r in vcf.VCFReader(open('B73Ki11.q20.vcf')): ki11_snps[r.CHROM][r.POS] = r print >> sys.stderr, 'Finished reading B73Ki11.q20.vcf.' out_f = open('evaled.isophase_SNP.txt', 'w') out_f.write('dir\tchrom\tpos\tref\talt_Short\talt_PB\tin_Short\tin_PB\n') if dirs is None: dirs = glob.glob('by_loci/*size*/') for d1 in dirs: mpileup = os.path.join(d1, 'ccs.mpileup') mapfile = os.path.join(d1, 'fake.mapping.txt') vcffile = os.path.join(d1, 'phased.partial.vcf') nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND') if not os.path.exists(vcffile): assert os.path.exists(nosnp) print >> sys.stderr, ( 'Skipping {0} because no SNPs found.').format(d1) else: print >> sys.stderr, ('Evaluating {0}.').format(d1) good_positions = get_positions_to_recover( mapfile, mpileup, ki11_snps, min_cov=30 ) # use lower min cov here becuz a few close cases where BQ filtering lowered cov name = d1.split('/')[1] eval_isophase(vcffile, ki11_snps, good_positions, out_f, name) out_f.close() return ki11_snps
def main(): """main function (shutup pylint) """ assert len(sys.argv) == 2 f = sys.argv[1] assert os.path.exists(f) print_vcf_header(sys.stdout) vcfreader = vcf.VCFReader(filename=f) for v in vcfreader: assert len(v.ALT) == len(v.INFO['TYPE']) for i in range(len(v.ALT)): t = v.INFO['TYPE'][i] a = str(v.ALT[i]) if t == 'snp': print_snp(sys.stdout, v.CHROM, v.POS, v.REF, a, v.QUAL, "snp") elif t == 'mnp': assert len(v.REF) > 1 assert len(v.REF) == len(a) for i in range(len(v.REF)): print_snp(sys.stdout, v.CHROM, v.POS + i, v.REF[i], a[i], v.QUAL, "mnp") else: # FIXME handle indels pass
def main_brangus(unzip_snps=None): if unzip_snps is None: unzip_snps = defaultdict(lambda : {}) for r in vcf.VCFReader(open('Brangus.unzip.vcf')): unzip_snps[r.CHROM][r.POS] = r print >> sys.stderr, 'Finished reading Brangus.unzip.vcf.' out_f = open('evaled.isophase.txt', 'w') out_f.write('dir\tchrom\tpos\tref\talt_g\talt_i\tin_g\tin_i\n') dirs = glob.glob('by_loci/*size*/') for d1 in dirs: mpileup = os.path.join(d1, 'ccs.mpileup') mapfile = os.path.join(d1, 'fake.mapping.txt') vcffile = os.path.join(d1, 'phased.partial.vcf') nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND') if not os.path.exists(vcffile): assert os.path.exists(nosnp) print >> sys.stderr, ('Skipping {0} because no SNPs found.').format(d1) else: print >> sys.stderr, ('Evaluating {0}.').format(d1) good_positions = get_positions_to_recover(mapfile, mpileup, unzip_snps, min_cov=40) name = d1.split('/')[1] eval_isophase(vcffile, unzip_snps, good_positions, out_f, name) out_f.close() return
def eval_isophase(isophase_vcf, genome_snp, good_positions, out_f, name='NA'): for r in vcf.VCFReader(open(isophase_vcf)): r.CHROM = r.CHROM.split('|')[0] if (r.CHROM, r.POS) not in good_positions: alt_g = 'NA' in_g = 'N' in_i = 'Y' else: alt_g = genome_snp[r.CHROM][r.POS].ALT[0] in_g = 'Y' in_i = 'Y' good_positions.remove((r.CHROM, r.POS)) out_f.write(( '{name}\t{chrom}\t{pos}\t{ref}\t{alt_g}\t{alt_i}\t{in_g}\t{in_i}\n' ).format(name=name, chrom=r.CHROM, pos=r.POS, ref=r.REF, alt_g=alt_g, alt_i=r.ALT[0], in_g=in_g, in_i=in_i)) for chrom, pos in good_positions: r = genome_snp[chrom][pos] out_f.write( ('{name}\t{chrom}\t{pos}\t{ref}\t{alt_g}\tNA\tY\tN\n').format( name=name, chrom=chrom, pos=pos, ref=r.REF, alt_g=r.ALT[0]))
def _add_reject_flag(in_file, config): """Add REJECT flag to all records that aren't flagged somatic (SS=2)""" Filter = namedtuple('Filter', ['id', 'desc']) reject_filter = Filter(id='REJECT', desc='Rejected as non-SOMATIC or by quality') # NOTE: PyVCF will write an uncompressed VCF base, ext = utils.splitext_plus(in_file) name = "rejectfix" out_file = "{0}-{1}{2}".format(base, name, ".vcf") if utils.file_exists(in_file): reader = vcf.VCFReader(filename=in_file) # Add info to the header of the reader reader.filters["REJECT"] = reject_filter with file_transaction(config, out_file) as tx_out_file: with open(tx_out_file, "wb") as handle: writer = vcf.VCFWriter(handle, template=reader) for record in reader: if "SS" in record.INFO: # VarScan encodes it as a string # TODO: Set it as integer when cleaning if record.INFO["SS"] != "2": record.add_filter("REJECT") writer.write_record(record) # Re-compress the file out_file = bgzip_and_index(out_file, config) move_vcf(in_file, "{0}.orig".format(in_file)) move_vcf(out_file, in_file) with open(out_file, "w") as out_handle: out_handle.write("Moved to {0}".format(in_file))
def fp_validation(vcf_fn): path = vcf_fn.split(os.sep) vcf_id = path[-3] + ":" + path[-1] vars_file = vcf.VCFReader(open(vcf_fn)) for record in vars_file: (pb_ao, pb_cov) = validate_variant(record, pacbio_bam, reference_pyfasta) (ts_ao, ts_cov) = validate_variant(record, ts_bam, reference_pyfasta) (unphased, hap_1, hap_2) = get_phased_counts_variant(record, crg_bam, reference_pyfasta) # see tenkit/bio_io.py for some examples of getting stuff out of vcfs row = { 'vcf.ID': vcf_id, 'chrom': record.CHROM, 'pos': record.POS, 'PacBio_ALT': pb_ao, 'PacBio_COV': pb_cov } row.update({'TruSeq_ALT': ts_ao, 'TruSeq_COV': ts_cov}) row.update({'Unphased_ALT': unphased[0], 'Unphased_COV': unphased[1]}) row.update({ 'Hap1_ALT': hap_1[0], 'Hap1_COV': hap_1[1], 'Hap2_ALT': hap_2[0], 'Hap2_COV': hap_2[1] }) rows.append(row)
def eval_isophase_phaseswitch(isophase_vcf, config_file, out_f, name='NA'): _chr, _start, _end, _strand = read_config(config_file) reader = vcf.VCFReader(open(isophase_vcf)) # record the first SNP for each isoform prev = {} # sample -> CallData.GT (ex: '0|1') r = reader.next() for c in r.samples: prev[c.sample] = c.data.GT num_switch = 0 for r in reader: for c in r.samples: if c.data.GT.find('|') == -1: continue # ignore those with just one allele a, b = c.data.GT.split('|') if a == b: continue # for now, ignore IsoPhase results that only uses one allele if prev[c.sample] != c.data.GT: num_switch += 1 prev[c.sample] = c.data.GT out_f.write("{name}\t{chrom}\t{start}\t{end}\t{strand}\t{num_iso}\t{num_switch}\n".format(\ name=name, chrom=_chr, start=_start, end=_end, strand=_strand, num_iso=len(r.samples), num_switch=num_switch))
def main_brangus(vcf_filename, out_filename, unzip_snps=None): if unzip_snps is None: unzip_snps = defaultdict(lambda : {}) for r in vcf.VCFReader(open(vcf_filename)): unzip_snps[r.CHROM][r.POS] = r print('Finished reading ' + vcf_filename, file=sys.stderr) out_f = open(out_filename, 'w') FIELDS = ['dir', 'chrom', 'pos', 'strand', 'ref', 'alt_Short', 'alt_PB', 'in_Short', 'in_PB', 'cov_Short', 'cov_PB', 'genomic_HP'] writer = DictWriter(out_f, FIELDS, delimiter='\t') writer.writeheader() dirs = glob.glob('by_loci/*size*/') for d1 in dirs: mpileup = os.path.join(d1, 'ccs.mpileup') mapfile = os.path.join(d1, 'fake.mapping.txt') vcffile = os.path.join(d1, 'phased.partial.vcf') config = os.path.join(d1, 'config') nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND') if not os.path.exists(vcffile): assert os.path.exists(nosnp) print(('Skipping {0} because no SNPs found.').format(d1), file=sys.stderr) else: print(('Evaluating {0}.').format(d1), file=sys.stderr) strand = 'NA' if os.path.exists(config): # find the strand this gene family is on for line in open(config): if line.startswith('ref_strand='): strand = line.strip().split('=')[1] good_positions, cov_at_pos = get_positions_to_recover(mapfile, mpileup, unzip_snps, min_cov=30) name = d1.split('/')[1] eval_isophase(vcffile, unzip_snps, good_positions, cov_at_pos, {}, {}, writer, name, strand) out_f.close() return
def detect_vcf_annotation(filepath): """Return the name of the annotation parser to be used on the given file Called: In the importer and in the project wizard to display the detected annotations. :return: "vep", "snpeff", None """ if cm.is_gz_file(filepath): # Open .gz files in binary mode (See #84) device = open(filepath, "rb") else: device = open(filepath, "r") std_reader = vcf.VCFReader(device) # print(std_reader.metadata) if "VEP" in std_reader.metadata: if "CSQ" in std_reader.infos: device.close() return "vep" if "SnpEffVersion" in std_reader.metadata: if "ANN" in std_reader.infos: device.close() return "snpeff"
def vcf2csv(vcffile, csvfile): """FIXME:add-doc """ vcffh = vcf.VCFReader(filename=vcffile) fieldnames = ['Seq', 'Pos', 'Ref.Base', 'Var.Base', 'Qual', 'Allele Freq', 'Type', 'Homopolymer Length', 'Depth', 'Depth: ref fw and rev, var fw and rev'] if csvfile == "-": csvfile = '/dev/stdout' with open(csvfile, 'wb') as csvfh: csvw = csv.DictWriter(csvfh, fieldnames=fieldnames)#, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvw.writeheader() for var in vcffh: assert len(var.ALT) == 1 fields = [var.CHROM, var.POS, var.REF, var.ALT[0], var.QUAL, var.INFO['AF']] if var.is_indel: fields.extend(["INDEL", var.INFO['HRUN']]) else: fields.extend(["SNV", "None"]) fields.append(var.INFO['DP']) fields.append(','.join(str(x) for x in var.INFO['DP4'])) row = dict(zip(fieldnames, fields)) csvw.writerow(row)
def parse_vep_annotations_from_vcf(vcf_file_obj): """ Iterate through the variants in a VEP annotated VCF, pull out annotation from CSQ field """ r = vcf.VCFReader(vcf_file_obj) if "CSQ" not in r.infos: raise ValueError("CSQ field not found in %s header" % vcf_file_obj) csq_field_names = r.infos["CSQ"].desc.split("Format: ")[1].split("|") csq_field_names = map(lambda s: s.lower(), csq_field_names) for vcf_row in r: vep_annotations = [] for i, per_transcript_csq_string in enumerate(vcf_row.INFO["CSQ"]): csq_values = per_transcript_csq_string.split('|') # sanity-check the csq_values if len(csq_values) != len(csq_field_names): raise ValueError("CSQ per-transcript string %s contains %s values instead of %s:\n%s" % ( i, len(csq_values), len(csq_field_names), per_transcript_csq_string)) vep_annotation = dict(zip(csq_field_names, csq_values)) vep_annotation['is_nmd'] = "NMD_transcript_variant" in csq_values # 2 kinds of 'nc_transcript_variant' label due to name change in Ensembl v77 vep_annotation['is_nc'] = "nc_transcript_variant" in csq_values or "non_coding_transcript_variant" in csq_values variant_consequence_strings = vep_annotation["consequence"].split("&") vep_annotation["consequence"] = get_worst_vep_annotation(variant_consequence_strings) vep_annotations.append(vep_annotation) vcf_fields = [vcf_row.CHROM, vcf_row.POS, vcf_row.ID, vcf_row.REF, ",".join(map(str, vcf_row.ALT))] variant_objects = vcf_stuff.get_variants_from_vcf_fields(vcf_fields) for variant_obj in variant_objects: yield variant_obj, vep_annotations
def load_data(file_name): vcf_reader = vcf.VCFReader(open(file_name, 'rb')) types = [] records = [list(record) for record in vcf_reader] for item in records: types.append(item[7]['TYPE']) return types
def parse_vcf(vcf_file_name): import vcf import numpy as np vcf_reader = vcf.VCFReader(open(vcf_file_name, 'rb')) vcf_sample_matrix = [] i = 0 samples_list = [] for record in vcf_reader: #print i i += 1 #print record import os samples = [record.POS] for call in record.samples: #print call, call.gt_type #print call.called, call.gt_type, call.gt_bases if i == 1: samples_list.append(call.sample) if call.gt_type == None: samples.append(100) else: samples.append(call.gt_type) #print samples #os.quit() vcf_sample_matrix.append(samples) return (np.asarray(vcf_sample_matrix, dtype=np.float32), samples_list)
def juliet_json_to_vcf(json_filename, vcf_filename, gene_pos_info, ref_name='NC_045512v2', sample_name='UnknownSample'): with open('template.vcf', 'w') as f: f.write(__VCF_EXAMPLE__ + '\n') reader = vcf.VCFReader(open('template.vcf')) reader.samples = [sample_name] f_vcf = vcf.Writer(open(vcf_filename, 'w'), reader) h = open(json_filename) data = json.load(h) for g in data['genes']: for v in g['variant_positions']: cov = v['coverage'] ref_codon = v['ref_codon'] abs_pos = gene_pos_info[g['name']] + 3 * (v['ref_position'] - 1) #flag_is_primer = has_pos_overlap(abs_pos, primer_regions) ind = 0 for vac in v['variant_amino_acids']: for cur in vac['variant_codons']: ind += 1 _id = "{g}.{r}.{ind}".format(g=g['name'], r=v['ref_position'], ind=ind) codon = cur['codon'] codon_offset = 0 for codon_offset in range(3): if ref_codon[codon_offset] != codon[codon_offset]: freq = "{0:.6f}".format(cur['frequency']) #pdb.set_trace() rec = vcf.model._Record( CHROM=ref_name, POS=abs_pos + codon_offset, ID=_id, REF=ref_codon[codon_offset], ALT=[ vcf.model._Substitution( codon[codon_offset]) ], QUAL='.', FILTER='PASS', INFO={ 'AF': freq, 'DP': cov }, FORMAT="GT", sample_indexes=None) samp_ft = vcf.model.make_calldata_tuple(['GT']) rec.samples.append( vcf.model._Call(rec, sample_name, samp_ft(*["0|1"]))) f_vcf.write_record(rec) f_vcf.close()
def replaceFunc(fn, sourcevcfpaths, sourceext, outpath, outext): for i in range(len(sourcevcfpaths)): if sourcevcfpaths[i][-1] != '/': sourcevcfpaths[i] = sourcevcfpaths[i] + '/' sourcename = map(lambda (l): l.strip('/').split('/')[-1], sourcevcfpaths) sourcedata = {} sample = getSampleName(fn) for i in range(len(sourcevcfpaths)): vcffile = vcf.VCFReader( open(sourcevcfpaths[i] + sample + sourceext)) sourcedata[sourcename[i]] = [] for r in vcffile: sourcedata[sourcename[i]].append( [r.CHROM, '%s' % r.POS, r.ALT, '%s' % r.QUAL]) txt = open(fn).readlines() header = filter(lambda (l): l[0] == '#', txt) txt = filter(lambda (l): l[0] != '#', txt) for li in range(len(txt)): record = txt[li].split('\t') found = False repstr = [] for s in sourcename: match = filter( lambda (l): l[0] == record[0] and l[1] == record[1] and len( set(tuple(l[2])).intersection(set(record[4].split(',')) )) > 0, sourcedata[s]) if len(match) > 0: found = True A = '' for fieldidx in range(5): A = A + record[fieldidx] + '\t' repstr.append([A + record[5], A + match[0][3]]) repstr.append( [record[7], record[7] + ';qualSource=%s' % s]) break if not found: print 'Error, not found record in', fn, 'record', record exit() for A, B in repstr: txt[li] = txt[li].replace(A, B) addHeader = False fout = open(outpath + sample + outext, 'w') for l in header: if '##INFO=<ID=' in l and not addHeader: fout.write( '##INFO=<ID=qualSource,Number=1,Type=String,Description="QUAL Source">\n' ) addHeader = True fout.write('%s' % l) for l in txt: fout.write('%s' % l) fout.close()
def run(self): with self.input().open() as f: variants = list(vcf.VCFReader(f)) maf['rna_af'] = maf['genome_change'].map(lambda gc: get_af(gc, variants)) with self.output().open('w') as f: maf.to_csv(f, index=False)
def load_project_variants_from_vcf(project_id, vcf_files, mark_as_loaded=True, start_from_chrom=None, end_with_chrom=None): """ Load any families and cohorts in this project that aren't loaded already Args: project_id: the project id as a string vcf_files: a list of one or more vcf file paths """ print("Called load_project_variants_from_vcf on " + str(vcf_files)) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_file in vcf_files: if not os.path.isfile(vcf_file): print("Skipping " + vcf_file) continue r = vcf.VCFReader(filename=vcf_file) if "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_file) if vcf_file in vcf_files: mall.get_annotator().add_preannotated_vcf_file( vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # batch load families by VCF file print("project.families_by_vcf(): " + str(project.families_by_vcf())) for vcf_file, families in project.families_by_vcf().items(): if vcf_file not in vcf_files: print("Skipping %(vcf_file)s since its not in %(vcf_files)s" % locals()) continue #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] print("Loading families for VCF file: " + vcf_file) for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): #print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list( project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, mark_as_loaded=mark_as_loaded, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- finished loading project: " + project_id))
def load_project_variants(project_id, force_load_annotations=False, force_load_variants=False, ignore_csq_in_vcf=False, start_from_chrom=None, end_with_chrom=None): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v: v.path()): if not os.path.isfile(vcf_obj.path()): print("Skipping " + vcf_obj.path()) continue r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_obj.path()) mall.get_annotator().add_preannotated_vcf_file( vcf_obj.path(), force=force_load_annotations, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): if not force_load_variants: # filter out families that have already finished loading families = [ f for f in families if get_mall(project.project_id).variant_store. get_family_status(project_id, f.family_id) != 'loaded' ] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list( project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # now load cohorts load_cohorts(project_id)
def eval_isophase(isophase_vcf, genome_snp, good_positions, cov_at_pos, repeat_by_chrom, shortread_cov, writer_f, name='NA', strand='NA'): for r in vcf.VCFReader(open(isophase_vcf)): out = {'dir': name, 'chrom': 'NA', 'pos': r.POS, 'strand': strand, 'ref': r.REF, 'alt_Short': 'NA', 'alt_PB': 'NA', 'in_Short': 'NA', 'in_PB': 'NA', 'cov_Short': 'NA', 'cov_PB': 'NA', 'genomic_HP': 'NA'} r.CHROM = r.CHROM.split('|')[0] out['chrom'] = r.CHROM out['alt_PB'] = r.ALT[0] out['genomic_HP'] = 'Y' if (r.CHROM in repeat_by_chrom and len(repeat_by_chrom[r.CHROM].find(r.POS,r.POS))>0) else 'N' try: out['cov_Short'] = shortread_cov[r.CHROM][r.POS] except KeyError: out['cov_Short'] = 0 out['cov_PB'] = cov_at_pos[r.CHROM,r.POS-1] if (r.CHROM, r.POS) not in good_positions: out['alt_Short'] = 'NA' out['in_Short'] = 'N' out['in_PB'] = 'Y' else: out['alt_Short'] = genome_snp[r.CHROM][r.POS].ALT[0] out['in_Short'] = 'Y' out['in_PB'] = 'Y' good_positions.remove((r.CHROM, r.POS)) writer_f.writerow(out) # now we write out everything that is only in Shortread for chrom, pos in good_positions: out = {'dir': name, 'chrom': chrom, 'pos': pos, 'strand': strand, 'ref': genome_snp[chrom][pos].REF, 'alt_Short': genome_snp[chrom][pos].ALT[0], 'alt_PB': 'NA', 'in_Short': 'Y', 'in_PB': 'N', 'cov_Short': 'NA', 'cov_PB': cov_at_pos[chrom,pos-1], 'genomic_HP': 'Y' if (chrom in repeat_by_chrom and len(repeat_by_chrom[chrom].find(pos,pos))>0) else 'N' } try: out['cov_Short'] = shortread_cov[chrom][pos] except KeyError: out['cov_Short'] = 0 writer_f.writerow(out)
def parse_vcf_get_pairwise_n_snp(vcf_file_name, core_list=False): import vcf print core_list import numpy as np vcf_reader = vcf.VCFReader(open(vcf_file_name, 'rb')) vcf_sample_matrix = [] i = 0 samples_list = [] position2index = {} nsnp = {} for record in vcf_reader: print i, type(record.POS), record i += 1 position2index[record.POS] = record.ID samples = [record.POS] for call in record.samples: #print call, call.gt_type #print call.called, call.gt_type, call.gt_bases if i == 1: samples_list.append(call.sample) nsnp[call.sample] = {} nsnp[call.sample]["identical"] = 0 nsnp[call.sample]["diff"] = 0 nsnp[call.sample]["NA"] = 0 nsnp[call.sample]["snp"] = [] if core_list: if call.gt_type == None and record.POS in core_list: samples.append(100) nsnp[call.sample]["NA"] += 1 elif call.gt_type == 0 and record.POS in core_list: nsnp[call.sample]["identical"] += 1 elif record.POS in core_list: nsnp[call.sample]["diff"] += 1 nsnp[call.sample]["snp"].append(record.POS) else: continue else: if call.gt_type == None: samples.append(100) nsnp[call.sample]["NA"] += 1 elif call.gt_type == 0: nsnp[call.sample]["identical"] += 1 else: nsnp[call.sample]["diff"] += 1 nsnp[call.sample]["snp"].append(record.POS) #import sys #print samples #os.quit() vcf_sample_matrix.append(samples) for i in nsnp: print i, "NA:", nsnp[i]["NA"], "idem:", nsnp[i][ "identical"], "diff:", nsnp[i]["diff"], "TOTAL:", nsnp[i][ "NA"] + nsnp[i]["identical"] + nsnp[i]["diff"]
def vcf_to_stats(in_handle, target): d = collections.defaultdict(list) for rec in vcf.VCFReader(in_handle): data = rec.samples[0].data d["target"] = target d["DP"].append(data.DP) d["QUAL"].append(rec.QUAL) d["GT"].append(data.GT) d["AD"].append(percent_ad_deviation(data)) d["QR_QA"].append(strand_bias(data)) return pd.DataFrame(d)
def load_gvcf_allele_dict(self, sample_name_fn=lambda x: x): with open(self.gvcf) as h: variantes = list(tqdm(vcf.VCFReader(h))) for v in tqdm(variantes): for sample in v.samples: if sample.data.GT != ".": self.sample_variants[sample_name_fn(sample.sample)].append( str(v.POS) + ":" + str(v.ALT[int(sample.data.GT) - 1])) self.sample_variants = dict(self.sample_variants) for x in self.sample_variants: self.sample_variants[x] = set(self.sample_variants[x])
def __main__(): #check arguments if len(sys.argv) != 8: print(len(sys.argv)) print( 'python make_snps_files.py [gff file] [vcf file] [scaf_number] [distance] [out directory] [all/noncoding]' ) sys.exit() # read in the gff parseOut = gff_parse.gff_parse(sys.argv[1], int(sys.argv[4])) gffList = parseOut['gffList'] geneDict = parseOut['geneDict'] #read in the annotation file if sys.argv[6] == 'noncoding': annot = open(sys.argv[7], 'r') annotDic = makeAnnotDic(annot) # read in the vcf vcf_reader = vcf.VCFReader(open(sys.argv[2], 'r')) nameList = vcf_reader.samples for record in vcf_reader: entry = vcf_parse.vcfParse(record, nameDict) # see if it's in a gene! if len(gffList[entry['pos']]) == 0: continue # is it in a coding site and are we making noncoding? elif sys.argv[6] == 'noncoding': if annotDic[entry['pos']] == 1: #make it an integer continue #add the snp to each gene for gene in gffList[entry['pos']]: geneDict[gene].append(entry) #print(geneDict['20911598'][0]) # write out. for gene in geneDict: out = open(sys.argv[5] + gene + ".scaf" + str(sys.argv[3]) + ".snps", "w") #write out first line of individual names out.write(gene) for ind in nameList: out.write(" " + str(ind)) #write out snp genotypes for snp in geneDict[gene]: out.write("\n" + str(snp['pos'])) #print(geneDict[gene]) for ind in nameList: out.write(" " + str(snp["genotypes"][ind]))
def _23andme_exome(path): if vcf is None: raise RuntimeError("PyVCF not available, please 'easy_install' it.") for r in vcf.VCFReader(open(path, "r")): if not r.is_snp: continue # XXX Is it even possible? for sample in r.samples: yield SNP(name=r.ID, chromosome=r.CHROM, position=r.POS, genotype=sample.gt_bases.replace("/", ""))
def read_vcf_metrics(in_handle, metrics, format_metrics, target, use_subset=False): d = { "target": [], "indel": [], "zygosity": [], "QUAL": [], "AD": [], "PL": [] } zygosity_map = { "0/0": 0, "0/1": 1, "0|1": 1, "1/1": 2, "2/1": 2, "1/2": 2, "0": 0, "1": 2 } for x in metrics + format_metrics: d[x] = [] if use_subset: recs = itertools.islice(vcf.VCFReader(in_handle), 10000) else: recs = vcf.VCFReader(in_handle) for rec in recs: d["zygosity"].append(zygosity_map[rec.samples[0].data.GT]) for x in metrics: d[x].append(rec.INFO.get(x, None)) d["target"] = target d["AD"].append(_calc_ad(rec.samples[0].data)) d["PL"].append(_calc_pl(rec.samples[0].data)) format_dp = _calc_dp(rec.samples[0].data) d["DP"].append(format_dp) d["QUAL"].append(rec.QUAL) d["indel"].append(int(rec.is_indel)) return pandas.DataFrame(d)
def main(): """main function """ vcf_fh = dict() #vcf_files = dict() parser = cmdline_parser() args = parser.parse_args() for (k, v) in [ ('FN', args.vcf_fn), ('normal_rlx', args.vcf_nrlx), ('normal_str', args.vcf_nstr), ('tumor_rlx', args.vcf_trlx), ('tumor_str', args.vcf_tstr), ('somatic_raw', args.vcf_sraw), ('somatic_final', args.vcf_sfinal), ('somatic_final_minus_dbsnp', args.vcf_sfinal_wo_dbsnp)]: #vcf_files[k] = v try: vcf_fh[k] = vcf.VCFReader(filename=v) except: sys.stderr.write("Reading %s failed\n" % v) raise sys.stderr.write("Analyzing FN %s and friends\n" % vcf_fh['FN'].filename) ORDER = ['normal_rlx', 'normal_str', 'tumor_rlx', 'tumor_str', 'somatic_raw', 'somatic_final', 'somatic_final_minus_dbsnp'] print("#CHROM\tPOS\tREF\tALT\t%s" % ('\t'.join(ORDER))) for fn in vcf_fh['FN']: present_in = dict() for k in ORDER: present_in[k] = 0 for t in vcf_fh[k].fetch(fn.CHROM, fn.POS-1, fn.POS): assert len(fn.REF) == len(t.REF) assert len(fn.ALT)==1 assert len(t.ALT)==1 if t.ALT[0] == fn.ALT[0]: if t.QUAL: q = t.QUAL else: q = "." try: present_in[k] = "Q=%s;SB=%s;DP=%d;AF=%f" % (q, t.INFO['SB'], t.INFO['DP'], t.INFO['AF']) except KeyError: sys.stderr.write("Key Error. Dropping to debugger\n") import pdb; pdb.set_trace() break print("%s\t%s\t%s\t%s\t%s" % ( fn.CHROM, fn.POS, fn.REF, fn.ALT[0], '\t'.join(["%s" % present_in[k] for k in ORDER])))