def main(options): try: fh = open(options.convfile, "r") smap = load_sample_map(fh) except: print "Unexpected error:", sys.exc_info()[0] exit() hdr = [] hdrlen = 0 count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('##')): print line else: if (line.startswith('#')): vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() for idx, elem in enumerate(sfx): sfx[idx] = smap[elem] print "\t".join(prfx) + "\t" + "\t".join(sfx) else: print line return count
def get_variant_summary_probs(self, rsid, threshold): variant_array = [] msg = "" docs = self.var_coll.get_variant_data_multi(rsid) for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) fpath = self.filepaths_coll.get_filepath(doc["assaytype"], chromosome) fullrec = self.var_coll.get_raw_variant_values( fpath, chromosome, doc['position']) vcfr = VCFrecord(fullrec) prfx, sfx = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.var_coll.get_genotype_probs(sfx, threshold, probidx) doc['selected'] = 1 doc['a_af'] = alleleAf doc['b_af'] = alleleBf doc['hwe_p'] = p_hwe doc['Missing'] = 0 if 'Missing' in gc_count_dict: doc['Missing'] = gc_count_dict['Missing'] variant_array.append(doc) if len(variant_array) == 0: msg = "Variant NOT FOUND - %s, " % (rsid) return (variant_array, msg)
def process_variant_detail_vcf(self, record, assaytype): """Process info file variant detail records Set up a json-stype document and add it to the variant buffer """ doc = {} doc["assaytype"] = assaytype vcfr = VCFrecord(record) prfx, sfx = vcfr.get_prfx_sfx() doc["rsid"] = vcfr.get_varid() # always store chromosome as a 2-digit string doc["chromosome"] = "%.2d" % (int(vcfr.get_chr())) alleleA, alleleB = vcfr.get_alleles() doc["alleleA"] = alleleA doc["alleleB"] = alleleB doc["position"] = vcfr.get_posn_as_int() try: doc["ref_maf"] = float(vcfr.get_info_value("RefPanelAF")) except: pass try: doc["info"] = float(vcfr.get_info_value("INFO")) except: doc["info"] = 1.0 self.variantbuff.append(doc)
def main(options): try: godb = GoDb() except: print "Unexpected error:", sys.exc_info()[0] exit() hdr = [] count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: if (line.startswith('#')): vcfr = VCFrecord(line) prf, sfx = vcfr.get_prfx_sfx() for idx, field in enumerate(sfx): count += 1 godb.process_sample_detail(field, idx, options.assaytype) if (godb.get_samples_len() > flush_at): godb.flush_sample_buff() break godb.flush_sample_buff() print "" return count
def get_combined_array(self, buffer_list, cr_list, assay_list, threshold=0.9): """ For each list of data, for each element of list of data: 1) Find the col header from the corresonding file_position element 2) Use the col_header to find the combined postion 3) Place the data_element in the combined postion * TODO - conflict resolution, what to do if a slot is already occupied TODO - CR check """ #print "COMBO", self.combined_positions # #print "ASSAY_LIST: %s" % (str(assay_list)) assay_posns = {} for i, assaytype in enumerate(assay_list): assay_posns[i] = assaytype #print "ASSAY_POSNS: %s" % (str(assay_posns)) combo_array = ["."] * len(self.combined_positions) #print "BUFFL", len(buffer_list) for i, vcf_record in enumerate(buffer_list): if len(vcf_record) > 0: #print "asstp: %d, %s" % (i, assay_list[i]) vcfr = VCFrecord(vcf_record) prfx, data_list = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() rsid = vcfr.get_varid() hasAT = vcfr.has_fmt("AT") for j, dataelem in enumerate(data_list): if data_list[j] != ".": cpos = self.combined_positions[self.file_positions[i] [j]] geno = self.call_geno_for_threshold( data_list[j], probidx, threshold) if (hasAT == False): geno = geno + ":" + self.assay_abbrev[ assay_list[i]] if combo_array[cpos] != ".": self.geno_overlap_count += 1 #print "OVERLAP %s:%s - %s vs %s" % (rsid, self.file_positions[i][j], combo_array[cpos], geno) geno = self.call_genotype(combo_array[cpos], geno, probidx) combo_array[cpos] = geno return combo_array
def main(): count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: if (line.startswith('#')): vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() for samp in sfx: print samp break return count
def get_geno_data(self, rsid, sample_id, assaytype_list_posns): geno_values = {} docs = self.get_variant_data_multi(rsid) for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) fpath = self.filepaths_coll.get_filepath(doc["assaytype"], chromosome) fullrec = self.get_raw_variant_values(fpath, chromosome, doc['position']) if doc["assaytype"] in assaytype_list_posns: vcfr = VCFrecord(fullrec) prfx, genodata = vcfr.get_prfx_sfx() geno_values[sample_id + "_" + doc["assaytype"]] = genodata[ assaytype_list_posns[doc["assaytype"]]] return (geno_values)
def main(options): included_assaytypes = { "affy": 1, "illumina": 1, "broad": 1, "metabo": 1, "exome": 1 } godb = GoDb() # Data structures atype_list = [] atype_posns = {} marker_list = [] rsid_assaytypes = {} rsid_dict = {} rsid_prfx_dict = {} rsid_cr_dict = {} rsid_info_dict = {} count = 0 # Step 1 - get the list of entries for each rsid - one per assaytype vardocs = godb.get_multiple_variants(options.rsid) sampposns = godb.get_sample_posns(options.sampleid) for doc in vardocs: filepath = godb.get_filepath(doc["assaytype"], doc["chromosome"]) rec = godb.get_variant_file_data(filepath, doc["chromosome"], doc["position"]) vcfr = VCFrecord(rec) prfx, sfx = vcfr.get_prfx_sfx() if doc["assaytype"] in sampposns: print "%s,%s,%s,%d,%s" % ( options.rsid, options.sampleid, doc["assaytype"], sampposns[doc["assaytype"]], sfx[sampposns[doc["assaytype"]]]) return count
def get_next_records(self, key_list, prfx_list, recbuff_list): """ main rule is we read from the fh's corresponding to the min key list and replace the key_list, prfx and rec_buff elements accordingly. """ low_key_list, low_key_count = self.get_low_key_list(key_list) for i, fh in enumerate(self.fh_list): if low_key_list[i] != self.empty_key: line = fh.readline().strip() if line != "": # testing for EOF self.rec_counts[i] += 1 vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() maf, ma, cr = self.mafh.get_maf_and_cr(data, vcfr) prfx_list[i] = prfx recbuff_list[i] = sfx key_list[i] = int(prfx[1]) else: prfx_list[i] = [] recbuff_list[i] = [] key_list[i] = self.high_key #logging.info("rec_counts: %s, key_list: %s" % (str(self.rec_counts), str(key_list))) return key_list, prfx_list, recbuff_list
def main(options): #included_assaytypes = {"biggertest":1, "bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1} #included_assaytypes = {"affy":1, "illumina":1} included_assaytypes = {"broad":1} #included_assaytypes = {"metabo":1} #included_assaytypes = {"affy":1} #included_assaytypes = {"bigtest":1} #included_assaytypes = {"biggertest":1} rsids = [] godb = GoDb() try: if options.snpfile != None: fh = open(options.snpfile, "r") rsids = load_snpfile_data(fh) else: rsids = options.rsids.split(",") except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) exit() except TypeError as e: print "Missing arguments ", e exit() except: logging.info("Unexpected error: %s", str(sys.exc_info())) sys.exit() # Step 0 - initialise db connection and instanciate helper objects mafh = Mafhelper() hweh = Hwehelper() # Data structures atype_list = [] atype_posns = {} marker_list = [] rsid_assaytypes = {} rsid_dict = {} rsid_prfx_dict = {} rsid_cr_dict = {} rsid_info_dict = {} hdr_pref = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] # Step 1 - get the list of entries for each rsid - one per assaytype for rsid in rsids: #logging.info("Processing rsid = %s", rsid) docs = godb.get_multiple_variants(rsid) if docs.count() > 0: rsid_assaytypes[rsid] = [] else: logging.info("RSID %s NOTFOUND", rsid) #print docs # Step 1a - collect assaytypes and marker documents # At this point we're establishing a list order which must be observed throughout. for doc in docs: #logging.info("%s", str(doc)) if doc["assaytype"] not in included_assaytypes: continue if doc["assaytype"] not in atype_list: atype_list.append(doc["assaytype"]) rsid_assaytypes[rsid].append(doc) logging.info(str(atype_list)) # Step 2 - collect lists of prochis (sample ids) by assaytype prochi_list = [[]] * len(atype_list) for i, atype in enumerate(atype_list): atype_posns[atype] = i prochi_list[i] = godb.get_samples(atype) #logging.info("SAMP %d, %s, %s", i, atype, str(prochi_list[i])) mm = Multibuffermerge(prochi_list) # Step 3 - get combined col_header positions # combo is a dict {posn:colname} combo = mm.get_combined_positions() #print len(combo) # combocol is a list [colname1, colname2, ..., colname] again we keep the order of this intact combocol = mm.get_combined_columns() # Step 4 - for each variant by rsid for rsid in rsid_assaytypes: if rsid not in rsid_dict: rsid_prfx_dict[rsid] = [[]] * len(atype_list) rsid_dict[rsid] = [[]] * len(atype_list) rsid_cr_dict[rsid] = [[]] * len(atype_list) rsid_info_dict[rsid] = [[]] * len(atype_list) #print len(rsid_assaytypes[rsid]) for doc in rsid_assaytypes[rsid]: if options.prfx != None: fpath = godb.get_full_filepath(doc["assaytype"], doc["chromosome"], options.prfx) else: fpath = godb.get_filepath(doc["assaytype"], doc["chromosome"]) logging.info("Assaytype=%s fpath=%s", doc["assaytype"], fpath) result = godb.get_variant_file_data(fpath, doc["chromosome"], doc["position"]) if result != None: vcfr = VCFrecord(result) varid = vcfr.get_varid() if varid == rsid: rec = result maf, ma, cr = mafh.get_maf_and_cr(vcfr) # TODO - ALSO check maf, also apply QC filter at individual record level rsid_cr_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = cr rsid_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = rec logging.info("%s (%s), maf=%s, ma=%s, cr=%s" % (doc["rsid"], doc["assaytype"], maf, ma, cr)) #print combocol # Step 5 - execute the merge process print "\t".join(hdr_pref + combocol) count = 0 concordant = True for rsid in rsid_dict: if len(rsid_dict[rsid][0]) > 0: if options.check == 'Y': concordant = mm.check_concordancies(rsid_dict[rsid], atype_list, options.chipval) if concordant == True: comborec = mm.get_combined_array(rsid_dict[rsid], rsid_cr_dict[rsid], atype_list) vcfr = VCFrecord(rsid_dict[rsid][0]) prfx,sfx = vcfr.get_prfx_sfx() if len(prfx) > 0: logging.info("PRFX = %s, for %s", str(prfx), rsid) prfx[8] += ":AT" outrec = prfx + comborec print "\t".join(outrec) count += 1 else: logging.info("RSID %s NOTFOUND (2)", rsid) pass else: logging.info("Concordancy check fail for - %s" % (rsid)) #chi_test_count, allele_disc_count, overlap_count, cr_check_count = mm.get_counts() #logging.info("Overlap check count = %d, cr_check_count = %d", overlap_count, cr_check_count) chi_test_count, allele_disc_count, overlap_count = mm.get_counts() logging.info("CHI test count = %d, Allele discord count = %d, Overlap check count = %d", chi_test_count, allele_disc_count, overlap_count) return count
def get_rslist_data(self, input_rslist, threshold, download_list): msg = None snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n" data = [] assaytypelist = [] probidxlist = [] rslist = [] assaytypes = {} Afreq = {} Bfreq = {} data_count = 0 impDict = {} for rsid in input_rslist: docs = self.var_coll.get_variant_data_multi(rsid) if len(docs) > 0: rslist.append(rsid) # handling SNPs on multiple platforms for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) # first get filepath fpath = self.filepaths_coll.get_filepath( doc["assaytype"], chromosome) # get raw variant data fullrec = self.var_coll.get_raw_variant_values( fpath, chromosome, doc['position']) geno_count = 0 sample_count = 0 hwep = 0.0 vcfr = VCFrecord(fullrec) prfx, sfx = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.var_coll.get_genotype_probs( sfx, threshold, probidx) Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf data_count += 1 hwep = float(p_hwe) assaytypelist.append(doc["assaytype"]) data.append(vcfr) if doc["assaytype"] not in assaytypes: assaytypes[doc["assaytype"]] = 1 imputed = 0 if "imputed" in doc: imputed = 1 if "info" in doc: if doc["info"] != 1.0: imputed = 1 impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % ( doc["rsid"], doc["assaytype"], doc["chromosome"], doc["position"], doc["alleleA"], alleleAf, doc["alleleB"], alleleBf, maf, imputed, float(geno_count) / sample_count, hwep, doc["info"]) pdata = self.get_sample_values(assaytypelist, data, data_count, rslist, impDict, assaytypes, Afreq, Bfreq, threshold) return (pdata, snpdata, msg)
def main(options): hdrData = ["id"] sampleDict = {} colPosns = {} RefAlleleDict = {} AltAlleleDict = {} count = 0 mafh = Mafhelper() for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() #print prfx if (line.startswith('#')): # Parse out the header record. for i, col_hdr in enumerate(sfx): colPosns[i] = col_hdr sampleDict[col_hdr] = [] else: flip = False varid = vcfr.get_varid_ukb() #logging.info("varid=%s", varid) ref, alt = vcfr.get_alleles() probidx = vcfr.get_probidx() hdr_allele = alt homref_count, het_count, homalt_count, nc_count, miss_count = vcfr.get_allele_counts( ) call_count = homref_count + het_count + homalt_count maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count, alt, nc_count) RefAlleleDict[varid] = ref AltAlleleDict[varid] = alt #if ma == ref: # flip = True # hdr_allele = ref # logging.info("FLIP for %s, %s, %s", varid, ref, alt) hdrData.append(varid) for i, str_geno in enumerate(sfx): if str_geno != ".": geno = str_geno.split(":") max_prob, max_idx = get_max_prob(geno, probidx) i_call = icalls[geno[0]] if flip == True: if i_call == "0": i_call == "2" elif i_call == "2": i_call = "0" sampleDict[colPosns[i]].append(str(i_call)) else: sampleDict[colPosns[i]].append("") print ",".join(hdrData) for samp in sampleDict: count += 1 print ",".join([samp] + sampleDict[samp]) return count