def main(options): try: csvfile = open(options.csvfile, "r") csvreader = csv.reader(csvfile) fh = open(options.snpsummary, "r") snp_data = load_snp_summary(fh) dbsnpfile = Dbsnpfile() dbsnpfile.set_tabix_file(options.dbsnpfile) godb = GoDb() except IOError as e: logging.fatal("I/O error({0}): {1}".format(e.errno, e.strerror)) sys.exit() except TypeError as e: logging.fatal("Missing arguments ", e) sys.exit() except: logging.fatal("Unexpected error:", sys.exc_info()) sys.exit() hdr = [] hdrlen = 0 count = 0 flipidx = {} hdr = csvreader.next() hdrlen = len(hdr) for i, varid in enumerate(hdr): if "ID" in varid: pass elif varid.startswith("rs"): # Split to get the allele component, check allele va alleleA # If necessary add an entry to the flip array, replace the hdr element with raw rsNumber var = varid.split("_") vardata = godb.get_one_variant(var[0]) #print var[0], var[1], vardata["alleleA"], vardata["chromosome"], vardata["position"], i hdr[i] = var[0] if var[1] == vardata["alleleA"]: flipidx[i] = True else: coldata = varid.split(":") posn, allele = get_posn_allele(coldata) var, ref = get_dbsnp_rsid(dbsnpfile, coldata[0], posn) if allele == ref: flipidx[i] = True #print coldata, var, ref, i hdr[i] = var print ",".join(hdr) for row in csvreader: count += 1 for i, genotype in enumerate(row): if i in flipidx: row[i] = flip_geno(int(genotype)) print ",".join(row) return count, hdrlen
def __init__(self): self.godb = GoDb() self.filepaths_coll = _filepaths(self.godb) self.sam_coll = _samples(self.godb) self.var_coll = _variants(self.filepaths_coll, self.sam_coll, self.godb) self.variant_totals = [] self.sample_count = -1 self.call_rates = {} self.maxrslist = 200
def main(options): try: godb = GoDb() except: print "Unexpected error:", sys.exc_info()[0] exit() hdr = [] hdrlen = 0 count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('#')): pass else: if (line.startswith('alternate')): hdr = line.split() hdrlen = len(hdr) print hdr else: data = line.split() lgth = len(data) if (lgth != hdrlen): print 'UNABLE TO PARSE DETAIL:', data else: godb.process_marker_detail(hdr, data, options.assaytype) count += 1 if (godb.get_markers_len() >= flush_at): godb.flush_marker_data() print ".", time.time() - start_time, "seconds", count godb.flush_marker_data() print "" return count
def main(options): try: godb = GoDb() except: print "Unexpected error:", sys.exc_info()[0] exit() hdr = [] count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: if (line.startswith('#')): vcfr = VCFrecord(line) prf, sfx = vcfr.get_prfx_sfx() for idx, field in enumerate(sfx): count += 1 godb.process_sample_detail(field, idx, options.assaytype) if (godb.get_samples_len() > flush_at): godb.flush_sample_buff() break godb.flush_sample_buff() print "" return count
def main(options): try: godb = GoDb() except: print "Unexpected error:", sys.exc_info()[0] exit() filelist = [] dirname = "/" + options.prfx + "/" + options.sfx + "/" for filename in os.listdir(dirname): if filename.endswith('.vcf.gz'): filelist.append(filename) godb.add_filepath_detail(options.assaytype, options.prfx, options.sfx, filelist)
def main(): try: godb = GoDb() except: print "Unexpected error:", sys.exc_info()[0] exit() hdr = [] hdrlen = 0 count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('#')): pass else: godb.process_genemap_detail(line) count += 1 if (godb.get_genemap_len() >= flush_at): godb.flush_genemap_buff() print ".", time.time() - start_time, "seconds", count godb.flush_genemap_buff() print "" return count
def main(options): included_assaytypes = { "affy": 1, "illumina": 1, "broad": 1, "metabo": 1, "exome": 1 } godb = GoDb() # Data structures atype_list = [] atype_posns = {} marker_list = [] rsid_assaytypes = {} rsid_dict = {} rsid_prfx_dict = {} rsid_cr_dict = {} rsid_info_dict = {} count = 0 # Step 1 - get the list of entries for each rsid - one per assaytype vardocs = godb.get_multiple_variants(options.rsid) sampposns = godb.get_sample_posns(options.sampleid) for doc in vardocs: filepath = godb.get_filepath(doc["assaytype"], doc["chromosome"]) rec = godb.get_variant_file_data(filepath, doc["chromosome"], doc["position"]) vcfr = VCFrecord(rec) prfx, sfx = vcfr.get_prfx_sfx() if doc["assaytype"] in sampposns: print "%s,%s,%s,%d,%s" % ( options.rsid, options.sampleid, doc["assaytype"], sampposns[doc["assaytype"]], sfx[sampposns[doc["assaytype"]]]) return count
def main(options): #included_assaytypes = {"biggertest":1, "bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1} #included_assaytypes = {"affy":1, "illumina":1} included_assaytypes = {"broad":1} #included_assaytypes = {"metabo":1} #included_assaytypes = {"affy":1} #included_assaytypes = {"bigtest":1} #included_assaytypes = {"biggertest":1} rsids = [] godb = GoDb() try: if options.snpfile != None: fh = open(options.snpfile, "r") rsids = load_snpfile_data(fh) else: rsids = options.rsids.split(",") except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) exit() except TypeError as e: print "Missing arguments ", e exit() except: logging.info("Unexpected error: %s", str(sys.exc_info())) sys.exit() # Step 0 - initialise db connection and instanciate helper objects mafh = Mafhelper() hweh = Hwehelper() # Data structures atype_list = [] atype_posns = {} marker_list = [] rsid_assaytypes = {} rsid_dict = {} rsid_prfx_dict = {} rsid_cr_dict = {} rsid_info_dict = {} hdr_pref = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] # Step 1 - get the list of entries for each rsid - one per assaytype for rsid in rsids: #logging.info("Processing rsid = %s", rsid) docs = godb.get_multiple_variants(rsid) if docs.count() > 0: rsid_assaytypes[rsid] = [] else: logging.info("RSID %s NOTFOUND", rsid) #print docs # Step 1a - collect assaytypes and marker documents # At this point we're establishing a list order which must be observed throughout. for doc in docs: #logging.info("%s", str(doc)) if doc["assaytype"] not in included_assaytypes: continue if doc["assaytype"] not in atype_list: atype_list.append(doc["assaytype"]) rsid_assaytypes[rsid].append(doc) logging.info(str(atype_list)) # Step 2 - collect lists of prochis (sample ids) by assaytype prochi_list = [[]] * len(atype_list) for i, atype in enumerate(atype_list): atype_posns[atype] = i prochi_list[i] = godb.get_samples(atype) #logging.info("SAMP %d, %s, %s", i, atype, str(prochi_list[i])) mm = Multibuffermerge(prochi_list) # Step 3 - get combined col_header positions # combo is a dict {posn:colname} combo = mm.get_combined_positions() #print len(combo) # combocol is a list [colname1, colname2, ..., colname] again we keep the order of this intact combocol = mm.get_combined_columns() # Step 4 - for each variant by rsid for rsid in rsid_assaytypes: if rsid not in rsid_dict: rsid_prfx_dict[rsid] = [[]] * len(atype_list) rsid_dict[rsid] = [[]] * len(atype_list) rsid_cr_dict[rsid] = [[]] * len(atype_list) rsid_info_dict[rsid] = [[]] * len(atype_list) #print len(rsid_assaytypes[rsid]) for doc in rsid_assaytypes[rsid]: if options.prfx != None: fpath = godb.get_full_filepath(doc["assaytype"], doc["chromosome"], options.prfx) else: fpath = godb.get_filepath(doc["assaytype"], doc["chromosome"]) logging.info("Assaytype=%s fpath=%s", doc["assaytype"], fpath) result = godb.get_variant_file_data(fpath, doc["chromosome"], doc["position"]) if result != None: vcfr = VCFrecord(result) varid = vcfr.get_varid() if varid == rsid: rec = result maf, ma, cr = mafh.get_maf_and_cr(vcfr) # TODO - ALSO check maf, also apply QC filter at individual record level rsid_cr_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = cr rsid_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = rec logging.info("%s (%s), maf=%s, ma=%s, cr=%s" % (doc["rsid"], doc["assaytype"], maf, ma, cr)) #print combocol # Step 5 - execute the merge process print "\t".join(hdr_pref + combocol) count = 0 concordant = True for rsid in rsid_dict: if len(rsid_dict[rsid][0]) > 0: if options.check == 'Y': concordant = mm.check_concordancies(rsid_dict[rsid], atype_list, options.chipval) if concordant == True: comborec = mm.get_combined_array(rsid_dict[rsid], rsid_cr_dict[rsid], atype_list) vcfr = VCFrecord(rsid_dict[rsid][0]) prfx,sfx = vcfr.get_prfx_sfx() if len(prfx) > 0: logging.info("PRFX = %s, for %s", str(prfx), rsid) prfx[8] += ":AT" outrec = prfx + comborec print "\t".join(outrec) count += 1 else: logging.info("RSID %s NOTFOUND (2)", rsid) pass else: logging.info("Concordancy check fail for - %s" % (rsid)) #chi_test_count, allele_disc_count, overlap_count, cr_check_count = mm.get_counts() #logging.info("Overlap check count = %d, cr_check_count = %d", overlap_count, cr_check_count) chi_test_count, allele_disc_count, overlap_count = mm.get_counts() logging.info("CHI test count = %d, Allele discord count = %d, Overlap check count = %d", chi_test_count, allele_disc_count, overlap_count) return count
class DataStore(): def __init__(self): self.godb = GoDb() self.filepaths_coll = _filepaths(self.godb) self.sam_coll = _samples(self.godb) self.var_coll = _variants(self.filepaths_coll, self.sam_coll, self.godb) self.variant_totals = [] self.sample_count = -1 self.call_rates = {} self.maxrslist = 200 def get_db_name(self): return self.godb.get_dbname() def make_selection_key(self, varid, assaytype): return varid + "_" + assaytype def get_variant_data_for_file(self, filepath, threshold): msg = "" pattern = re.compile('[\W_]+') try: f = open(filepath, "r") except IOError as e: msg = filepath + ":" + e.strerror return ([], msg) count = 0 rslist = [] for line in f: count += 1 if count > self.maxrslist: msg = "line count for %s gt the limit (%d)" % (filepath, self.maxrslist) return ([], msg) line = line.strip() pattern.sub('', line) if line.startswith("rs"): elems = line.split() rslist.append(elems[0]) else: msg += "Removed bad line at %d, " % (count) f.close() variant_data = [] for rsid in rslist: (variant_docs, tmsg) = self.get_variant_summary_probs(rsid, threshold) if (tmsg != ""): msg += tmsg for doc in variant_docs: variant_data.append(doc) return variant_data, msg def get_variant_data_by_range(self, chromosome, start, end, threshold=0.9): rslist = [] variant_data = [] msg = "" return (self.var_coll.get_variant_data_by_range( chromosome, start, end)) def get_range_data(self, chromosome, start, end, threshold, download_list): (docs, msg) = self.var_coll.get_variant_data_by_range( chromosome, start, end) if len(docs) == 0: return ([], [], msg) rsdict = {} rslist = [] for doc in docs: rsdict[doc["rsid"]] = 1 for rsid in rsdict: rslist.append(rsid) return (self.get_rslist_data(rslist, threshold, download_list)) def build_csv_data(self, rslist, sampleDict, assaytypes, threshold, atpidx): # NOTE: maintaining rslist order is vital! rsidx = {} normalised = False idx = 0 for rsid in rslist: rsidx[rsid] = idx idx += 1 assaytypes['combined'] = 1 by_platform_data = {} for assaytype in assaytypes: by_platform_data[assaytype] = [] hdrData = ["sampleId"] for rsid in rslist: hdrData.append(rsid) hdrData.append(rsid + "_c") hdrData.append(rsid + "_p") hdrData.append(rsid + "_alt") hdrString = ','.join(hdrData) for assaytype in assaytypes: by_platform_data[assaytype].append(hdrString) for samp in sampleDict: output_lines = {} filled_output_lines = {} filled_output_lines['combined'] = True for assaytype in assaytypes: output_lines[assaytype] = ["" for x in range(len(rslist) * 4)] for rsid in sampleDict[samp]: if len( sampleDict[samp][rsid] ) > 0: # if a sample wasn't genotyped on any platform there might not be data idxoffset = rsidx[rsid] * 4 # resolve_geno is at the crux - need to change to test CR? #logging.info("Call resolve_geno %s, %s", samp, str(sampleDict[samp][rsid])) geno_data = self.resolve_geno(sampleDict[samp][rsid], rsid, samp, threshold, atpidx) dataVals = geno_data[2].split(':') #print "build_csv_data:", geno_data probVals = dataVals[atpidx[geno_data[0]]].split(',') (probcall, intcall, outprob) = self.var_coll.get_call(probVals, threshold) output_lines['combined'][idxoffset] = str(intcall) output_lines['combined'][idxoffset + 1] = str(outprob) output_lines['combined'][idxoffset + 2] = geno_data[0] output_lines['combined'][idxoffset + 3] = geno_data[4] for geno_data in sampleDict[samp][rsid]: dataVals = geno_data[2].split(':') probVals = dataVals[atpidx[geno_data[0]]].split(',') (probcall, intcall, outprob) = self.var_coll.get_call( probVals, threshold) output_lines[geno_data[0]][idxoffset] = str(intcall) output_lines[geno_data[0]][idxoffset + 1] = str(outprob) output_lines[geno_data[0]][idxoffset + 2] = geno_data[0] output_lines[geno_data[0]][idxoffset + 3] = geno_data[4] filled_output_lines[geno_data[0]] = True for assaytype in assaytypes: if assaytype in filled_output_lines: by_platform_data[assaytype].append( samp + "," + ",".join(output_lines[assaytype])) return by_platform_data def resolve_geno(self, genlist, rsid, samp, threshold, atpidx): maxprob = 0.0 maxidx = -1 if len(genlist) == 1: return genlist[0] elif len(genlist) > 1: for idx, gendata in enumerate(genlist): try: dataVals = gendata[2].split(':') probVals = dataVals[atpidx[gendata[0]]].split(',') except IndexError: sys.exit() (probcall, intcall, outprob) = self.var_coll.get_call(probVals, threshold) if outprob > maxprob: maxprob = outprob maxidx = idx if maxprob > 0.0: return genlist[maxidx] return genlist[0] def get_variant_summary_probs(self, rsid, threshold): variant_array = [] msg = "" docs = self.var_coll.get_variant_data_multi(rsid) for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) fpath = self.filepaths_coll.get_filepath(doc["assaytype"], chromosome) fullrec = self.var_coll.get_raw_variant_values( fpath, chromosome, doc['position']) vcfr = VCFrecord(fullrec) prfx, sfx = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.var_coll.get_genotype_probs(sfx, threshold, probidx) doc['selected'] = 1 doc['a_af'] = alleleAf doc['b_af'] = alleleBf doc['hwe_p'] = p_hwe doc['Missing'] = 0 if 'Missing' in gc_count_dict: doc['Missing'] = gc_count_dict['Missing'] variant_array.append(doc) if len(variant_array) == 0: msg = "Variant NOT FOUND - %s, " % (rsid) return (variant_array, msg) def get_variant_data(self, variantid, assaytype): return self.var_coll.get_variant_data(variantid, assaytype) def get_sample(self, sampleid): return self.sam_coll.get_sample(sampleid) def make_zipfile(self, sample_return_data, snp_return_data, uploadDir, zipfilename): """ moved here from views.py """ ares = {} for assaytype in sample_return_data: ares[assaytype] = '\n'.join(sample_return_data[assaytype]) + '\n' zipname = uploadDir + "/" + zipfilename with ZipFile(zipname, 'w') as resZip: resZip.writestr('snp_summary.csv', snp_return_data, ZIP_DEFLATED) for assaytype in ares: resZip.writestr(assaytype + '_samples.csv', ares[assaytype], ZIP_DEFLATED) with open(zipname, 'r') as f: body = f.read() return (body) response = make_response(body) response.headers[ "Content-Disposition"] = "attachment; filename=" + zipfilename return (response) def get_rslist_file_data(self, filepath, threshold, download_list): msg = None try: f = open(filepath, "r") except IOError as e: msg = filepath + ":" + e.strerror return ([], [], msg) count = 0 rslist = [] for line in f: count += 1 if count > 500: msg = "line count for %s gt the limit (%d)" % (filepath, 500) return ([], [], msg) line = line.strip() elems = line.split() rslist.append(elems[0]) f.close() logging.info("get_rslist_file_data: %.2f", float(threshold)) return (self.get_rslist_data(rslist, threshold, download_list)) def get_rslist_data(self, input_rslist, threshold, download_list): msg = None snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n" data = [] assaytypelist = [] probidxlist = [] rslist = [] assaytypes = {} Afreq = {} Bfreq = {} data_count = 0 impDict = {} for rsid in input_rslist: docs = self.var_coll.get_variant_data_multi(rsid) if len(docs) > 0: rslist.append(rsid) # handling SNPs on multiple platforms for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) # first get filepath fpath = self.filepaths_coll.get_filepath( doc["assaytype"], chromosome) # get raw variant data fullrec = self.var_coll.get_raw_variant_values( fpath, chromosome, doc['position']) geno_count = 0 sample_count = 0 hwep = 0.0 vcfr = VCFrecord(fullrec) prfx, sfx = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.var_coll.get_genotype_probs( sfx, threshold, probidx) Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf data_count += 1 hwep = float(p_hwe) assaytypelist.append(doc["assaytype"]) data.append(vcfr) if doc["assaytype"] not in assaytypes: assaytypes[doc["assaytype"]] = 1 imputed = 0 if "imputed" in doc: imputed = 1 if "info" in doc: if doc["info"] != 1.0: imputed = 1 impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % ( doc["rsid"], doc["assaytype"], doc["chromosome"], doc["position"], doc["alleleA"], alleleAf, doc["alleleB"], alleleBf, maf, imputed, float(geno_count) / sample_count, hwep, doc["info"]) pdata = self.get_sample_values(assaytypelist, data, data_count, rslist, impDict, assaytypes, Afreq, Bfreq, threshold) return (pdata, snpdata, msg) def get_sample_values(self, assaytypelist, records, numrecs, rslist, impDict, assaytypes, Afreq, Bfreq, threshold): """Process vcf records NOTE: impDict keyed on a composite of rsid_platform """ samplesByAt = {} for assaytype in assaytypes: samplesByAt[assaytype] = self.sam_coll.get_samples(assaytype) # A dict of dicts of tables sampleDict = {} dupDict = {} dupcount = 0 for assaytype in samplesByAt: for samp in samplesByAt[assaytype]: if samp not in sampleDict: sampleDict[samp] = {} for rsid in rslist: sampleDict[samp][rsid] = [] count = 0 rscount = 0 atpidx = {} values_totals = 0 for idx, vcfr in enumerate(records): assaytype = assaytypelist[idx] chromosome = vcfr.get_chr() pos = vcfr.get_posn() rsid = vcfr.get_varid() alleleA, alleleB = vcfr.get_alleles() prfx, sfx = vcfr.get_prfx_sfx() atpidx[assaytype] = vcfr.get_probidx() if assaytype not in samplesByAt: samplesByAt[assaytype] = self.sam_coll.get_samples(assaytype) for idx, elem in enumerate(sfx): if samplesByAt[assaytype][idx] in sampleDict: sampleId = samplesByAt[assaytype][idx] values_totals += 1 sampleDict[sampleId][rsid].append([ assaytype, impDict[rsid + "_" + assaytype], sfx[idx], alleleA, alleleB, Afreq[rsid + "_" + assaytype], Bfreq[rsid + "_" + assaytype] ]) rscount += 1 by_assaytype_data = self.build_csv_data(rslist, sampleDict, assaytypes, threshold, atpidx) return (by_assaytype_data)