def check_concordancies(self, data_list, assays, chipval): hwe_values = [0.0] * len(data_list) maf_values = [0.0] * len(data_list) obs = [0.0] * 3 exp = [0.0] * 3 allele_ref_1 = "" allele_alt_1 = "" allele_ref_2 = "" allele_alt_2 = "" #print "CHECK_CONC:", len(data_list) for i, vcf_record in enumerate(data_list): if len(vcf_record) > 0: vcfr = VCFrecord(vcf_record) probidx = vcfr.get_probidx() homref_count, het_count, homalt_count, nc_count, miss_count = self.vcfr.get_allele_counts_from_array( data) allele_a, allele_b = vcfr.get_alleles() if allele_ref_1 == "": allele_ref_1 = allele_a allele_alt_1 = allele_b # Add 1 to prevent 0-divide obs[0] = homref_count + 1 obs[1] = het_count + 1 obs[2] = homalt_count + 1 else: allele_ref_2 = allele_a allele_alt_2 = allele_b exp[0] = homref_count + 1 exp[1] = het_count + 1 exp[2] = homalt_count + 1 if (allele_ref_1 != allele_ref_2) or (allele_alt_1 != allele_alt_2): varid = vcfr.get_varid(data) posn = vcfr.get_posn(data) self.allele_discord_count += 1 logging.info( "Allele discordancy: assay1=%s, assay2=%s, varid=%s, posn=%d, ref1=%s, alt1=%s, ref2=%s, alt2=%s", assays[0], assays[i], varid, int(posn), allele_ref_1, allele_alt_1, allele_ref_2, allele_alt_2) #print "Allele discord" return False chi_stat, chi_p_value = chisquare(obs, f_exp=exp) varid = vcfr.get_varid(data) posn = vcfr.get_posn(data) if chi_p_value < chipval: self.chisq_count += 1 logging.info( "CHI SQ test REJECT: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, chipval=%e, obs=%s, exp=%s, at %d", assays[0], assays[i], varid, int(posn), chi_stat, chi_p_value, chipval, str(obs), str(exp), i) #print "CHISQ discord" return False logging.info( "CHI SQ test OK: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, obs=%s, exp=%s, at %d", assays[0], assays[i], varid, int(posn), chi_stat, chi_p_value, str(obs), str(exp), i) return True
def process_variant_detail_vcf(self, record, assaytype): """Process info file variant detail records Set up a json-stype document and add it to the variant buffer """ doc = {} doc["assaytype"] = assaytype vcfr = VCFrecord(record) prfx, sfx = vcfr.get_prfx_sfx() doc["rsid"] = vcfr.get_varid() # always store chromosome as a 2-digit string doc["chromosome"] = "%.2d" % (int(vcfr.get_chr())) alleleA, alleleB = vcfr.get_alleles() doc["alleleA"] = alleleA doc["alleleB"] = alleleB doc["position"] = vcfr.get_posn_as_int() try: doc["ref_maf"] = float(vcfr.get_info_value("RefPanelAF")) except: pass try: doc["info"] = float(vcfr.get_info_value("INFO")) except: doc["info"] = 1.0 self.variantbuff.append(doc)
def get_dbsnp_rsid(dbsnpfile, chrom, posn): dbsnprec = dbsnpfile.get_dbsnp_file_record(options.dbsnpfile, chrom, int(posn)) rsid = "" refallele = "" if dbsnprec != None: dbvcf = VCFrecord(dbsnprec) rsid = dbvcf.get_varid() refallele, altallele = dbvcf.get_alleles() return rsid, refallele
def main(): mafh = Mafhelper() hweh = Hwehelper() in_count = 0 hdr_count = 0 homr_total = 0 het_total = 0 homa_total = 0 virt_nc_total = 0 miss_total = 0 print "SNPId,AssayType,chr,pos,REF,ALT,Minor,MAF,CallRate,HWE_pval" for line in sys.stdin: line = line.strip() in_count += 1 if line.startswith("#"): hdr_count += 1 continue vcfr = VCFrecord(line) varid = vcfr.get_varid_ukb() chromosome = vcfr.get_chr() posn = vcfr.get_posn_as_int() ref, alt = vcfr.get_alleles() homref_count, het_count, homalt_count, virt_nc_count, miss_count = vcfr.get_allele_counts() call_count = homref_count + het_count + homalt_count #nocall_count = virt_nc_count + miss_count nocall_count = virt_nc_count call_rate = float(call_count) / float(call_count + nocall_count) homr_total += homref_count het_total += het_count homa_total += homalt_count virt_nc_total += virt_nc_count miss_total += miss_count try: hwe = hweh.HWE_exact(het_count, homref_count, homalt_count, call_count) maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count, alt, virt_nc_count) except ZeroDivisionError: logging.info("DIV 0 error at %d (%d), where hom_r=%d, het=%d, home_a=%d, cc=%d", in_count, posn, homref_count, het_count, homalt_count, call_count) print "%s,combo,%s,%d,%s,%s,%s,%s,%.3f,%s" % (varid, chromosome, posn, ref, alt, ma, maf, call_rate, hwe) return in_count, hdr_count, homr_total, het_total, homa_total, virt_nc_total, miss_total
def main(options): hdrData = ["id"] sampleDict = {} colPosns = {} RefAlleleDict = {} AltAlleleDict = {} count = 0 mafh = Mafhelper() for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() #print prfx if (line.startswith('#')): # Parse out the header record. for i, col_hdr in enumerate(sfx): colPosns[i] = col_hdr sampleDict[col_hdr] = [] else: flip = False varid = vcfr.get_varid_ukb() #logging.info("varid=%s", varid) ref, alt = vcfr.get_alleles() probidx = vcfr.get_probidx() hdr_allele = alt homref_count, het_count, homalt_count, nc_count, miss_count = vcfr.get_allele_counts( ) call_count = homref_count + het_count + homalt_count maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count, alt, nc_count) RefAlleleDict[varid] = ref AltAlleleDict[varid] = alt #if ma == ref: # flip = True # hdr_allele = ref # logging.info("FLIP for %s, %s, %s", varid, ref, alt) hdrData.append(varid) for i, str_geno in enumerate(sfx): if str_geno != ".": geno = str_geno.split(":") max_prob, max_idx = get_max_prob(geno, probidx) i_call = icalls[geno[0]] if flip == True: if i_call == "0": i_call == "2" elif i_call == "2": i_call = "0" sampleDict[colPosns[i]].append(str(i_call)) else: sampleDict[colPosns[i]].append("") print ",".join(hdrData) for samp in sampleDict: count += 1 print ",".join([samp] + sampleDict[samp]) return count