コード例 #1
0
    def check_concordancies(self, data_list, assays, chipval):
        hwe_values = [0.0] * len(data_list)
        maf_values = [0.0] * len(data_list)
        obs = [0.0] * 3
        exp = [0.0] * 3
        allele_ref_1 = ""
        allele_alt_1 = ""
        allele_ref_2 = ""
        allele_alt_2 = ""
        #print "CHECK_CONC:", len(data_list)
        for i, vcf_record in enumerate(data_list):
            if len(vcf_record) > 0:
                vcfr = VCFrecord(vcf_record)
                probidx = vcfr.get_probidx()
                homref_count, het_count, homalt_count, nc_count, miss_count = self.vcfr.get_allele_counts_from_array(
                    data)
                allele_a, allele_b = vcfr.get_alleles()
                if allele_ref_1 == "":
                    allele_ref_1 = allele_a
                    allele_alt_1 = allele_b
                    # Add 1 to prevent 0-divide
                    obs[0] = homref_count + 1
                    obs[1] = het_count + 1
                    obs[2] = homalt_count + 1
                else:
                    allele_ref_2 = allele_a
                    allele_alt_2 = allele_b
                    exp[0] = homref_count + 1
                    exp[1] = het_count + 1
                    exp[2] = homalt_count + 1
                    if (allele_ref_1 != allele_ref_2) or (allele_alt_1 !=
                                                          allele_alt_2):
                        varid = vcfr.get_varid(data)
                        posn = vcfr.get_posn(data)
                        self.allele_discord_count += 1
                        logging.info(
                            "Allele discordancy: assay1=%s, assay2=%s, varid=%s, posn=%d, ref1=%s, alt1=%s, ref2=%s, alt2=%s",
                            assays[0], assays[i], varid, int(posn),
                            allele_ref_1, allele_alt_1, allele_ref_2,
                            allele_alt_2)
                        #print "Allele discord"
                        return False

                    chi_stat, chi_p_value = chisquare(obs, f_exp=exp)
                    varid = vcfr.get_varid(data)
                    posn = vcfr.get_posn(data)
                    if chi_p_value < chipval:
                        self.chisq_count += 1
                        logging.info(
                            "CHI SQ test REJECT: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, chipval=%e, obs=%s, exp=%s, at %d",
                            assays[0], assays[i], varid, int(posn), chi_stat,
                            chi_p_value, chipval, str(obs), str(exp), i)
                        #print "CHISQ discord"
                        return False
                    logging.info(
                        "CHI SQ test OK: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, obs=%s, exp=%s, at %d",
                        assays[0], assays[i], varid, int(posn), chi_stat,
                        chi_p_value, str(obs), str(exp), i)

        return True
コード例 #2
0
  def process_variant_detail_vcf(self, record, assaytype):
    """Process info file variant detail records
       Set up a json-stype document and add it to the
      variant buffer
    """
    doc = {}
    doc["assaytype"] = assaytype
    vcfr = VCFrecord(record)
    prfx, sfx = vcfr.get_prfx_sfx()
    doc["rsid"] = vcfr.get_varid()
    # always store chromosome as a 2-digit string
    doc["chromosome"] = "%.2d" % (int(vcfr.get_chr()))
    alleleA, alleleB = vcfr.get_alleles()
    doc["alleleA"] = alleleA
    doc["alleleB"] = alleleB
    doc["position"] = vcfr.get_posn_as_int()
    try:
      doc["ref_maf"] = float(vcfr.get_info_value("RefPanelAF"))
    except:
      pass
    try:
      doc["info"] = float(vcfr.get_info_value("INFO"))
    except:
      doc["info"] = 1.0

    self.variantbuff.append(doc)
コード例 #3
0
ファイル: correct_and_flip.py プロジェクト: PhilAppleby/GoDb
def get_dbsnp_rsid(dbsnpfile, chrom, posn):
  dbsnprec = dbsnpfile.get_dbsnp_file_record(options.dbsnpfile, chrom, int(posn))
  rsid = ""
  refallele = ""
  if dbsnprec != None:
    dbvcf = VCFrecord(dbsnprec)
    rsid = dbvcf.get_varid()
    refallele, altallele = dbvcf.get_alleles()
  return rsid, refallele
コード例 #4
0
def main():
  mafh = Mafhelper()
  hweh = Hwehelper()
  in_count = 0
  hdr_count = 0
  homr_total = 0
  het_total = 0
  homa_total = 0
  virt_nc_total = 0
  miss_total = 0

  print "SNPId,AssayType,chr,pos,REF,ALT,Minor,MAF,CallRate,HWE_pval"

  for line in sys.stdin:
    line = line.strip()
    in_count += 1
    if line.startswith("#"):
      hdr_count += 1
      continue


    vcfr = VCFrecord(line)
    varid = vcfr.get_varid_ukb()
    chromosome = vcfr.get_chr()
    posn = vcfr.get_posn_as_int()
    ref, alt = vcfr.get_alleles()
    homref_count, het_count, homalt_count, virt_nc_count, miss_count = vcfr.get_allele_counts()
    call_count = homref_count + het_count + homalt_count
    #nocall_count = virt_nc_count + miss_count
    nocall_count = virt_nc_count
    call_rate = float(call_count) / float(call_count + nocall_count)
    homr_total += homref_count
    het_total += het_count
    homa_total += homalt_count
    virt_nc_total += virt_nc_count
    miss_total += miss_count
    try:
      hwe = hweh.HWE_exact(het_count, homref_count, homalt_count, call_count)
      maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count, alt, virt_nc_count)
    except ZeroDivisionError:
      logging.info("DIV 0 error at %d (%d), where hom_r=%d, het=%d, home_a=%d, cc=%d", in_count, posn, homref_count, het_count, homalt_count, call_count)
    print "%s,combo,%s,%d,%s,%s,%s,%s,%.3f,%s" % (varid, chromosome, posn, ref, alt, ma, maf, call_rate, hwe)
  return in_count, hdr_count, homr_total, het_total, homa_total, virt_nc_total, miss_total
コード例 #5
0
def main(options):
    hdrData = ["id"]
    sampleDict = {}
    colPosns = {}
    RefAlleleDict = {}
    AltAlleleDict = {}
    count = 0

    mafh = Mafhelper()

    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('##')):
            pass
        else:
            vcfr = VCFrecord(line)
            prfx, sfx = vcfr.get_prfx_sfx()
            #print prfx
            if (line.startswith('#')):
                # Parse out the header record.
                for i, col_hdr in enumerate(sfx):
                    colPosns[i] = col_hdr
                    sampleDict[col_hdr] = []
            else:
                flip = False
                varid = vcfr.get_varid_ukb()
                #logging.info("varid=%s", varid)
                ref, alt = vcfr.get_alleles()
                probidx = vcfr.get_probidx()
                hdr_allele = alt
                homref_count, het_count, homalt_count, nc_count, miss_count = vcfr.get_allele_counts(
                )
                call_count = homref_count + het_count + homalt_count
                maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count,
                                   alt, nc_count)
                RefAlleleDict[varid] = ref
                AltAlleleDict[varid] = alt
                #if ma == ref:
                #  flip = True
                #  hdr_allele = ref
                #  logging.info("FLIP for %s, %s, %s", varid, ref, alt)
                hdrData.append(varid)
                for i, str_geno in enumerate(sfx):
                    if str_geno != ".":
                        geno = str_geno.split(":")
                        max_prob, max_idx = get_max_prob(geno, probidx)
                        i_call = icalls[geno[0]]
                        if flip == True:
                            if i_call == "0":
                                i_call == "2"
                            elif i_call == "2":
                                i_call = "0"
                        sampleDict[colPosns[i]].append(str(i_call))
                    else:
                        sampleDict[colPosns[i]].append("")

    print ",".join(hdrData)
    for samp in sampleDict:
        count += 1
        print ",".join([samp] + sampleDict[samp])
    return count