def main(options):
  try:
    fh = open(options.convfile, "r") 
    smap = load_sample_map(fh)
  except:
    print "Unexpected error:", sys.exc_info()[0]
    exit()

  hdr = []
  hdrlen = 0
  count = 0
  for line in sys.stdin:
    line = line.strip()
    if (line.startswith('##')):
      print line
    else:
      if (line.startswith('#')):
        vcfr = VCFrecord(line)
        prfx, sfx = vcfr.get_prfx_sfx()
        for idx, elem in enumerate(sfx):
          sfx[idx] = smap[elem]
        print "\t".join(prfx) + "\t" + "\t".join(sfx)
      else:
        print line

  return count
Exemple #2
0
def main(options):
    try:
        godb = GoDb()
    except:
        print "Unexpected error:", sys.exc_info()[0]
        exit()

    hdr = []
    count = 0
    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('##')):
            pass
        else:
            if (line.startswith('#')):
                vcfr = VCFrecord(line)
                prf, sfx = vcfr.get_prfx_sfx()
                for idx, field in enumerate(sfx):
                    count += 1
                    godb.process_sample_detail(field, idx, options.assaytype)
                    if (godb.get_samples_len() > flush_at):
                        godb.flush_sample_buff()
                break

    godb.flush_sample_buff()
    print ""
    return count
Exemple #3
0
 def get_variant_summary_probs(self, rsid, threshold):
     variant_array = []
     msg = ""
     docs = self.var_coll.get_variant_data_multi(rsid)
     for doc in docs:
         # always force chromosome to 2 digits
         chromosome = "%.2d" % int(doc["chromosome"])
         fpath = self.filepaths_coll.get_filepath(doc["assaytype"],
                                                  chromosome)
         fullrec = self.var_coll.get_raw_variant_values(
             fpath, chromosome, doc['position'])
         vcfr = VCFrecord(fullrec)
         prfx, sfx = vcfr.get_prfx_sfx()
         probidx = vcfr.get_probidx()
         (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf,
          p_hwe) = self.var_coll.get_genotype_probs(sfx, threshold, probidx)
         doc['selected'] = 1
         doc['a_af'] = alleleAf
         doc['b_af'] = alleleBf
         doc['hwe_p'] = p_hwe
         doc['Missing'] = 0
         if 'Missing' in gc_count_dict:
             doc['Missing'] = gc_count_dict['Missing']
         variant_array.append(doc)
     if len(variant_array) == 0:
         msg = "Variant NOT FOUND - %s, " % (rsid)
     return (variant_array, msg)
Exemple #4
0
def get_dbsnp_rsid(dbsnpfile, chrom, posn):
  dbsnprec = dbsnpfile.get_dbsnp_file_record(options.dbsnpfile, chrom, int(posn))
  rsid = ""
  refallele = ""
  if dbsnprec != None:
    dbvcf = VCFrecord(dbsnprec)
    rsid = dbvcf.get_varid()
    refallele, altallele = dbvcf.get_alleles()
  return rsid, refallele
Exemple #5
0
 def __init__(self, db, filedata_coll, sample_coll, gwasdb, probidx=1):
     self.db = db
     self.gwasdb = gwasdb
     self.markers = db.markers
     self.calls = ["0/0", "0/1", "1/1", "Missing"]
     self.icalls = [0, 1, 2, -9]
     self.filedata_coll = filedata_coll
     self.sample_coll = sample_coll
     self.probidx = probidx
     self.vcfr = VCFrecord()
Exemple #6
0
  def process_variant_detail_vcf(self, record, assaytype):
    """Process info file variant detail records
       Set up a json-stype document and add it to the
      variant buffer
    """
    doc = {}
    doc["assaytype"] = assaytype
    vcfr = VCFrecord(record)
    prfx, sfx = vcfr.get_prfx_sfx()
    doc["rsid"] = vcfr.get_varid()
    # always store chromosome as a 2-digit string
    doc["chromosome"] = "%.2d" % (int(vcfr.get_chr()))
    alleleA, alleleB = vcfr.get_alleles()
    doc["alleleA"] = alleleA
    doc["alleleB"] = alleleB
    doc["position"] = vcfr.get_posn_as_int()
    try:
      doc["ref_maf"] = float(vcfr.get_info_value("RefPanelAF"))
    except:
      pass
    try:
      doc["info"] = float(vcfr.get_info_value("INFO"))
    except:
      doc["info"] = 1.0

    self.variantbuff.append(doc)
Exemple #7
0
    def check_concordancies(self, data_list, assays, chipval):
        hwe_values = [0.0] * len(data_list)
        maf_values = [0.0] * len(data_list)
        obs = [0.0] * 3
        exp = [0.0] * 3
        allele_ref_1 = ""
        allele_alt_1 = ""
        allele_ref_2 = ""
        allele_alt_2 = ""
        #print "CHECK_CONC:", len(data_list)
        for i, vcf_record in enumerate(data_list):
            if len(vcf_record) > 0:
                vcfr = VCFrecord(vcf_record)
                probidx = vcfr.get_probidx()
                homref_count, het_count, homalt_count, nc_count, miss_count = self.vcfr.get_allele_counts_from_array(
                    data)
                allele_a, allele_b = vcfr.get_alleles()
                if allele_ref_1 == "":
                    allele_ref_1 = allele_a
                    allele_alt_1 = allele_b
                    # Add 1 to prevent 0-divide
                    obs[0] = homref_count + 1
                    obs[1] = het_count + 1
                    obs[2] = homalt_count + 1
                else:
                    allele_ref_2 = allele_a
                    allele_alt_2 = allele_b
                    exp[0] = homref_count + 1
                    exp[1] = het_count + 1
                    exp[2] = homalt_count + 1
                    if (allele_ref_1 != allele_ref_2) or (allele_alt_1 !=
                                                          allele_alt_2):
                        varid = vcfr.get_varid(data)
                        posn = vcfr.get_posn(data)
                        self.allele_discord_count += 1
                        logging.info(
                            "Allele discordancy: assay1=%s, assay2=%s, varid=%s, posn=%d, ref1=%s, alt1=%s, ref2=%s, alt2=%s",
                            assays[0], assays[i], varid, int(posn),
                            allele_ref_1, allele_alt_1, allele_ref_2,
                            allele_alt_2)
                        #print "Allele discord"
                        return False

                    chi_stat, chi_p_value = chisquare(obs, f_exp=exp)
                    varid = vcfr.get_varid(data)
                    posn = vcfr.get_posn(data)
                    if chi_p_value < chipval:
                        self.chisq_count += 1
                        logging.info(
                            "CHI SQ test REJECT: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, chipval=%e, obs=%s, exp=%s, at %d",
                            assays[0], assays[i], varid, int(posn), chi_stat,
                            chi_p_value, chipval, str(obs), str(exp), i)
                        #print "CHISQ discord"
                        return False
                    logging.info(
                        "CHI SQ test OK: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, obs=%s, exp=%s, at %d",
                        assays[0], assays[i], varid, int(posn), chi_stat,
                        chi_p_value, str(obs), str(exp), i)

        return True
Exemple #8
0
def main():
  count = 0
  for line in sys.stdin:
    line = line.strip()
    if (line.startswith('##')):
      pass
    else:
      if (line.startswith('#')):
        vcfr = VCFrecord(line)
        prfx, sfx = vcfr.get_prfx_sfx()
        for samp in sfx:
          print samp
        break


  return count
Exemple #9
0
def main(options):
  hdr = []
  hdrlen = 0
  count = 0
  try:
    fh = open(options.chrommap)
    chrom_map = load_chrom_map(fh)
  except:
    print "Unable to open", options.chrommap
    exit()

  for line in sys.stdin:
    count += 1
    line = line.strip()
    if (line.startswith('#')):
      print line
    else:
      vcfr = VCFrecord(line)
      strchrom = vcfr.get_chr()
      try:
        vcfr.set_chr(chrom_map[strchrom])
      except:
        logging.info("Chromosome not found in map %s, %s" % (options.chrommap, strchrom))
        exit()
      print vcfr.get_record()

  return count
Exemple #10
0
def main(options):
    hdr = []
    hdrlen = 0
    count = 0
    try:
        fh = open(options.chrommap)
        chrom_map = load_chrom_map(fh)
    except:
        print "Unable to open", options.chrommap
        exit()

    dbsnpfile = Dbsnpfile()
    dbsnpfile.set_tabix_file(options.dbsnpfile)
    for line in sys.stdin:
        count += 1
        line = line.strip()
        if (line.startswith('#')):
            print line
        else:
            vcfr = VCFrecord(line)
            posn = vcfr.get_posn_as_int()
            try:
                dbsnprecs = dbsnpfile.get_dbsnp_file_record(
                    options.dbsnpfile, chrom_map[options.chrom], posn)
            except:
                print "Chromosome not found in map", options.chrom
                exit()
            if len(dbsnprecs) > 0:
                vcfr.set_varid(dbsnpfile.get_rsid(dbsnprecs[0]))
            else:
                logging.info("NOT FOUND for %s at %d" % (options.chrom, posn))
            print vcfr.get_record()

    return count
Exemple #11
0
    def get_geno_data(self, rsid, sample_id, assaytype_list_posns):
        geno_values = {}
        docs = self.get_variant_data_multi(rsid)

        for doc in docs:
            # always force chromosome to 2 digits
            chromosome = "%.2d" % int(doc["chromosome"])
            fpath = self.filepaths_coll.get_filepath(doc["assaytype"],
                                                     chromosome)
            fullrec = self.get_raw_variant_values(fpath, chromosome,
                                                  doc['position'])

            if doc["assaytype"] in assaytype_list_posns:
                vcfr = VCFrecord(fullrec)
                prfx, genodata = vcfr.get_prfx_sfx()
                geno_values[sample_id + "_" + doc["assaytype"]] = genodata[
                    assaytype_list_posns[doc["assaytype"]]]
        return (geno_values)
Exemple #12
0
 def __init__(self,
              db,
              dbname,
              projpref="akh",
              get_anochi=False,
              probidx=1):
     #print "Anochi logical", get_anochi
     self.db = db
     self.dbname = dbname
     self.gwasdb = Gwasdb(db)
     self.filedata_coll = _filedata(db)
     self.sam_coll = _samples(db)
     self.mkr_coll = _markers(db, self.filedata_coll, self.sam_coll,
                              self.gwasdb, probidx)
     self.prochi_coll = _prochi_map(db, projpref, get_anochi)
     self.marker_totals = []
     self.sample_count = -1
     self.call_rates = {}
     self.probidx = probidx
     self.vcfr = VCFrecord()
def main(options):
    included_assaytypes = {
        "affy": 1,
        "illumina": 1,
        "broad": 1,
        "metabo": 1,
        "exome": 1
    }
    godb = GoDb()

    # Data structures
    atype_list = []
    atype_posns = {}
    marker_list = []
    rsid_assaytypes = {}
    rsid_dict = {}
    rsid_prfx_dict = {}
    rsid_cr_dict = {}
    rsid_info_dict = {}
    count = 0

    # Step 1 - get the list of entries for each rsid - one per assaytype

    vardocs = godb.get_multiple_variants(options.rsid)

    sampposns = godb.get_sample_posns(options.sampleid)

    for doc in vardocs:
        filepath = godb.get_filepath(doc["assaytype"], doc["chromosome"])
        rec = godb.get_variant_file_data(filepath, doc["chromosome"],
                                         doc["position"])
        vcfr = VCFrecord(rec)
        prfx, sfx = vcfr.get_prfx_sfx()
        if doc["assaytype"] in sampposns:
            print "%s,%s,%s,%d,%s" % (
                options.rsid, options.sampleid, doc["assaytype"],
                sampposns[doc["assaytype"]], sfx[sampposns[doc["assaytype"]]])

    return count
Exemple #14
0
  def get_next_records(self, key_list, prfx_list, recbuff_list):
    """
    main rule is we read from the fh's corresponding to the min key list and replace the key_list, prfx and rec_buff elements accordingly.
    """
    low_key_list, low_key_count = self.get_low_key_list(key_list)

    for i, fh in enumerate(self.fh_list):
      if low_key_list[i] != self.empty_key:
        line = fh.readline().strip()
        if line != "": # testing for EOF
          self.rec_counts[i] += 1
          vcfr = VCFrecord(line)
          prfx, sfx = vcfr.get_prfx_sfx()
          maf, ma, cr = self.mafh.get_maf_and_cr(data, vcfr)           
          prfx_list[i] = prfx
          recbuff_list[i] = sfx
          key_list[i] = int(prfx[1])
        else:
          prfx_list[i] = []
          recbuff_list[i] = []
          key_list[i] = self.high_key
    #logging.info("rec_counts: %s, key_list: %s" % (str(self.rec_counts), str(key_list)))
    return key_list, prfx_list, recbuff_list
Exemple #15
0
    def get_combined_array(self,
                           buffer_list,
                           cr_list,
                           assay_list,
                           threshold=0.9):
        """
    For each list of data, for each element of list of data:
    1) Find the col header from the corresonding file_position element
    2) Use the col_header to find the combined postion
    3) Place the data_element in the combined postion *
    TODO - conflict resolution, what to do if a slot is already occupied
    TODO - CR check
    """
        #print "COMBO", self.combined_positions
        #
        #print "ASSAY_LIST: %s" % (str(assay_list))
        assay_posns = {}

        for i, assaytype in enumerate(assay_list):
            assay_posns[i] = assaytype

        #print "ASSAY_POSNS: %s" % (str(assay_posns))

        combo_array = ["."] * len(self.combined_positions)
        #print "BUFFL", len(buffer_list)
        for i, vcf_record in enumerate(buffer_list):
            if len(vcf_record) > 0:
                #print "asstp: %d, %s" % (i, assay_list[i])
                vcfr = VCFrecord(vcf_record)
                prfx, data_list = vcfr.get_prfx_sfx()
                probidx = vcfr.get_probidx()
                rsid = vcfr.get_varid()
                hasAT = vcfr.has_fmt("AT")
                for j, dataelem in enumerate(data_list):
                    if data_list[j] != ".":
                        cpos = self.combined_positions[self.file_positions[i]
                                                       [j]]
                        geno = self.call_geno_for_threshold(
                            data_list[j], probidx, threshold)
                        if (hasAT == False):
                            geno = geno + ":" + self.assay_abbrev[
                                assay_list[i]]
                        if combo_array[cpos] != ".":
                            self.geno_overlap_count += 1
                            #print "OVERLAP %s:%s - %s vs %s" % (rsid, self.file_positions[i][j], combo_array[cpos], geno)
                            geno = self.call_genotype(combo_array[cpos], geno,
                                                      probidx)
                        combo_array[cpos] = geno

        return combo_array
Exemple #16
0
def main():
  mafh = Mafhelper()
  hweh = Hwehelper()
  in_count = 0
  hdr_count = 0
  homr_total = 0
  het_total = 0
  homa_total = 0
  virt_nc_total = 0
  miss_total = 0

  print "SNPId,AssayType,chr,pos,REF,ALT,Minor,MAF,CallRate,HWE_pval"

  for line in sys.stdin:
    line = line.strip()
    in_count += 1
    if line.startswith("#"):
      hdr_count += 1
      continue


    vcfr = VCFrecord(line)
    varid = vcfr.get_varid_ukb()
    chromosome = vcfr.get_chr()
    posn = vcfr.get_posn_as_int()
    ref, alt = vcfr.get_alleles()
    homref_count, het_count, homalt_count, virt_nc_count, miss_count = vcfr.get_allele_counts()
    call_count = homref_count + het_count + homalt_count
    #nocall_count = virt_nc_count + miss_count
    nocall_count = virt_nc_count
    call_rate = float(call_count) / float(call_count + nocall_count)
    homr_total += homref_count
    het_total += het_count
    homa_total += homalt_count
    virt_nc_total += virt_nc_count
    miss_total += miss_count
    try:
      hwe = hweh.HWE_exact(het_count, homref_count, homalt_count, call_count)
      maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count, alt, virt_nc_count)
    except ZeroDivisionError:
      logging.info("DIV 0 error at %d (%d), where hom_r=%d, het=%d, home_a=%d, cc=%d", in_count, posn, homref_count, het_count, homalt_count, call_count)
    print "%s,combo,%s,%d,%s,%s,%s,%s,%.3f,%s" % (varid, chromosome, posn, ref, alt, ma, maf, call_rate, hwe)
  return in_count, hdr_count, homr_total, het_total, homa_total, virt_nc_total, miss_total
Exemple #17
0
def main(options):
  #included_assaytypes = {"biggertest":1, "bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1}
  #included_assaytypes = {"affy":1, "illumina":1}
  included_assaytypes = {"broad":1}
  #included_assaytypes = {"metabo":1}
  #included_assaytypes = {"affy":1}
  #included_assaytypes = {"bigtest":1}
  #included_assaytypes = {"biggertest":1}
  rsids = []
  godb = GoDb()

  try:
    if options.snpfile != None:
      fh = open(options.snpfile, "r") 
      rsids = load_snpfile_data(fh)
    else:
      rsids = options.rsids.split(",")
  except IOError as e:
    print "I/O error({0}): {1}".format(e.errno, e.strerror)
    exit()
  except TypeError as e:
    print "Missing arguments ", e
    exit()
  except:
    logging.info("Unexpected error: %s", str(sys.exc_info()))
    sys.exit()

# Step 0 - initialise db connection and instanciate helper objects
  mafh = Mafhelper()
  hweh = Hwehelper()
# Data structures
  atype_list = []
  atype_posns = {}
  marker_list = []
  rsid_assaytypes = {}
  rsid_dict = {}
  rsid_prfx_dict = {}
  rsid_cr_dict = {}
  rsid_info_dict = {}
  hdr_pref = ["#CHROM",  "POS", "ID",  "REF", "ALT", "QUAL",  "FILTER",  "INFO",  "FORMAT"]

# Step 1 - get the list of entries for each rsid - one per assaytype

  for rsid in rsids:
    #logging.info("Processing rsid = %s", rsid)
    docs = godb.get_multiple_variants(rsid)
    if docs.count() > 0:
      rsid_assaytypes[rsid] = []
    else:
      logging.info("RSID %s NOTFOUND", rsid)
  #print docs

  # Step 1a - collect assaytypes and marker documents
  # At this point we're establishing a list order which must be observed throughout.
    for doc in docs:
      #logging.info("%s", str(doc))
      if doc["assaytype"] not in included_assaytypes:
        continue
      if doc["assaytype"] not in atype_list:
        atype_list.append(doc["assaytype"])
      rsid_assaytypes[rsid].append(doc)
  logging.info(str(atype_list))
# Step 2 - collect lists of prochis (sample ids) by assaytype
  prochi_list = [[]] * len(atype_list)
  for i, atype in enumerate(atype_list):
    atype_posns[atype] = i
    prochi_list[i] = godb.get_samples(atype)
    #logging.info("SAMP %d, %s, %s", i, atype, str(prochi_list[i]))

  mm = Multibuffermerge(prochi_list)

# Step 3 - get combined col_header positions
# combo is a dict {posn:colname}
  combo = mm.get_combined_positions()
  #print len(combo)
# combocol is a list [colname1, colname2, ..., colname] again we keep the order of this intact
  combocol = mm.get_combined_columns()
  
# Step 4 - for each variant by rsid
  for rsid in rsid_assaytypes:
    if rsid not in rsid_dict:
      rsid_prfx_dict[rsid] = [[]] * len(atype_list)
      rsid_dict[rsid] = [[]] * len(atype_list)
      rsid_cr_dict[rsid] = [[]] * len(atype_list)
      rsid_info_dict[rsid] = [[]] * len(atype_list)
    #print len(rsid_assaytypes[rsid])
    for doc in rsid_assaytypes[rsid]:
      if options.prfx != None:
        fpath = godb.get_full_filepath(doc["assaytype"], doc["chromosome"], options.prfx)
      else:
        fpath = godb.get_filepath(doc["assaytype"], doc["chromosome"])
      logging.info("Assaytype=%s fpath=%s", doc["assaytype"], fpath)

      result = godb.get_variant_file_data(fpath, doc["chromosome"], doc["position"])
      if result != None:
        vcfr = VCFrecord(result)
        varid = vcfr.get_varid()
        if varid == rsid:
          rec = result
          maf, ma, cr = mafh.get_maf_and_cr(vcfr)
          # TODO - ALSO check maf, also apply QC filter at individual record level
          rsid_cr_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = cr
          rsid_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = rec
          logging.info("%s (%s), maf=%s, ma=%s, cr=%s" % (doc["rsid"], doc["assaytype"], maf, ma, cr))
  
  #print combocol
# Step 5 - execute the merge process
  print "\t".join(hdr_pref + combocol)
  count = 0
  concordant = True
  for rsid in rsid_dict:
    if len(rsid_dict[rsid][0]) > 0:
      if options.check == 'Y':
        concordant = mm.check_concordancies(rsid_dict[rsid], atype_list, options.chipval)

      if concordant == True:
        comborec = mm.get_combined_array(rsid_dict[rsid], rsid_cr_dict[rsid], atype_list)
        vcfr = VCFrecord(rsid_dict[rsid][0])
        prfx,sfx = vcfr.get_prfx_sfx()
        if len(prfx) > 0:
          logging.info("PRFX = %s, for %s", str(prfx), rsid)
          prfx[8] += ":AT"
          outrec = prfx + comborec
          print "\t".join(outrec)
          count += 1
        else:
          logging.info("RSID %s NOTFOUND (2)", rsid)
          pass
      else:
        logging.info("Concordancy check fail for - %s" % (rsid))

  #chi_test_count, allele_disc_count, overlap_count, cr_check_count = mm.get_counts()
  #logging.info("Overlap check count = %d, cr_check_count = %d", overlap_count, cr_check_count)
  chi_test_count, allele_disc_count, overlap_count = mm.get_counts()
  logging.info("CHI test count = %d, Allele discord count = %d, Overlap check count = %d", 
    chi_test_count, allele_disc_count, overlap_count)

  return count 
Exemple #18
0
    def get_rslist_data(self, input_rslist, threshold, download_list):
        msg = None
        snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n"

        data = []
        assaytypelist = []
        probidxlist = []
        rslist = []
        assaytypes = {}
        Afreq = {}
        Bfreq = {}

        data_count = 0
        impDict = {}
        for rsid in input_rslist:
            docs = self.var_coll.get_variant_data_multi(rsid)
            if len(docs) > 0:
                rslist.append(rsid)
            # handling SNPs on multiple platforms
            for doc in docs:
                # always force chromosome to 2 digits
                chromosome = "%.2d" % int(doc["chromosome"])
                # first get filepath
                fpath = self.filepaths_coll.get_filepath(
                    doc["assaytype"], chromosome)
                # get raw variant data
                fullrec = self.var_coll.get_raw_variant_values(
                    fpath, chromosome, doc['position'])
                geno_count = 0
                sample_count = 0
                hwep = 0.0
                vcfr = VCFrecord(fullrec)
                prfx, sfx = vcfr.get_prfx_sfx()
                probidx = vcfr.get_probidx()
                (gc_count_dict, sample_count, geno_count, maf, alleleAf,
                 alleleBf, p_hwe) = self.var_coll.get_genotype_probs(
                     sfx, threshold, probidx)
                Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf
                Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf
                data_count += 1
                hwep = float(p_hwe)

                assaytypelist.append(doc["assaytype"])
                data.append(vcfr)
                if doc["assaytype"] not in assaytypes:
                    assaytypes[doc["assaytype"]] = 1

                imputed = 0
                if "imputed" in doc:
                    imputed = 1
                if "info" in doc:
                    if doc["info"] != 1.0:
                        imputed = 1
                impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed
                snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % (
                    doc["rsid"], doc["assaytype"], doc["chromosome"],
                    doc["position"], doc["alleleA"],
                    alleleAf, doc["alleleB"], alleleBf, maf, imputed,
                    float(geno_count) / sample_count, hwep, doc["info"])

        pdata = self.get_sample_values(assaytypelist, data, data_count, rslist,
                                       impDict, assaytypes, Afreq, Bfreq,
                                       threshold)
        return (pdata, snpdata, msg)
Exemple #19
0
def main(options):
    hdrData = ["id"]
    sampleDict = {}
    colPosns = {}
    RefAlleleDict = {}
    AltAlleleDict = {}
    count = 0

    mafh = Mafhelper()

    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('##')):
            pass
        else:
            vcfr = VCFrecord(line)
            prfx, sfx = vcfr.get_prfx_sfx()
            #print prfx
            if (line.startswith('#')):
                # Parse out the header record.
                for i, col_hdr in enumerate(sfx):
                    colPosns[i] = col_hdr
                    sampleDict[col_hdr] = []
            else:
                flip = False
                varid = vcfr.get_varid_ukb()
                #logging.info("varid=%s", varid)
                ref, alt = vcfr.get_alleles()
                probidx = vcfr.get_probidx()
                hdr_allele = alt
                homref_count, het_count, homalt_count, nc_count, miss_count = vcfr.get_allele_counts(
                )
                call_count = homref_count + het_count + homalt_count
                maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count,
                                   alt, nc_count)
                RefAlleleDict[varid] = ref
                AltAlleleDict[varid] = alt
                #if ma == ref:
                #  flip = True
                #  hdr_allele = ref
                #  logging.info("FLIP for %s, %s, %s", varid, ref, alt)
                hdrData.append(varid)
                for i, str_geno in enumerate(sfx):
                    if str_geno != ".":
                        geno = str_geno.split(":")
                        max_prob, max_idx = get_max_prob(geno, probidx)
                        i_call = icalls[geno[0]]
                        if flip == True:
                            if i_call == "0":
                                i_call == "2"
                            elif i_call == "2":
                                i_call = "0"
                        sampleDict[colPosns[i]].append(str(i_call))
                    else:
                        sampleDict[colPosns[i]].append("")

    print ",".join(hdrData)
    for samp in sampleDict:
        count += 1
        print ",".join([samp] + sampleDict[samp])
    return count
Exemple #20
0
def main(options):
    #print options.file1
    #print options.file2

    try:
        fh1 = open(options.file1, "r")
        #fh2 = open(options.file2, "r")
        fh2 = sys.stdin
    except IOError as e:
        logging.info("I/O error({0}): {1}".format(e.errno, e.strerror))
        exit()
    except TypeError as e:
        logging.info("Missing arguments " + e)
        exit()
    except:
        logging.info("Unexpected error:" + sys.exc_info()[0])
        exit()

    vcff = VCFrecord()

    f1_hdr, f2_hdr = load_sample_positions(fh1, fh2, vcff)
    #print len(file1_positions)
    #print len(file2_positions)

    srtd_samples = sorted(combined_samples)
    #print len(srtd_samples)
    for i, sample in enumerate(srtd_samples):
        combined_positions[sample] = i

    output_combined_hdr(f1_hdr, f2_hdr, vcff)
    line1 = fh1.readline().strip()
    data1 = vcff.get_data_array(line1)
    key1 = vcff.get_posn_from_array_as_int(data1)
    fmts1 = vcff.get_fmts_from_array(data1)
    idxs1 = vcff.get_fmt_indices(fmts1, ["GT", "GP"])
    line2 = fh2.readline().strip()
    data2 = vcff.get_data_array(line2)
    key2 = vcff.get_posn_from_array_as_int(data2)
    fmts2 = vcff.get_fmts_from_array(data1)
    idxs2 = vcff.get_fmt_indices(fmts2, ["GT", "GP"])
    #print key1, key2
    f1_count = 1
    f2_count = 1
    out_count = 0
    discord_count = 0

    while True:
        if (key1 == max_key and key2 == max_key):
            break
        if (key1 > key2):
            output_combined_record([], data2, vcffi, idxs1)
            out_count += 1
            line2 = fh2.readline().strip()
            if line2 == "":
                key2 = max_key
            else:
                f2_count += 1
                data2 = vcff.get_data_array(line2)
                key2 = vcff.get_posn_from_array_as_int(data2)
        elif (key2 > key1):
            output_combined_record(data1, [], vcff, idxs1)
            out_count += 1
            line1 = fh1.readline().strip()
            if line1 == "":
                key1 = max_key
            else:
                f1_count += 1
                data1 = vcff.get_data_array(line1)
                key1 = vcff.get_posn_from_array_as_int(data1)
        else:
            # On equality - check for allele concordance
            AlleleA1, AlleleB1 = vcff.get_alleles_from_array(data1)
            AlleleA2, AlleleB2 = vcff.get_alleles_from_array(data2)
            # TODO HWE concordance check - but how do we set thresholds?
            if (AlleleA1 == AlleleA2) and (AlleleB1 == AlleleB2):
                output_combined_record(data1, data2, vcff, idxs1,
                                       vcff.get_call_rate_from_array(data1),
                                       vcff.get_call_rate_from_array(data2))
                out_count += 1
            else:
                discord_count += 1
            line1 = fh1.readline().strip()
            if line1 == "":
                key1 = max_key
            else:
                f1_count += 1
                data1 = vcff.get_data_array(line1)
                key1 = vcff.get_posn_from_array_as_int(data1)
            line2 = fh2.readline().strip()
            if line2 == "":
                key2 = max_key
            else:
                f2_count += 1
                data2 = vcff.get_data_array(line2)
                key2 = vcff.get_posn_from_array_as_int(data2)

    fh1.close()
    fh2.close()
    return f1_count, f2_count, out_count, discord_count
Exemple #21
0
def main(options):
  #print options.file1
  #print options.file2

  try:
    fh1 = open(options.file1, "r") 
    #fh2 = open(options.file2, "r") 
    fh2 = sys.stdin
  except IOError as e:
    logging.info("I/O error({0}): {1}".format(e.errno, e.strerror))
    exit()
  except TypeError as e:
    logging.info("Missing arguments " + e)
    exit()
  except:
    logging.info("Unexpected error:" + sys.exc_info()[0])
    exit()

  vcff = VCFrecord()

  f1_hdr, f2_hdr = load_sample_positions(fh1, fh2, vcff)
  #print len(file1_positions)
  #print len(file2_positions)

  srtd_samples = sorted(combined_samples)
  #print len(srtd_samples)
  for i, sample in enumerate(srtd_samples):
    combined_positions[sample] = i

  output_combined_hdr(f1_hdr, f2_hdr, vcff)
  line1 = fh1.readline().strip()
  data1 = vcff.get_data_array(line1)
  key1 = vcff.get_posn_from_array_as_int(data1)
  fmts1 = vcff.get_fmts_from_array(data1)
  idxs1 = vcff.get_fmt_indices(fmts1, ["GT","GP"])
  line2 = fh2.readline().strip()
  data2 = vcff.get_data_array(line2)
  key2 = vcff.get_posn_from_array_as_int(data2)
  fmts2 = vcff.get_fmts_from_array(data1)
  idxs2 = vcff.get_fmt_indices(fmts2, ["GT","GP"])
  #print key1, key2
  f1_count = 1
  f2_count = 1
  out_count = 0
  discord_count = 0

  while True:
    if (key1 == max_key and key2 == max_key):
      break
    if (key1 > key2):
      output_combined_record([], data2, vcffi, idxs1)
      out_count += 1
      line2 = fh2.readline().strip()
      if line2 == "":
        key2 = max_key
      else:
        f2_count += 1
        data2 = vcff.get_data_array(line2)
        key2 = vcff.get_posn_from_array_as_int(data2)
    elif (key2 > key1):
      output_combined_record(data1, [], vcff, idxs1)
      out_count += 1
      line1 = fh1.readline().strip()
      if line1 == "":
        key1 = max_key
      else:
        f1_count += 1
        data1 = vcff.get_data_array(line1)
        key1 = vcff.get_posn_from_array_as_int(data1)
    else:
      # On equality - check for allele concordance
      AlleleA1, AlleleB1 = vcff.get_alleles_from_array(data1)
      AlleleA2, AlleleB2 = vcff.get_alleles_from_array(data2)
      # TODO HWE concordance check - but how do we set thresholds?
      if (AlleleA1 == AlleleA2) and (AlleleB1 == AlleleB2):
        output_combined_record(data1, data2, vcff, idxs1, vcff.get_call_rate_from_array(data1), vcff.get_call_rate_from_array(data2))
        out_count += 1
      else:
        discord_count += 1
      line1 = fh1.readline().strip()
      if line1 == "":
        key1 = max_key
      else:
        f1_count += 1
        data1 = vcff.get_data_array(line1)
        key1 = vcff.get_posn_from_array_as_int(data1)
      line2 = fh2.readline().strip()
      if line2 == "":
        key2 = max_key
      else:
        f2_count += 1
        data2 = vcff.get_data_array(line2)
        key2 = vcff.get_posn_from_array_as_int(data2)
        
  fh1.close()
  fh2.close()
  return f1_count, f2_count, out_count, discord_count 
Exemple #22
0
class DataStore():
    def __init__(self,
                 db,
                 dbname,
                 projpref="akh",
                 get_anochi=False,
                 probidx=1):
        #print "Anochi logical", get_anochi
        self.db = db
        self.dbname = dbname
        self.gwasdb = Gwasdb(db)
        self.filedata_coll = _filedata(db)
        self.sam_coll = _samples(db)
        self.mkr_coll = _markers(db, self.filedata_coll, self.sam_coll,
                                 self.gwasdb, probidx)
        self.prochi_coll = _prochi_map(db, projpref, get_anochi)
        self.marker_totals = []
        self.sample_count = -1
        self.call_rates = {}
        self.probidx = probidx
        self.vcfr = VCFrecord()

    def get_db_name(self):
        return self.dbname

    def get_probidx(self):
        return self.probidx

    def make_selection_key(self, varid, assaytype):
        return varid + "_" + assaytype

    def get_rsid_prochi_data(self, rsid, prochi, threshold, filefmt):
        """
      Get  data for the prochi, dict {platform:position_in_list}
      Get marker data for the rsid (up to num of platforms)
    """
        assaytype_list_posns = {}

        sdocs = self.sam_coll.get_sampledata(prochi)

        for sdoc in sdocs:
            #print sdoc["assaytype"]
            assaytype_list_posns[sdoc["assaytype"]] = sdoc["list_posn"]

        genotypes = self.mkr_coll.get_geno_data(rsid, prochi,
                                                assaytype_list_posns)

        for genotype in genotypes:
            print genotype, genotypes[genotype]

    def get_marker_data_for_file(self, filepath, threshold):
        msg = None
        try:
            f = open(filepath, "r")
        except IOError as e:
            msg = filepath + ":" + e.strerror
            return ([], [], msg)
        count = 0

        rslist = []
        for line in f:
            count += 1
            if count > 500:
                msg = "line count for %s gt the limit (%d)" % (filepath, 500)
                return ([], [], msg)
            line = line.strip()
            elems = line.split()
            rslist.append(elems[0])

        f.close()
        marker_data = []
        msg = ""
        for rsid in rslist:
            (marker_docs,
             tmsg) = self.get_marker_summary_probs(rsid, threshold)
            if (tmsg != ""):
                msg += tmsg
            for doc in marker_docs:
                marker_data.append(doc)
        return marker_data, msg

    def get_marker_data_by_range(self, chr, start, end, threshold=0.9):
        rslist = []
        marker_data = []
        msg = ""
        return (self.mkr_coll.get_marker_data_by_range(chr, start, end))

    def get_range_data(self, chr, start, end, threshold, download_list):
        (docs, msg) = self.mkr_coll.get_marker_data_by_range(chr, start, end)
        if len(docs) == 0:
            return ([], [], msg)

        rsdict = {}
        rslist = []
        for doc in docs:
            #print"RANGE rsid", doc["rsid"], doc["assaytype"], doc["position"]
            rsdict[doc["rsid"]] = 1

        for rsid in rsdict:
            rslist.append(rsid)

        #return([], [], "")
        #print "Call get rslist data"
        return (self.get_rslist_data(rslist, threshold, download_list))

    def build_csv_data(self, rslist, sampleDict, assaytypes, threshold):
        # NOTE: maintaining rslist order is vital!
        rsidx = {}
        normalised = False
        idx = 0
        for rsid in rslist:
            rsidx[rsid] = idx
            idx += 1

        assaytypes['combined'] = 1
        by_platform_data = {}
        for assaytype in assaytypes:
            by_platform_data[assaytype] = []
        hdrData = ["sampleId"]
        for rsid in rslist:
            #print "RSID", rsid
            hdrData.append(rsid)
            hdrData.append(rsid + "_c")
            hdrData.append(rsid + "_p")
            hdrData.append(rsid + "_alt")
        hdrString = ','.join(hdrData)

        for assaytype in assaytypes:
            by_platform_data[assaytype].append(hdrString)
        for samp in sampleDict:
            output_lines = {}
            filled_output_lines = {}
            filled_output_lines['combined'] = True
            for assaytype in assaytypes:
                output_lines[assaytype] = ["" for x in range(len(rslist) * 4)]
            for rsid in sampleDict[samp]:
                if len(
                        sampleDict[samp][rsid]
                ) > 0:  # if a sample wasn't genotyped on any platform there might not be data
                    idxoffset = rsidx[rsid] * 4
                    # resolve_geno is at the crux - need to change to test CR?
                    #logging.info("Call resolve_geno %s, %s", samp, str(sampleDict[samp][rsid]))
                    geno_data = self.resolve_geno(sampleDict[samp][rsid], rsid,
                                                  samp, threshold)
                    dataVals = geno_data[2].split(':')
                    probVals = dataVals[self.get_probidx()].split(',')
                    (probcall, intcall,
                     outprob) = self.mkr_coll.get_call(probVals, threshold)
                    #intcall, normalised = self.get_integer_call(intcall, geno_data[5], geno_data[6])
                    output_lines['combined'][idxoffset] = str(intcall)
                    output_lines['combined'][idxoffset + 1] = str(outprob)
                    output_lines['combined'][idxoffset + 2] = geno_data[0]
                    output_lines['combined'][idxoffset + 3] = geno_data[4]
                    for geno_data in sampleDict[samp][rsid]:
                        dataVals = geno_data[2].split(':')
                        probVals = dataVals[self.get_probidx()].split(',')
                        (probcall, intcall, outprob) = self.mkr_coll.get_call(
                            probVals, threshold)
                        #intcall, normalised = self.get_integer_call(intcall, geno_data[5], geno_data[6])
                        output_lines[geno_data[0]][idxoffset] = str(intcall)
                        output_lines[geno_data[0]][idxoffset +
                                                   1] = str(outprob)
                        output_lines[geno_data[0]][idxoffset +
                                                   2] = geno_data[0]
                        output_lines[geno_data[0]][idxoffset +
                                                   3] = geno_data[4]
                        filled_output_lines[geno_data[0]] = True

            for assaytype in assaytypes:
                if assaytype in filled_output_lines:
                    by_platform_data[assaytype].append(
                        samp + "," + ",".join(output_lines[assaytype]))
        return by_platform_data

    def build_gen_data(self, rslist, sampleDict, threshold):
        data = []
        hdrData = ["rsid"]
        for sampleid in sorted(sampleDict):
            hdrData.append(sampleid)
            hdrData.append(sampleid)
            hdrData.append(sampleid)
        hdrString = ' '.join(hdrData)
        data.append(hdrString)
        for rsid in rslist:
            line = rsid + " "
            for samp in sorted(sampleDict):
                geno_data = self.resolve_geno(sampleDict[samp][rsid], rsid,
                                              samp, threshold)
                if len(geno_data) > 2:
                    dataVals = geno_data[2].split(':')
                    probVals = dataVals[self.get_probidx()].split(',')
                    line += ' '.join(probVals) + " "
                else:
                    line += ' '.join(["0", "0", "0"]) + " "

            data.append(line[:-1])
        return data

    def get_integer_call(self, intcall, afreq, bfreq, normalise=False):
        rtncall = intcall
        normalised = False
        #print rtncall, normalise, afreq, bfreq,
        if normalise == True:
            #print "normalising",
            if afreq < bfreq:
                normalised = True
                #print "LT",
                if rtncall == 2:
                    #print "2",
                    rtncall = 0
                elif rtncall == 0:
                    #print "0",
                    rtncall = 2
                #else:
                #print "1",
            #else:
            #print "GE",
        #print "return", rtncall
        return rtncall, normalised

    def resolve_geno(self, genlist, rsid, samp, threshold):
        maxprob = 0.0
        maxidx = -1
        if len(genlist) == 1:
            return genlist[0]
        elif len(genlist) > 1:
            for idx, gendata in enumerate(genlist):
                #if gendata[1] == 0:
                #  #print "Decided on D Type:", genlist[idx][0], rsid, samp
                #  return genlist[idx]
                dataVals = gendata[2].split(':')
                probVals = dataVals[self.get_probidx()].split(',')
                (probcall, intcall,
                 outprob) = self.mkr_coll.get_call(probVals, threshold)
                if outprob > maxprob:
                    maxprob = outprob
                    maxidx = idx
        if maxprob > 0.0:
            #print "Decided on prob:", maxprob, maxidx, rsid, samp
            return genlist[maxidx]
        return []

    def get_marker_summary_probs(self, rsid, threshold):
        marker_array = []
        msg = ""
        #print "get_marker_summary_probs:", threshold
        docs = self.mkr_coll.get_marker_data_multi(rsid)
        for doc in docs:
            fpath = self.filedata_coll.get_filepath(doc["assaytype"],
                                                    doc['chromosome'])
            # this is not ideal - need to get on top of this chromosome id thing
            chr = "%.2d" % int(doc["chromosome"])
            rec, fullrec = self.mkr_coll.get_raw_marker_values(
                fpath, doc["rsid"], chr, doc['position'])
            #print "REC", rec
            prfx, sfx = self.vcfr.get_prfx_sfx_from_array(rec)
            (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf,
             p_hwe) = self.mkr_coll.get_genotype_probs(sfx, threshold)
            doc['selected'] = 1
            doc['a_af'] = alleleAf
            doc['b_af'] = alleleBf
            doc['hwe_p'] = p_hwe
            doc['Missing'] = 0
            if 'Missing' in gc_count_dict:
                doc['Missing'] = gc_count_dict['Missing']
            marker_array.append(doc)
        if len(marker_array) == 0:
            msg = "Variant NOT FOUND - %s, " % (rsid)
        return (marker_array, msg)

    def get_marker_data(self, markerid, assaytype):
        return self.mkr_coll.get_marker_data(markerid, assaytype)

    def get_sample(self, sampleid):
        return self.sam_coll.get_sample(sampleid)

    def get_marker_totals(self):
        if len(self.marker_totals) == 0:
            self.marker_totals = self.mkr_coll.get_marker_totals()
        return self.marker_totals

    def get_sample_count(self, assaytype):
        if self.sample_count == -1:
            self.sample_count = self.sam_coll.get_count(assaytype)
        return self.sample_count

    def get_all_samples(self):
        return self.sam_coll.get_all_samples()

    def convert_to_prochi(self, cvt_value):
        """
    Convert the supplied value to prochi by whichever method works (or return supplied value if all else fails)
    """
        rtn_value = self.get_prochi_from_mprochi(cvt_value)
        if rtn_value == None:
            rtn_value = self.get_prochi_from_plateid(cvt_value)
        if rtn_value == None:
            rtn_value = cvt_value
        return rtn_value

    def get_prochi_from_mprochi(self, mprochi):
        """
    Get the prochi_maps value for the supplied arg
    """
        return self.prochi_coll.get_anochi_or_prochi_from_mprochi(mprochi)

    def get_prochi_from_plateid(self, plateid):
        """
    Get the prochi_maps value for the supplied arg
    """
        return self.prochi_coll.get_prochi_from_plateid(plateid)

    def get_converted_samples(self):
        return [
            self.convert_to_prochi(samp)
            for samp in self.sam_coll.get_all_samples()
        ]

    def make_zipfile(self, sample_return_data, snp_return_data, uploadDir,
                     zipfilename):
        """
    moved here from views.py
    """
        ares = {}
        for assaytype in sample_return_data:
            ares[assaytype] = '\n'.join(sample_return_data[assaytype])
        zipname = uploadDir + "/" + zipfilename
        with ZipFile(zipname, 'w') as resZip:
            resZip.writestr('snp_summary.csv', snp_return_data, ZIP_DEFLATED)
            for assaytype in ares:
                resZip.writestr(assaytype + '_samples.csv', ares[assaytype],
                                ZIP_DEFLATED)
        with open(zipname, 'r') as f:
            body = f.read()
        return (body)
        response = make_response(body)
        response.headers[
            "Content-Disposition"] = "attachment; filename=" + zipfilename
        return (response)

    def get_rslist_file_data(self, filepath, threshold, download_list):
        msg = None
        try:
            f = open(filepath, "r")
        except IOError as e:
            msg = filepath + ":" + e.strerror
            return ([], [], msg)
        count = 0

        rslist = []
        for line in f:
            count += 1
            if count > 500:
                msg = "line count for %s gt the limit (%d)" % (filepath, 500)
                return ([], [], msg)
            line = line.strip()
            elems = line.split()
            rslist.append(elems[0])

        f.close()
        logging.info("get_rslist_file_data: %.2f", float(threshold))
        return (self.get_rslist_data(rslist, threshold, download_list))

    def get_rslist_data(self, input_rslist, threshold, download_list):
        msg = None
        snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n"

        data = []
        rslist = []
        assaytypes = {}
        Afreq = {}
        Bfreq = {}

        data_count = 0
        impDict = {}
        for rsid in input_rslist:
            docs = self.mkr_coll.get_marker_data_multi(rsid)
            if len(docs) > 0:
                rslist.append(rsid)
            # handling SNPs on multiple platforms
            for doc in docs:
                #print doc
                # first get filepath
                #select_key = self.make_selection_key(doc["rsid"], doc["assaytype"])
                #if select_key in download_list:
                fpath = self.filedata_coll.get_filepath(
                    doc["assaytype"], doc["chromosome"])
                # this is not ideal - need to get on top of this chromosome id thing
                chr = "%.2d" % int(doc["chromosome"])
                # get raw marker data
                rec, fullrec = self.mkr_coll.get_raw_marker_values(
                    fpath, doc["rsid"], chr, doc['position'])
                geno_count = 0
                sample_count = 0
                hwep = 0.0
                #print "REC", rec, fpath
                prfx, sfx = self.vcfr.get_prfx_sfx_from_array(rec)
                (gc_count_dict, sample_count, geno_count, maf, alleleAf,
                 alleleBf,
                 p_hwe) = self.mkr_coll.get_genotype_probs(sfx, threshold)
                Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf
                Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf
                data_count += 1
                hwep = float(p_hwe)

                # add record to list of all records for the rslist
                data.append(doc["assaytype"] + '\t' + fullrec)
                if doc["assaytype"] not in assaytypes:
                    assaytypes[doc["assaytype"]] = 1

                imputed = 0
                if "imputed" in doc:
                    imputed = 1
                impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed
                #print doc["rsid"], doc["assaytype"]
                if "cohort_1_hwe" in doc:
                    hwep = doc["cohort_1_hwe"]
                snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % (
                    doc["rsid"], doc["assaytype"], doc["chromosome"],
                    doc["position"], doc["alleleA"],
                    alleleAf, doc["alleleB"], alleleBf, maf, imputed,
                    float(geno_count) / sample_count, hwep, doc["info"])
                #else: # not selected
                #logging.info("Line was unselected: %s", select_key)

        pdata = self.get_sample_values(data, data_count, rslist, impDict,
                                       assaytypes, Afreq, Bfreq, threshold)
        return (pdata, snpdata, msg)

    def get_sample_values(self, records, numrecs, rslist, impDict, platforms,
                          Afreq, Bfreq, threshold):
        """Process vcf records
      NOTE: impDict keyed on a composite of rsid_platform 
       """
        #print 'START 1', len(records)
        #print impDict
        first_sample_idx = 9 + 1  # add 1 due to forcing assaytype in as col 0
        samplesByAt = {}
        for platform in platforms:
            samplesByAt[platform] = [
                self.convert_to_prochi(samp)
                for samp in self.sam_coll.get_samples(platform)
            ]
        # A dict of dicts of tables
        sampleDict = {}
        dupDict = {}
        dupcount = 0
        for platform in samplesByAt:
            for samp in samplesByAt[platform]:
                if samp not in sampleDict:
                    sampleDict[samp] = {}
                    for rsid in rslist:
                        sampleDict[samp][rsid] = []

        #print '2'
        count = 0
        rscount = 0
        values_totals = 0
        for line in records:
            linedata = line.split('\t')
            platform = linedata[0]
            chr = linedata[1]
            pos = linedata[2]
            rsid = linedata[3]
            alleleA = linedata[4]
            alleleB = linedata[5]
            #print "rec", rscount, rsid, platform, pos, len(linedata)
            if platform not in samplesByAt:
                print platform, "not cached"
                samplesByAt[platform] = [
                    self.convert_to_prochi(samp)
                    for samp in self.sam_coll.get_samples(platform)
                ]
            for idx, elem in enumerate(linedata):
                #print idx,
                if idx >= first_sample_idx:
                    arridx = idx - first_sample_idx
                    if samplesByAt[platform][arridx] in sampleDict:
                        sampleId = samplesByAt[platform][arridx]
                        values_totals += 1
                        sampleDict[sampleId][rsid].append([
                            platform, impDict[rsid + "_" + platform],
                            linedata[idx], alleleA, alleleB,
                            Afreq[rsid + "_" + platform],
                            Bfreq[rsid + "_" + platform]
                        ])
            print "x"
            rscount += 1

        print '3'
        by_platform_data = self.build_csv_data(rslist, sampleDict, platforms,
                                               threshold)

        return (by_platform_data)
Exemple #23
0
class _markers():
    def __init__(self, db, filedata_coll, sample_coll, gwasdb, probidx=1):
        self.db = db
        self.gwasdb = gwasdb
        self.markers = db.markers
        self.calls = ["0/0", "0/1", "1/1", "Missing"]
        self.icalls = [0, 1, 2, -9]
        self.filedata_coll = filedata_coll
        self.sample_coll = sample_coll
        self.probidx = probidx
        self.vcfr = VCFrecord()

    def get_probidx(self):
        return self.probidx

    def get_marker_data(self, markerid, assaytype):
        """
    Get the data for a genetic marker / assay platform combination 
    """
        query = {}
        query['rsid'] = markerid
        query['assaytype'] = assaytype

        try:
            doc = self.markers.find_one(query)
        except:
            print "Unexpected error:", sys.exc_info()[0]

        # can return 'None' if query fails
        return doc

    def get_marker_data_multi(self, markerid):
        """
    Get the data for a genetic marker (DBSNP rs number or chrn:pos:I|D format)
    """
        query = {}
        query['rsid'] = markerid
        docs = []

        try:
            cursor = self.markers.find(query)
        except:
            print "Unexpected error:", sys.exc_info()[0]

        for doc in cursor:
            doc["samplecount"] = self.sample_coll.get_count(doc["assaytype"])
            docs.append(doc)
        # can return [] if query fails
        return docs

    def get_marker_data_by_range(self, chr, start, end):
        """
    Get the data for genetic markerwithin a range 
    """
        docs = []
        msg = ""
        start_pos = int(start)
        end_pos = int(end)

        # Some basic sanity checking
        if (end_pos - start_pos) > 250000:
            msg = "Range is too great should be 250Kb or less [%d]" % (
                end_pos - start_pos)
            return (docs, msg)
        if (end_pos - start_pos) < 0:
            msg = "Start pos is greater than End pos"
            return (docs, msg)

        query = {}
        query['chromosome'] = chr = "%.2d" % (int(chr))
        query['position'] = {}
        query['position']['$gte'] = start_pos
        query['position']['$lte'] = end_pos

        print "RANGE QUERY", query

        try:
            cursor = self.markers.find(query)
        except:
            msg = "Unexpected error:" + sys.exc_info()[0]

        for doc in cursor:
            if len(doc["alleleA"]) > 10:
                doc["alleleA"] = doc["alleleA"][0:10] + " ..."
            if len(doc["alleleB"]) > 10:
                doc["alleleB"] = doc["alleleB"][0:10] + " ..."
            doc["samplecount"] = self.sample_coll.get_count(doc["assaytype"])
            docs.append(doc)
        # can return [] if query fails
        if len(docs) == 0:
            msg = "Nothing found in range"
        return (docs, msg)

    def get_marker_totals(self):
        """ Use agg framework to get totals by CHR
    """
        chr_totals = []
        curs = self.db.markers.aggregate([{
            "$group": {
                "_id": {
                    "chr": "$chromosome",
                    "at": "$assaytype"
                },
                "mkrsPerChrom": {
                    "$sum": 1
                }
            }
        }, {
            "$sort": {
                "_id": 1
            }
        }])
        for doc in curs['result']:
            #print doc['_id'], doc['mkrsPerChrom']
            chr_totals.append((doc['_id'], doc['mkrsPerChrom']))
        return chr_totals

    def get_raw_marker_values(self, filepath, variantid, chr, posn):
        """
    Access a vcf file to extract marker data
    """
        tabixFile = pysam.Tabixfile(filepath)
        if int(chr) > 22:
            chr = "NA"
        else:
            chr = str(chr)

        rec = []
        rtn_rec = ""

        try:
            records = tabixFile.fetch(chr, posn - 1, posn)
        except ValueError:
            chr = chr[1:]
            records = tabixFile.fetch(chr, posn - 1, posn)

        for record in records:
            data = self.vcfr.get_data_array(record)
            dvarid = self.vcfr.get_var_id_from_array(data)
            dposn = self.vcfr.get_posn_from_array(data)
            #print "%s-%s, %d-%d" % (variantid, dvarid, posn, int(dposn))
            if (dvarid == variantid) and (int(dposn) == posn):
                rec = data
                rtn_rec = record

        return rec, rtn_rec

    def get_genotype_probs(self, sample_values, threshold, has_GP=True):
        """
    Summarise marker_values based on probabilities
    TODO: deal with user-supplied threshold
    TODO: what to do when has_GP is false
    """
        geno_count = 0
        sample_count = 0
        genotype_counts = {}
        for sample_value in sample_values:
            sample_count += 1
            #print sample_value
            genoValues = sample_value.split(':')
            probVals = genoValues[self.get_probidx()].split(',')
            (key, ccode, maxprob) = self.get_call(probVals, threshold)
            genotype_counts[key] = genotype_counts.get(key, 0) + 1

        #print "SAMPLE COUNT", sample_count
        #gc_count_str = []
        hom1_ct = 0
        hom2_ct = 0
        het_ct = 0
        if "0/0" in genotype_counts:
            hom1_ct = genotype_counts["0/0"]
            geno_count += hom1_ct
        if "0/1" in genotype_counts:
            het_ct = genotype_counts["0/1"]
            geno_count += het_ct
        if "1/1" in genotype_counts:
            hom2_ct = genotype_counts["1/1"]
            geno_count += hom2_ct
        #print "allele counts:", hom1_ct, het_ct, hom2_ct, sample_count
        mafr = maf(het_ct, hom1_ct, hom2_ct, geno_count)
        #print "mafr:", mafr
        AlleleAfr = af(het_ct, hom1_ct, hom2_ct, geno_count)
        #print "afr:", AlleleAfr
        AlleleBfr = af(het_ct, hom2_ct, hom1_ct, geno_count)
        #print "bfr:", AlleleBfr
        p_hwe = HWE_exact(het_ct, hom1_ct, hom2_ct, geno_count)
        #for gt in genotype_counts:
        #  gc_count_str.append(gt + ": " + str(genotype_counts[gt]))
        return (genotype_counts, sample_count, geno_count, mafr, AlleleAfr,
                AlleleBfr, p_hwe)

    def get_call(self, probs, threshold):
        max_prob = 0.0
        max_idx = 3

        #print "Probs:", probs
        for idx, prob in enumerate(probs):
            if float(prob) > max_prob:
                max_prob = float(prob)
                max_idx = idx

        if (threshold != 0.0):
            #print 'threshold', threshold, max_prob
            if max_prob < threshold:
                #print 'LT threshold', threshold, max_prob
                max_idx = 3

        return (self.calls[max_idx], self.icalls[max_idx], max_prob)

    def get_geno_data(self, rsid, sample_id, assaytype_list_posns):
        first_sample_idx = 9
        geno_values = {}
        docs = self.get_marker_data_multi(rsid)

        for doc in docs:
            # first get filepath
            fpath = self.filedata_coll.get_filepath(doc["assaytype"],
                                                    doc["chromosome"])
            # this is not ideal - need to get on top of this chromosome id thing
            chr = "%.2d" % int(doc["chromosome"])
            rec, fullrec = self.get_raw_marker_values(fpath, doc["rsid"], chr,
                                                      doc['position'])

            if doc["assaytype"] in assaytype_list_posns:
                prfx, genodata = self.vcfr.get_prfx_sfx_from_array(rec)
                print "list posn", rsid, sample_id, assaytype_list_posns[
                    doc["assaytype"]]
                geno_values[sample_id + "_" + doc["assaytype"]] = genodata[
                    assaytype_list_posns[doc["assaytype"]]]
        return (geno_values)