Example #1
0
def main(options):
  try:
    fh = open(options.convfile, "r") 
    smap = load_sample_map(fh)
  except:
    print "Unexpected error:", sys.exc_info()[0]
    exit()

  hdr = []
  hdrlen = 0
  count = 0
  for line in sys.stdin:
    line = line.strip()
    if (line.startswith('##')):
      print line
    else:
      if (line.startswith('#')):
        vcfr = VCFrecord(line)
        prfx, sfx = vcfr.get_prfx_sfx()
        for idx, elem in enumerate(sfx):
          sfx[idx] = smap[elem]
        print "\t".join(prfx) + "\t" + "\t".join(sfx)
      else:
        print line

  return count
Example #2
0
 def get_variant_summary_probs(self, rsid, threshold):
     variant_array = []
     msg = ""
     docs = self.var_coll.get_variant_data_multi(rsid)
     for doc in docs:
         # always force chromosome to 2 digits
         chromosome = "%.2d" % int(doc["chromosome"])
         fpath = self.filepaths_coll.get_filepath(doc["assaytype"],
                                                  chromosome)
         fullrec = self.var_coll.get_raw_variant_values(
             fpath, chromosome, doc['position'])
         vcfr = VCFrecord(fullrec)
         prfx, sfx = vcfr.get_prfx_sfx()
         probidx = vcfr.get_probidx()
         (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf,
          p_hwe) = self.var_coll.get_genotype_probs(sfx, threshold, probidx)
         doc['selected'] = 1
         doc['a_af'] = alleleAf
         doc['b_af'] = alleleBf
         doc['hwe_p'] = p_hwe
         doc['Missing'] = 0
         if 'Missing' in gc_count_dict:
             doc['Missing'] = gc_count_dict['Missing']
         variant_array.append(doc)
     if len(variant_array) == 0:
         msg = "Variant NOT FOUND - %s, " % (rsid)
     return (variant_array, msg)
Example #3
0
  def process_variant_detail_vcf(self, record, assaytype):
    """Process info file variant detail records
       Set up a json-stype document and add it to the
      variant buffer
    """
    doc = {}
    doc["assaytype"] = assaytype
    vcfr = VCFrecord(record)
    prfx, sfx = vcfr.get_prfx_sfx()
    doc["rsid"] = vcfr.get_varid()
    # always store chromosome as a 2-digit string
    doc["chromosome"] = "%.2d" % (int(vcfr.get_chr()))
    alleleA, alleleB = vcfr.get_alleles()
    doc["alleleA"] = alleleA
    doc["alleleB"] = alleleB
    doc["position"] = vcfr.get_posn_as_int()
    try:
      doc["ref_maf"] = float(vcfr.get_info_value("RefPanelAF"))
    except:
      pass
    try:
      doc["info"] = float(vcfr.get_info_value("INFO"))
    except:
      doc["info"] = 1.0

    self.variantbuff.append(doc)
Example #4
0
def main(options):
    try:
        godb = GoDb()
    except:
        print "Unexpected error:", sys.exc_info()[0]
        exit()

    hdr = []
    count = 0
    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('##')):
            pass
        else:
            if (line.startswith('#')):
                vcfr = VCFrecord(line)
                prf, sfx = vcfr.get_prfx_sfx()
                for idx, field in enumerate(sfx):
                    count += 1
                    godb.process_sample_detail(field, idx, options.assaytype)
                    if (godb.get_samples_len() > flush_at):
                        godb.flush_sample_buff()
                break

    godb.flush_sample_buff()
    print ""
    return count
Example #5
0
    def get_combined_array(self,
                           buffer_list,
                           cr_list,
                           assay_list,
                           threshold=0.9):
        """
    For each list of data, for each element of list of data:
    1) Find the col header from the corresonding file_position element
    2) Use the col_header to find the combined postion
    3) Place the data_element in the combined postion *
    TODO - conflict resolution, what to do if a slot is already occupied
    TODO - CR check
    """
        #print "COMBO", self.combined_positions
        #
        #print "ASSAY_LIST: %s" % (str(assay_list))
        assay_posns = {}

        for i, assaytype in enumerate(assay_list):
            assay_posns[i] = assaytype

        #print "ASSAY_POSNS: %s" % (str(assay_posns))

        combo_array = ["."] * len(self.combined_positions)
        #print "BUFFL", len(buffer_list)
        for i, vcf_record in enumerate(buffer_list):
            if len(vcf_record) > 0:
                #print "asstp: %d, %s" % (i, assay_list[i])
                vcfr = VCFrecord(vcf_record)
                prfx, data_list = vcfr.get_prfx_sfx()
                probidx = vcfr.get_probidx()
                rsid = vcfr.get_varid()
                hasAT = vcfr.has_fmt("AT")
                for j, dataelem in enumerate(data_list):
                    if data_list[j] != ".":
                        cpos = self.combined_positions[self.file_positions[i]
                                                       [j]]
                        geno = self.call_geno_for_threshold(
                            data_list[j], probidx, threshold)
                        if (hasAT == False):
                            geno = geno + ":" + self.assay_abbrev[
                                assay_list[i]]
                        if combo_array[cpos] != ".":
                            self.geno_overlap_count += 1
                            #print "OVERLAP %s:%s - %s vs %s" % (rsid, self.file_positions[i][j], combo_array[cpos], geno)
                            geno = self.call_genotype(combo_array[cpos], geno,
                                                      probidx)
                        combo_array[cpos] = geno

        return combo_array
Example #6
0
def main():
  count = 0
  for line in sys.stdin:
    line = line.strip()
    if (line.startswith('##')):
      pass
    else:
      if (line.startswith('#')):
        vcfr = VCFrecord(line)
        prfx, sfx = vcfr.get_prfx_sfx()
        for samp in sfx:
          print samp
        break


  return count
Example #7
0
    def get_geno_data(self, rsid, sample_id, assaytype_list_posns):
        geno_values = {}
        docs = self.get_variant_data_multi(rsid)

        for doc in docs:
            # always force chromosome to 2 digits
            chromosome = "%.2d" % int(doc["chromosome"])
            fpath = self.filepaths_coll.get_filepath(doc["assaytype"],
                                                     chromosome)
            fullrec = self.get_raw_variant_values(fpath, chromosome,
                                                  doc['position'])

            if doc["assaytype"] in assaytype_list_posns:
                vcfr = VCFrecord(fullrec)
                prfx, genodata = vcfr.get_prfx_sfx()
                geno_values[sample_id + "_" + doc["assaytype"]] = genodata[
                    assaytype_list_posns[doc["assaytype"]]]
        return (geno_values)
def main(options):
    included_assaytypes = {
        "affy": 1,
        "illumina": 1,
        "broad": 1,
        "metabo": 1,
        "exome": 1
    }
    godb = GoDb()

    # Data structures
    atype_list = []
    atype_posns = {}
    marker_list = []
    rsid_assaytypes = {}
    rsid_dict = {}
    rsid_prfx_dict = {}
    rsid_cr_dict = {}
    rsid_info_dict = {}
    count = 0

    # Step 1 - get the list of entries for each rsid - one per assaytype

    vardocs = godb.get_multiple_variants(options.rsid)

    sampposns = godb.get_sample_posns(options.sampleid)

    for doc in vardocs:
        filepath = godb.get_filepath(doc["assaytype"], doc["chromosome"])
        rec = godb.get_variant_file_data(filepath, doc["chromosome"],
                                         doc["position"])
        vcfr = VCFrecord(rec)
        prfx, sfx = vcfr.get_prfx_sfx()
        if doc["assaytype"] in sampposns:
            print "%s,%s,%s,%d,%s" % (
                options.rsid, options.sampleid, doc["assaytype"],
                sampposns[doc["assaytype"]], sfx[sampposns[doc["assaytype"]]])

    return count
Example #9
0
  def get_next_records(self, key_list, prfx_list, recbuff_list):
    """
    main rule is we read from the fh's corresponding to the min key list and replace the key_list, prfx and rec_buff elements accordingly.
    """
    low_key_list, low_key_count = self.get_low_key_list(key_list)

    for i, fh in enumerate(self.fh_list):
      if low_key_list[i] != self.empty_key:
        line = fh.readline().strip()
        if line != "": # testing for EOF
          self.rec_counts[i] += 1
          vcfr = VCFrecord(line)
          prfx, sfx = vcfr.get_prfx_sfx()
          maf, ma, cr = self.mafh.get_maf_and_cr(data, vcfr)           
          prfx_list[i] = prfx
          recbuff_list[i] = sfx
          key_list[i] = int(prfx[1])
        else:
          prfx_list[i] = []
          recbuff_list[i] = []
          key_list[i] = self.high_key
    #logging.info("rec_counts: %s, key_list: %s" % (str(self.rec_counts), str(key_list)))
    return key_list, prfx_list, recbuff_list
Example #10
0
def main(options):
  #included_assaytypes = {"biggertest":1, "bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1}
  #included_assaytypes = {"affy":1, "illumina":1}
  included_assaytypes = {"broad":1}
  #included_assaytypes = {"metabo":1}
  #included_assaytypes = {"affy":1}
  #included_assaytypes = {"bigtest":1}
  #included_assaytypes = {"biggertest":1}
  rsids = []
  godb = GoDb()

  try:
    if options.snpfile != None:
      fh = open(options.snpfile, "r") 
      rsids = load_snpfile_data(fh)
    else:
      rsids = options.rsids.split(",")
  except IOError as e:
    print "I/O error({0}): {1}".format(e.errno, e.strerror)
    exit()
  except TypeError as e:
    print "Missing arguments ", e
    exit()
  except:
    logging.info("Unexpected error: %s", str(sys.exc_info()))
    sys.exit()

# Step 0 - initialise db connection and instanciate helper objects
  mafh = Mafhelper()
  hweh = Hwehelper()
# Data structures
  atype_list = []
  atype_posns = {}
  marker_list = []
  rsid_assaytypes = {}
  rsid_dict = {}
  rsid_prfx_dict = {}
  rsid_cr_dict = {}
  rsid_info_dict = {}
  hdr_pref = ["#CHROM",  "POS", "ID",  "REF", "ALT", "QUAL",  "FILTER",  "INFO",  "FORMAT"]

# Step 1 - get the list of entries for each rsid - one per assaytype

  for rsid in rsids:
    #logging.info("Processing rsid = %s", rsid)
    docs = godb.get_multiple_variants(rsid)
    if docs.count() > 0:
      rsid_assaytypes[rsid] = []
    else:
      logging.info("RSID %s NOTFOUND", rsid)
  #print docs

  # Step 1a - collect assaytypes and marker documents
  # At this point we're establishing a list order which must be observed throughout.
    for doc in docs:
      #logging.info("%s", str(doc))
      if doc["assaytype"] not in included_assaytypes:
        continue
      if doc["assaytype"] not in atype_list:
        atype_list.append(doc["assaytype"])
      rsid_assaytypes[rsid].append(doc)
  logging.info(str(atype_list))
# Step 2 - collect lists of prochis (sample ids) by assaytype
  prochi_list = [[]] * len(atype_list)
  for i, atype in enumerate(atype_list):
    atype_posns[atype] = i
    prochi_list[i] = godb.get_samples(atype)
    #logging.info("SAMP %d, %s, %s", i, atype, str(prochi_list[i]))

  mm = Multibuffermerge(prochi_list)

# Step 3 - get combined col_header positions
# combo is a dict {posn:colname}
  combo = mm.get_combined_positions()
  #print len(combo)
# combocol is a list [colname1, colname2, ..., colname] again we keep the order of this intact
  combocol = mm.get_combined_columns()
  
# Step 4 - for each variant by rsid
  for rsid in rsid_assaytypes:
    if rsid not in rsid_dict:
      rsid_prfx_dict[rsid] = [[]] * len(atype_list)
      rsid_dict[rsid] = [[]] * len(atype_list)
      rsid_cr_dict[rsid] = [[]] * len(atype_list)
      rsid_info_dict[rsid] = [[]] * len(atype_list)
    #print len(rsid_assaytypes[rsid])
    for doc in rsid_assaytypes[rsid]:
      if options.prfx != None:
        fpath = godb.get_full_filepath(doc["assaytype"], doc["chromosome"], options.prfx)
      else:
        fpath = godb.get_filepath(doc["assaytype"], doc["chromosome"])
      logging.info("Assaytype=%s fpath=%s", doc["assaytype"], fpath)

      result = godb.get_variant_file_data(fpath, doc["chromosome"], doc["position"])
      if result != None:
        vcfr = VCFrecord(result)
        varid = vcfr.get_varid()
        if varid == rsid:
          rec = result
          maf, ma, cr = mafh.get_maf_and_cr(vcfr)
          # TODO - ALSO check maf, also apply QC filter at individual record level
          rsid_cr_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = cr
          rsid_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = rec
          logging.info("%s (%s), maf=%s, ma=%s, cr=%s" % (doc["rsid"], doc["assaytype"], maf, ma, cr))
  
  #print combocol
# Step 5 - execute the merge process
  print "\t".join(hdr_pref + combocol)
  count = 0
  concordant = True
  for rsid in rsid_dict:
    if len(rsid_dict[rsid][0]) > 0:
      if options.check == 'Y':
        concordant = mm.check_concordancies(rsid_dict[rsid], atype_list, options.chipval)

      if concordant == True:
        comborec = mm.get_combined_array(rsid_dict[rsid], rsid_cr_dict[rsid], atype_list)
        vcfr = VCFrecord(rsid_dict[rsid][0])
        prfx,sfx = vcfr.get_prfx_sfx()
        if len(prfx) > 0:
          logging.info("PRFX = %s, for %s", str(prfx), rsid)
          prfx[8] += ":AT"
          outrec = prfx + comborec
          print "\t".join(outrec)
          count += 1
        else:
          logging.info("RSID %s NOTFOUND (2)", rsid)
          pass
      else:
        logging.info("Concordancy check fail for - %s" % (rsid))

  #chi_test_count, allele_disc_count, overlap_count, cr_check_count = mm.get_counts()
  #logging.info("Overlap check count = %d, cr_check_count = %d", overlap_count, cr_check_count)
  chi_test_count, allele_disc_count, overlap_count = mm.get_counts()
  logging.info("CHI test count = %d, Allele discord count = %d, Overlap check count = %d", 
    chi_test_count, allele_disc_count, overlap_count)

  return count 
Example #11
0
    def get_rslist_data(self, input_rslist, threshold, download_list):
        msg = None
        snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n"

        data = []
        assaytypelist = []
        probidxlist = []
        rslist = []
        assaytypes = {}
        Afreq = {}
        Bfreq = {}

        data_count = 0
        impDict = {}
        for rsid in input_rslist:
            docs = self.var_coll.get_variant_data_multi(rsid)
            if len(docs) > 0:
                rslist.append(rsid)
            # handling SNPs on multiple platforms
            for doc in docs:
                # always force chromosome to 2 digits
                chromosome = "%.2d" % int(doc["chromosome"])
                # first get filepath
                fpath = self.filepaths_coll.get_filepath(
                    doc["assaytype"], chromosome)
                # get raw variant data
                fullrec = self.var_coll.get_raw_variant_values(
                    fpath, chromosome, doc['position'])
                geno_count = 0
                sample_count = 0
                hwep = 0.0
                vcfr = VCFrecord(fullrec)
                prfx, sfx = vcfr.get_prfx_sfx()
                probidx = vcfr.get_probidx()
                (gc_count_dict, sample_count, geno_count, maf, alleleAf,
                 alleleBf, p_hwe) = self.var_coll.get_genotype_probs(
                     sfx, threshold, probidx)
                Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf
                Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf
                data_count += 1
                hwep = float(p_hwe)

                assaytypelist.append(doc["assaytype"])
                data.append(vcfr)
                if doc["assaytype"] not in assaytypes:
                    assaytypes[doc["assaytype"]] = 1

                imputed = 0
                if "imputed" in doc:
                    imputed = 1
                if "info" in doc:
                    if doc["info"] != 1.0:
                        imputed = 1
                impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed
                snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % (
                    doc["rsid"], doc["assaytype"], doc["chromosome"],
                    doc["position"], doc["alleleA"],
                    alleleAf, doc["alleleB"], alleleBf, maf, imputed,
                    float(geno_count) / sample_count, hwep, doc["info"])

        pdata = self.get_sample_values(assaytypelist, data, data_count, rslist,
                                       impDict, assaytypes, Afreq, Bfreq,
                                       threshold)
        return (pdata, snpdata, msg)
Example #12
0
def main(options):
    hdrData = ["id"]
    sampleDict = {}
    colPosns = {}
    RefAlleleDict = {}
    AltAlleleDict = {}
    count = 0

    mafh = Mafhelper()

    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('##')):
            pass
        else:
            vcfr = VCFrecord(line)
            prfx, sfx = vcfr.get_prfx_sfx()
            #print prfx
            if (line.startswith('#')):
                # Parse out the header record.
                for i, col_hdr in enumerate(sfx):
                    colPosns[i] = col_hdr
                    sampleDict[col_hdr] = []
            else:
                flip = False
                varid = vcfr.get_varid_ukb()
                #logging.info("varid=%s", varid)
                ref, alt = vcfr.get_alleles()
                probidx = vcfr.get_probidx()
                hdr_allele = alt
                homref_count, het_count, homalt_count, nc_count, miss_count = vcfr.get_allele_counts(
                )
                call_count = homref_count + het_count + homalt_count
                maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count,
                                   alt, nc_count)
                RefAlleleDict[varid] = ref
                AltAlleleDict[varid] = alt
                #if ma == ref:
                #  flip = True
                #  hdr_allele = ref
                #  logging.info("FLIP for %s, %s, %s", varid, ref, alt)
                hdrData.append(varid)
                for i, str_geno in enumerate(sfx):
                    if str_geno != ".":
                        geno = str_geno.split(":")
                        max_prob, max_idx = get_max_prob(geno, probidx)
                        i_call = icalls[geno[0]]
                        if flip == True:
                            if i_call == "0":
                                i_call == "2"
                            elif i_call == "2":
                                i_call = "0"
                        sampleDict[colPosns[i]].append(str(i_call))
                    else:
                        sampleDict[colPosns[i]].append("")

    print ",".join(hdrData)
    for samp in sampleDict:
        count += 1
        print ",".join([samp] + sampleDict[samp])
    return count