Python GoDb Examples, godb.GoDb Python Examples

Example #1

0

Show file

File: correct_and_flip.py Project: PhilAppleby/GoDb

def main(options):

  try:
    csvfile =  open(options.csvfile, "r")
    csvreader = csv.reader(csvfile)
    fh = open(options.snpsummary, "r")
    snp_data = load_snp_summary(fh)
    dbsnpfile = Dbsnpfile()
    dbsnpfile.set_tabix_file(options.dbsnpfile)
    godb = GoDb()
  except IOError as e:
    logging.fatal("I/O error({0}): {1}".format(e.errno, e.strerror))
    sys.exit()
  except TypeError as e:
    logging.fatal("Missing arguments ", e)
    sys.exit()
  except:
    logging.fatal("Unexpected error:", sys.exc_info())
    sys.exit()

  hdr = []
  hdrlen = 0
  count = 0
  flipidx = {}

  hdr = csvreader.next()
  hdrlen = len(hdr)

  for i, varid in enumerate(hdr):
    if "ID" in varid:
      pass
    elif varid.startswith("rs"):
      # Split to get the allele component, check allele va alleleA
      # If necessary add an entry to the flip array, replace the hdr element with raw rsNumber
      var = varid.split("_")
      vardata = godb.get_one_variant(var[0])
      #print var[0], var[1], vardata["alleleA"], vardata["chromosome"], vardata["position"], i
      hdr[i] = var[0]
      if var[1] == vardata["alleleA"]:
        flipidx[i] = True
    else:
      coldata = varid.split(":")
      posn, allele = get_posn_allele(coldata)
      var, ref = get_dbsnp_rsid(dbsnpfile, coldata[0], posn)
      if allele == ref:
        flipidx[i] = True
      #print coldata, var, ref, i
      hdr[i] = var

  print ",".join(hdr)

  for row in csvreader:
    count += 1
    for i, genotype in enumerate(row):
      if i in flipidx:
        row[i] = flip_geno(int(genotype))
    print ",".join(row)
  return count, hdrlen

Example #2

0

Show file

File: models.py Project: PhilAppleby/GoDb

 def __init__(self):
     self.godb = GoDb()
     self.filepaths_coll = _filepaths(self.godb)
     self.sam_coll = _samples(self.godb)
     self.var_coll = _variants(self.filepaths_coll, self.sam_coll,
                               self.godb)
     self.variant_totals = []
     self.sample_count = -1
     self.call_rates = {}
     self.maxrslist = 200

Example #3

0

Show file

File: load_variants_from_info.py Project: PhilAppleby/GoDb

def main(options):
    try:
        godb = GoDb()
    except:
        print "Unexpected error:", sys.exc_info()[0]
        exit()

    hdr = []
    hdrlen = 0
    count = 0
    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('#')):
            pass
        else:
            if (line.startswith('alternate')):
                hdr = line.split()
                hdrlen = len(hdr)
                print hdr
            else:
                data = line.split()
                lgth = len(data)
                if (lgth != hdrlen):
                    print 'UNABLE TO PARSE DETAIL:', data
                else:
                    godb.process_marker_detail(hdr, data, options.assaytype)
                    count += 1
                    if (godb.get_markers_len() >= flush_at):
                        godb.flush_marker_data()
                        print ".", time.time() - start_time, "seconds", count

    godb.flush_marker_data()
    print ""
    return count

Example #4

0

Show file

File: load_samples.py Project: PhilAppleby/GoDb

def main(options):
    try:
        godb = GoDb()
    except:
        print "Unexpected error:", sys.exc_info()[0]
        exit()

    hdr = []
    count = 0
    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('##')):
            pass
        else:
            if (line.startswith('#')):
                vcfr = VCFrecord(line)
                prf, sfx = vcfr.get_prfx_sfx()
                for idx, field in enumerate(sfx):
                    count += 1
                    godb.process_sample_detail(field, idx, options.assaytype)
                    if (godb.get_samples_len() > flush_at):
                        godb.flush_sample_buff()
                break

    godb.flush_sample_buff()
    print ""
    return count

Example #5

0

Show file

def main(options):
    try:
        godb = GoDb()
    except:
        print "Unexpected error:", sys.exc_info()[0]
        exit()

    filelist = []
    dirname = "/" + options.prfx + "/" + options.sfx + "/"

    for filename in os.listdir(dirname):
        if filename.endswith('.vcf.gz'):
            filelist.append(filename)

    godb.add_filepath_detail(options.assaytype, options.prfx, options.sfx,
                             filelist)

Example #6

0

Show file

File: load_gene_map.py Project: PhilAppleby/GoDb

def main():
    try:
        godb = GoDb()
    except:
        print "Unexpected error:", sys.exc_info()[0]
        exit()

    hdr = []
    hdrlen = 0
    count = 0
    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('#')):
            pass
        else:
            godb.process_genemap_detail(line)
            count += 1
            if (godb.get_genemap_len() >= flush_at):
                godb.flush_genemap_buff()
                print ".", time.time() - start_time, "seconds", count

    godb.flush_genemap_buff()
    print ""
    return count

Example #7

0

Show file

File: get_genos_for_sample_variant.py Project: PhilAppleby/GoDb

def main(options):
    included_assaytypes = {
        "affy": 1,
        "illumina": 1,
        "broad": 1,
        "metabo": 1,
        "exome": 1
    }
    godb = GoDb()

    # Data structures
    atype_list = []
    atype_posns = {}
    marker_list = []
    rsid_assaytypes = {}
    rsid_dict = {}
    rsid_prfx_dict = {}
    rsid_cr_dict = {}
    rsid_info_dict = {}
    count = 0

    # Step 1 - get the list of entries for each rsid - one per assaytype

    vardocs = godb.get_multiple_variants(options.rsid)

    sampposns = godb.get_sample_posns(options.sampleid)

    for doc in vardocs:
        filepath = godb.get_filepath(doc["assaytype"], doc["chromosome"])
        rec = godb.get_variant_file_data(filepath, doc["chromosome"],
                                         doc["position"])
        vcfr = VCFrecord(rec)
        prfx, sfx = vcfr.get_prfx_sfx()
        if doc["assaytype"] in sampposns:
            print "%s,%s,%s,%d,%s" % (
                options.rsid, options.sampleid, doc["assaytype"],
                sampposns[doc["assaytype"]], sfx[sampposns[doc["assaytype"]]])

    return count

Example #8

0

Show file

def main(options):
  #included_assaytypes = {"biggertest":1, "bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "exome":1}
  #included_assaytypes = {"affy":1, "illumina":1, "broad":1}
  #included_assaytypes = {"affy":1, "illumina":1}
  included_assaytypes = {"broad":1}
  #included_assaytypes = {"metabo":1}
  #included_assaytypes = {"affy":1}
  #included_assaytypes = {"bigtest":1}
  #included_assaytypes = {"biggertest":1}
  rsids = []
  godb = GoDb()

  try:
    if options.snpfile != None:
      fh = open(options.snpfile, "r") 
      rsids = load_snpfile_data(fh)
    else:
      rsids = options.rsids.split(",")
  except IOError as e:
    print "I/O error({0}): {1}".format(e.errno, e.strerror)
    exit()
  except TypeError as e:
    print "Missing arguments ", e
    exit()
  except:
    logging.info("Unexpected error: %s", str(sys.exc_info()))
    sys.exit()

# Step 0 - initialise db connection and instanciate helper objects
  mafh = Mafhelper()
  hweh = Hwehelper()
# Data structures
  atype_list = []
  atype_posns = {}
  marker_list = []
  rsid_assaytypes = {}
  rsid_dict = {}
  rsid_prfx_dict = {}
  rsid_cr_dict = {}
  rsid_info_dict = {}
  hdr_pref = ["#CHROM",  "POS", "ID",  "REF", "ALT", "QUAL",  "FILTER",  "INFO",  "FORMAT"]

# Step 1 - get the list of entries for each rsid - one per assaytype

  for rsid in rsids:
    #logging.info("Processing rsid = %s", rsid)
    docs = godb.get_multiple_variants(rsid)
    if docs.count() > 0:
      rsid_assaytypes[rsid] = []
    else:
      logging.info("RSID %s NOTFOUND", rsid)
  #print docs

  # Step 1a - collect assaytypes and marker documents
  # At this point we're establishing a list order which must be observed throughout.
    for doc in docs:
      #logging.info("%s", str(doc))
      if doc["assaytype"] not in included_assaytypes:
        continue
      if doc["assaytype"] not in atype_list:
        atype_list.append(doc["assaytype"])
      rsid_assaytypes[rsid].append(doc)
  logging.info(str(atype_list))
# Step 2 - collect lists of prochis (sample ids) by assaytype
  prochi_list = [[]] * len(atype_list)
  for i, atype in enumerate(atype_list):
    atype_posns[atype] = i
    prochi_list[i] = godb.get_samples(atype)
    #logging.info("SAMP %d, %s, %s", i, atype, str(prochi_list[i]))

  mm = Multibuffermerge(prochi_list)

# Step 3 - get combined col_header positions
# combo is a dict {posn:colname}
  combo = mm.get_combined_positions()
  #print len(combo)
# combocol is a list [colname1, colname2, ..., colname] again we keep the order of this intact
  combocol = mm.get_combined_columns()
  
# Step 4 - for each variant by rsid
  for rsid in rsid_assaytypes:
    if rsid not in rsid_dict:
      rsid_prfx_dict[rsid] = [[]] * len(atype_list)
      rsid_dict[rsid] = [[]] * len(atype_list)
      rsid_cr_dict[rsid] = [[]] * len(atype_list)
      rsid_info_dict[rsid] = [[]] * len(atype_list)
    #print len(rsid_assaytypes[rsid])
    for doc in rsid_assaytypes[rsid]:
      if options.prfx != None:
        fpath = godb.get_full_filepath(doc["assaytype"], doc["chromosome"], options.prfx)
      else:
        fpath = godb.get_filepath(doc["assaytype"], doc["chromosome"])
      logging.info("Assaytype=%s fpath=%s", doc["assaytype"], fpath)

      result = godb.get_variant_file_data(fpath, doc["chromosome"], doc["position"])
      if result != None:
        vcfr = VCFrecord(result)
        varid = vcfr.get_varid()
        if varid == rsid:
          rec = result
          maf, ma, cr = mafh.get_maf_and_cr(vcfr)
          # TODO - ALSO check maf, also apply QC filter at individual record level
          rsid_cr_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = cr
          rsid_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = rec
          logging.info("%s (%s), maf=%s, ma=%s, cr=%s" % (doc["rsid"], doc["assaytype"], maf, ma, cr))
  
  #print combocol
# Step 5 - execute the merge process
  print "\t".join(hdr_pref + combocol)
  count = 0
  concordant = True
  for rsid in rsid_dict:
    if len(rsid_dict[rsid][0]) > 0:
      if options.check == 'Y':
        concordant = mm.check_concordancies(rsid_dict[rsid], atype_list, options.chipval)

      if concordant == True:
        comborec = mm.get_combined_array(rsid_dict[rsid], rsid_cr_dict[rsid], atype_list)
        vcfr = VCFrecord(rsid_dict[rsid][0])
        prfx,sfx = vcfr.get_prfx_sfx()
        if len(prfx) > 0:
          logging.info("PRFX = %s, for %s", str(prfx), rsid)
          prfx[8] += ":AT"
          outrec = prfx + comborec
          print "\t".join(outrec)
          count += 1
        else:
          logging.info("RSID %s NOTFOUND (2)", rsid)
          pass
      else:
        logging.info("Concordancy check fail for - %s" % (rsid))

  #chi_test_count, allele_disc_count, overlap_count, cr_check_count = mm.get_counts()
  #logging.info("Overlap check count = %d, cr_check_count = %d", overlap_count, cr_check_count)
  chi_test_count, allele_disc_count, overlap_count = mm.get_counts()
  logging.info("CHI test count = %d, Allele discord count = %d, Overlap check count = %d", 
    chi_test_count, allele_disc_count, overlap_count)

  return count

Example #9

0

Show file

File: models.py Project: PhilAppleby/GoDb

class DataStore():
    def __init__(self):
        self.godb = GoDb()
        self.filepaths_coll = _filepaths(self.godb)
        self.sam_coll = _samples(self.godb)
        self.var_coll = _variants(self.filepaths_coll, self.sam_coll,
                                  self.godb)
        self.variant_totals = []
        self.sample_count = -1
        self.call_rates = {}
        self.maxrslist = 200

    def get_db_name(self):
        return self.godb.get_dbname()

    def make_selection_key(self, varid, assaytype):
        return varid + "_" + assaytype

    def get_variant_data_for_file(self, filepath, threshold):
        msg = ""
        pattern = re.compile('[\W_]+')

        try:
            f = open(filepath, "r")
        except IOError as e:
            msg = filepath + ":" + e.strerror
            return ([], msg)
        count = 0

        rslist = []
        for line in f:
            count += 1
            if count > self.maxrslist:
                msg = "line count for %s gt the limit (%d)" % (filepath,
                                                               self.maxrslist)
                return ([], msg)
            line = line.strip()
            pattern.sub('', line)
            if line.startswith("rs"):
                elems = line.split()
                rslist.append(elems[0])
            else:
                msg += "Removed bad line at %d, " % (count)

        f.close()
        variant_data = []
        for rsid in rslist:
            (variant_docs,
             tmsg) = self.get_variant_summary_probs(rsid, threshold)
            if (tmsg != ""):
                msg += tmsg
            for doc in variant_docs:
                variant_data.append(doc)
        return variant_data, msg

    def get_variant_data_by_range(self, chromosome, start, end, threshold=0.9):
        rslist = []
        variant_data = []
        msg = ""
        return (self.var_coll.get_variant_data_by_range(
            chromosome, start, end))

    def get_range_data(self, chromosome, start, end, threshold, download_list):
        (docs, msg) = self.var_coll.get_variant_data_by_range(
            chromosome, start, end)
        if len(docs) == 0:
            return ([], [], msg)

        rsdict = {}
        rslist = []
        for doc in docs:
            rsdict[doc["rsid"]] = 1

        for rsid in rsdict:
            rslist.append(rsid)

        return (self.get_rslist_data(rslist, threshold, download_list))

    def build_csv_data(self, rslist, sampleDict, assaytypes, threshold,
                       atpidx):
        # NOTE: maintaining rslist order is vital!
        rsidx = {}
        normalised = False
        idx = 0
        for rsid in rslist:
            rsidx[rsid] = idx
            idx += 1

        assaytypes['combined'] = 1
        by_platform_data = {}
        for assaytype in assaytypes:
            by_platform_data[assaytype] = []
        hdrData = ["sampleId"]
        for rsid in rslist:
            hdrData.append(rsid)
            hdrData.append(rsid + "_c")
            hdrData.append(rsid + "_p")
            hdrData.append(rsid + "_alt")
        hdrString = ','.join(hdrData)

        for assaytype in assaytypes:
            by_platform_data[assaytype].append(hdrString)
        for samp in sampleDict:
            output_lines = {}
            filled_output_lines = {}
            filled_output_lines['combined'] = True
            for assaytype in assaytypes:
                output_lines[assaytype] = ["" for x in range(len(rslist) * 4)]
            for rsid in sampleDict[samp]:
                if len(
                        sampleDict[samp][rsid]
                ) > 0:  # if a sample wasn't genotyped on any platform there might not be data
                    idxoffset = rsidx[rsid] * 4
                    # resolve_geno is at the crux - need to change to test CR?
                    #logging.info("Call resolve_geno %s, %s", samp, str(sampleDict[samp][rsid]))
                    geno_data = self.resolve_geno(sampleDict[samp][rsid], rsid,
                                                  samp, threshold, atpidx)
                    dataVals = geno_data[2].split(':')
                    #print "build_csv_data:", geno_data
                    probVals = dataVals[atpidx[geno_data[0]]].split(',')
                    (probcall, intcall,
                     outprob) = self.var_coll.get_call(probVals, threshold)
                    output_lines['combined'][idxoffset] = str(intcall)
                    output_lines['combined'][idxoffset + 1] = str(outprob)
                    output_lines['combined'][idxoffset + 2] = geno_data[0]
                    output_lines['combined'][idxoffset + 3] = geno_data[4]
                    for geno_data in sampleDict[samp][rsid]:
                        dataVals = geno_data[2].split(':')
                        probVals = dataVals[atpidx[geno_data[0]]].split(',')
                        (probcall, intcall, outprob) = self.var_coll.get_call(
                            probVals, threshold)
                        output_lines[geno_data[0]][idxoffset] = str(intcall)
                        output_lines[geno_data[0]][idxoffset +
                                                   1] = str(outprob)
                        output_lines[geno_data[0]][idxoffset +
                                                   2] = geno_data[0]
                        output_lines[geno_data[0]][idxoffset +
                                                   3] = geno_data[4]
                        filled_output_lines[geno_data[0]] = True

            for assaytype in assaytypes:
                if assaytype in filled_output_lines:
                    by_platform_data[assaytype].append(
                        samp + "," + ",".join(output_lines[assaytype]))
        return by_platform_data

    def resolve_geno(self, genlist, rsid, samp, threshold, atpidx):
        maxprob = 0.0
        maxidx = -1
        if len(genlist) == 1:
            return genlist[0]
        elif len(genlist) > 1:
            for idx, gendata in enumerate(genlist):
                try:
                    dataVals = gendata[2].split(':')
                    probVals = dataVals[atpidx[gendata[0]]].split(',')
                except IndexError:
                    sys.exit()
                (probcall, intcall,
                 outprob) = self.var_coll.get_call(probVals, threshold)
                if outprob > maxprob:
                    maxprob = outprob
                    maxidx = idx
        if maxprob > 0.0:
            return genlist[maxidx]
        return genlist[0]

    def get_variant_summary_probs(self, rsid, threshold):
        variant_array = []
        msg = ""
        docs = self.var_coll.get_variant_data_multi(rsid)
        for doc in docs:
            # always force chromosome to 2 digits
            chromosome = "%.2d" % int(doc["chromosome"])
            fpath = self.filepaths_coll.get_filepath(doc["assaytype"],
                                                     chromosome)
            fullrec = self.var_coll.get_raw_variant_values(
                fpath, chromosome, doc['position'])
            vcfr = VCFrecord(fullrec)
            prfx, sfx = vcfr.get_prfx_sfx()
            probidx = vcfr.get_probidx()
            (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf,
             p_hwe) = self.var_coll.get_genotype_probs(sfx, threshold, probidx)
            doc['selected'] = 1
            doc['a_af'] = alleleAf
            doc['b_af'] = alleleBf
            doc['hwe_p'] = p_hwe
            doc['Missing'] = 0
            if 'Missing' in gc_count_dict:
                doc['Missing'] = gc_count_dict['Missing']
            variant_array.append(doc)
        if len(variant_array) == 0:
            msg = "Variant NOT FOUND - %s, " % (rsid)
        return (variant_array, msg)

    def get_variant_data(self, variantid, assaytype):
        return self.var_coll.get_variant_data(variantid, assaytype)

    def get_sample(self, sampleid):
        return self.sam_coll.get_sample(sampleid)

    def make_zipfile(self, sample_return_data, snp_return_data, uploadDir,
                     zipfilename):
        """
    moved here from views.py
    """
        ares = {}
        for assaytype in sample_return_data:
            ares[assaytype] = '\n'.join(sample_return_data[assaytype]) + '\n'
        zipname = uploadDir + "/" + zipfilename
        with ZipFile(zipname, 'w') as resZip:
            resZip.writestr('snp_summary.csv', snp_return_data, ZIP_DEFLATED)
            for assaytype in ares:
                resZip.writestr(assaytype + '_samples.csv', ares[assaytype],
                                ZIP_DEFLATED)
        with open(zipname, 'r') as f:
            body = f.read()
        return (body)
        response = make_response(body)
        response.headers[
            "Content-Disposition"] = "attachment; filename=" + zipfilename
        return (response)

    def get_rslist_file_data(self, filepath, threshold, download_list):
        msg = None
        try:
            f = open(filepath, "r")
        except IOError as e:
            msg = filepath + ":" + e.strerror
            return ([], [], msg)
        count = 0

        rslist = []
        for line in f:
            count += 1
            if count > 500:
                msg = "line count for %s gt the limit (%d)" % (filepath, 500)
                return ([], [], msg)
            line = line.strip()
            elems = line.split()
            rslist.append(elems[0])

        f.close()
        logging.info("get_rslist_file_data: %.2f", float(threshold))
        return (self.get_rslist_data(rslist, threshold, download_list))

    def get_rslist_data(self, input_rslist, threshold, download_list):
        msg = None
        snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n"

        data = []
        assaytypelist = []
        probidxlist = []
        rslist = []
        assaytypes = {}
        Afreq = {}
        Bfreq = {}

        data_count = 0
        impDict = {}
        for rsid in input_rslist:
            docs = self.var_coll.get_variant_data_multi(rsid)
            if len(docs) > 0:
                rslist.append(rsid)
            # handling SNPs on multiple platforms
            for doc in docs:
                # always force chromosome to 2 digits
                chromosome = "%.2d" % int(doc["chromosome"])
                # first get filepath
                fpath = self.filepaths_coll.get_filepath(
                    doc["assaytype"], chromosome)
                # get raw variant data
                fullrec = self.var_coll.get_raw_variant_values(
                    fpath, chromosome, doc['position'])
                geno_count = 0
                sample_count = 0
                hwep = 0.0
                vcfr = VCFrecord(fullrec)
                prfx, sfx = vcfr.get_prfx_sfx()
                probidx = vcfr.get_probidx()
                (gc_count_dict, sample_count, geno_count, maf, alleleAf,
                 alleleBf, p_hwe) = self.var_coll.get_genotype_probs(
                     sfx, threshold, probidx)
                Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf
                Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf
                data_count += 1
                hwep = float(p_hwe)

                assaytypelist.append(doc["assaytype"])
                data.append(vcfr)
                if doc["assaytype"] not in assaytypes:
                    assaytypes[doc["assaytype"]] = 1

                imputed = 0
                if "imputed" in doc:
                    imputed = 1
                if "info" in doc:
                    if doc["info"] != 1.0:
                        imputed = 1
                impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed
                snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % (
                    doc["rsid"], doc["assaytype"], doc["chromosome"],
                    doc["position"], doc["alleleA"],
                    alleleAf, doc["alleleB"], alleleBf, maf, imputed,
                    float(geno_count) / sample_count, hwep, doc["info"])

        pdata = self.get_sample_values(assaytypelist, data, data_count, rslist,
                                       impDict, assaytypes, Afreq, Bfreq,
                                       threshold)
        return (pdata, snpdata, msg)

    def get_sample_values(self, assaytypelist, records, numrecs, rslist,
                          impDict, assaytypes, Afreq, Bfreq, threshold):
        """Process vcf records
      NOTE: impDict keyed on a composite of rsid_platform 
       """
        samplesByAt = {}
        for assaytype in assaytypes:
            samplesByAt[assaytype] = self.sam_coll.get_samples(assaytype)
        # A dict of dicts of tables
        sampleDict = {}
        dupDict = {}
        dupcount = 0
        for assaytype in samplesByAt:
            for samp in samplesByAt[assaytype]:
                if samp not in sampleDict:
                    sampleDict[samp] = {}
                    for rsid in rslist:
                        sampleDict[samp][rsid] = []

        count = 0
        rscount = 0
        atpidx = {}
        values_totals = 0
        for idx, vcfr in enumerate(records):
            assaytype = assaytypelist[idx]
            chromosome = vcfr.get_chr()
            pos = vcfr.get_posn()
            rsid = vcfr.get_varid()
            alleleA, alleleB = vcfr.get_alleles()
            prfx, sfx = vcfr.get_prfx_sfx()
            atpidx[assaytype] = vcfr.get_probidx()

            if assaytype not in samplesByAt:
                samplesByAt[assaytype] = self.sam_coll.get_samples(assaytype)

            for idx, elem in enumerate(sfx):
                if samplesByAt[assaytype][idx] in sampleDict:
                    sampleId = samplesByAt[assaytype][idx]
                    values_totals += 1
                    sampleDict[sampleId][rsid].append([
                        assaytype, impDict[rsid + "_" + assaytype], sfx[idx],
                        alleleA, alleleB, Afreq[rsid + "_" + assaytype],
                        Bfreq[rsid + "_" + assaytype]
                    ])
            rscount += 1

        by_assaytype_data = self.build_csv_data(rslist, sampleDict, assaytypes,
                                                threshold, atpidx)

        return (by_assaytype_data)