def load(db, query=None): t0 = time.time() conn = sqlite3.connect(db) cur = conn.cursor() gt_cols = get_gt_cols(cur) samples = get_samples(cur) bcpath = get_bcolz_dir(db) carrays = {} n = 0 for gtc in gt_cols: if not gtc in query: continue carrays[gtc] = [] for s in samples: if not s in query and not fix_sample_name(s) in query: # need to add anyway as place-holder carrays[gtc].append(None) continue path = "%s/%s/%s" % (bcpath, s, gtc) if os.path.exists(path): carrays[gtc].append(bcolz.open(path, mode="r")) n += 1 if os.environ.get("GEMINI_DEBUG") == "TRUE": print >>sys.stderr, "it took %.2f seconds to load %d arrays" \ % (time.time() - t0, n) return carrays
def load(db, query=None): import database t0 = time.time() conn, metadata = database.get_session_metadata(db) gt_cols = get_gt_cols(metadata) samples = get_samples(metadata) bcpath = get_bcolz_dir(db) carrays = {} n = 0 for gtc in gt_cols: if not gtc in query: continue carrays[gtc] = [] for s in samples: if not s in query and not fix_sample_name(s) in query: # need to add anyway as place-holder carrays[gtc].append(None) continue path = "%s/%s/%s" % (bcpath, s, gtc) if os.path.exists(path): carrays[gtc].append(bcolz.open(path, mode="r")) n += 1 if os.environ.get("GEMINI_DEBUG") == "TRUE": print >>sys.stderr, "it took %.2f seconds to load %d arrays" \ % (time.time() - t0, n) return carrays
def __init__(self, db, include_gt_cols=False, out_format=DefaultRowFormat(None), variant_id_getter=None): assert os.path.exists(db), "%s does not exist." % db self.db = db self.query_executed = False self.for_browser = False self.include_gt_cols = include_gt_cols self.variant_id_getter = variant_id_getter # try to connect to the provided database self._connect_to_database() # save the gt_cols in the database and don't hard-code them anywhere. self.gt_cols = util.get_gt_cols(self.conn) # extract the column names from the sample table. # needed for gt-filter wildcard support. self._collect_sample_table_columns() # list of samples ids for each clause in the --gt-filter self.sample_info = collections.defaultdict(list) # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323 self.sample_to_idx = util.map_samples_to_indices(self.c) # and vice versa. e.g., self.idx_to_sample[323] -> NA20814 self.idx_to_sample = util.map_indices_to_samples(self.c) self.idx_to_sample_object = util.map_indices_to_sample_objects(self.c) self.sample_to_sample_object = util.map_samples_to_sample_objects(self.c) self.formatter = out_format self.predicates = [self.formatter.predicate] self.sample_show_fields = ["variant_samples", "het_samples", "hom_alt_samples"]
def load(db): t0 = time.time() conn = sqlite3.connect(db) cur = conn.cursor() gt_cols = get_gt_cols(cur) samples = get_samples(cur) bcpath = get_bcolz_dir(db) carrays = {} for gtc in gt_cols: carrays[gtc] = [] for s in samples: path = "%s/%s/%s" % (bcpath, s, gtc) if os.path.exists(path): carrays[gtc].append(bcolz.open(path, mode="r")) if os.environ.get("GEMINI_DEBUG") == "TRUE": print >>sys.stderr, "it took %.2f seconds to load arrays" \ % (time.time() - t0) return carrays
def create(db, cols=None): if cols is None: cols = [x[0] for x in gt_cols_types if x[0] != 'gts'] print >>sys.stderr, ( "indexing all columns execpt 'gts'; to index that column, " "run gemini bcolz_index %s --cols gts" % db) conn = sqlite3.connect(db) cur = conn.cursor() gt_cols = [x for x in get_gt_cols(cur) if x in cols] samples = get_samples(cur) bcpath = get_bcolz_dir(db) mkdir(bcpath) nv = get_n_variants(cur) sys.stderr.write("loading %i variants for %i samples into bcolz\n" % (nv, len(samples))) if nv == 0 or len(samples) == 0: return carrays = {} tmps = {} try: for gtc in gt_cols: carrays[gtc] = [] tmps[gtc] = [] dt = dict(gt_cols_types)[gtc] for s in samples: mkdir("%s/%s" % (bcpath, s)) carrays[gtc].append(bcolz.carray(np.empty(0, dtype=dt), expectedlen=nv, rootdir="%s/%s/%s" % (bcpath, s, gtc), chunklen=16384*8, mode="w")) tmps[gtc].append([]) t0 = time.time() step = 200000 del gtc empty = [-1] * len(samples) for i, row in enumerate(cur.execute("select %s from variants" % ", ".join(gt_cols))): for j, gt_col in enumerate(gt_cols): vals = decomp(row[j]) if vals is None: # empty gt_phred_ll vals = empty for isamp, sample in enumerate(samples): tmps[gt_col][isamp].append(vals[isamp]) if (i > 0 and i % step == 0) or i == nv - 1: carrays[gt_col][isamp].append(tmps[gt_col][isamp]) tmps[gt_col][isamp] = [] carrays[gt_col][isamp].flush() if i % step == 0 and i > 0: print >>sys.stderr, "at %.1fM (%.0f rows / second)" % (i / 1000000., i / float(time.time() - t0)) t = float(time.time() - t0) print >>sys.stderr, "loaded %d variants at %.1f / second" % (len(carrays[gt_col][0]), nv / t) except: # on error, we remove the dirs so we can't have weird problems. for k, li in carrays.items(): for i, ca in enumerate(li): if i < 5: print >>sys.stderr, "removing:", ca.rootdir if i == 5: print >>sys.stderr, "not reporting further removals for %s" % k ca.flush() shutil.rmtree(ca.rootdir) raise
def create(db, cols=None): if cols is None: cols = [x[0] for x in gt_cols_types if x[0] != 'gts'] print >> sys.stderr, ( "indexing all columns except 'gts'; to index that column, " "run gemini bcolz_index %s --cols gts" % db) conn = sqlite3.connect(db) cur = conn.cursor() gt_cols = [x for x in get_gt_cols(cur) if x in cols] samples = get_samples(cur) bcpath = get_bcolz_dir(db) mkdir(bcpath) nv = get_n_variants(cur) sys.stderr.write("loading %i variants for %i samples into bcolz\n" % (nv, len(samples))) if nv == 0 or len(samples) == 0: return carrays = {} tmps = {} try: for gtc in gt_cols: carrays[gtc] = [] tmps[gtc] = [] dt = dict(gt_cols_types)[gtc] for s in samples: mkdir("%s/%s" % (bcpath, s)) carrays[gtc].append( bcolz.carray(np.empty(0, dtype=dt), expectedlen=nv, rootdir="%s/%s/%s" % (bcpath, s, gtc), chunklen=16384 * 8, mode="w")) tmps[gtc].append([]) t0 = time.time() # scale step by number of samples to limit memory use. step = max(100, 2000000 / len(samples)) sys.stderr.write("step-size: %i\n" % step) del gtc empty = [-1] * len(samples) for i, row in enumerate( cur.execute("select %s from variants" % ", ".join(gt_cols))): for j, gt_col in enumerate(gt_cols): vals = decomp(row[j]) if vals is None or len(vals) == 0: # empty gt_phred_ll vals = empty for isamp, sample in enumerate(samples): tmps[gt_col][isamp].append(vals[isamp]) if (i > 0 and i % step == 0) or i == nv - 1: carrays[gt_col][isamp].append(tmps[gt_col][isamp]) tmps[gt_col][isamp] = [] carrays[gt_col][isamp].flush() if i % step == 0 and i > 0: print >> sys.stderr, "at %.1fM (%.0f rows / second)" % ( i / 1000000., i / float(time.time() - t0)) t = float(time.time() - t0) print >> sys.stderr, "loaded %d variants at %.1f / second" % (len( carrays[gt_col][0]), nv / t) except: # on error, we remove the dirs so we can't have weird problems. for k, li in carrays.items(): for i, ca in enumerate(li): if i < 5: print >> sys.stderr, "removing:", ca.rootdir if i == 5: print >> sys.stderr, "not reporting further removals for %s" % k ca.flush() shutil.rmtree(ca.rootdir) raise