def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None): """Generalized annotation of variants with a new column. get_val_fn takes a list of annotations in a region and returns the value for that region to update the database with. Separates selection and identification of values from update, to avoid concurrent database access errors from sqlite, especially on NFS systems. The retained to_update list is small, but batching could help if memory issues emerge. """ # For each, use Tabix to detect overlaps with the user-defined # annotation file. Update the variant row with T/F if overlaps found. anno = pysam.Tabixfile(args.anno_file) naming = guess_contig_naming(anno) cursor = conn.bind.connect() add_requested_columns(args, cursor, col_names, col_types) conn.commit() cursor.close() conn, metadata = database.get_session_metadata(str(conn.bind.url)) cursor = conn.bind.connect() last_id = 0 current_id = 0 total = 0 CHUNK_SIZE = 100000 to_update = [] select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''') while True: for row in select_res.fetchmany(CHUNK_SIZE): # update_data starts out as a list of the values that should # be used to populate the new columns for the current row. # Prefer no pysam parsing over tuple parsing to work around bug in pysam 0.8.0 # https://github.com/pysam-developers/pysam/pull/44 if args.anno_file.endswith(('.vcf', '.vcf.gz')): update_data = get_val_fn(annotations_in_vcf(row, anno, None, naming, args.region_only, True)) else: update_data = get_val_fn(annotations_in_region(row, anno, None, naming)) #update_data = get_val_fn(annotations_in_region(row, anno, "tuple", naming)) # were there any hits for this row? if len(update_data) > 0: # we add the primary key to update_data for the # where clause in the SQL UPDATE statement. update_data.append(str(row["variant_id"])) to_update.append(tuple(update_data)) current_id = row["variant_id"] if current_id <= last_id: break else: _update_variants(metadata, to_update, col_names, cursor) total += len(to_update) print "updated", total, "variants" last_id = current_id to_update = []
def windower(parser, args): check_dependencies("windower", [["bedtools", "--version"]]) conn, metadata = database.get_session_metadata(args.db) pid = os.getpid() temp_file = ".".join(['.temp', str(pid)]) make_windows(conn, args, temp_file)
def load(db, query=None): import database t0 = time.time() conn, metadata = database.get_session_metadata(db) gt_cols = get_gt_cols(metadata) samples = get_samples(metadata) bcpath = get_bcolz_dir(db) carrays = {} n = 0 for gtc in gt_cols: if not gtc in query: continue carrays[gtc] = [] for s in samples: if not s in query and not fix_sample_name(s) in query: # need to add anyway as place-holder carrays[gtc].append(None) continue path = "%s/%s/%s" % (bcpath, s, gtc) if os.path.exists(path): carrays[gtc].append(bcolz.open(path, mode="r")) n += 1 if os.environ.get("GEMINI_DEBUG") == "TRUE": print >>sys.stderr, "it took %.2f seconds to load %d arrays" \ % (time.time() - t0, n) return carrays
def pathways(parser, args): import database conn, metadata = database.get_session_metadata(args.db) if (not args.lof): get_ind_pathways(conn, metadata, args) else: get_ind_lof_pathways(conn, metadata, args)
def db_info(parser, args): conn, metadata = database.get_session_metadata(args.db) # column widths for the output out_template = "{0:20}{1:30}{2:10}" # header print out_template.format("table_name", "column_name", "type") for table in ['variants', 'variant_impacts', 'samples', 'gene_detailed', 'gene_summary']: get_table_info(metadata, table, out_template)
def dump(parser, args): conn, metadata = database.get_session_metadata(args.db) if args.variants: get_variants(conn, metadata, args) elif args.genotypes: get_genotypes(conn, metadata, args) elif args.samples: get_samples(conn, metadata, args) elif args.tfam: tfam(args)
def db_info(parser, args): conn, metadata = database.get_session_metadata(args.db) # column widths for the output out_template = "{0:20}{1:30}{2:10}" # header print out_template.format("table_name", "column_name", "type") for table in [ 'variants', 'variant_impacts', 'samples', 'gene_detailed', 'gene_summary' ]: get_table_info(metadata, table, out_template)
def get_families(db, selected_families=None): """ Query the samples table to return a list of Family objects that each contain all of the Subjects in a Family. """ conn, metadata = database.get_session_metadata(db) families_dict = Family.from_cursor(conn) # if the user has specified a set of selected families # to which the analysis should be restricted, then # first sanity check that the family ids they specified are valid. if selected_families is not None: for family in selected_families.split(','): if family not in families_dict: sys.exit("ERROR: family \"%s\" is not a valid family_id\n" % family) families = [] for fam in families_dict: if selected_families is None or fam in selected_families: families.append(families_dict[fam]) return families
def get_families(db, selected_families=None): """ Query the samples table to return a list of Family objects that each contain all of the Subjects in a Family. """ conn, metadata = database.get_session_metadata(db) families_dict = Family.from_cursor(conn) # if the user has specified a set of selected families # to which the analysis should be restricted, then # first sanity check that the family ids they specified are valid. if selected_families is not None: for family in selected_families.split(','): if family not in families_dict: raise ValueError("Family \"%s\" is not a valid family_id\n" % family) families = [] for fam in families_dict: if selected_families is None or fam in selected_families: families.append(families_dict[fam]) return families
def __init__(self, args, buffer_size=10000, prepare_db=True): self.args = args self.seen_multi = False # create the gemini database # create a reader for the VCF file self.vcf_reader = self._get_vcf_reader() # load sample information expected = "consequence,codons,amino_acids,gene,symbol,feature,exon,polyphen,sift,protein_position,biotype,warning".split(",") if self.args.anno_type == "VEP": self._effect_fields = self._get_vep_csq(self.vcf_reader) # tuples of (db_column, CSQ name) self._extra_effect_fields = [("vep_%s" % x.lower(), x) for x in self._effect_fields if not x.lower() in expected] else: self._effect_fields = [] self._extra_effect_fields = [] if not prepare_db: self.c, self.metadata = database.get_session_metadata(args.db) return self._create_db([x[0] for x in self._extra_effect_fields]) self._extra_empty = dict((x[0], None) for x in self._extra_effect_fields) if not self.args.no_genotypes and not self.args.no_load_genotypes: # load the sample info from the VCF file. self._prepare_samples() # initialize genotype counts for each sample self._init_sample_gt_counts() self.num_samples = len(self.samples) else: self.num_samples = 0 self.clinvar_chrom_gene_lookup = load_clinvar(annotations.get_anno_files(self.args)['clinvar']) self.buffer_size = buffer_size self._get_anno_version()
def stats(parser, args): import database conn, metadata = database.get_session_metadata(args.db) if args.tstv: get_tstv(conn, metadata, args) elif args.tstv_coding: get_tstv_coding(conn, metadata, args) elif args.tstv_noncoding: get_tstv_noncoding(conn, metadata, args) elif args.snp_counts: get_snpcounts(conn, metadata, args) elif args.sfs: get_sfs(conn, metadata, args) elif args.variants_by_sample: get_variants_by_sample(conn, metadata, args) elif args.genotypes_by_sample: get_gtcounts_by_sample(conn, metadata, args) elif args.mds: get_mds(conn, metadata, args) elif args.query: summarize_query_by_sample(args)
try: if res.shape[0] == 1 and len(res.shape) > 1: res = res[0] except AttributeError: return [] variant_ids, = np.where(res) #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict, # vm="numexpr").wheretrue())) # variant ids are 1-based. if len(variant_ids) > 0: return 1 + variant_ids else: return [] if __name__ == "__main__": db = sys.argv[1] #create(sys.argv[1]) carrays = load(db) conn, metadata = database.get_session_metadata(db) if len(sys.argv) > 2: q = sys.argv[2] else: q = "gt_types.1094PC0012 == HET and gt_types.1719PC0016 == HET and gts.1094PC0012 == 'A/C'" print filter(db, carrays, q, user_dict=dict(HET=1, HOM_REF=0, HOM_ALT=3, UNKNOWN=2)) print "compare to:", ("""gemini query -q "select variant_id, gts.1719PC0016 from variants" """ """ --gt-filter "%s" %s""" % (q, db))
def annotate(parser, args): check_dependencies("annotate", [["tabix", "-h"], ["bgzip", "-h"]]) def _validate_args(args): if (args.col_operations or args.col_types or args.col_extracts): raise ValueError('You may only specify a column name (-c) when ' 'using \"-a boolean\" or \"-a count\".\n') col_names = args.col_names.split(',') if len(col_names) > 1: raise ValueError('You may only specify a single column name (-c) ' 'when using \"-a boolean\" or \"-a count\".\n') if not args.anno_file.endswith(('.vcf', '.vcf.gz')) and args.region_only and parser is not None: raise ValueError('You may only specify --region-only when annotation is a VCF.') return col_names def _validate_extract_args(args): if args.anno_file.endswith(('.vcf', '.vcf.gz')): if not args.col_names: args.col_names = args.col_extracts elif not args.col_extracts: args.col_extracts = args.col_names elif args.region_only and parser is not None: raise ValueError('You may only specify --region-only when annotation is a VCF.1') if not args.col_types: raise ValueError('need to give column types ("-t")\n') col_ops = args.col_operations.split(',') col_idxs = args.col_extracts.split(',') col_names = args.col_names.split(',') col_types = args.col_types.split(',') supported_types = ['text', 'float', 'integer'] for col_type in col_types: if col_type not in supported_types: raise ValueError('Column type [%s] not supported.\n' % (col_type)) supported_ops = op_funcs.keys() for col_op in col_ops: if col_op not in supported_ops: raise ValueError('Column operation [%s] not supported.\n' % (col_op)) if not (len(col_ops) == len(col_names) == len(col_types) == len(col_idxs)): raise ValueError('The number of column names, numbers, types, and ' 'operations must match: [%s], [%s], [%s], [%s]\n' % (args.col_names, args.col_extracts, args.col_types, args.col_operations)) return col_names, col_types, col_ops, col_idxs if (args.db is None): parser.print_help() exit(1) if not os.path.exists(args.anno_file): sys.stderr.write("Error: cannot find annotation file.") exit(1) conn, metadata = database.get_session_metadata(args.db) if args.anno_type == "boolean": col_names = _validate_args(args) annotate_variants_bool(args, conn, metadata, col_names) elif args.anno_type == "count": col_names = _validate_args(args) annotate_variants_count(args, conn, metadata, col_names) elif args.anno_type == "extract": if args.col_extracts is None and not args.anno_file.endswith('.vcf.gz'): raise RuntimeError("You must specify which column to " "extract from your annotation file.") else: col_names, col_types, col_ops, col_idxs = _validate_extract_args(args) annotate_variants_extract(args, conn, metadata, col_names, col_types, col_ops, col_idxs) else: raise RuntimeError("Unknown column type requested. Exiting.") conn.close() # index on the newly created columns for col_name in col_names: with database_transaction(args.db) as c: c.execute('''drop index if exists %s''' % (col_name + "idx")) c.execute('''create index %s on variants(%s)''' % (col_name + "idx", col_name))
def lofgenequery(parser, args): conn, metadata = database.get_session_metadata(args.db) samples = sample_variants(conn, metadata, args) sample_lof_variants(conn, metadata, args, samples)
def annotate(parser, args): check_dependencies("annotate", [["tabix", "-h"], ["bgzip", "-h"]]) def _validate_args(args): if (args.col_operations or args.col_types or args.col_extracts): raise ValueError( 'You must not specify a column type (-t), op (-o) or extract (-e) when ' 'using \"-a boolean\" or \"-a count\".\n') col_names = args.col_names.split(',') if len(col_names) > 1: raise ValueError('You may only specify a single column name (-c) ' 'when using \"-a boolean\" or \"-a count\".\n') if not args.anno_file.endswith( ('.vcf', '.vcf.gz')) and args.region_only and parser is not None: raise ValueError( 'You may only specify --region-only when annotation is a VCF.') return col_names def _validate_extract_args(args): if args.anno_file.endswith(('.vcf', '.vcf.gz')): if not args.col_names: args.col_names = args.col_extracts elif not args.col_extracts: args.col_extracts = args.col_names elif args.region_only and parser is not None: raise ValueError( 'You may only specify --region-only when annotation is a VCF.1' ) if not args.col_types: raise ValueError('need to give column types ("-t")\n') col_ops = args.col_operations.split(',') col_idxs = args.col_extracts.split(',') col_names = args.col_names.split(',') col_types = args.col_types.split(',') supported_types = ['text', 'float', 'integer'] for col_type in col_types: if col_type not in supported_types: raise ValueError('Column type [%s] not supported.\n' % (col_type)) supported_ops = op_funcs.keys() for col_op in col_ops: if col_op not in supported_ops: raise ValueError('Column operation [%s] not supported.\n' % (col_op)) if not (len(col_ops) == len(col_names) == len(col_types) == len(col_idxs)): raise ValueError( 'The number of column names, numbers, types, and ' 'operations must match: [%s], [%s], [%s], [%s]\n' % (args.col_names, args.col_extracts, args.col_types, args.col_operations)) return col_names, col_types, col_ops, col_idxs if (args.db is None): parser.print_help() exit(1) if not os.path.exists(args.anno_file): sys.stderr.write("Error: cannot find annotation file.") exit(1) conn, metadata = database.get_session_metadata(args.db) if args.anno_type == "boolean": col_names = _validate_args(args) annotate_variants_bool(args, conn, metadata, col_names) elif args.anno_type == "count": col_names = _validate_args(args) annotate_variants_count(args, conn, metadata, col_names) elif args.anno_type == "extract": if args.col_extracts is None and not args.anno_file.endswith( '.vcf.gz'): raise RuntimeError("You must specify which column to " "extract from your annotation file.") else: col_names, col_types, col_ops, col_idxs = _validate_extract_args( args) annotate_variants_extract(args, conn, metadata, col_names, col_types, col_ops, col_idxs) else: raise RuntimeError("Unknown column type requested. Exiting.") conn.close() # index on the newly created columns for col_name in col_names: with database_transaction(args.db) as c: c.execute('''drop index if exists %s''' % (col_name + "idx")) c.execute('''create index %s on variants(%s)''' % (col_name + "idx", col_name))
def lof_sieve(parser, args): import database conn, metadata = database.get_session_metadata(args.db) get_ind_lof(conn, metadata, args)
except AttributeError: return [] variant_ids, = np.where(res) #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict, # vm="numexpr").wheretrue())) # variant ids are 1-based. if len(variant_ids) > 0: return 1 + variant_ids else: return [] if __name__ == "__main__": db = sys.argv[1] #create(sys.argv[1]) carrays = load(db) conn, metadata = database.get_session_metadata(db) if len(sys.argv) > 2: q = sys.argv[2] else: q = "gt_types.1094PC0012 == HET and gt_types.1719PC0016 == HET and gts.1094PC0012 == 'A/C'" print filter(db, carrays, q, user_dict=dict(HET=1, HOM_REF=0, HOM_ALT=3, UNKNOWN=2)) print "compare to:", ( """gemini query -q "select variant_id, gts.1719PC0016 from variants" """ """ --gt-filter "%s" %s""" % (q, db))
def filter(db, query, user_dict): # these should be translated to a bunch or or/and statements within gemini # so they are supported, but must be translated before getting here. if query == "False" or query is None or query is False: return [] if "any(" in query or "all(" in query or \ ("sum(" in query and not query.startswith("sum(") and query.count("sum(") == 1): return None user_dict['where'] = np.where if query.startswith("not "): # "~" is not to numexpr. query = "~" + query[4:] sum_cmp = False if query.startswith("sum("): assert query[-1].isdigit() query, sum_cmp = query[4:].rsplit(")", 1) query = "(%s) %s" % (query, sum_cmp) query = query.replace(".", "__") query = " & ".join("(%s)" % token for token in query.split(" and ")) query = " | ".join("(%s)" % token for token in query.split(" or ")) import database conn, metadata = database.get_session_metadata(db) samples = get_samples(metadata) # convert gt_col[index] to gt_col__sample_name patt = "(%s)\[(\d+)\]" % "|".join((g[0] for g in gt_cols_types)) def subfn(x): """Turn gt_types[1] into gt_types__sample""" field, idx = x.groups() return "%s__%s" % (field, fix_sample_name(samples[int(idx)])) query = re.sub(patt, subfn, query) if os.environ.get('GEMINI_DEBUG') == 'TRUE': print >> sys.stderr, query[:250] + "..." carrays = load(db, query=query) if len(carrays) == 0 or max(len(carrays[c]) for c in carrays) == 0 or \ any(not any(carrays[c]) for c in carrays): # need this 2nd check above because of the place-holders in load() raise NoGTIndexException # loop through and create a cache of "$gt__$sample" for gt_col in carrays: if not gt_col in query: continue for i, sample_array in enumerate(carrays[gt_col]): sample = fix_sample_name(samples[i]) if not sample in query: continue user_dict["%s__%s" % (gt_col, sample)] = sample_array # had to special-case count. it won't quite be as efficient if "|count|" in query: tokens = query[2:-2].split("|count|") icmp = tokens[-1] # a list of carrays, so not in memory. res = [bcolz.eval(tok, user_dict=user_dict) for tok in tokens[:-1]] # in memory after this, but just a single axis array. res = np.sum(res, axis=0) res = ne.evaluate('res%s' % icmp) else: res = bcolz.eval(query, user_dict=user_dict) try: if res.shape[0] == 1 and len(res.shape) > 1: res = res[0] except AttributeError: return [] variant_ids, = np.where(res) #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict, # vm="numexpr").wheretrue())) # variant ids are 1-based. if len(variant_ids) > 0: return 1 + variant_ids else: return []
def tag_somatic_mutations(args): t_n_pairs = gemini_subjects.get_families(args.db) gq = GeminiQuery.GeminiQuery(args.db) depth_string, qual_string, ssc_string, chrom_string = ("", "", "", "") if args.min_depth: depth_string = " AND depth >= %s" % args.min_depth if args.min_qual: qual_string = " AND qual >= %s" % args.min_qual if args.min_somatic_score: ssc_string = " AND (type='sv' \ OR somatic_score >= %s)" % args.min_somatic_score if args.chrom: chrom_string = " AND chrom = '%s'" % args.chrom if args.chrom is None: query = "SELECT variant_id, chrom, start, end, \ ref, alt, gene, impact, gts, gt_types, \ gt_ref_depths, gt_alt_depths \ FROM variants \ WHERE 1 \ %s \ %s \ %s \ %s" % (depth_string, qual_string, ssc_string, chrom_string) gq.run(query) smp2idx = gq.sample_to_idx somatic_counter = 0 somatic_v_ids = [] if args.dry_run: print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \ 'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth', 'chrom', 'start', 'end', 'ref', 'alt', 'gene']) for row in gq: # we can skip variants where all genotypes are identical if len(set(row['gt_types'])) == 1: continue for pair in t_n_pairs: samples = pair.subjects if len(samples) != 2: continue tumor = pair.subjects[0] normal = pair.subjects[1] # swap if we guessed the tumor incorrectly if tumor.affected is False: tumor, normal = normal, tumor tum_idx = smp2idx[tumor.name] nrm_idx = smp2idx[normal.name] tum_gt = row['gts'][tum_idx] nrm_gt = row['gts'][nrm_idx] tum_gt_type = row['gt_types'][tum_idx] nrm_gt_type = row['gt_types'][nrm_idx] if nrm_gt_type == tum_gt_type: continue if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN: continue # the genotypes pass the smell test for somatic # mutations if in this block. if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF): tum_ref_depth = row['gt_ref_depths'][tum_idx] nrm_ref_depth = row['gt_ref_depths'][nrm_idx] tum_alt_depth = row['gt_alt_depths'][tum_idx] nrm_alt_depth = row['gt_alt_depths'][nrm_idx] # total observed depth nrm_depth = nrm_alt_depth + nrm_ref_depth tum_depth = tum_alt_depth + tum_ref_depth if (nrm_depth < args.min_norm_depth \ or \ tum_depth < args.min_tumor_depth): continue try: tum_alt_freq = float(tum_alt_depth) / \ (float(tum_alt_depth) + float(tum_ref_depth)) except ZeroDivisionError: tum_alt_freq = 'NA' try: nrm_alt_freq = float(nrm_alt_depth) / \ (float(nrm_alt_depth) + float(nrm_ref_depth)) except ZeroDivisionError: nrm_alt_freq = 'NA' # apply evidence thresholds. if (args.max_norm_alt_freq and nrm_alt_freq > args.max_norm_alt_freq) \ or \ (args.max_norm_alt_count and nrm_alt_depth > args.max_norm_alt_count): continue somatic_counter += 1 somatic_v_ids.append((1, row['variant_id'])) print'\t'.join(str(s) for s in [tumor.name, tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \ normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \ row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']]) if not args.dry_run: import database conn, metadata = database.get_session_metadata(args.db) # now set the identified mutations to True. update_qry = "UPDATE variants SET is_somatic = 1 " update_qry += " WHERE variant_id IN (%s)" update_qry %= ",".join(str(x[1]) for x in somatic_v_ids) res = conn.execute(update_qry) assert res.rowcount == somatic_counter print "Identified and set", somatic_counter, "somatic mutations" conn.commit() else: print "Would have identified and set", somatic_counter, "somatic mutations"
def create(db, cols=None): if cols is None: cols = [x[0] for x in gt_cols_types if x[0] != 'gts'] print >> sys.stderr, ( "indexing all columns except 'gts'; to index that column, " "run gemini bcolz_index %s --cols gts" % db) conn, metadata = database.get_session_metadata(db) gt_cols = [x for x in get_gt_cols(metadata) if x in cols] samples = get_samples(metadata) bcpath = get_bcolz_dir(db) mkdir(bcpath) nv = get_n_variants(conn) sys.stderr.write("loading %i variants for %i samples into bcolz\n" % (nv, len(samples))) if nv == 0 or len(samples) == 0: return carrays = {} tmps = {} try: for gtc in gt_cols: carrays[gtc] = [] tmps[gtc] = [] dt = dict(gt_cols_types)[gtc] for s in samples: mkdir("%s/%s" % (bcpath, s)) carrays[gtc].append( bcolz.carray(np.empty(0, dtype=dt), expectedlen=nv, rootdir="%s/%s/%s" % (bcpath, s, gtc), chunklen=16384 * 8, mode="w")) tmps[gtc].append([]) t0 = time.time() # scale step by number of samples to limit memory use. step = max(100, 2000000 / len(samples)) sys.stderr.write("step-size: %i\n" % step) del gtc decomp = compression.unpack_genotype_blob empty = [-1] * len(samples) for i, row in enumerate( conn.execute( sql.text("select %s from variants" % ", ".join(gt_cols)))): if i == 0: try: decomp(row[0]) except zlib.error: decomp = compression.snappy_unpack_blob for j, gt_col in enumerate(gt_cols): vals = decomp(row[j]) if vals is None or len(vals) == 0: # empty gt_phred_ll vals = empty for isamp, sample in enumerate(samples): tmps[gt_col][isamp].append(vals[isamp]) if (i > 0 and i % step == 0) or i == nv - 1: carrays[gt_col][isamp].append(tmps[gt_col][isamp]) tmps[gt_col][isamp] = [] carrays[gt_col][isamp].flush() if i % step == 0 and i > 0: print >> sys.stderr, "at %.1fM (%.0f rows / second)" % ( i / 1000000., i / float(time.time() - t0)) t = float(time.time() - t0) print >> sys.stderr, "loaded %d variants at %.1f / second" % (len( carrays[gt_col][0]), nv / t) except: # on error, we remove the dirs so we can't have weird problems. for k, li in carrays.items(): for i, ca in enumerate(li): if i < 5: print >> sys.stderr, "removing:", ca.rootdir if i == 5: print >> sys.stderr, "not reporting further removals for %s" % k ca.flush() shutil.rmtree(ca.rootdir) raise
def create(db, cols=None): if cols is None: cols = [x[0] for x in gt_cols_types if x[0] != 'gts'] print >>sys.stderr, ( "indexing all columns except 'gts'; to index that column, " "run gemini bcolz_index %s --cols gts" % db) conn, metadata = database.get_session_metadata(db) gt_cols = [x for x in get_gt_cols(metadata) if x in cols] samples = get_samples(metadata) bcpath = get_bcolz_dir(db) mkdir(bcpath) nv = get_n_variants(conn) sys.stderr.write("loading %i variants for %i samples into bcolz\n" % (nv, len(samples))) if nv == 0 or len(samples) == 0: return carrays = {} tmps = {} try: for gtc in gt_cols: carrays[gtc] = [] tmps[gtc] = [] dt = dict(gt_cols_types)[gtc] for s in samples: mkdir("%s/%s" % (bcpath, s)) carrays[gtc].append(bcolz.carray(np.empty(0, dtype=dt), expectedlen=nv, rootdir="%s/%s/%s" % (bcpath, s, gtc), chunklen=16384*8, mode="w")) tmps[gtc].append([]) t0 = time.time() # scale step by number of samples to limit memory use. step = max(100, 2000000 / len(samples)) sys.stderr.write("step-size: %i\n" % step) del gtc empty = [-1] * len(samples) for i, row in enumerate(conn.execute(sql.text("select %s from variants" % ", ".join(gt_cols)))): for j, gt_col in enumerate(gt_cols): vals = decomp(row[j]) if vals is None or len(vals) == 0: # empty gt_phred_ll vals = empty for isamp, sample in enumerate(samples): tmps[gt_col][isamp].append(vals[isamp]) if (i > 0 and i % step == 0) or i == nv - 1: carrays[gt_col][isamp].append(tmps[gt_col][isamp]) tmps[gt_col][isamp] = [] carrays[gt_col][isamp].flush() if i % step == 0 and i > 0: print >>sys.stderr, "at %.1fM (%.0f rows / second)" % (i / 1000000., i / float(time.time() - t0)) t = float(time.time() - t0) print >>sys.stderr, "loaded %d variants at %.1f / second" % (len(carrays[gt_col][0]), nv / t) except: # on error, we remove the dirs so we can't have weird problems. for k, li in carrays.items(): for i, ca in enumerate(li): if i < 5: print >>sys.stderr, "removing:", ca.rootdir if i == 5: print >>sys.stderr, "not reporting further removals for %s" % k ca.flush() shutil.rmtree(ca.rootdir) raise
def filter(db, query, user_dict): # these should be translated to a bunch or or/and statements within gemini # so they are supported, but must be translated before getting here. if query == "False" or query is None or query is False: return [] if "any(" in query or "all(" in query or \ ("sum(" in query and not query.startswith("sum(") and query.count("sum(") == 1): return None user_dict['where'] = np.where if query.startswith("not "): # "~" is not to numexpr. query = "~" + query[4:] sum_cmp = False if query.startswith("sum("): assert query[-1].isdigit() query, sum_cmp = query[4:].rsplit(")", 1) query = "(%s) %s" % (query, sum_cmp) query = query.replace(".", "__") query = " & ".join("(%s)" % token for token in query.split(" and ")) query = " | ".join("(%s)" % token for token in query.split(" or ")) import database conn, metadata = database.get_session_metadata(db) samples = get_samples(metadata) # convert gt_col[index] to gt_col__sample_name patt = "(%s)\[(\d+)\]" % "|".join((g[0] for g in gt_cols_types)) def subfn(x): """Turn gt_types[1] into gt_types__sample""" field, idx = x.groups() return "%s__%s" % (field, fix_sample_name(samples[int(idx)])) query = re.sub(patt, subfn, query) if os.environ.get('GEMINI_DEBUG') == 'TRUE': print >>sys.stderr, query[:250] + "..." carrays = load(db, query=query) if len(carrays) == 0 or max(len(carrays[c]) for c in carrays) == 0 or \ any(not any(carrays[c]) for c in carrays): # need this 2nd check above because of the place-holders in load() raise NoGTIndexException # loop through and create a cache of "$gt__$sample" for gt_col in carrays: if not gt_col in query: continue for i, sample_array in enumerate(carrays[gt_col]): sample = fix_sample_name(samples[i]) if not sample in query: continue user_dict["%s__%s" % (gt_col, sample)] = sample_array # had to special-case count. it won't quite be as efficient if "|count|" in query: tokens = query[2:-2].split("|count|") icmp = tokens[-1] # a list of carrays, so not in memory. res = [bcolz.eval(tok, user_dict=user_dict) for tok in tokens[:-1]] # in memory after this, but just a single axis array. res = np.sum(res, axis=0) res = ne.evaluate('res%s' % icmp) else: res = bcolz.eval(query, user_dict=user_dict) try: if res.shape[0] == 1 and len(res.shape) > 1: res = res[0] except AttributeError: return [] variant_ids, = np.where(res) #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict, # vm="numexpr").wheretrue())) # variant ids are 1-based. if len(variant_ids) > 0: return 1 + variant_ids else: return []