def apply_query_w_genotype_select(c, query, use_header): """ Execute a query that contains gt* columns in only in the SELECT. """ # construct a mapping of sample names to list indices sample_to_idx = util.map_samples_to_indicies(c) (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx) query = add_gt_cols_to_query(query.lower()) c.execute(query) # what are the columns that were actually selected by the user. all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] if "*" in select_cols: select_cols.remove("*") # all_cols_orig.remove("*") all_cols_new.remove("*") select_cols += all_query_cols if use_header: h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)] yield OrderedDict(itertools.izip(h, h)) report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols)) for row in c: gts = compression.unpack_genotype_blob(row["gts"]) gt_types = compression.unpack_genotype_blob(row["gt_types"]) gt_phases = compression.unpack_genotype_blob(row["gt_phases"]) gt_depths = compression.unpack_genotype_blob(row["gt_depths"]) fields = OrderedDict() for idx, col in enumerate(report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT"): fields[col] = row[col] else: fields[col] = eval(col.strip()) yield fields
def filter_query(c, query, gt_filter, use_header): """ Execute a base SQL query while applying filters on the returned rows based on filters applied to the genotype-specific columns. For example: --gt_filter "(gt_types.1478PC0011 == 1 or gt_types.1478PC0012 == 1) """ def correct_genotype_filter(gt_filter, sample_to_idx): """ This converts a "raw" genotype filter supplied by the user to a filter than can be eval()'ed. Specifically, we must convery a _named_ genotype index to a _numerical_ genotype index so that the appropriate value can be extracted for the sample from the genotype numpy arrays. For example, converts: --gt-filter "(gt_types.1478PC0011 == 1)" to (gt_types[11] == 1) """ corrected_gt_filter = [] tokens = re.split(r"[\s+]+", gt_filter) for token in tokens: if token.find("gt") >= 0 or token.find("GT") >= 0: corrected = _correct_genotype_col(token, sample_to_idx) corrected_gt_filter.append(corrected) else: corrected_gt_filter.append(token) return " ".join(corrected_gt_filter) # construct a mapping of sample names to list indices sample_to_idx = util.map_samples_to_indicies(c) gt_filter = correct_genotype_filter(gt_filter, sample_to_idx) (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx) query = add_gt_cols_to_query(query.lower()) c.execute(query) # what are the columns that were actually selected by the user. all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] if "*" in select_cols: select_cols.remove("*") all_cols_orig.remove("*") all_cols_new.remove("*") select_cols += all_query_cols if use_header: h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)] yield OrderedDict(itertools.izip(h, h)) report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols)) for row in c: gts = compression.unpack_genotype_blob(row["gts"]) gt_types = compression.unpack_genotype_blob(row["gt_types"]) gt_phases = compression.unpack_genotype_blob(row["gt_phases"]) gt_depths = compression.unpack_genotype_blob(row["gt_depths"]) if not eval(gt_filter): continue fields = OrderedDict() for idx, col in enumerate(report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT"): fields[col] = row[col] else: fields[col] = eval(col.strip()) yield fields