Esempio n. 1
0
def apply_query_w_genotype_select(c, query, use_header):
    """
    Execute a query that contains gt* columns in only in the SELECT.
    """
    # construct a mapping of sample names to list indices
    sample_to_idx = util.map_samples_to_indicies(c)

    (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx)

    query = add_gt_cols_to_query(query.lower())
    c.execute(query)

    # what are the columns that were actually selected by the user.
    all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")]

    if "*" in select_cols:
        select_cols.remove("*")
        # all_cols_orig.remove("*")
        all_cols_new.remove("*")
        select_cols += all_query_cols

    if use_header:
        h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)]
        yield OrderedDict(itertools.izip(h, h))

    report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols))
    for row in c:
        gts = compression.unpack_genotype_blob(row["gts"])
        gt_types = compression.unpack_genotype_blob(row["gt_types"])
        gt_phases = compression.unpack_genotype_blob(row["gt_phases"])
        gt_depths = compression.unpack_genotype_blob(row["gt_depths"])

        fields = OrderedDict()
        for idx, col in enumerate(report_cols):
            if col == "*":
                continue
            if not col.startswith("gt") and not col.startswith("GT"):
                fields[col] = row[col]
            else:
                fields[col] = eval(col.strip())
        yield fields
Esempio n. 2
0
def filter_query(c, query, gt_filter, use_header):
    """
    Execute a base SQL query while applying filters on the returned 
    rows based on filters applied to the genotype-specific columns.
    
    For example:
    --gt_filter "(gt_types.1478PC0011 == 1 or gt_types.1478PC0012 == 1)
    """

    def correct_genotype_filter(gt_filter, sample_to_idx):
        """
        This converts a "raw" genotype filter supplied by the user
        to a filter than can be eval()'ed.  Specifically, we must
        convery a _named_ genotype index to a _numerical_
        genotype index so that the appropriate value can be
        extracted for the sample from the genotype numpy arrays.
        
        For example, converts:
        --gt-filter "(gt_types.1478PC0011 == 1)"
        to
        (gt_types[11] == 1)
        """
        corrected_gt_filter = []
        tokens = re.split(r"[\s+]+", gt_filter)
        for token in tokens:
            if token.find("gt") >= 0 or token.find("GT") >= 0:
                corrected = _correct_genotype_col(token, sample_to_idx)
                corrected_gt_filter.append(corrected)
            else:
                corrected_gt_filter.append(token)
        return " ".join(corrected_gt_filter)

    # construct a mapping of sample names to list indices
    sample_to_idx = util.map_samples_to_indicies(c)

    gt_filter = correct_genotype_filter(gt_filter, sample_to_idx)
    (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx)

    query = add_gt_cols_to_query(query.lower())

    c.execute(query)

    # what are the columns that were actually selected by the user.
    all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")]

    if "*" in select_cols:
        select_cols.remove("*")
        all_cols_orig.remove("*")
        all_cols_new.remove("*")
        select_cols += all_query_cols

    if use_header:
        h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)]
        yield OrderedDict(itertools.izip(h, h))

    report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols))
    for row in c:
        gts = compression.unpack_genotype_blob(row["gts"])
        gt_types = compression.unpack_genotype_blob(row["gt_types"])
        gt_phases = compression.unpack_genotype_blob(row["gt_phases"])
        gt_depths = compression.unpack_genotype_blob(row["gt_depths"])

        if not eval(gt_filter):
            continue

        fields = OrderedDict()
        for idx, col in enumerate(report_cols):
            if col == "*":
                continue
            if not col.startswith("gt") and not col.startswith("GT"):
                fields[col] = row[col]
            else:
                fields[col] = eval(col.strip())
        yield fields