Ejemplo n.º 1
0
    def __init__(self, db):
        self.db = db
        self.query_executed = False
        self.for_browser = False

        self._connect_to_database()
        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indicies(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indicies_to_samples(self.c)
Ejemplo n.º 2
0
    def __init__(self, db):
        self.db = db
        self.query_executed = False
        self.for_browser = False

        self._connect_to_database()
        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indicies(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indicies_to_samples(self.c)
Ejemplo n.º 3
0
def get_query(args, c):
    """
    Execute a user-defined query passed in via
    the command line.
    """
    sample_to_idx = util.map_samples_to_indicies(c)

    query_pieces = args.query.split()
    if not any(s.startswith("gt") for s in query_pieces) and not any("gt" in s for s in query_pieces):
        apply_basic_query(c, args)
    else:
        apply_query_w_genotype_select(c, args.query, args.use_header)
Ejemplo n.º 4
0
    def __init__(self, db, include_gt_cols=False):
        assert os.path.exists(db), "%s does not exist." % db

        self.db = db
        self.query_executed = False
        self.for_browser = False
        self.include_gt_cols = include_gt_cols
        
        self._connect_to_database()
        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indicies(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indicies_to_samples(self.c)
Ejemplo n.º 5
0
def get_query(args, c):
    """
    Execute a user-defined query passed in via
    the command line.
    """
    sample_to_idx = util.map_samples_to_indicies(c)
    
    query_pieces = args.query.split()
    if not any(s.startswith("gt") for s in query_pieces) and \
       not any("gt" in s for s in query_pieces):
       apply_query(c, args)
    else:
        (tokens, select_cols, main_where, gts_where) = \
            refine_sql(args.query, sample_to_idx)
        apply_refined_query(c, tokens, select_cols, main_where, gts_where, args)
Ejemplo n.º 6
0
def get_query(args, c):
    """
    Execute a user-defined query passed in via
    the command line.
    """
    sample_to_idx = util.map_samples_to_indicies(c)

    query_pieces = args.query.split()
    if not any(s.startswith("gt") for s in query_pieces) and \
       not any("gt" in s for s in query_pieces):
        apply_query(c, args)
    else:
        (tokens, select_cols, main_where, gts_where) = \
            refine_sql(args.query, sample_to_idx)
        apply_refined_query(c, tokens, select_cols, main_where, gts_where,
                            args)
Ejemplo n.º 7
0
def apply_query_w_genotype_select(c, query, use_header):
    """
    Execute a query that contains gt* columns in only in the SELECT.
    """
    # construct a mapping of sample names to list indices
    sample_to_idx = util.map_samples_to_indicies(c)

    (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx)

    query = add_gt_cols_to_query(query.lower())
    c.execute(query)

    # what are the columns that were actually selected by the user.
    all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")]

    if "*" in select_cols:
        select_cols.remove("*")
        # all_cols_orig.remove("*")
        all_cols_new.remove("*")
        select_cols += all_query_cols

    if use_header:
        h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)]
        yield OrderedDict(itertools.izip(h, h))

    report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols))
    for row in c:
        gts = compression.unpack_genotype_blob(row["gts"])
        gt_types = compression.unpack_genotype_blob(row["gt_types"])
        gt_phases = compression.unpack_genotype_blob(row["gt_phases"])
        gt_depths = compression.unpack_genotype_blob(row["gt_depths"])

        fields = OrderedDict()
        for idx, col in enumerate(report_cols):
            if col == "*":
                continue
            if not col.startswith("gt") and not col.startswith("GT"):
                fields[col] = row[col]
            else:
                fields[col] = eval(col.strip())
        yield fields
Ejemplo n.º 8
0
def filter_query(c, query, gt_filter, use_header):
    """
    Execute a base SQL query while applying filters on the returned 
    rows based on filters applied to the genotype-specific columns.
    
    For example:
    --gt_filter "(gt_types.1478PC0011 == 1 or gt_types.1478PC0012 == 1)
    """

    def correct_genotype_filter(gt_filter, sample_to_idx):
        """
        This converts a "raw" genotype filter supplied by the user
        to a filter than can be eval()'ed.  Specifically, we must
        convery a _named_ genotype index to a _numerical_
        genotype index so that the appropriate value can be
        extracted for the sample from the genotype numpy arrays.
        
        For example, converts:
        --gt-filter "(gt_types.1478PC0011 == 1)"
        to
        (gt_types[11] == 1)
        """
        corrected_gt_filter = []
        tokens = re.split(r"[\s+]+", gt_filter)
        for token in tokens:
            if token.find("gt") >= 0 or token.find("GT") >= 0:
                corrected = _correct_genotype_col(token, sample_to_idx)
                corrected_gt_filter.append(corrected)
            else:
                corrected_gt_filter.append(token)
        return " ".join(corrected_gt_filter)

    # construct a mapping of sample names to list indices
    sample_to_idx = util.map_samples_to_indicies(c)

    gt_filter = correct_genotype_filter(gt_filter, sample_to_idx)
    (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx)

    query = add_gt_cols_to_query(query.lower())

    c.execute(query)

    # what are the columns that were actually selected by the user.
    all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")]

    if "*" in select_cols:
        select_cols.remove("*")
        all_cols_orig.remove("*")
        all_cols_new.remove("*")
        select_cols += all_query_cols

    if use_header:
        h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)]
        yield OrderedDict(itertools.izip(h, h))

    report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols))
    for row in c:
        gts = compression.unpack_genotype_blob(row["gts"])
        gt_types = compression.unpack_genotype_blob(row["gt_types"])
        gt_phases = compression.unpack_genotype_blob(row["gt_phases"])
        gt_depths = compression.unpack_genotype_blob(row["gt_depths"])

        if not eval(gt_filter):
            continue

        fields = OrderedDict()
        for idx, col in enumerate(report_cols):
            if col == "*":
                continue
            if not col.startswith("gt") and not col.startswith("GT"):
                fields[col] = row[col]
            else:
                fields[col] = eval(col.strip())
        yield fields