Exemple #1
0
    def _add_gene_col_to_query(self):
        """
        Add the gene column to the list of SELECT'ed columns
        in a query.
        """
        if "from" not in self.query.lower():
            sys.exit("Malformed query: expected a FROM keyword.")

        (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query)
        
        select_clause = ",".join(select_tokens) + \
                    ", gene "
        
        self.query = "select " + select_clause + rest_of_query
        return self.query
Exemple #2
0
    def _add_gt_cols_to_query(self):
        """
        We have to modify the raw query to select the genotype
        columns in order to support the genotype filters.  That is,
        if the user wants to limit the rows returned based upon, for example,
        "gts.joe == 1", then we need to select the full gts BLOB column in
        order to enforce that limit.  The user wouldn't have selected gts as a
        columns, so therefore, we have to modify the select statement to add
        it.

        In essence, when a gneotype filter has been requested, we always add
        the gts, gt_types and gt_phases columns.
        """

        if "from" not in self.query.lower():
            sys.exit("Malformed query: expected a FROM keyword.")

        (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query)

        # remove any GT columns
        select_clause_list = []
        for token in select_tokens:
            if not token.startswith("gt") and \
               not token.startswith("GT") and \
               not ".gt" in token and \
               not ".GT" in token and \
               not token.startswith("(gt") and \
               not token.startswith("(GT"):
                select_clause_list.append(token)

        # reconstruct the query with the GT* columns added
        if len(select_clause_list) > 0:
            select_clause = ",".join(select_clause_list) + \
                    ", gts, gt_types, gt_phases, gt_depths, \
                       gt_ref_depths, gt_alt_depths, gt_quals "

        else:
            select_clause = ",".join(select_clause_list) + \
                    " gts, gt_types, gt_phases, gt_depths, \
                      gt_ref_depths, gt_alt_depths, gt_quals "

        self.query = "select " + select_clause + rest_of_query

        # extract the original select columns
        return self.query
Exemple #3
0
    def _add_vcf_cols_to_query(self):
        """
        Add the VCF columns to the list of SELECT'ed columns
        in a query.

        NOTE: Should only be called if using VCFRowFormat()
        """
        if "from" not in self.query.lower():
            sys.exit("Malformed query: expected a FROM keyword.")

        (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query)

        cols_to_add = []
        for col in ['chrom', 'start', 'vcf_id', 'ref', 'alt', 'qual', 'filter', 'info', \
            'gts', 'gt_types', 'gt_phases']:
            if not any(col in s for s in select_tokens):
                cols_to_add.append(col)

        select_clause = ",".join(select_tokens + cols_to_add)
        self.query = "select " + select_clause + rest_of_query
        return self.query
Exemple #4
0
    def _split_select(self):
        """
        Build a list of _all_ columns in the SELECT statement
        and segregated the non-genotype specific SELECT columns.

        This is used to control how to report the results, as the
        genotype-specific columns need to be eval()'ed whereas others
        do not.

        For example: "SELECT chrom, start, end, gt_types.1478PC0011"
        will populate the lists as follows:

        select_columns = ['chrom', 'start', 'end']
        all_columns = ['chrom', 'start', 'end', 'gt_types[11]']
        """
        self.select_columns = []
        self.all_columns_new = []
        self.all_columns_orig = []
        self.gt_name_to_idx_map = {}
        self.gt_idx_to_name_map = {}

        # iterate through all of the select columns andclear
        # distinguish the genotype-specific columns from the base columns
        if "from" not in self.query.lower():
            sys.exit("Malformed query: expected a FROM keyword.")

        (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query)

        for token in select_tokens:
            if not token.startswith("GT") and not token.startswith("gt"):
                self.select_columns.append(token)
                self.all_columns_new.append(token)
                self.all_columns_orig.append(token)
            else:
                new_col = self._correct_genotype_col(token)
                self.all_columns_new.append(new_col)
                self.all_columns_orig.append(token)
                self.gt_name_to_idx_map[token] = new_col
                self.gt_idx_to_name_map[new_col] = token
Exemple #5
0
    def _split_select(self):
        """
        Build a list of _all_ columns in the SELECT statement
        and segregated the non-genotype specific SELECT columns.

        This is used to control how to report the results, as the
        genotype-specific columns need to be eval()'ed whereas others
        do not.

        For example: "SELECT chrom, start, end, gt_types.1478PC0011"
        will populate the lists as follows:

        select_columns = ['chrom', 'start', 'end']
        all_columns = ['chrom', 'start', 'end', 'gt_types[11]']
        """
        self.select_columns = []
        self.all_columns_new = []
        self.all_columns_orig = []
        self.gt_name_to_idx_map = {}
        self.gt_idx_to_name_map = {}

        # iterate through all of the select columns andclear
        # distinguish the genotype-specific columns from the base columns
        if "from" not in self.query.lower():
            sys.exit("Malformed query: expected a FROM keyword.")

        (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query)

        for token in select_tokens:
            
            # it is a WILDCARD
            if (token.find("gt") >= 0 or token.find("GT") >= 0) \
                and '.(' in token and ').' in token:
                # break the wildcard into its pieces. That is:
                # (COLUMN).(WILDCARD)
                (column, wildcard) = token.split('.')

                # remove the syntactic parentheses
                wildcard = wildcard.strip('(').strip(')')
                column = column.strip('(').strip(')')

                # convert "gt_types.(affected==1)"
                # to: gt_types[3] == HET and gt_types[9] == HET
                sample_info = self._get_matching_sample_ids(wildcard)
                
                # maintain a list of the sample indices that should
                # be displayed as a result of the SELECT'ed wildcard
                wildcard_indices = []
                for (idx, sample) in enumerate(sample_info):
                    wildcard_display_col = column + '.' + str(sample[1])
                    wildcard_mask_col = column + '[' + str(sample[0]) + ']'
                    wildcard_indices.append(sample[0])

                    new_col = wildcard_mask_col
                    self.all_columns_new.append(new_col)
                    self.all_columns_orig.append(wildcard_display_col)
                    self.gt_name_to_idx_map[wildcard_display_col] = wildcard_mask_col
                    self.gt_idx_to_name_map[wildcard_mask_col] = wildcard_display_col

            # it is a basic genotype column
            elif (token.find("gt") >= 0 or token.find("GT") >= 0) \
                and '.(' not in token and not ').' in token:
                new_col = self._correct_genotype_col(token)

                self.all_columns_new.append(new_col)
                self.all_columns_orig.append(token)
                self.gt_name_to_idx_map[token] = new_col
                self.gt_idx_to_name_map[new_col] = token

            # it is neither
            else:
                self.select_columns.append(token)
                self.all_columns_new.append(token)
                self.all_columns_orig.append(token)