def _add_gene_col_to_query(self): """ Add the gene column to the list of SELECT'ed columns in a query. """ if "from" not in self.query.lower(): sys.exit("Malformed query: expected a FROM keyword.") (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query) select_clause = ",".join(select_tokens) + \ ", gene " self.query = "select " + select_clause + rest_of_query return self.query
def _add_gt_cols_to_query(self): """ We have to modify the raw query to select the genotype columns in order to support the genotype filters. That is, if the user wants to limit the rows returned based upon, for example, "gts.joe == 1", then we need to select the full gts BLOB column in order to enforce that limit. The user wouldn't have selected gts as a columns, so therefore, we have to modify the select statement to add it. In essence, when a gneotype filter has been requested, we always add the gts, gt_types and gt_phases columns. """ if "from" not in self.query.lower(): sys.exit("Malformed query: expected a FROM keyword.") (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query) # remove any GT columns select_clause_list = [] for token in select_tokens: if not token.startswith("gt") and \ not token.startswith("GT") and \ not ".gt" in token and \ not ".GT" in token and \ not token.startswith("(gt") and \ not token.startswith("(GT"): select_clause_list.append(token) # reconstruct the query with the GT* columns added if len(select_clause_list) > 0: select_clause = ",".join(select_clause_list) + \ ", gts, gt_types, gt_phases, gt_depths, \ gt_ref_depths, gt_alt_depths, gt_quals " else: select_clause = ",".join(select_clause_list) + \ " gts, gt_types, gt_phases, gt_depths, \ gt_ref_depths, gt_alt_depths, gt_quals " self.query = "select " + select_clause + rest_of_query # extract the original select columns return self.query
def _add_gene_col_to_query(self): """ Add the gene column to the list of SELECT'ed columns in a query. """ if "from" not in self.query.lower(): sys.exit("Malformed query: expected a FROM keyword.") (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query) if not any("gene" in s for s in select_tokens): select_clause = ",".join(select_tokens) + \ ", gene " self.query = "select " + select_clause + rest_of_query return self.query
def _add_vcf_cols_to_query(self): """ Add the VCF columns to the list of SELECT'ed columns in a query. NOTE: Should only be called if using VCFRowFormat() """ if "from" not in self.query.lower(): sys.exit("Malformed query: expected a FROM keyword.") (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query) cols_to_add = [] for col in ['chrom', 'start', 'vcf_id', 'ref', 'alt', 'qual', 'filter', 'info', \ 'gts', 'gt_types', 'gt_phases']: if not any(col in s for s in select_tokens): cols_to_add.append(col) select_clause = ",".join(select_tokens + cols_to_add) self.query = "select " + select_clause + rest_of_query return self.query
def _split_select(self): """ Build a list of _all_ columns in the SELECT statement and segregated the non-genotype specific SELECT columns. This is used to control how to report the results, as the genotype-specific columns need to be eval()'ed whereas others do not. For example: "SELECT chrom, start, end, gt_types.1478PC0011" will populate the lists as follows: select_columns = ['chrom', 'start', 'end'] all_columns = ['chrom', 'start', 'end', 'gt_types[11]'] """ self.select_columns = [] self.all_columns_new = [] self.all_columns_orig = [] self.gt_name_to_idx_map = {} self.gt_idx_to_name_map = {} # iterate through all of the select columns andclear # distinguish the genotype-specific columns from the base columns if "from" not in self.query.lower(): sys.exit("Malformed query: expected a FROM keyword.") (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query) for token in select_tokens: if not token.startswith("GT") and not token.startswith("gt"): self.select_columns.append(token) self.all_columns_new.append(token) self.all_columns_orig.append(token) else: new_col = self._correct_genotype_col(token) self.all_columns_new.append(new_col) self.all_columns_orig.append(token) self.gt_name_to_idx_map[token] = new_col self.gt_idx_to_name_map[new_col] = token
def _split_select(self): """ Build a list of _all_ columns in the SELECT statement and segregated the non-genotype specific SELECT columns. This is used to control how to report the results, as the genotype-specific columns need to be eval()'ed whereas others do not. For example: "SELECT chrom, start, end, gt_types.1478PC0011" will populate the lists as follows: select_columns = ['chrom', 'start', 'end'] all_columns = ['chrom', 'start', 'end', 'gt_types[11]'] """ self.select_columns = [] self.all_columns_new = [] self.all_columns_orig = [] self.gt_name_to_idx_map = {} self.gt_idx_to_name_map = {} # iterate through all of the select columns andclear # distinguish the genotype-specific columns from the base columns if "from" not in self.query.lower(): sys.exit("Malformed query: expected a FROM keyword.") (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query) for token in select_tokens: # it is a WILDCARD if (token.find("gt") >= 0 or token.find("GT") >= 0) \ and '.(' in token and ').' in token: # break the wildcard into its pieces. That is: # (COLUMN).(WILDCARD) (column, wildcard) = token.split('.') # remove the syntactic parentheses wildcard = wildcard.strip('(').strip(')') column = column.strip('(').strip(')') # convert "gt_types.(affected==1)" # to: gt_types[3] == HET and gt_types[9] == HET sample_info = self._get_matching_sample_ids(wildcard) # maintain a list of the sample indices that should # be displayed as a result of the SELECT'ed wildcard wildcard_indices = [] for (idx, sample) in enumerate(sample_info): wildcard_display_col = column + '.' + str(sample[1]) wildcard_mask_col = column + '[' + str(sample[0]) + ']' wildcard_indices.append(sample[0]) new_col = wildcard_mask_col self.all_columns_new.append(new_col) self.all_columns_orig.append(wildcard_display_col) self.gt_name_to_idx_map[ wildcard_display_col] = wildcard_mask_col self.gt_idx_to_name_map[ wildcard_mask_col] = wildcard_display_col # it is a basic genotype column elif (token.find("gt") >= 0 or token.find("GT") >= 0) \ and '.(' not in token and not ').' in token: new_col = self._correct_genotype_col(token) self.all_columns_new.append(new_col) self.all_columns_orig.append(token) self.gt_name_to_idx_map[token] = new_col self.gt_idx_to_name_map[new_col] = token # it is neither else: self.select_columns.append(token) self.all_columns_new.append(token) self.all_columns_orig.append(token)
def _split_select(self): """ Build a list of _all_ columns in the SELECT statement and segregated the non-genotype specific SELECT columns. This is used to control how to report the results, as the genotype-specific columns need to be eval()'ed whereas others do not. For example: "SELECT chrom, start, end, gt_types.1478PC0011" will populate the lists as follows: select_columns = ['chrom', 'start', 'end'] all_columns = ['chrom', 'start', 'end', 'gt_types[11]'] """ self.select_columns = [] self.all_columns_new = [] self.all_columns_orig = [] self.gt_name_to_idx_map = {} self.gt_idx_to_name_map = {} # iterate through all of the select columns andclear # distinguish the genotype-specific columns from the base columns if "from" not in self.query.lower(): sys.exit("Malformed query: expected a FROM keyword.") (select_tokens, rest_of_query) = get_select_cols_and_rest(self.query) for token in select_tokens: # it is a WILDCARD if (token.find("gt") >= 0 or token.find("GT") >= 0) \ and '.(' in token and ').' in token: # break the wildcard into its pieces. That is: # (COLUMN).(WILDCARD) (column, wildcard) = token.split('.') # remove the syntactic parentheses wildcard = wildcard.strip('(').strip(')') column = column.strip('(').strip(')') # convert "gt_types.(affected==1)" # to: gt_types[3] == HET and gt_types[9] == HET sample_info = self._get_matching_sample_ids(wildcard) # maintain a list of the sample indices that should # be displayed as a result of the SELECT'ed wildcard wildcard_indices = [] for (idx, sample) in enumerate(sample_info): wildcard_display_col = column + '.' + str(sample[1]) wildcard_mask_col = column + '[' + str(sample[0]) + ']' wildcard_indices.append(sample[0]) new_col = wildcard_mask_col self.all_columns_new.append(new_col) self.all_columns_orig.append(wildcard_display_col) self.gt_name_to_idx_map[wildcard_display_col] = wildcard_mask_col self.gt_idx_to_name_map[wildcard_mask_col] = wildcard_display_col # it is a basic genotype column elif (token.find("gt") >= 0 or token.find("GT") >= 0) \ and '.(' not in token and not ').' in token: new_col = self._correct_genotype_col(token) self.all_columns_new.append(new_col) self.all_columns_orig.append(token) self.gt_name_to_idx_map[token] = new_col self.gt_idx_to_name_map[new_col] = token # it is neither else: self.select_columns.append(token) self.all_columns_new.append(token) self.all_columns_orig.append(token)