def annotate(self, connection: Connection, genomic_interval: GenomicInterval, attrs: Optional[List[Vocabulary]], assembly) -> Selectable: """ :param connection: :param genomic_interval: :param attrs: a list of Vocabulary elements indicating the kind of annotation attributes desired :param assembly: :return: a statement that when executed returns the annotation data requested. """ self.connection = connection columns_of_interest = [ ann_table.c[self.col_map[attr]].label(attr.name) for attr in attrs ] stmt = \ select(columns_of_interest) \ .where((ann_table.c.start <= genomic_interval.stop) & (ann_table.c.stop >= genomic_interval.start) & (ann_table.c.chrom == genomic_interval.chrom)) if genomic_interval.strand is not None and genomic_interval.strand != 0: stmt = stmt.where(ann_table.c.strand == genomic_interval.strand) item_id_for_assembly = item_id_assembly_hg19 if assembly == 'hg19' else item_id_assembly_grch38 stmt = stmt.where(ann_table.c.item_id == item_id_for_assembly) if self.log_sql_statements: utils.show_stmt(connection, stmt, self.logger.debug, 'GENCODE_V19_HG19: ANNOTATE REGION/VARIANT') return stmt
def view_of_variants_in_interval_or_type(self, select_columns: Optional[list]): if self.region_attrs.with_variants_in_reg is None and self.region_attrs.with_variants_of_type is None: raise ValueError( 'you called this method without giving any selection criteria') columns = [regions.c[c_name] for c_name in select_columns ] if select_columns is not None else [regions] stmt_as = select(columns) if self.region_attrs.with_variants_in_reg is not None: stmt_as = stmt_as.where(( regions.c.chrom == self.region_attrs.with_variants_in_reg.chrom ) & ( regions.c.start >= self.region_attrs.with_variants_in_reg.start ) & (regions.c.start <= self.region_attrs.with_variants_in_reg.stop )) if self.region_attrs.with_variants_of_type is not None: stmt_as = stmt_as.where( regions.c.mut_type.in_( self.region_attrs.with_variants_of_type)) generated_view_name = utils.random_t_name_w_prefix( 'mut_of_type_interval') stmt = utils.stmt_create_view_as(generated_view_name, stmt_as, default_schema_to_use_name) if self.log_sql_commands: utils.show_stmt( self.connection, stmt, self.logger.debug, 'VIEW OF REGIONS IN INTERVAL {} of types {}'.format( self.region_attrs.with_variants_in_reg, self.region_attrs.with_variants_of_type)) self.connection.execute(stmt) return Table(generated_view_name, db_meta, autoload=True, autoload_with=self.connection, schema=default_schema_to_use_name)
def _table_with_any_of_mutations(self, select_columns, only_item_id_in_table: Optional[Table], *mutations: Mutation): """Returns a Table containing all the rows from the table regions containing one of the variants in the argument mutations. :param select_columns selects only the column names in this collection. If None, selects all the columns from regions. :param only_item_id_in_table If None, the variants that are not owned by any of the individuals in this table are discarded from the result. """ if len(mutations) == 0: raise ValueError('function argument *mutations cannot be empty') else: # create table for the result t_name = utils.random_t_name_w_prefix('with_any_of_mut') columns = [regions.c[c_name] for c_name in select_columns ] if select_columns is not None else [regions] stmt_as = self._stmt_where_region_is_any_of_mutations( *mutations, from_table=regions, select_expression=select(columns), only_item_id_in_table=only_item_id_in_table) stmt_create_table = utils.stmt_create_table_as( t_name, stmt_as, default_schema_to_use_name) if self.log_sql_commands: utils.show_stmt( self.connection, stmt_create_table, self.logger.debug, 'CREATE TABLE HAVING ANY OF THE {} MUTATIONS'.format( len(mutations))) self.connection.execute(stmt_create_table) return Table(t_name, db_meta, autoload=True, autoload_with=self.connection, schema=default_schema_to_use_name)
def _table_without_any_of_mutations(self): """ Returns a Table containing the item_id from the table genomes that do not match the given mutations. :param select_columns selects only the column names in this collection. If None, selects all the columns from genomes. """ mutations = self.region_attrs.without_variants if len(mutations) == 0: raise ValueError('function argument *mutations cannot be empty') else: # create table for the result t_name = utils.random_t_name_w_prefix('without_any_of_mut') query_mutations = self._stmt_where_region_is_any_of_mutations( *mutations, from_table=regions, select_expression=select([regions.c.item_id]), only_item_id_in_table=self.my_meta_t) stmt_as = except_(select([self.my_meta_t.c.item_id]), query_mutations) stmt_create_table = utils.stmt_create_table_as( t_name, stmt_as, default_schema_to_use_name) if self.log_sql_commands: utils.show_stmt( self.connection, stmt_create_table, self.logger.debug, 'CREATE TABLE WITHOUT ANY OF THE {} MUTATIONS'.format( len(mutations))) self.connection.execute(stmt_create_table) return Table(t_name, db_meta, autoload=True, autoload_with=self.connection, schema=default_schema_to_use_name)
def table_with_variants_same_c_copy(self, select_columns: Optional[list]): """ Returns a table of variants of the same type of the ones contained in RegionAttrs.with_variants_same_c_copy and only form the individuals that own all of them on the same chromosome copy. :param select_columns: the list of column names to select from the result. If None, all the columns are taken. """ if len(self.region_attrs.with_variants_same_c_copy) < 2: raise ValueError( 'You must provide at least two Mutation instances in order to use this method.' ) # selects only the mutations to be on the same chromosome copies (this set will be used two times) from all individuals # we will enforce the presence of all the given mutations in all the individuals later... interm_select_column_names = None # means all columns if select_columns is not None: # otherwise pick select_columns + minimum required interm_select_column_names = set(select_columns) interm_select_column_names.update(['item_id', 'al1', 'al2']) intermediate_table = self._table_with_any_of_mutations( interm_select_column_names, self.my_meta_t, *self.region_attrs.with_variants_same_c_copy) # groups mutations by owner in the intermediate table, and take only the owners for which sum(al1) or sum(al2) # equals to the number of the given mutations. That condition automatically implies the presence of all the # given mutations in the same individual. # for those owner, take all the given mutations result_columns = [intermediate_table] # means all columns if select_columns is not None: # otherwise pick the columns from select_columns result_columns = [ intermediate_table.c[col_name] for col_name in select_columns ] stmt_as = \ select(result_columns) \ .where(intermediate_table.c.item_id.in_( select([intermediate_table.c.item_id]) .group_by(intermediate_table.c.item_id) .having( (func.sum(intermediate_table.c.al1) == len( self.region_attrs.with_variants_same_c_copy)) | # the ( ) around each condition are mandatory (func.sum(func.coalesce(intermediate_table.c.al2, 0)) == len( self.region_attrs.with_variants_same_c_copy))) )) target_t_name = utils.random_t_name_w_prefix('with_var_same_c_copy') stmt = utils.stmt_create_table_as(target_t_name, stmt_as, default_schema_to_use_name) if self.log_sql_commands: utils.show_stmt( self.connection, stmt, self.logger.debug, 'INDIVIDUALS (+ THE GIVEN MUTATIONS) HAVING ALL THE SPECIFIED MUTATIONS ON THE SAME CHROMOSOME COPY' ) self.connection.execute(stmt) if self.log_sql_commands: self.logger.debug('DROP TABLE ' + intermediate_table.name) intermediate_table.drop(self.connection) return Table(target_t_name, db_meta, autoload=True, autoload_with=self.connection, schema=default_schema_to_use_name)
def variant_occurrence(self, connection: Connection, by_attributes: list, meta_attrs: MetadataAttrs, region_attrs: RegionAttrs, variant: Mutation) -> Selectable: """ Assembles a query statement that, after execution, returns a table containing for each individual matching the conditions in region_attrs and meta_attrs, the attributes given in by_attributes and the number of times the given "variant" occurs in each individual. """ # init state self.connection = connection names_columns_of_interest = [ self.meta_col_map[attr] for attr in by_attributes ] self._set_meta_attributes(meta_attrs) self.create_table_of_meta(names_columns_of_interest + ['item_id']) self._set_region_attributes(region_attrs) self.create_table_of_regions(['item_id']) # select target attributes from table of metadata with meta_attrs stmt_sample_set = select([ self.my_meta_t.c[self.meta_col_map[attr]] for attr in by_attributes ] + [self.my_meta_t.c.item_id]) # join with the table of regions with region_attrs if self.my_region_t is not None: stmt_sample_set = stmt_sample_set.where( self.my_meta_t.c.item_id.in_( select([self.my_region_t.c.item_id]).distinct())) stmt_sample_set = stmt_sample_set.alias() # select individuals with "variant" in table genomes and compute the occurrence for each individual func_occurrence = (genomes.c.al1 + func.coalesce(genomes.c.al2, 0)).label( Vocabulary.OCCURRENCE.name) stmt_samples_w_var = self._stmt_where_region_is_any_of_mutations(variant, from_table=genomes, select_expression=select([genomes.c.item_id, func_occurrence])) \ .alias('samples_w_var') # build a query returning individuals in sample_set and for each, the attributes in "by_attributes" + the occurrence # of the given variant stmt = \ select([stmt_sample_set.c[self.meta_col_map[attr]].label(attr.name) for attr in by_attributes] + [func.coalesce(column(Vocabulary.OCCURRENCE.name), 0).label(Vocabulary.OCCURRENCE.name)]) \ .select_from(stmt_sample_set.outerjoin(stmt_samples_w_var, stmt_sample_set.c.item_id == stmt_samples_w_var.c.item_id)) # TODO test what happens if sample set is empty and it is anyway used in the left join statement if self.log_sql_commands: utils.show_stmt(connection, stmt, self.logger.debug, 'KGENOMES: STMT VARIANT OCCURRENCE') return stmt
def table_with_all_of_mutations(self, select_columns: Optional[list]): """ Returns a table of variants of the same type of the ones contained in RegionAttrs.with_variants and only form the individuals that own all of them. :param select_columns: the list of column names to select from the result. If None, all the columns are taken. """ if not self.region_attrs.with_variants: raise ValueError( 'instance parameter self.with_variants not initialized') elif len(self.region_attrs.with_variants) == 1: return self._table_with_any_of_mutations( select_columns, self.my_meta_t, *self.region_attrs.with_variants) else: union_select_column_names = None # means all columns if select_columns is not None: # otherwise use select_columns + minimum necessary union_select_column_names = set(select_columns) union_select_column_names.add('item_id') union_table = self._table_with_any_of_mutations( union_select_column_names, self.my_meta_t, *self.region_attrs.with_variants) # extracts only the samples having all the mutations result_select_columns = [union_table] # means all columns if select_columns is not None: # otherwise use selected_columns result_select_columns = [ union_table.c[col_name] for col_name in select_columns ] stmt_as = \ select(result_select_columns) \ .where(union_table.c.item_id.in_( select([union_table.c.item_id]) .group_by(union_table.c.item_id) .having(func.count(union_table.c.item_id) == len(self.region_attrs.with_variants)) )) target_t_name = utils.random_t_name_w_prefix('with') stmt_create_table = utils.stmt_create_table_as( target_t_name, stmt_as, default_schema_to_use_name) if self.log_sql_commands: utils.show_stmt( self.connection, stmt_create_table, self.logger.debug, 'INDIVIDUALS HAVING "ALL" THE {} MUTATIONS (WITH DUPLICATE ITEM_ID)' .format(len(self.region_attrs.with_variants))) self.connection.execute(stmt_create_table) if self.log_sql_commands: self.logger.debug('DROP TABLE ' + union_table.name) union_table.drop(self.connection) return Table(target_t_name, db_meta, autoload=True, autoload_with=self.connection, schema=default_schema_to_use_name)
def donors(self, connection, by_attributes: List[Vocabulary], meta_attrs: MetadataAttrs, region_attrs: RegionAttrs, with_download_urls: bool) -> Selectable: """ Assembles a query statement that, when executed, returns a table containing for each individual matching the requirements in meta_attrs and region_attrs, the attributes in "by_attributes" """ # init state self.connection = connection names_columns_of_interest = [ self.meta_col_map[attr] for attr in by_attributes ] self._set_meta_attributes(meta_attrs) self.create_table_of_meta(names_columns_of_interest) self._set_region_attributes(region_attrs) self.create_table_of_regions(['item_id']) # TCGA has 4 gender classes: males/females/not reported/<no gender at all>. This trick merges null gender with # not reported. Otherwise, when coordinator does group by cube(gender) we would get 2 times a null gender. columns_of_interest = [ self.my_meta_t.c[self.meta_col_map[attr]].label(attr.name) for attr in by_attributes if attr is not Vocabulary.GENDER ] if Vocabulary.GENDER in by_attributes: columns_of_interest.append( func.coalesce( self.my_meta_t.c[self.meta_col_map[Vocabulary.GENDER]], 'not reported').label(Vocabulary.GENDER.name)) # compute statistics if with_download_urls: columns_of_interest.append( public_item.c.local_url.label( Vocabulary.DOWNLOAD_REGION_URL.name)) stmt = select(columns_of_interest) if self.my_region_t is not None: stmt = stmt.where( self.my_meta_t.c.item_id.in_( select([self.my_region_t.c.item_id]).distinct())) if with_download_urls: stmt = stmt.where( self.my_meta_t.c.item_id == public_item.c.item_id) if self.log_sql_commands: utils.show_stmt(self.connection, stmt, self.logger.debug, 'TCGA: STMT DONORS WITH REQUIRED ATTRIBUTES') return stmt
def take_regions_of_common_individuals(self, tables: list): """ Generates a table containing all the mutations from all the origin tables but only for those individuals that appear in all the origin tables. Supposing that each origin table reflects a characteristic that the final sample set must have, this method basically puts those characteristics in AND relationship by taking only the regions from the individuals that have all the characteristics. :param tables: The source tables which must have the same columns in the same order. """ if len(tables) == 1: return tables[0] else: # join 1st with 2nd with 3rd ... with nth on item_id # TODO consider creating temporary tables selecting only the item_id before joining stmt_join = reduce( lambda table_1, table_2: table_1.join( table_2, tables[0].c.item_id == table_2.c.item_id), tables) # union of tables select_all_from_each_table = map(lambda table_: select([table_]), tables) # TODO consider selecting from union table only what is needed by the users of this method (parametric choice) stmt_union = union(*select_all_from_each_table).alias() # select from the union table only the item_id that exists in the join stmt_as = \ select([stmt_union]) \ .where(exists(select() .select_from(stmt_join) .where(stmt_union.c.item_id == tables[0].c.item_id) )) target_t_name = utils.random_t_name_w_prefix('intersect') stmt_create_table = utils.stmt_create_table_as( target_t_name, stmt_as, default_schema_to_use_name) if self.log_sql_commands: utils.show_stmt( self.connection, stmt_create_table, self.logger.debug, 'SELECT ALL FROM SOURCE TABLES WHERE item_id IS IN ALL SOURCE TABLES' ) self.connection.execute(stmt_create_table) # TODO drop partial tables ? return Table(target_t_name, db_meta, autoload=True, autoload_with=self.connection, schema=default_schema_to_use_name)
def find_gene_region(self, connection: Connection, gene: Gene, output_attrs: List[Vocabulary], assembly): self.connection = connection select_columns = [ ann_table.c[self.col_map[att]].label(att.name) for att in output_attrs ] stmt = select(select_columns)\ .where(ann_table.c.gene_name == gene.name) if gene.type_ is not None: stmt = stmt.where(ann_table.c.gene_type == gene.type_) if gene.id_ is not None: stmt = stmt.where(ann_table.c.gene_id == gene.id_) item_id_for_assembly = item_id_assembly_hg19 if assembly == 'hg19' else item_id_assembly_grch38 stmt = stmt.where(ann_table.c.item_id == item_id_for_assembly) if self.log_sql_statements: utils.show_stmt(connection, stmt, self.logger.debug, 'GENCODE_V19_HG19: FIND GENE') return stmt
def get_variant_details(self, connection: Connection, variant: Mutation, which_details: List[Vocabulary], assembly) -> list: self.connection = connection global genomes select_columns = [] for att in which_details: mapping = self.region_col_map.get(att) if mapping is not None: select_columns.append(genomes.c[mapping].label(att.name)) else: select_columns.append( cast(literal(Vocabulary.unknown.name), types.String).label(att.name)) stmt = select(select_columns).distinct() if variant.chrom is not None: stmt = stmt.where((genomes.c.chrom == variant.chrom) & (genomes.c.start == variant.start) & (genomes.c.ref == variant.ref) & (genomes.c.alt == variant.alt)) else: stmt = stmt.where(genomes.c.id == variant.id) stmt = stmt.where( genomes.c.item_id.in_( select([metadata.c.item_id ]).where(metadata.c.assembly == assembly))) if self.log_sql_commands: utils.show_stmt(connection, stmt, self.logger.debug, 'GET VARIANT DETAILS') result = connection.execute(stmt) if result.rowcount == 0: return list() else: if result.rowcount > 1: self.logger.error( f'user searched for variant: chrom {str(variant.chrom)}, start {str(variant)}, ' f'ref {str(variant.ref)}, alt {str(variant.alt)}, id {str(variant.id)}' f'but two results were found') final_result = result.fetchone().values() result.close() return final_result
def donors(self, connection, by_attributes: List[Vocabulary], meta_attrs: MetadataAttrs, region_attrs: RegionAttrs, with_download_urls: bool) -> Selectable: """ Assembles a query statement that, when executed, returns a table containing for each individual matching the requirements in meta_attrs and region_attrs, the attributes in "by_attributes" """ # init state self.connection = connection names_columns_of_interest = [ self.meta_col_map[attr] for attr in by_attributes ] self._set_meta_attributes(meta_attrs) self.create_table_of_meta(names_columns_of_interest) self._set_region_attributes(region_attrs) self.create_table_of_regions(['item_id']) # compute statistics columns_of_interest = [ self.my_meta_t.c[self.meta_col_map[attr]].label(attr.name) for attr in by_attributes ] if with_download_urls: columns_of_interest.append( public_item.c.local_url.label( Vocabulary.DOWNLOAD_REGION_URL.name)) stmt = select(columns_of_interest) if self.my_region_t is not None: stmt = stmt.where( self.my_meta_t.c.item_id.in_( select([self.my_region_t.c.item_id]).distinct())) if with_download_urls: stmt = stmt.where( self.my_meta_t.c.item_id == public_item.c.item_id) if self.log_sql_commands: utils.show_stmt(self.connection, stmt, self.logger.debug, 'KGENOMES: STMT DONORS WITH REQUIRED ATTRIBUTES') return stmt
def try_stmt(what, log_function: Optional[Callable], log_title: Optional[str], num_attempts: int = 2) -> ResultProxy: # following instruction can raise OperationalError if the database is not reachable/not connected but it's caught elsewhere connection = db_engine.connect().execution_options(autocommit=True) try: num_attempts -= 1 if log_function is not None: db_utils.show_stmt(connection, what, log_function, log_title) result = connection.execute(what) return result except sqlalchemy_exceptions.DatabaseError as e: # pooled database connection has been invalidated/restarted logger.debug('Connection has been reset. Invalidate connection pool.') db_engine.dispose() logger.debug(f'POOL STATUS {str(db_engine.pool.status())}') if num_attempts > 0: logger.debug('Attempt {} more time(s)'.format(num_attempts)) return try_stmt(what, log_function, log_title, num_attempts) else: raise e finally: connection.close()
def variants_in_region(self, connection: Connection, genomic_interval: GenomicInterval, output_region_attrs: List[Vocabulary], meta_attrs: MetadataAttrs, region_attrs: Optional[RegionAttrs]) -> Selectable: # init state self.connection = connection self._set_meta_attributes(meta_attrs) self.create_table_of_meta(['item_id']) self._set_region_attributes(region_attrs) self.create_table_of_regions(['item_id']) if self.my_region_t is not None: only_from_samples = intersect(select([self.my_meta_t.c.item_id]), select([self.my_region_t.c.item_id])) else: only_from_samples = select([self.my_meta_t.c.item_id]) only_from_samples = only_from_samples.alias('samples') select_columns = list() for att in output_region_attrs: select_columns.append(regions.c[self.region_col_map[att]].label( att.name)) stmt =\ select(select_columns).distinct() \ .select_from(regions.join(only_from_samples, only_from_samples.c.item_id == regions.c.item_id)) \ .where((regions.c.start >= genomic_interval.start) & (regions.c.start <= genomic_interval.stop) & (regions.c.chrom == genomic_interval.chrom)) if self.log_sql_commands: utils.show_stmt( connection, stmt, self.logger.debug, f'TCGA: VARIANTS IN REGION ' f'{genomic_interval.chrom}' f'-{genomic_interval.start}-{genomic_interval.stop}') return stmt
def rank_variants_by_frequency(self, connection, meta_attrs: MetadataAttrs, region_attrs: RegionAttrs, ascending: bool, freq_threshold: float, limit_result: int, time_estimate_only: bool) -> FromClause: # init state self.connection = connection self._set_meta_attributes(meta_attrs) self.create_table_of_meta(['item_id', 'gender']) self._set_region_attributes(region_attrs) self.create_table_of_regions(['item_id']) females_and_males_stmt = select( [self.my_meta_t.c.gender, func.count()]) if self.my_region_t is not None: females_and_males_stmt = females_and_males_stmt\ .where(self.my_meta_t.c.item_id.in_(select([self.my_region_t.c.item_id]))) females_and_males_stmt = females_and_males_stmt \ .group_by(self.my_meta_t.c.gender) gender_of_individuals = [ row.values() for row in connection.execute(females_and_males_stmt).fetchall() ] if len(gender_of_individuals) == 0: raise EmptyResult('TCGA ') females = next( (el[1] for el in gender_of_individuals if el[0] == 'female'), 0) males = next( (el[1] for el in gender_of_individuals if el[0] == 'male'), 0) other_genders = reduce( lambda x1, x2: x1 + x2, [el[1] for el in gender_of_individuals]) - males - females self.logger.debug( f'TCGA: request /rank_variants_by_frequency for a population of {males+females+other_genders} individuals' ) if time_estimate_only: approx_pop_size = males + females + other_genders self.notify_message(SourceMessage.Type.TIME_TO_FINISH, str(int(0.3 * approx_pop_size))) self.notify_message( SourceMessage.Type.GENERAL_WARNING, f'Samples to analyze in TCGA: {approx_pop_size}') locale.setlocale(locale.LC_ALL, '') estimated_n_variants = 232 * approx_pop_size self.notify_message( SourceMessage.Type.GENERAL_WARNING, f'Estimated number of variants to rank in TCGA: ~{estimated_n_variants:n}' ) raise EmptyResult('TCGA') if other_genders > 0: self.notify_message( SourceMessage.Type.GENERAL_WARNING, 'Note for TCGA data: Individuals with an undefined gender have been excluded from the population while ' 'calculating the frequency of variants in chromosomes 23 and 24' ) # reduce size of the join with regions table genomes_red = select( [regions.c.item_id, regions.c.chrom, regions.c.start, regions.c.ref, regions.c.alt, regions.c.al1, regions.c.al2])\ .alias('variants_few_columns') # custom functions func_occurrence = (func.sum(genomes_red.c.al1) + func.sum( func.coalesce(genomes_red.c.al2, 0))).label('occurrence_by_gender') func_positive_donors = func.count( genomes_red.c.item_id).label('positives_by_gender') # Actually, self.my_region_t already contains only the individuals compatible with meta_attrs, but it can contain # duplicated item_id. Since we want to join, it's better to remove them. sample_set_with_limit = select( [self.my_meta_t.c.item_id, self.my_meta_t.c.gender]) if self.my_region_t is not None: sample_set_with_limit = sample_set_with_limit \ .where(self.my_meta_t.c.item_id.in_( select([self.my_region_t.c.item_id]) )) sample_set_with_limit = sample_set_with_limit.alias('sample_set') stmt = select([genomes_red.c.chrom, genomes_red.c.start, genomes_red.c.ref, genomes_red.c.alt, func_occurrence, func_positive_donors, sample_set_with_limit.c.gender]) \ .select_from(genomes_red.join( sample_set_with_limit, genomes_red.c.item_id == sample_set_with_limit.c.item_id)) \ .group_by(genomes_red.c.chrom, genomes_red.c.start, genomes_red.c.ref, genomes_red.c.alt, sample_set_with_limit.c.gender) \ .alias('stmt_1') # do not count the occurrences and the positiveness of whom who aren't males/females for a variant in chrom 23/24 outer_stmt = \ select([stmt.c.chrom, stmt.c.start, stmt.c.ref, stmt.c.alt] + [ case([ ((stmt.c.chrom < 23) | (stmt.c.chrom > 24), males+other_genders) ], else_=males).label('males'), cast(func.sum(column('positives_by_gender')), types.INTEGER).label('positives'), cast(func.sum(column('occurrence_by_gender')), types.INTEGER).label('occurrence') ] ) \ .where((stmt.c.gender.in_(['male', 'female'])) | (stmt.c.chrom < 23) | (stmt.c.chrom > 24)) \ .group_by(stmt.c.chrom, stmt.c.start, stmt.c.ref, stmt.c.alt) \ .alias('stmt_2') # other custom function if meta_attrs.assembly == 'hg19': func_frequency_new = func.rr.mut_frequency_new_hg19( column('occurrence'), column('males'), females, outer_stmt.c.chrom, outer_stmt.c.start) else: func_frequency_new = func.rr.mut_frequency_new_grch38( column('occurrence'), column('males'), females, outer_stmt.c.chrom, outer_stmt.c.start) func_frequency_new = func_frequency_new.label( Vocabulary.FREQUENCY.name) outer_outer_stmt = \ select([ outer_stmt.c.chrom.label(Vocabulary.CHROM.name), outer_stmt.c.start.label(Vocabulary.START.name), outer_stmt.c.ref.label(Vocabulary.REF.name), outer_stmt.c.alt.label(Vocabulary.ALT.name), (column('males') + females).label(Vocabulary.POPULATION_SIZE.name), column('positives').label(Vocabulary.POSITIVE_DONORS.name), column('occurrence').label(Vocabulary.OCCURRENCE.name), func_frequency_new ]) if ascending: if freq_threshold: outer_outer_stmt = outer_outer_stmt.where( func_frequency_new >= freq_threshold) outer_outer_stmt = outer_outer_stmt.order_by( asc(func_frequency_new), asc(column('occurrence'))) else: if freq_threshold: outer_outer_stmt = outer_outer_stmt.where( func_frequency_new <= freq_threshold) outer_outer_stmt = outer_outer_stmt.order_by( desc(func_frequency_new), desc(column('occurrence'))) outer_outer_stmt = outer_outer_stmt.limit(limit_result) \ .alias('TCGA_ranked') if self.log_sql_commands: utils.show_stmt(connection, outer_outer_stmt, self.logger.debug, 'TCGA: RANKING VARIANTS IN SAMPLE SET') return outer_outer_stmt