def test_cube_operators(self): t = table('t', column('value'), column('x'), column('y'), column('z'), column('q')) stmt = select([func.sum(t.c.value)]) self.assert_compile( stmt.group_by(func.cube(t.c.x, t.c.y)), "SELECT sum(t.value) AS sum_1 FROM t GROUP BY CUBE(t.x, t.y)" ) self.assert_compile( stmt.group_by(func.rollup(t.c.x, t.c.y)), "SELECT sum(t.value) AS sum_1 FROM t GROUP BY ROLLUP(t.x, t.y)" ) self.assert_compile( stmt.group_by( func.grouping_sets(t.c.x, t.c.y) ), "SELECT sum(t.value) AS sum_1 FROM t " "GROUP BY GROUPING SETS(t.x, t.y)" ) self.assert_compile( stmt.group_by( func.grouping_sets( sql.tuple_(t.c.x, t.c.y), sql.tuple_(t.c.z, t.c.q), ) ), "SELECT sum(t.value) AS sum_1 FROM t GROUP BY " "GROUPING SETS((t.x, t.y), (t.z, t.q))" )
def test_cube_operators(self): t = table( "t", column("value"), column("x"), column("y"), column("z"), column("q"), ) stmt = select(func.sum(t.c.value)) self.assert_compile( stmt.group_by(func.cube(t.c.x, t.c.y)), "SELECT sum(t.value) AS sum_1 FROM t GROUP BY CUBE(t.x, t.y)", ) self.assert_compile( stmt.group_by(func.rollup(t.c.x, t.c.y)), "SELECT sum(t.value) AS sum_1 FROM t GROUP BY ROLLUP(t.x, t.y)", ) self.assert_compile( stmt.group_by(func.grouping_sets(t.c.x, t.c.y)), "SELECT sum(t.value) AS sum_1 FROM t " "GROUP BY GROUPING SETS(t.x, t.y)", ) self.assert_compile( stmt.group_by( func.grouping_sets( sql.tuple_(t.c.x, t.c.y), sql.tuple_(t.c.z, t.c.q) ) ), "SELECT sum(t.value) AS sum_1 FROM t GROUP BY " "GROUPING SETS((t.x, t.y), (t.z, t.q))", )
def variant_distribution(self, by_attributes: List[Vocabulary], meta_attrs: MetadataAttrs, region_attrs: RegionAttrs, variant: Mutation) -> dict: region_attrs = self.replace_gene_with_interval(region_attrs, meta_attrs.assembly) eligible_sources = [ source for source in self.use_sources if source.can_express_constraint( meta_attrs, region_attrs, source.variant_occurrence) ] self.logger.warning(f"eligible sources are {eligible_sources}") answer_204_if_no_source_can_answer(eligible_sources) # sorted copy of ( by_attributes + donor_id ) 'cos we need the same table schema from each source by_attributes_copy = set(by_attributes) by_attributes_copy.update([Vocabulary.DONOR_ID, Vocabulary.GENDER]) by_attributes_copy = list(by_attributes_copy) by_attributes_copy.sort(key=lambda x: x.name) # collect results from individual sources as DONOR_ID | OCCURRENCE | <by_attributes> def ask_to_source(source: Type[Source]): def do(): obj: Source = source(self.logger) available_attributes_in_source = obj.get_available_attributes() select_from_source_output = [ ] # what we select from the source output (both available and unavailable attributes) selectable_attributes: List[Vocabulary] = [ ] # what we can ask to the source to give us for elem in by_attributes_copy: if elem in available_attributes_in_source: selectable_attributes.append(elem) select_from_source_output.append(column(elem.name)) else: select_from_source_output.append( cast(literal(Vocabulary.unknown.name), types.String).label(elem.name)) select_from_source_output.append( column(Vocabulary.OCCURRENCE.name)) def variant_occurrence(a_connection): source_stmt = obj.variant_occurrence(a_connection, selectable_attributes, meta_attrs, region_attrs, variant)\ .alias(source.__name__) return \ select(select_from_source_output)\ .select_from(source_stmt) return database.try_py_function(variant_occurrence) return self.try_catch_source_errors(do, None) with concurrent.futures.ThreadPoolExecutor( max_workers=len(eligible_sources) + 1) as executor: from_sources = executor.map(ask_to_source, eligible_sources) if variant.chrom is None: region_of_variant = executor.submit( self.get_region_of_variant, variant, meta_attrs.assembly).result() else: region_of_variant = [ variant.chrom, variant.start, variant.start + 1 ] # stop is fake but I don't need it anyway # remove failures from_sources = [ result for result in from_sources if result is not None ] if len(from_sources) == 0: raise NoDataFromSources(self.notices) else: self.warn_if_mixed_germline_somatic_vars(eligible_sources) all_sources = union(*from_sources).alias('all_sources') chrom = region_of_variant[0] start = region_of_variant[1] # functions func_count_donors = func.count(column( Vocabulary.DONOR_ID.name)).label('POPULATION_SIZE') # in the following statements 1 is an abbreviation for the column DONOR_ID func_count_positive_donors = func.count(1).filter( column(Vocabulary.OCCURRENCE.name) > 0).label( 'POSITIVE_DONORS') func_count_males_and_na = cast( func.count(1).filter( func.coalesce(column(Vocabulary.GENDER.name), '') != 'female'), types.Integer) func_count_females = cast( func.count(1).filter( column(Vocabulary.GENDER.name) == 'female'), types.Integer) func_count_occurrence = func.sum(column( Vocabulary.OCCURRENCE.name)).label( 'OCCURRENCE_OF_TARGET_VARIANT') if meta_attrs.assembly == 'hg19': func_frequency_new = func.rr.mut_frequency_new_hg19( func_count_occurrence, func_count_males_and_na, func_count_females, chrom, start) else: func_frequency_new = func.rr.mut_frequency_new_grch38( func_count_occurrence, func_count_males_and_na, func_count_females, chrom, start) func_frequency_new = func_frequency_new.label( Vocabulary.FREQUENCY.name) # merge results by union (which removes duplicates) and count by_attributes_as_columns = [ column(att.name) for att in by_attributes ] stmt = \ select(by_attributes_as_columns + [func_count_donors, func_count_positive_donors, func_count_occurrence, func_frequency_new]) \ .select_from(all_sources) if chrom == 23 or chrom == 24: self.notices.append( Notice( 'The target variant is located in a non-autosomal chromosome, as such the ' 'individuals of the selected population having unknown gender have been excluded ' 'from the frequency computation.')) stmt = stmt.where( column(Vocabulary.GENDER.name).in_(['male', 'female'])) stmt = stmt.group_by(func.cube(*by_attributes_as_columns)) return self.get_as_dictionary(stmt, 'VARIANT DISTRIBUTION')
def donor_distribution(self, by_attributes: List[Vocabulary], meta_attrs: MetadataAttrs, region_attrs: RegionAttrs) -> dict: region_attrs = self.replace_gene_with_interval(region_attrs, meta_attrs.assembly) eligible_sources = [ source for source in self.use_sources if source.can_express_constraint( meta_attrs, region_attrs, source.donors) ] answer_204_if_no_source_can_answer(eligible_sources) # sorted copy of ( by_attributes + donor_id ) 'cos we need the same table schema from each source by_attributes_copy = by_attributes.copy() if Vocabulary.DONOR_ID not in by_attributes_copy: by_attributes_copy.append(Vocabulary.DONOR_ID) by_attributes_copy.sort(key=lambda x: x.name) # collect results from individual sources def ask_to_source(source: Type[Source]): def do(): obj: Source = source(self.logger) available_attributes_in_source = obj.get_available_attributes() select_from_source_output = [ ] # what we select from the source output (both available and unavailable attributes) selectable_attributes: List[Vocabulary] = [ ] # what we can ask to the source to give us for elem in by_attributes_copy: if elem in available_attributes_in_source: selectable_attributes.append(elem) select_from_source_output.append(column(elem.name)) else: select_from_source_output.append( cast(literal(Vocabulary.unknown.name), types.String).label(elem.name)) def donors(a_connection): source_stmt = obj.donors(a_connection, selectable_attributes, meta_attrs, region_attrs, False)\ .alias(source.__name__) return \ select(select_from_source_output) \ .select_from(source_stmt) return database.try_py_function(donors) return self.try_catch_source_errors(do, None) with concurrent.futures.ThreadPoolExecutor( max_workers=len(eligible_sources)) as executor: from_sources = executor.map(ask_to_source, eligible_sources) # remove failures from_sources = [ result for result in from_sources if result is not None ] if len(from_sources) == 0: raise NoDataFromSources(self.notices) else: # aggregate the results of all the queries self.warn_if_mixed_germline_somatic_vars(eligible_sources) by_attributes_as_columns = [ column(att.name) for att in by_attributes ] stmt = \ select( by_attributes_as_columns + [func.count(column(Vocabulary.DONOR_ID.name)).label('DONORS')] )\ .select_from(union(*from_sources).alias("all_sources"))\ .group_by(func.cube(*by_attributes_as_columns)) result = self.get_as_dictionary(stmt, 'DONOR DISTRIBUTION') return result