Beispiel #1
0
    def test_cube_operators(self):

        t = table('t', column('value'),
                  column('x'), column('y'), column('z'), column('q'))

        stmt = select([func.sum(t.c.value)])

        self.assert_compile(
            stmt.group_by(func.cube(t.c.x, t.c.y)),
            "SELECT sum(t.value) AS sum_1 FROM t GROUP BY CUBE(t.x, t.y)"
        )

        self.assert_compile(
            stmt.group_by(func.rollup(t.c.x, t.c.y)),
            "SELECT sum(t.value) AS sum_1 FROM t GROUP BY ROLLUP(t.x, t.y)"
        )

        self.assert_compile(
            stmt.group_by(
                func.grouping_sets(t.c.x, t.c.y)
            ),
            "SELECT sum(t.value) AS sum_1 FROM t "
            "GROUP BY GROUPING SETS(t.x, t.y)"
        )

        self.assert_compile(
            stmt.group_by(
                func.grouping_sets(
                    sql.tuple_(t.c.x, t.c.y),
                    sql.tuple_(t.c.z, t.c.q),
                )
            ),
            "SELECT sum(t.value) AS sum_1 FROM t GROUP BY "
            "GROUPING SETS((t.x, t.y), (t.z, t.q))"
        )
Beispiel #2
0
    def test_cube_operators(self):

        t = table(
            "t",
            column("value"),
            column("x"),
            column("y"),
            column("z"),
            column("q"),
        )

        stmt = select(func.sum(t.c.value))

        self.assert_compile(
            stmt.group_by(func.cube(t.c.x, t.c.y)),
            "SELECT sum(t.value) AS sum_1 FROM t GROUP BY CUBE(t.x, t.y)",
        )

        self.assert_compile(
            stmt.group_by(func.rollup(t.c.x, t.c.y)),
            "SELECT sum(t.value) AS sum_1 FROM t GROUP BY ROLLUP(t.x, t.y)",
        )

        self.assert_compile(
            stmt.group_by(func.grouping_sets(t.c.x, t.c.y)),
            "SELECT sum(t.value) AS sum_1 FROM t "
            "GROUP BY GROUPING SETS(t.x, t.y)",
        )

        self.assert_compile(
            stmt.group_by(
                func.grouping_sets(
                    sql.tuple_(t.c.x, t.c.y), sql.tuple_(t.c.z, t.c.q)
                )
            ),
            "SELECT sum(t.value) AS sum_1 FROM t GROUP BY "
            "GROUPING SETS((t.x, t.y), (t.z, t.q))",
        )
Beispiel #3
0
    def variant_distribution(self, by_attributes: List[Vocabulary],
                             meta_attrs: MetadataAttrs,
                             region_attrs: RegionAttrs,
                             variant: Mutation) -> dict:
        region_attrs = self.replace_gene_with_interval(region_attrs,
                                                       meta_attrs.assembly)
        eligible_sources = [
            source
            for source in self.use_sources if source.can_express_constraint(
                meta_attrs, region_attrs, source.variant_occurrence)
        ]
        self.logger.warning(f"eligible sources are {eligible_sources}")
        answer_204_if_no_source_can_answer(eligible_sources)

        # sorted copy of ( by_attributes + donor_id ) 'cos we need the same table schema from each source
        by_attributes_copy = set(by_attributes)
        by_attributes_copy.update([Vocabulary.DONOR_ID, Vocabulary.GENDER])
        by_attributes_copy = list(by_attributes_copy)
        by_attributes_copy.sort(key=lambda x: x.name)

        # collect results from individual sources as DONOR_ID | OCCURRENCE | <by_attributes>
        def ask_to_source(source: Type[Source]):
            def do():
                obj: Source = source(self.logger)
                available_attributes_in_source = obj.get_available_attributes()

                select_from_source_output = [
                ]  # what we select from the source output (both available and unavailable attributes)
                selectable_attributes: List[Vocabulary] = [
                ]  # what we can ask to the source to give us
                for elem in by_attributes_copy:
                    if elem in available_attributes_in_source:
                        selectable_attributes.append(elem)
                        select_from_source_output.append(column(elem.name))
                    else:
                        select_from_source_output.append(
                            cast(literal(Vocabulary.unknown.name),
                                 types.String).label(elem.name))
                select_from_source_output.append(
                    column(Vocabulary.OCCURRENCE.name))

                def variant_occurrence(a_connection):
                    source_stmt = obj.variant_occurrence(a_connection, selectable_attributes, meta_attrs, region_attrs, variant)\
                        .alias(source.__name__)
                    return \
                        select(select_from_source_output)\
                        .select_from(source_stmt)

                return database.try_py_function(variant_occurrence)

            return self.try_catch_source_errors(do, None)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=len(eligible_sources) + 1) as executor:
            from_sources = executor.map(ask_to_source, eligible_sources)
            if variant.chrom is None:
                region_of_variant = executor.submit(
                    self.get_region_of_variant, variant,
                    meta_attrs.assembly).result()
            else:
                region_of_variant = [
                    variant.chrom, variant.start, variant.start + 1
                ]  # stop is fake but I don't need it anyway

        # remove failures
        from_sources = [
            result for result in from_sources if result is not None
        ]
        if len(from_sources) == 0:
            raise NoDataFromSources(self.notices)
        else:
            self.warn_if_mixed_germline_somatic_vars(eligible_sources)
            all_sources = union(*from_sources).alias('all_sources')
            chrom = region_of_variant[0]
            start = region_of_variant[1]

            # functions
            func_count_donors = func.count(column(
                Vocabulary.DONOR_ID.name)).label('POPULATION_SIZE')
            # in the following statements 1 is an abbreviation for the column DONOR_ID
            func_count_positive_donors = func.count(1).filter(
                column(Vocabulary.OCCURRENCE.name) > 0).label(
                    'POSITIVE_DONORS')
            func_count_males_and_na = cast(
                func.count(1).filter(
                    func.coalesce(column(Vocabulary.GENDER.name), '') !=
                    'female'), types.Integer)
            func_count_females = cast(
                func.count(1).filter(
                    column(Vocabulary.GENDER.name) == 'female'), types.Integer)
            func_count_occurrence = func.sum(column(
                Vocabulary.OCCURRENCE.name)).label(
                    'OCCURRENCE_OF_TARGET_VARIANT')
            if meta_attrs.assembly == 'hg19':
                func_frequency_new = func.rr.mut_frequency_new_hg19(
                    func_count_occurrence, func_count_males_and_na,
                    func_count_females, chrom, start)
            else:
                func_frequency_new = func.rr.mut_frequency_new_grch38(
                    func_count_occurrence, func_count_males_and_na,
                    func_count_females, chrom, start)
            func_frequency_new = func_frequency_new.label(
                Vocabulary.FREQUENCY.name)

            # merge results by union (which removes duplicates) and count
            by_attributes_as_columns = [
                column(att.name) for att in by_attributes
            ]
            stmt = \
                select(by_attributes_as_columns + [func_count_donors, func_count_positive_donors, func_count_occurrence, func_frequency_new]) \
                .select_from(all_sources)
            if chrom == 23 or chrom == 24:
                self.notices.append(
                    Notice(
                        'The target variant is located in a non-autosomal chromosome, as such the '
                        'individuals of the selected population having unknown gender have been excluded '
                        'from the frequency computation.'))
                stmt = stmt.where(
                    column(Vocabulary.GENDER.name).in_(['male', 'female']))
            stmt = stmt.group_by(func.cube(*by_attributes_as_columns))

            return self.get_as_dictionary(stmt, 'VARIANT DISTRIBUTION')
Beispiel #4
0
    def donor_distribution(self, by_attributes: List[Vocabulary],
                           meta_attrs: MetadataAttrs,
                           region_attrs: RegionAttrs) -> dict:
        region_attrs = self.replace_gene_with_interval(region_attrs,
                                                       meta_attrs.assembly)
        eligible_sources = [
            source
            for source in self.use_sources if source.can_express_constraint(
                meta_attrs, region_attrs, source.donors)
        ]
        answer_204_if_no_source_can_answer(eligible_sources)

        # sorted copy of ( by_attributes + donor_id ) 'cos we need the same table schema from each source
        by_attributes_copy = by_attributes.copy()
        if Vocabulary.DONOR_ID not in by_attributes_copy:
            by_attributes_copy.append(Vocabulary.DONOR_ID)
        by_attributes_copy.sort(key=lambda x: x.name)

        # collect results from individual sources
        def ask_to_source(source: Type[Source]):
            def do():
                obj: Source = source(self.logger)
                available_attributes_in_source = obj.get_available_attributes()

                select_from_source_output = [
                ]  # what we select from the source output (both available and unavailable attributes)
                selectable_attributes: List[Vocabulary] = [
                ]  # what we can ask to the source to give us
                for elem in by_attributes_copy:
                    if elem in available_attributes_in_source:
                        selectable_attributes.append(elem)
                        select_from_source_output.append(column(elem.name))
                    else:
                        select_from_source_output.append(
                            cast(literal(Vocabulary.unknown.name),
                                 types.String).label(elem.name))

                def donors(a_connection):
                    source_stmt = obj.donors(a_connection, selectable_attributes, meta_attrs, region_attrs, False)\
                        .alias(source.__name__)
                    return \
                        select(select_from_source_output) \
                        .select_from(source_stmt)

                return database.try_py_function(donors)

            return self.try_catch_source_errors(do, None)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=len(eligible_sources)) as executor:
            from_sources = executor.map(ask_to_source, eligible_sources)

        # remove failures
        from_sources = [
            result for result in from_sources if result is not None
        ]
        if len(from_sources) == 0:
            raise NoDataFromSources(self.notices)
        else:
            # aggregate the results of all the queries
            self.warn_if_mixed_germline_somatic_vars(eligible_sources)
            by_attributes_as_columns = [
                column(att.name) for att in by_attributes
            ]

            stmt = \
                select(
                    by_attributes_as_columns +
                    [func.count(column(Vocabulary.DONOR_ID.name)).label('DONORS')]
                )\
                .select_from(union(*from_sources).alias("all_sources"))\
                .group_by(func.cube(*by_attributes_as_columns))

            result = self.get_as_dictionary(stmt, 'DONOR DISTRIBUTION')
            return result