def _build_ordered_similarity(
            self,
            select_from,
            with_clause_as="_with_clause_normalization_factor"):
        """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from,
                               alias=self.LEFT_NORMALIZATION_FACTOR_AS)

        join_conditions = [
            Column(self.pivot_column,
                   self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe(
                       Column(self.pivot_column,
                              self.RIGHT_NORMALIZATION_FACTOR_AS))
        ]

        if self.supports_full_outer_join:
            join_conditions += [
                Column(self.based_column,
                       self.LEFT_NORMALIZATION_FACTOR_AS).lt(
                           Column(self.based_column,
                                  self.RIGHT_NORMALIZATION_FACTOR_AS))
            ]
        else:
            join_conditions += [
                Column(self.based_column,
                       self.LEFT_NORMALIZATION_FACTOR_AS).ne(
                           Column(self.based_column,
                                  self.RIGHT_NORMALIZATION_FACTOR_AS))
            ]

        similarity.join(select_from,
                        JoinTypes.INNER,
                        join_conditions,
                        alias=self.RIGHT_NORMALIZATION_FACTOR_AS)

        similarity.group_by(
            Column(self.based_column,
                   table_name=self.LEFT_NORMALIZATION_FACTOR_AS))
        similarity.group_by(
            Column(self.based_column,
                   table_name=self.RIGHT_NORMALIZATION_FACTOR_AS))

        similarity.select(Column(self.based_column,
                                 table_name=self.LEFT_NORMALIZATION_FACTOR_AS),
                          alias=f"{self.based_column}_1")
        similarity.select(Column(
            self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS),
                          alias=f"{self.based_column}_2")

        similarity.select(self._get_similarity_formula(),
                          alias=constants.SIMILARITY_COLUMN_NAME)

        return similarity
def make_full_transform_query(aggregation_queries,
                              dataset,
                              aggregation_params,
                              transform_params,
                              encoding_feature=False):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    inner = SelectQuery()
    if is_hdfs:
        inner.select_from('_'.join(dataset.name.split('.')))
    else:
        inner.select_from(dataset)

    if aggregation_params.is_rolling_window():
        inner.select(Column('*'))
    else:
        inner.distinct()  #TODO why?! -> avoid dupplicate
        for key in aggregation_params.get_effective_keys():
            inner.select(Column(key))
    prefilter = _make_prefilter(aggregation_params, transform_params)
    inner.where(prefilter)

    outer = SelectQuery()
    outer.select_from(inner, alias='inner')
    if aggregation_params.is_rolling_window():
        outer.select(Column('*', 'inner'))
    else:
        for col in aggregation_params.get_effective_keys():  #+ feature_names:
            outer.select(Column(col, 'inner'))

    reverse_mapping_dict = {}

    for idx, agg_query in enumerate(aggregation_queries):
        agg_query.alias(
            agg_query.get_alias()
            or 'cte_' + str(idx))  #TODO remove, make sure they have ids
        outer.with_cte(agg_query)
        join_cond = Expression()
        for key in aggregation_params.get_effective_keys():
            join_cond = join_cond.and_(
                Column(key, 'inner').eq_null_unsafe(
                    Column(key, agg_query.get_alias())))
        outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond)

        for idx2, col in enumerate(agg_query.get_columns_alias()):
            if encoding_feature:
                if aggregation_params.feature_name_mapping.get(col):
                    new_alias = '{}_{}_{}'.format(
                        aggregation_params.feature_name_mapping.get(col), idx,
                        idx2)
                    outer.select(Column(col, agg_query.get_alias()), new_alias)
                    reverse_mapping_dict[new_alias] = col
            else:
                outer.select(Column(col, agg_query.get_alias()))

    return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
    def _build_unordered_similarity(
        self,
        select_from,
        left_select_from_as="_left_ordered_similarity",
        right_select_from_as="_right_ordered_similarity",
        with_clause_as="_with_clause_ordered_similarity",
    ):
        """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table"""
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from, alias=left_select_from_as)

        join_condition = Constant(1).eq_null_unsafe(Constant(0))

        similarity.join(select_from,
                        JoinTypes.FULL,
                        join_condition,
                        alias=right_select_from_as)

        similarity.select(
            Column(f"{self.based_column}_1",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_2",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_1",
        )
        similarity.select(
            Column(f"{self.based_column}_2",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_1",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_2",
        )
        similarity.select(
            Column("similarity", table_name=left_select_from_as).coalesce(
                Column("similarity", table_name=right_select_from_as)),
            alias=constants.SIMILARITY_COLUMN_NAME,
        )

        return similarity
Esempio n. 4
0
toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Having clause
sb = SelectQuery()
sb.select_from('myTable')
sb.having(Column('havingcol').ge(Constant(33)))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# CTE
sb = SelectQuery()
sb.select(Constant(33))
sb.with_cte(sql.SelectQuery().select_from('cte_table').alias('myCTE'))
sb.select_from('myTable')

toSQL(sb, Dialects.GREENPLUM)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Order
sb = SelectQuery()
sb.select_from('myTable')
sb.order_by(Column('orderCol'))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Subquery
q1 = SelectQuery()