def _build_ordered_similarity(
            self,
            select_from,
            with_clause_as="_with_clause_normalization_factor"):
        """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from,
                               alias=self.LEFT_NORMALIZATION_FACTOR_AS)

        join_conditions = [
            Column(self.pivot_column,
                   self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe(
                       Column(self.pivot_column,
                              self.RIGHT_NORMALIZATION_FACTOR_AS))
        ]

        if self.supports_full_outer_join:
            join_conditions += [
                Column(self.based_column,
                       self.LEFT_NORMALIZATION_FACTOR_AS).lt(
                           Column(self.based_column,
                                  self.RIGHT_NORMALIZATION_FACTOR_AS))
            ]
        else:
            join_conditions += [
                Column(self.based_column,
                       self.LEFT_NORMALIZATION_FACTOR_AS).ne(
                           Column(self.based_column,
                                  self.RIGHT_NORMALIZATION_FACTOR_AS))
            ]

        similarity.join(select_from,
                        JoinTypes.INNER,
                        join_conditions,
                        alias=self.RIGHT_NORMALIZATION_FACTOR_AS)

        similarity.group_by(
            Column(self.based_column,
                   table_name=self.LEFT_NORMALIZATION_FACTOR_AS))
        similarity.group_by(
            Column(self.based_column,
                   table_name=self.RIGHT_NORMALIZATION_FACTOR_AS))

        similarity.select(Column(self.based_column,
                                 table_name=self.LEFT_NORMALIZATION_FACTOR_AS),
                          alias=f"{self.based_column}_1")
        similarity.select(Column(
            self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS),
                          alias=f"{self.based_column}_2")

        similarity.select(self._get_similarity_formula(),
                          alias=constants.SIMILARITY_COLUMN_NAME)

        return similarity
def make_full_transform_query(aggregation_queries,
                              dataset,
                              aggregation_params,
                              transform_params,
                              encoding_feature=False):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    inner = SelectQuery()
    if is_hdfs:
        inner.select_from('_'.join(dataset.name.split('.')))
    else:
        inner.select_from(dataset)

    if aggregation_params.is_rolling_window():
        inner.select(Column('*'))
    else:
        inner.distinct()  #TODO why?! -> avoid dupplicate
        for key in aggregation_params.get_effective_keys():
            inner.select(Column(key))
    prefilter = _make_prefilter(aggregation_params, transform_params)
    inner.where(prefilter)

    outer = SelectQuery()
    outer.select_from(inner, alias='inner')
    if aggregation_params.is_rolling_window():
        outer.select(Column('*', 'inner'))
    else:
        for col in aggregation_params.get_effective_keys():  #+ feature_names:
            outer.select(Column(col, 'inner'))

    reverse_mapping_dict = {}

    for idx, agg_query in enumerate(aggregation_queries):
        agg_query.alias(
            agg_query.get_alias()
            or 'cte_' + str(idx))  #TODO remove, make sure they have ids
        outer.with_cte(agg_query)
        join_cond = Expression()
        for key in aggregation_params.get_effective_keys():
            join_cond = join_cond.and_(
                Column(key, 'inner').eq_null_unsafe(
                    Column(key, agg_query.get_alias())))
        outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond)

        for idx2, col in enumerate(agg_query.get_columns_alias()):
            if encoding_feature:
                if aggregation_params.feature_name_mapping.get(col):
                    new_alias = '{}_{}_{}'.format(
                        aggregation_params.feature_name_mapping.get(col), idx,
                        idx2)
                    outer.select(Column(col, agg_query.get_alias()), new_alias)
                    reverse_mapping_dict[new_alias] = col
            else:
                outer.select(Column(col, agg_query.get_alias()))

    return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
    def _build_unordered_similarity(
        self,
        select_from,
        left_select_from_as="_left_ordered_similarity",
        right_select_from_as="_right_ordered_similarity",
        with_clause_as="_with_clause_ordered_similarity",
    ):
        """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table"""
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from, alias=left_select_from_as)

        join_condition = Constant(1).eq_null_unsafe(Constant(0))

        similarity.join(select_from,
                        JoinTypes.FULL,
                        join_condition,
                        alias=right_select_from_as)

        similarity.select(
            Column(f"{self.based_column}_1",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_2",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_1",
        )
        similarity.select(
            Column(f"{self.based_column}_2",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_1",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_2",
        )
        similarity.select(
            Column("similarity", table_name=left_select_from_as).coalesce(
                Column("similarity", table_name=right_select_from_as)),
            alias=constants.SIMILARITY_COLUMN_NAME,
        )

        return similarity
    def _build_sum_of_similarity_scores(
            self,
            top_n,
            normalization_factor,
            top_n_as="_top_n",
            normalization_factor_as="_normalization_factor"):
        cf_scores = SelectQuery()
        cf_scores.select_from(top_n, alias=top_n_as)

        join_condition = Column(f"{self.based_column}_2",
                                top_n_as).eq_null_unsafe(
                                    Column(self.based_column,
                                           normalization_factor_as))
        cf_scores.join(normalization_factor,
                       JoinTypes.INNER,
                       join_condition,
                       alias=normalization_factor_as)

        cf_scores.group_by(
            Column(f"{self.based_column}_1", table_name=top_n_as))
        cf_scores.group_by(
            Column(self.pivot_column, table_name=normalization_factor_as))

        cf_scores.select(Column(f"{self.based_column}_1", table_name=top_n_as),
                         alias=self.based_column)
        cf_scores.select(
            Column(self.pivot_column, table_name=normalization_factor_as))

        cf_scores.select(self._get_user_item_similarity_formula(
            top_n_as, normalization_factor_as),
                         alias=constants.SCORE_COLUMN_NAME)

        cf_scores.order_by(Column(self.based_column))
        cf_scores.order_by(Column(constants.SCORE_COLUMN_NAME),
                           direction="DESC")
        return cf_scores
Example #5
0
columns_right = get_recipe_config()['columns_2']


#############################
# Original recipe
#############################

#Start the loop
joins = ['LEFT', 'RIGHT', 'INNER']
join_conds = []

for key in range(len(key_a)):
    join_cond = Expression()
    globals()['join_cond_'+str(key)] = join_cond.and_(Column(key_a[key], input_A_names[0]).eq_null_unsafe(Column(key_b[key], input_B_names[0])))
    join_conds.append(globals()['join_cond_'+str(key)])  
    
for i in joins:
    query = SelectQuery()
    query.select_from(input_A_datasets[0], alias = input_A_names[0])
    for j in columns_left:
        query.select(Column(j, input_A_names[0]),alias = j)
    for k in columns_right:
        query.select(Column(k, input_B_names[0]),alias = k) 
    query.join(input_B_datasets[0], i, join_conds, operatorBetweenConditions = operator , alias= input_B_names[0])
    globals()['sql_'+str(i)] = toSQL(query, input_A_datasets[0])


e = SQLExecutor2()
e.exec_recipe_fragment(output_A_datasets[0], sql_LEFT)
e.exec_recipe_fragment(output_B_datasets[0], sql_RIGHT)
e.exec_recipe_fragment(output_C_datasets[0], sql_INNER)