def _build_normalization_factor(self,
                                    select_from,
                                    select_from_as="_visit_count"):
        # compute normalization factor
        normalization_factor = SelectQuery()
        normalization_factor.select_from(select_from, alias=select_from_as)

        self._select_columns_list(
            normalization_factor,
            column_names=self.similarity_computation_columns +
            self.filtering_columns,
            table_name=select_from_as,
        )

        rating_column = (Column(self.dku_config.ratings_column_name).minus(
            Column(self.RATING_AVERAGE)) if self.use_explicit else Constant(1))

        normalization_factor.select(
            self._get_normalization_factor_formula(Column(self.based_column),
                                                   rating_column),
            alias=self.NORMALIZATION_FACTOR_AS,
        )

        self.similarity_computation_columns += [self.NORMALIZATION_FACTOR_AS]

        # keep only items and users with enough visits
        normalization_factor.where(
            Column(self.NB_VISIT_USER_AS, table_name=select_from_as).ge(
                Constant(self.dku_config.user_visit_threshold)))
        normalization_factor.where(
            Column(self.NB_VISIT_ITEM_AS, table_name=select_from_as).ge(
                Constant(self.dku_config.item_visit_threshold)))
        return normalization_factor
Esempio n. 2
0
 def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME):
     samples_with_only_positives = SelectQuery()
     samples_with_only_positives.select_from(inner_select_from, inner_select_from_as)
     samples_with_only_positives.select(Column(self.dku_config.users_column_name))
     samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER)
     samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)))
     samples_with_only_positives.group_by(Column(self.dku_config.users_column_name))
     return samples_with_only_positives
def make_full_transform_query(aggregation_queries,
                              dataset,
                              aggregation_params,
                              transform_params,
                              encoding_feature=False):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    inner = SelectQuery()
    if is_hdfs:
        inner.select_from('_'.join(dataset.name.split('.')))
    else:
        inner.select_from(dataset)

    if aggregation_params.is_rolling_window():
        inner.select(Column('*'))
    else:
        inner.distinct()  #TODO why?! -> avoid dupplicate
        for key in aggregation_params.get_effective_keys():
            inner.select(Column(key))
    prefilter = _make_prefilter(aggregation_params, transform_params)
    inner.where(prefilter)

    outer = SelectQuery()
    outer.select_from(inner, alias='inner')
    if aggregation_params.is_rolling_window():
        outer.select(Column('*', 'inner'))
    else:
        for col in aggregation_params.get_effective_keys():  #+ feature_names:
            outer.select(Column(col, 'inner'))

    reverse_mapping_dict = {}

    for idx, agg_query in enumerate(aggregation_queries):
        agg_query.alias(
            agg_query.get_alias()
            or 'cte_' + str(idx))  #TODO remove, make sure they have ids
        outer.with_cte(agg_query)
        join_cond = Expression()
        for key in aggregation_params.get_effective_keys():
            join_cond = join_cond.and_(
                Column(key, 'inner').eq_null_unsafe(
                    Column(key, agg_query.get_alias())))
        outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond)

        for idx2, col in enumerate(agg_query.get_columns_alias()):
            if encoding_feature:
                if aggregation_params.feature_name_mapping.get(col):
                    new_alias = '{}_{}_{}'.format(
                        aggregation_params.feature_name_mapping.get(col), idx,
                        idx2)
                    outer.select(Column(col, agg_query.get_alias()), new_alias)
                    reverse_mapping_dict[new_alias] = col
            else:
                outer.select(Column(col, agg_query.get_alias()))

    return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
Esempio n. 4
0
 def _build_remove_historical_samples(self, select_from, select_from_as="_remove_negative_samples_seen"):
     historical_negative_samples_removed = SelectQuery()
     historical_negative_samples_removed.select_from(select_from, alias=select_from_as)
     columns_to_select = self.sample_keys + [constants.TARGET_COLUMN_NAME] + self.dku_config.score_column_names
     self._select_columns_list(select_query=historical_negative_samples_removed, column_names=columns_to_select)
     unseen_samples_condition = (
         Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)).or_(Column(self.SCORE_SAMPLE).eq(Constant(0)))
     )
     historical_negative_samples_removed.where(unseen_samples_condition)
     return historical_negative_samples_removed
    def _build_timestamp_filtered(self,
                                  select_from,
                                  select_from_as="_prepared_input_dataset"):
        def _build_timestamp_filtered_row_number(select_from_inner,
                                                 select_from_as_inner):
            ts_row_numbers = SelectQuery()
            ts_row_numbers.select_from(select_from_inner,
                                       alias=select_from_as_inner)

            self._select_columns_list(
                ts_row_numbers,
                column_names=self.similarity_computation_columns,
                table_name=select_from_as_inner)

            ts_row_number_expression = (Expression().rowNumber().over(
                Window(
                    partition_by=[
                        Column(self.based_column,
                               table_name=select_from_as_inner)
                    ],
                    order_by=[
                        Column(self.dku_config.timestamps_column_name,
                               table_name=select_from_as_inner),
                        Column(self.pivot_column,
                               table_name=select_from_as_inner),
                    ],
                    order_types=["DESC", "DESC"],
                    mode=None,
                )))
            ts_row_numbers.select(ts_row_number_expression,
                                  alias=self.TIMESTAMP_FILTERED_ROW_NB)
            return ts_row_numbers

        built_ts_row_numbers = _build_timestamp_filtered_row_number(
            select_from, select_from_as)

        ts_row_numbers_alias = "_ts_row_numbers"
        timestamp_filtered = SelectQuery()
        timestamp_filtered.select_from(built_ts_row_numbers,
                                       alias=ts_row_numbers_alias)

        self._select_columns_list(
            timestamp_filtered,
            column_names=self.similarity_computation_columns,
            table_name=ts_row_numbers_alias)

        timestamp_filtered.where(
            Column(self.TIMESTAMP_FILTERED_ROW_NB,
                   table_name=ts_row_numbers_alias).le(
                       Constant(self.dku_config.top_n_most_recent)))

        return timestamp_filtered
    def _build_top_n(self, select_from, select_from_as="_row_number_table"):
        top_n = SelectQuery()
        top_n.select_from(select_from, alias=select_from_as)

        columns_to_select = [
            f"{self.based_column}_1", f"{self.based_column}_2",
            constants.SIMILARITY_COLUMN_NAME
        ]
        self._select_columns_list(top_n,
                                  column_names=columns_to_select,
                                  table_name=select_from_as)

        top_n.where(
            Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(
                Constant(self.dku_config.top_n_most_similar)))
        return top_n
Esempio n. 7
0
        def _build_filtered_samples(inner_select_from, inner_select_from_as):
            ratio = float(self.dku_config.negative_samples_percentage / 100.0)
            filtered_samples = SelectQuery()
            filtered_samples.select_from(inner_select_from, inner_select_from_as)
            columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME]
            self._select_columns_list(filtered_samples, columns_to_select)

            nb_negative_threshold_expr = (
                Column(NB_POSITIVE_PER_USER, table_name=select_from_as)
                .times(Constant(ratio))
                .div(Constant(1).minus(Constant(ratio)))
                .ceil()
            )
            filtered_samples.where(
                Column(constants.TARGET_COLUMN_NAME, table_name=select_from_as)
                .eq(Constant(1))
                .or_(Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(nb_negative_threshold_expr))
            )
            return filtered_samples
Esempio n. 8
0
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Group by and limit
sb = SelectQuery()
sb.select_from('myTable')
sb.group_by(Column('groupCol'))
sb.limit(1000)

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Where clause and select distinct
sb = SelectQuery()
sb.distinct()
sb.select_from('myTable')
sb.where(Column('wherecol').le(Constant(33)))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Having clause
sb = SelectQuery()
sb.select_from('myTable')
sb.having(Column('havingcol').ge(Constant(33)))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# CTE
sb = SelectQuery()
sb.select(Constant(33))