def _build_normalization_factor(self,
                                    select_from,
                                    select_from_as="_visit_count"):
        # compute normalization factor
        normalization_factor = SelectQuery()
        normalization_factor.select_from(select_from, alias=select_from_as)

        self._select_columns_list(
            normalization_factor,
            column_names=self.similarity_computation_columns +
            self.filtering_columns,
            table_name=select_from_as,
        )

        rating_column = (Column(self.dku_config.ratings_column_name).minus(
            Column(self.RATING_AVERAGE)) if self.use_explicit else Constant(1))

        normalization_factor.select(
            self._get_normalization_factor_formula(Column(self.based_column),
                                                   rating_column),
            alias=self.NORMALIZATION_FACTOR_AS,
        )

        self.similarity_computation_columns += [self.NORMALIZATION_FACTOR_AS]

        # keep only items and users with enough visits
        normalization_factor.where(
            Column(self.NB_VISIT_USER_AS, table_name=select_from_as).ge(
                Constant(self.dku_config.user_visit_threshold)))
        normalization_factor.where(
            Column(self.NB_VISIT_ITEM_AS, table_name=select_from_as).ge(
                Constant(self.dku_config.item_visit_threshold)))
        return normalization_factor
Esempio n. 2
0
 def _build_cf_scores_without_null(self, select_from, select_from_as="_all_cf_scores_to_filter"):
     null_scores_filtered = SelectQuery()
     null_scores_filtered.select_from(select_from, alias=select_from_as)
     columns_to_select = self.sample_keys + self.dku_config.score_column_names
     self._select_columns_list(select_query=null_scores_filtered, column_names=columns_to_select)
     self._or_condition_columns_list(
         null_scores_filtered, self.dku_config.score_column_names, lambda x: x.is_not_null()
     )
     return null_scores_filtered
Esempio n. 3
0
    def _build_all_cf_scores(
        self,
        select_from,
        samples_for_training,
        samples_for_scores=None,
        select_from_as="_all_cf_scores",
        samples_for_training_as="_samples_for_training",
        samples_for_scores_as="_samples_for_scores",
    ):
        all_cf_scores = SelectQuery()
        all_cf_scores.select_from(select_from, alias=select_from_as)

        self._left_join_samples(
            all_cf_scores, select_from_as, samples_for_training, samples_for_training_as, self.sample_keys
        )
        all_cf_scores.select(Column(self.IS_TRAINING_SAMPLE, table_name=samples_for_training_as))

        if samples_for_scores:
            self._left_join_samples(
                all_cf_scores, select_from_as, samples_for_scores, samples_for_scores_as, self.sample_keys
            )
            all_cf_scores.select(Column(self.IS_SCORE_SAMPLE, table_name=samples_for_scores_as))

        columns_to_select = self.sample_keys + self.dku_config.score_column_names
        self._select_columns_list(all_cf_scores, columns_to_select, table_name=select_from_as)
        return all_cf_scores
Esempio n. 4
0
        def _build_samples_with_all_infos(inner_select_from, join_with, inner_select_from_as=ALL_INFOS_TABLE_NAME):
            samples_with_all_infos = SelectQuery()
            samples_with_all_infos.select_from(inner_select_from, alias=inner_select_from_as)

            columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME]
            self._select_columns_list(samples_with_all_infos, columns_to_select, table_name=inner_select_from_as)

            row_number_expression = (
                Expression()
                .rowNumber()
                .over(
                    Window(
                        partition_by=[
                            Column(self.dku_config.users_column_name, table_name=inner_select_from_as),
                            Column(constants.TARGET_COLUMN_NAME, table_name=inner_select_from_as),
                        ],
                        order_by=[Column(constants.TARGET_COLUMN_NAME, table_name=inner_select_from_as)],
                        order_types=["DESC"],
                        mode=None,
                    )
                )
            )
            samples_with_all_infos.select(row_number_expression, alias=self.ROW_NUMBER_AS)
            samples_with_all_infos.select(Column(NB_POSITIVE_PER_USER, table_name=ONLY_POSITIVE_TABLE_NAME))

            self._left_join_samples(
                left_select_query=samples_with_all_infos,
                left_table_name=inner_select_from_as,
                right_select_query=join_with,
                right_table_name=ONLY_POSITIVE_TABLE_NAME,
                keys=[self.dku_config.users_column_name],
            )
            return samples_with_all_infos
def make_most_recent_timestamp_query(timestamp_column, dataset):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    query = SelectQuery()
    query.select(Column(timestamp_column).max(), alias='most_recent_timestamp')
    if is_hdfs:
        query.select_from('_'.join(dataset.name.split('.')))
    else:
        query.select_from(dataset)
    dialect_handler = dialectHandler(dataset)
    return dialect_handler.convertToSQL(
        query)  #toSQL(query, dialect='Hive')#dataset=dataset)
def make_max_time_interval_query(timestamp_column, resolved_ref_date, dataset):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    max_time_interval = Constant(resolved_ref_date).minus(
        Column(timestamp_column)).extract(TimeUnit.DAY).max()
    query = SelectQuery()
    query.select(max_time_interval, alias="max_time_interval")  #TODO naming
    if is_hdfs:
        query.select_from('_'.join(dataset.name.split('.')))
    else:
        query.select_from(dataset)
    dialect_handler = dialectHandler(dataset)
    return dialect_handler.convertToSQL(query)  #toSQL(query, dataset=dataset)
Esempio n. 7
0
 def _build_all_cf_scores_with_target(self, select_from, select_from_as="_all_cf_scores_with_target"):
     all_cf_scores_with_target = SelectQuery()
     all_cf_scores_with_target.select_from(select_from, alias=select_from_as)
     columns_to_select = self.sample_keys + self.dku_config.score_column_names
     self._select_columns_list(select_query=all_cf_scores_with_target, column_names=columns_to_select)
     all_cf_scores_with_target.select(
         Column(self.IS_TRAINING_SAMPLE).coalesce(0).cast("int"), alias=constants.TARGET_COLUMN_NAME
     )
     if self.has_historical_data:
         all_cf_scores_with_target.select(
             Column(self.IS_SCORE_SAMPLE).coalesce(0).cast("int"), alias=self.SCORE_SAMPLE
         )
     return all_cf_scores_with_target
    def _encoding_feature_name(self, dataset):

        is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
        dataset_schema = dataset.read_schema()

        col_list = []
        new_col_alias = []

        for col_index, col_info in enumerate(dataset_schema):
            col_name = col_info.get('name')
            col_list.append(Column(col_name))
            col_name_mapping = self.params.feature_name_mapping.get(col_name)
            if col_name_mapping:
                col_alias = '{}_{}'.format(col_name_mapping, col_index)
            else:
                col_alias = col_name
            new_col_alias.append(col_alias)

        query_to_rename = SelectQuery().select(col_list, new_col_alias)
        if is_hdfs:
            query_to_rename.select_from('_'.join(dataset.name.split('.')))
        else:
            query_to_rename.select_from(dataset)
        dialect_handler = dialectHandler(dataset)
        dialect_handler.get_executor().exec_recipe_fragment(
            dataset, query=dialect_handler.convertToSQL(
                query_to_rename))  #toSQL(query_to_rename, dataset=dataset))
Esempio n. 9
0
 def _rename_table(self, to_rename, renaming_mapping):
     renamed_table = SelectQuery()
     renamed_table.select_from(to_rename, alias="_renamed")
     for input_column, renamed_column in renaming_mapping.items():
         renamed_table.select(Column(input_column, table_name="_renamed"),
                              alias=renamed_column)
     return renamed_table
Esempio n. 10
0
 def _cast_table(self, to_cast, cast_mapping, alias):
     cast_table = SelectQuery()
     cast_table.select_from(to_cast, alias=alias)
     for input_column, target_type in cast_mapping.items():
         cast_table.select(Column(input_column,
                                  table_name=alias).cast(target_type),
                           alias=input_column)
     return cast_table
Esempio n. 11
0
 def _build_remove_historical_samples(self, select_from, select_from_as="_remove_negative_samples_seen"):
     historical_negative_samples_removed = SelectQuery()
     historical_negative_samples_removed.select_from(select_from, alias=select_from_as)
     columns_to_select = self.sample_keys + [constants.TARGET_COLUMN_NAME] + self.dku_config.score_column_names
     self._select_columns_list(select_query=historical_negative_samples_removed, column_names=columns_to_select)
     unseen_samples_condition = (
         Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)).or_(Column(self.SCORE_SAMPLE).eq(Constant(0)))
     )
     historical_negative_samples_removed.where(unseen_samples_condition)
     return historical_negative_samples_removed
Esempio n. 12
0
        def _build_filtered_samples(inner_select_from, inner_select_from_as):
            ratio = float(self.dku_config.negative_samples_percentage / 100.0)
            filtered_samples = SelectQuery()
            filtered_samples.select_from(inner_select_from, inner_select_from_as)
            columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME]
            self._select_columns_list(filtered_samples, columns_to_select)

            nb_negative_threshold_expr = (
                Column(NB_POSITIVE_PER_USER, table_name=select_from_as)
                .times(Constant(ratio))
                .div(Constant(1).minus(Constant(ratio)))
                .ceil()
            )
            filtered_samples.where(
                Column(constants.TARGET_COLUMN_NAME, table_name=select_from_as)
                .eq(Constant(1))
                .or_(Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(nb_negative_threshold_expr))
            )
            return filtered_samples
    def _build_unordered_similarity(
        self,
        select_from,
        left_select_from_as="_left_ordered_similarity",
        right_select_from_as="_right_ordered_similarity",
        with_clause_as="_with_clause_ordered_similarity",
    ):
        """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table"""
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from, alias=left_select_from_as)

        join_condition = Constant(1).eq_null_unsafe(Constant(0))

        similarity.join(select_from,
                        JoinTypes.FULL,
                        join_condition,
                        alias=right_select_from_as)

        similarity.select(
            Column(f"{self.based_column}_1",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_2",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_1",
        )
        similarity.select(
            Column(f"{self.based_column}_2",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_1",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_2",
        )
        similarity.select(
            Column("similarity", table_name=left_select_from_as).coalesce(
                Column("similarity", table_name=right_select_from_as)),
            alias=constants.SIMILARITY_COLUMN_NAME,
        )

        return similarity
def make_full_transform_query(aggregation_queries,
                              dataset,
                              aggregation_params,
                              transform_params,
                              encoding_feature=False):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    inner = SelectQuery()
    if is_hdfs:
        inner.select_from('_'.join(dataset.name.split('.')))
    else:
        inner.select_from(dataset)

    if aggregation_params.is_rolling_window():
        inner.select(Column('*'))
    else:
        inner.distinct()  #TODO why?! -> avoid dupplicate
        for key in aggregation_params.get_effective_keys():
            inner.select(Column(key))
    prefilter = _make_prefilter(aggregation_params, transform_params)
    inner.where(prefilter)

    outer = SelectQuery()
    outer.select_from(inner, alias='inner')
    if aggregation_params.is_rolling_window():
        outer.select(Column('*', 'inner'))
    else:
        for col in aggregation_params.get_effective_keys():  #+ feature_names:
            outer.select(Column(col, 'inner'))

    reverse_mapping_dict = {}

    for idx, agg_query in enumerate(aggregation_queries):
        agg_query.alias(
            agg_query.get_alias()
            or 'cte_' + str(idx))  #TODO remove, make sure they have ids
        outer.with_cte(agg_query)
        join_cond = Expression()
        for key in aggregation_params.get_effective_keys():
            join_cond = join_cond.and_(
                Column(key, 'inner').eq_null_unsafe(
                    Column(key, agg_query.get_alias())))
        outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond)

        for idx2, col in enumerate(agg_query.get_columns_alias()):
            if encoding_feature:
                if aggregation_params.feature_name_mapping.get(col):
                    new_alias = '{}_{}_{}'.format(
                        aggregation_params.feature_name_mapping.get(col), idx,
                        idx2)
                    outer.select(Column(col, agg_query.get_alias()), new_alias)
                    reverse_mapping_dict[new_alias] = col
            else:
                outer.select(Column(col, agg_query.get_alias()))

    return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
def make_distinct_values_query(column, dataset):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()

    column = Column(column)

    layer_1 = SelectQuery()
    layer_1.select(column)
    layer_1.select(Column('*').count(), 'count')
    if is_hdfs:
        layer_1.select_from('_'.join(dataset.name.split('.')))
    else:
        layer_1.select_from(dataset)
    layer_1.group_by(column)

    count = Column('count')
    layer_2 = SelectQuery()
    layer_2.select(column)
    layer_2.select(
        count.div(count.sum().over(Window())).times(100), 'distribution')
    layer_2.select_from(layer_1, alias='layer_1')

    dialect_handler = dialectHandler(dataset)
    return dialect_handler.convertToSQL(
        layer_2)  #toSQL(query, dialect='Hive')#dataset=dataset)
Esempio n. 16
0
 def _build_samples_for_training(self, select_from, select_from_as="_samples_for_training"):
     samples_for_training = SelectQuery()
     samples_for_training.select_from(select_from, alias=select_from_as)
     self._select_columns_list(samples_for_training, self.sample_keys, table_name=select_from_as)
     samples_for_training.select(Constant(1), alias=self.IS_TRAINING_SAMPLE)
     return samples_for_training
Esempio n. 17
0
 def _build_samples_for_scoring(self, select_from, select_from_as="_samples_for_scores"):
     samples_for_scores = SelectQuery()
     samples_for_scores.select_from(select_from, alias=select_from_as)
     self._select_columns_list(samples_for_scores, self.sample_keys, table_name=select_from_as)
     samples_for_scores.select(Constant(1), alias=self.IS_SCORE_SAMPLE)
     return samples_for_scores
Esempio n. 18
0
# It does not replace proper documentation, tutos and samples but for now it will do the trick

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku.sql as sql
import json
from dataiku.sql import JoinTypes, Expression, Column, Constant, InlineSQL, SelectQuery, Table, Dialects, toSQL

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Expression: 1 * 2 * 3
eb = Constant(1).times(Constant(2), Constant(3))

toSQL(eb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Simple select
sb = SelectQuery()
sb.select_from(Table('myTable'))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# join in fluent style
sb = SelectQuery() \
    .select(Constant('Hello')) \
    .select_from('t1') \
    .join('t2', JoinTypes.INNER, Column('c', 't1').eq(Column('c', 't2')))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Group by and limit
Esempio n. 19
0
 def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME):
     samples_with_only_positives = SelectQuery()
     samples_with_only_positives.select_from(inner_select_from, inner_select_from_as)
     samples_with_only_positives.select(Column(self.dku_config.users_column_name))
     samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER)
     samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)))
     samples_with_only_positives.group_by(Column(self.dku_config.users_column_name))
     return samples_with_only_positives