def _rename_table(self, to_rename, renaming_mapping): renamed_table = SelectQuery() renamed_table.select_from(to_rename, alias="_renamed") for input_column, renamed_column in renaming_mapping.items(): renamed_table.select(Column(input_column, table_name="_renamed"), alias=renamed_column) return renamed_table
def _encoding_feature_name(self, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() dataset_schema = dataset.read_schema() col_list = [] new_col_alias = [] for col_index, col_info in enumerate(dataset_schema): col_name = col_info.get('name') col_list.append(Column(col_name)) col_name_mapping = self.params.feature_name_mapping.get(col_name) if col_name_mapping: col_alias = '{}_{}'.format(col_name_mapping, col_index) else: col_alias = col_name new_col_alias.append(col_alias) query_to_rename = SelectQuery().select(col_list, new_col_alias) if is_hdfs: query_to_rename.select_from('_'.join(dataset.name.split('.'))) else: query_to_rename.select_from(dataset) dialect_handler = dialectHandler(dataset) dialect_handler.get_executor().exec_recipe_fragment( dataset, query=dialect_handler.convertToSQL( query_to_rename)) #toSQL(query_to_rename, dataset=dataset))
def _build_row_numbers(self, select_from, select_from_as="_similarity_matrix"): row_numbers = SelectQuery() row_numbers.select_from(select_from, alias=select_from_as) columns_to_select = [ f"{self.based_column}_1", f"{self.based_column}_2", constants.SIMILARITY_COLUMN_NAME ] self._select_columns_list(row_numbers, column_names=columns_to_select, table_name=select_from_as) row_number_expression = (Expression().rowNumber().over( Window( partition_by=[ Column(f"{self.based_column}_1", table_name=select_from_as) ], order_by=[ Column(constants.SIMILARITY_COLUMN_NAME, table_name=select_from_as), Column(f"{self.based_column}_2", table_name=select_from_as), ], order_types=["DESC", "DESC"], mode=None, ))) row_numbers.select(row_number_expression, alias=self.ROW_NUMBER_AS) return row_numbers
def _build_all_cf_scores( self, select_from, samples_for_training, samples_for_scores=None, select_from_as="_all_cf_scores", samples_for_training_as="_samples_for_training", samples_for_scores_as="_samples_for_scores", ): all_cf_scores = SelectQuery() all_cf_scores.select_from(select_from, alias=select_from_as) self._left_join_samples( all_cf_scores, select_from_as, samples_for_training, samples_for_training_as, self.sample_keys ) all_cf_scores.select(Column(self.IS_TRAINING_SAMPLE, table_name=samples_for_training_as)) if samples_for_scores: self._left_join_samples( all_cf_scores, select_from_as, samples_for_scores, samples_for_scores_as, self.sample_keys ) all_cf_scores.select(Column(self.IS_SCORE_SAMPLE, table_name=samples_for_scores_as)) columns_to_select = self.sample_keys + self.dku_config.score_column_names self._select_columns_list(all_cf_scores, columns_to_select, table_name=select_from_as) return all_cf_scores
def _build_timestamp_filtered_row_number(select_from_inner, select_from_as_inner): ts_row_numbers = SelectQuery() ts_row_numbers.select_from(select_from_inner, alias=select_from_as_inner) self._select_columns_list( ts_row_numbers, column_names=self.similarity_computation_columns, table_name=select_from_as_inner) ts_row_number_expression = (Expression().rowNumber().over( Window( partition_by=[ Column(self.based_column, table_name=select_from_as_inner) ], order_by=[ Column(self.dku_config.timestamps_column_name, table_name=select_from_as_inner), Column(self.pivot_column, table_name=select_from_as_inner), ], order_types=["DESC", "DESC"], mode=None, ))) ts_row_numbers.select(ts_row_number_expression, alias=self.TIMESTAMP_FILTERED_ROW_NB) return ts_row_numbers
def _build_samples_with_all_infos(inner_select_from, join_with, inner_select_from_as=ALL_INFOS_TABLE_NAME): samples_with_all_infos = SelectQuery() samples_with_all_infos.select_from(inner_select_from, alias=inner_select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME] self._select_columns_list(samples_with_all_infos, columns_to_select, table_name=inner_select_from_as) row_number_expression = ( Expression() .rowNumber() .over( Window( partition_by=[ Column(self.dku_config.users_column_name, table_name=inner_select_from_as), Column(constants.TARGET_COLUMN_NAME, table_name=inner_select_from_as), ], order_by=[Column(constants.TARGET_COLUMN_NAME, table_name=inner_select_from_as)], order_types=["DESC"], mode=None, ) ) ) samples_with_all_infos.select(row_number_expression, alias=self.ROW_NUMBER_AS) samples_with_all_infos.select(Column(NB_POSITIVE_PER_USER, table_name=ONLY_POSITIVE_TABLE_NAME)) self._left_join_samples( left_select_query=samples_with_all_infos, left_table_name=inner_select_from_as, right_select_query=join_with, right_table_name=ONLY_POSITIVE_TABLE_NAME, keys=[self.dku_config.users_column_name], ) return samples_with_all_infos
def _build_normalization_factor(self, select_from, select_from_as="_visit_count"): # compute normalization factor normalization_factor = SelectQuery() normalization_factor.select_from(select_from, alias=select_from_as) self._select_columns_list( normalization_factor, column_names=self.similarity_computation_columns + self.filtering_columns, table_name=select_from_as, ) rating_column = (Column(self.dku_config.ratings_column_name).minus( Column(self.RATING_AVERAGE)) if self.use_explicit else Constant(1)) normalization_factor.select( self._get_normalization_factor_formula(Column(self.based_column), rating_column), alias=self.NORMALIZATION_FACTOR_AS, ) self.similarity_computation_columns += [self.NORMALIZATION_FACTOR_AS] # keep only items and users with enough visits normalization_factor.where( Column(self.NB_VISIT_USER_AS, table_name=select_from_as).ge( Constant(self.dku_config.user_visit_threshold))) normalization_factor.where( Column(self.NB_VISIT_ITEM_AS, table_name=select_from_as).ge( Constant(self.dku_config.item_visit_threshold))) return normalization_factor
def _build_ordered_similarity( self, select_from, with_clause_as="_with_clause_normalization_factor"): """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """ similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=self.LEFT_NORMALIZATION_FACTOR_AS) join_conditions = [ Column(self.pivot_column, self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe( Column(self.pivot_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] if self.supports_full_outer_join: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).lt( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] else: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).ne( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] similarity.join(select_from, JoinTypes.INNER, join_conditions, alias=self.RIGHT_NORMALIZATION_FACTOR_AS) similarity.group_by( Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)) similarity.group_by( Column(self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)) similarity.select(Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_1") similarity.select(Column( self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_2") similarity.select(self._get_similarity_formula(), alias=constants.SIMILARITY_COLUMN_NAME) return similarity
def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME): samples_with_only_positives = SelectQuery() samples_with_only_positives.select_from(inner_select_from, inner_select_from_as) samples_with_only_positives.select(Column(self.dku_config.users_column_name)) samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER) samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1))) samples_with_only_positives.group_by(Column(self.dku_config.users_column_name)) return samples_with_only_positives
def _cast_table(self, to_cast, cast_mapping, alias): cast_table = SelectQuery() cast_table.select_from(to_cast, alias=alias) for input_column, target_type in cast_mapping.items(): cast_table.select(Column(input_column, table_name=alias).cast(target_type), alias=input_column) return cast_table
def _build_cf_scores_without_null(self, select_from, select_from_as="_all_cf_scores_to_filter"): null_scores_filtered = SelectQuery() null_scores_filtered.select_from(select_from, alias=select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names self._select_columns_list(select_query=null_scores_filtered, column_names=columns_to_select) self._or_condition_columns_list( null_scores_filtered, self.dku_config.score_column_names, lambda x: x.is_not_null() ) return null_scores_filtered
def _build_remove_historical_samples(self, select_from, select_from_as="_remove_negative_samples_seen"): historical_negative_samples_removed = SelectQuery() historical_negative_samples_removed.select_from(select_from, alias=select_from_as) columns_to_select = self.sample_keys + [constants.TARGET_COLUMN_NAME] + self.dku_config.score_column_names self._select_columns_list(select_query=historical_negative_samples_removed, column_names=columns_to_select) unseen_samples_condition = ( Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)).or_(Column(self.SCORE_SAMPLE).eq(Constant(0))) ) historical_negative_samples_removed.where(unseen_samples_condition) return historical_negative_samples_removed
def make_most_recent_timestamp_query(timestamp_column, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() query = SelectQuery() query.select(Column(timestamp_column).max(), alias='most_recent_timestamp') if is_hdfs: query.select_from('_'.join(dataset.name.split('.'))) else: query.select_from(dataset) dialect_handler = dialectHandler(dataset) return dialect_handler.convertToSQL( query) #toSQL(query, dialect='Hive')#dataset=dataset)
def _build_timestamp_filtered(self, select_from, select_from_as="_prepared_input_dataset"): def _build_timestamp_filtered_row_number(select_from_inner, select_from_as_inner): ts_row_numbers = SelectQuery() ts_row_numbers.select_from(select_from_inner, alias=select_from_as_inner) self._select_columns_list( ts_row_numbers, column_names=self.similarity_computation_columns, table_name=select_from_as_inner) ts_row_number_expression = (Expression().rowNumber().over( Window( partition_by=[ Column(self.based_column, table_name=select_from_as_inner) ], order_by=[ Column(self.dku_config.timestamps_column_name, table_name=select_from_as_inner), Column(self.pivot_column, table_name=select_from_as_inner), ], order_types=["DESC", "DESC"], mode=None, ))) ts_row_numbers.select(ts_row_number_expression, alias=self.TIMESTAMP_FILTERED_ROW_NB) return ts_row_numbers built_ts_row_numbers = _build_timestamp_filtered_row_number( select_from, select_from_as) ts_row_numbers_alias = "_ts_row_numbers" timestamp_filtered = SelectQuery() timestamp_filtered.select_from(built_ts_row_numbers, alias=ts_row_numbers_alias) self._select_columns_list( timestamp_filtered, column_names=self.similarity_computation_columns, table_name=ts_row_numbers_alias) timestamp_filtered.where( Column(self.TIMESTAMP_FILTERED_ROW_NB, table_name=ts_row_numbers_alias).le( Constant(self.dku_config.top_n_most_recent))) return timestamp_filtered
def _build_all_cf_scores_with_target(self, select_from, select_from_as="_all_cf_scores_with_target"): all_cf_scores_with_target = SelectQuery() all_cf_scores_with_target.select_from(select_from, alias=select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names self._select_columns_list(select_query=all_cf_scores_with_target, column_names=columns_to_select) all_cf_scores_with_target.select( Column(self.IS_TRAINING_SAMPLE).coalesce(0).cast("int"), alias=constants.TARGET_COLUMN_NAME ) if self.has_historical_data: all_cf_scores_with_target.select( Column(self.IS_SCORE_SAMPLE).coalesce(0).cast("int"), alias=self.SCORE_SAMPLE ) return all_cf_scores_with_target
def make_max_time_interval_query(timestamp_column, resolved_ref_date, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() max_time_interval = Constant(resolved_ref_date).minus( Column(timestamp_column)).extract(TimeUnit.DAY).max() query = SelectQuery() query.select(max_time_interval, alias="max_time_interval") #TODO naming if is_hdfs: query.select_from('_'.join(dataset.name.split('.'))) else: query.select_from(dataset) dialect_handler = dialectHandler(dataset) return dialect_handler.convertToSQL(query) #toSQL(query, dataset=dataset)
def _build_top_n(self, select_from, select_from_as="_row_number_table"): top_n = SelectQuery() top_n.select_from(select_from, alias=select_from_as) columns_to_select = [ f"{self.based_column}_1", f"{self.based_column}_2", constants.SIMILARITY_COLUMN_NAME ] self._select_columns_list(top_n, column_names=columns_to_select, table_name=select_from_as) top_n.where( Column(self.ROW_NUMBER_AS, table_name=select_from_as).le( Constant(self.dku_config.top_n_most_similar))) return top_n
def _build_unordered_similarity( self, select_from, left_select_from_as="_left_ordered_similarity", right_select_from_as="_right_ordered_similarity", with_clause_as="_with_clause_ordered_similarity", ): """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table""" similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=left_select_from_as) join_condition = Constant(1).eq_null_unsafe(Constant(0)) similarity.join(select_from, JoinTypes.FULL, join_condition, alias=right_select_from_as) similarity.select( Column(f"{self.based_column}_1", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_2", table_name=right_select_from_as)), alias=f"{self.based_column}_1", ) similarity.select( Column(f"{self.based_column}_2", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_1", table_name=right_select_from_as)), alias=f"{self.based_column}_2", ) similarity.select( Column("similarity", table_name=left_select_from_as).coalesce( Column("similarity", table_name=right_select_from_as)), alias=constants.SIMILARITY_COLUMN_NAME, ) return similarity
def _build_filtered_samples(inner_select_from, inner_select_from_as): ratio = float(self.dku_config.negative_samples_percentage / 100.0) filtered_samples = SelectQuery() filtered_samples.select_from(inner_select_from, inner_select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME] self._select_columns_list(filtered_samples, columns_to_select) nb_negative_threshold_expr = ( Column(NB_POSITIVE_PER_USER, table_name=select_from_as) .times(Constant(ratio)) .div(Constant(1).minus(Constant(ratio))) .ceil() ) filtered_samples.where( Column(constants.TARGET_COLUMN_NAME, table_name=select_from_as) .eq(Constant(1)) .or_(Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(nb_negative_threshold_expr)) ) return filtered_samples
def _build_visit_count(self, select_from, select_from_as="_filtered_input_dataset"): # total user and item visits visit_count = SelectQuery() visit_count.select_from(select_from, alias=select_from_as) self._select_columns_list( visit_count, column_names=self.similarity_computation_columns + self.filtering_columns, table_name=select_from_as, ) visit_count.select( Column("*").count().over( Window( partition_by=[Column(self.dku_config.users_column_name)], mode=None, )), alias=self.NB_VISIT_USER_AS, ) visit_count.select( Column("*").count().over( Window( partition_by=[Column(self.dku_config.items_column_name)], mode=None, )), alias=self.NB_VISIT_ITEM_AS, ) if self.use_explicit: visit_count.select( Column(self.dku_config.ratings_column_name).avg().over( Window(partition_by=[Column(self.based_column)], mode=None)), alias=self.RATING_AVERAGE, ) self.similarity_computation_columns += [self.RATING_AVERAGE] return visit_count
def make_distinct_values_query(column, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() column = Column(column) layer_1 = SelectQuery() layer_1.select(column) layer_1.select(Column('*').count(), 'count') if is_hdfs: layer_1.select_from('_'.join(dataset.name.split('.'))) else: layer_1.select_from(dataset) layer_1.group_by(column) count = Column('count') layer_2 = SelectQuery() layer_2.select(column) layer_2.select( count.div(count.sum().over(Window())).times(100), 'distribution') layer_2.select_from(layer_1, alias='layer_1') dialect_handler = dialectHandler(dataset) return dialect_handler.convertToSQL( layer_2) #toSQL(query, dialect='Hive')#dataset=dataset)
def _build_sum_of_similarity_scores( self, top_n, normalization_factor, top_n_as="_top_n", normalization_factor_as="_normalization_factor"): cf_scores = SelectQuery() cf_scores.select_from(top_n, alias=top_n_as) join_condition = Column(f"{self.based_column}_2", top_n_as).eq_null_unsafe( Column(self.based_column, normalization_factor_as)) cf_scores.join(normalization_factor, JoinTypes.INNER, join_condition, alias=normalization_factor_as) cf_scores.group_by( Column(f"{self.based_column}_1", table_name=top_n_as)) cf_scores.group_by( Column(self.pivot_column, table_name=normalization_factor_as)) cf_scores.select(Column(f"{self.based_column}_1", table_name=top_n_as), alias=self.based_column) cf_scores.select( Column(self.pivot_column, table_name=normalization_factor_as)) cf_scores.select(self._get_user_item_similarity_formula( top_n_as, normalization_factor_as), alias=constants.SCORE_COLUMN_NAME) cf_scores.order_by(Column(self.based_column)) cf_scores.order_by(Column(constants.SCORE_COLUMN_NAME), direction="DESC") return cf_scores
def make_full_transform_query(aggregation_queries, dataset, aggregation_params, transform_params, encoding_feature=False): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() inner = SelectQuery() if is_hdfs: inner.select_from('_'.join(dataset.name.split('.'))) else: inner.select_from(dataset) if aggregation_params.is_rolling_window(): inner.select(Column('*')) else: inner.distinct() #TODO why?! -> avoid dupplicate for key in aggregation_params.get_effective_keys(): inner.select(Column(key)) prefilter = _make_prefilter(aggregation_params, transform_params) inner.where(prefilter) outer = SelectQuery() outer.select_from(inner, alias='inner') if aggregation_params.is_rolling_window(): outer.select(Column('*', 'inner')) else: for col in aggregation_params.get_effective_keys(): #+ feature_names: outer.select(Column(col, 'inner')) reverse_mapping_dict = {} for idx, agg_query in enumerate(aggregation_queries): agg_query.alias( agg_query.get_alias() or 'cte_' + str(idx)) #TODO remove, make sure they have ids outer.with_cte(agg_query) join_cond = Expression() for key in aggregation_params.get_effective_keys(): join_cond = join_cond.and_( Column(key, 'inner').eq_null_unsafe( Column(key, agg_query.get_alias()))) outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond) for idx2, col in enumerate(agg_query.get_columns_alias()): if encoding_feature: if aggregation_params.feature_name_mapping.get(col): new_alias = '{}_{}_{}'.format( aggregation_params.feature_name_mapping.get(col), idx, idx2) outer.select(Column(col, agg_query.get_alias()), new_alias) reverse_mapping_dict[new_alias] = col else: outer.select(Column(col, agg_query.get_alias())) return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
columns_right = get_recipe_config()['columns_2'] ############################# # Original recipe ############################# #Start the loop joins = ['LEFT', 'RIGHT', 'INNER'] join_conds = [] for key in range(len(key_a)): join_cond = Expression() globals()['join_cond_'+str(key)] = join_cond.and_(Column(key_a[key], input_A_names[0]).eq_null_unsafe(Column(key_b[key], input_B_names[0]))) join_conds.append(globals()['join_cond_'+str(key)]) for i in joins: query = SelectQuery() query.select_from(input_A_datasets[0], alias = input_A_names[0]) for j in columns_left: query.select(Column(j, input_A_names[0]),alias = j) for k in columns_right: query.select(Column(k, input_B_names[0]),alias = k) query.join(input_B_datasets[0], i, join_conds, operatorBetweenConditions = operator , alias= input_B_names[0]) globals()['sql_'+str(i)] = toSQL(query, input_A_datasets[0]) e = SQLExecutor2() e.exec_recipe_fragment(output_A_datasets[0], sql_LEFT) e.exec_recipe_fragment(output_B_datasets[0], sql_RIGHT) e.exec_recipe_fragment(output_C_datasets[0], sql_INNER)
def _build_samples_for_training(self, select_from, select_from_as="_samples_for_training"): samples_for_training = SelectQuery() samples_for_training.select_from(select_from, alias=select_from_as) self._select_columns_list(samples_for_training, self.sample_keys, table_name=select_from_as) samples_for_training.select(Constant(1), alias=self.IS_TRAINING_SAMPLE) return samples_for_training
def _build_samples_for_scoring(self, select_from, select_from_as="_samples_for_scores"): samples_for_scores = SelectQuery() samples_for_scores.select_from(select_from, alias=select_from_as) self._select_columns_list(samples_for_scores, self.sample_keys, table_name=select_from_as) samples_for_scores.select(Constant(1), alias=self.IS_SCORE_SAMPLE) return samples_for_scores
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE import dataiku.sql as sql import json from dataiku.sql import JoinTypes, Expression, Column, Constant, InlineSQL, SelectQuery, Table, Dialects, toSQL # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Expression: 1 * 2 * 3 eb = Constant(1).times(Constant(2), Constant(3)) toSQL(eb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Simple select sb = SelectQuery() sb.select_from(Table('myTable')) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # join in fluent style sb = SelectQuery() \ .select(Constant('Hello')) \ .select_from('t1') \ .join('t2', JoinTypes.INNER, Column('c', 't1').eq(Column('c', 't2'))) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Group by and limit sb = SelectQuery()