def _build_normalization_factor(self, select_from, select_from_as="_visit_count"): # compute normalization factor normalization_factor = SelectQuery() normalization_factor.select_from(select_from, alias=select_from_as) self._select_columns_list( normalization_factor, column_names=self.similarity_computation_columns + self.filtering_columns, table_name=select_from_as, ) rating_column = (Column(self.dku_config.ratings_column_name).minus( Column(self.RATING_AVERAGE)) if self.use_explicit else Constant(1)) normalization_factor.select( self._get_normalization_factor_formula(Column(self.based_column), rating_column), alias=self.NORMALIZATION_FACTOR_AS, ) self.similarity_computation_columns += [self.NORMALIZATION_FACTOR_AS] # keep only items and users with enough visits normalization_factor.where( Column(self.NB_VISIT_USER_AS, table_name=select_from_as).ge( Constant(self.dku_config.user_visit_threshold))) normalization_factor.where( Column(self.NB_VISIT_ITEM_AS, table_name=select_from_as).ge( Constant(self.dku_config.item_visit_threshold))) return normalization_factor
def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME): samples_with_only_positives = SelectQuery() samples_with_only_positives.select_from(inner_select_from, inner_select_from_as) samples_with_only_positives.select(Column(self.dku_config.users_column_name)) samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER) samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1))) samples_with_only_positives.group_by(Column(self.dku_config.users_column_name)) return samples_with_only_positives
def make_full_transform_query(aggregation_queries, dataset, aggregation_params, transform_params, encoding_feature=False): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() inner = SelectQuery() if is_hdfs: inner.select_from('_'.join(dataset.name.split('.'))) else: inner.select_from(dataset) if aggregation_params.is_rolling_window(): inner.select(Column('*')) else: inner.distinct() #TODO why?! -> avoid dupplicate for key in aggregation_params.get_effective_keys(): inner.select(Column(key)) prefilter = _make_prefilter(aggregation_params, transform_params) inner.where(prefilter) outer = SelectQuery() outer.select_from(inner, alias='inner') if aggregation_params.is_rolling_window(): outer.select(Column('*', 'inner')) else: for col in aggregation_params.get_effective_keys(): #+ feature_names: outer.select(Column(col, 'inner')) reverse_mapping_dict = {} for idx, agg_query in enumerate(aggregation_queries): agg_query.alias( agg_query.get_alias() or 'cte_' + str(idx)) #TODO remove, make sure they have ids outer.with_cte(agg_query) join_cond = Expression() for key in aggregation_params.get_effective_keys(): join_cond = join_cond.and_( Column(key, 'inner').eq_null_unsafe( Column(key, agg_query.get_alias()))) outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond) for idx2, col in enumerate(agg_query.get_columns_alias()): if encoding_feature: if aggregation_params.feature_name_mapping.get(col): new_alias = '{}_{}_{}'.format( aggregation_params.feature_name_mapping.get(col), idx, idx2) outer.select(Column(col, agg_query.get_alias()), new_alias) reverse_mapping_dict[new_alias] = col else: outer.select(Column(col, agg_query.get_alias())) return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
def _build_remove_historical_samples(self, select_from, select_from_as="_remove_negative_samples_seen"): historical_negative_samples_removed = SelectQuery() historical_negative_samples_removed.select_from(select_from, alias=select_from_as) columns_to_select = self.sample_keys + [constants.TARGET_COLUMN_NAME] + self.dku_config.score_column_names self._select_columns_list(select_query=historical_negative_samples_removed, column_names=columns_to_select) unseen_samples_condition = ( Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)).or_(Column(self.SCORE_SAMPLE).eq(Constant(0))) ) historical_negative_samples_removed.where(unseen_samples_condition) return historical_negative_samples_removed
def _build_timestamp_filtered(self, select_from, select_from_as="_prepared_input_dataset"): def _build_timestamp_filtered_row_number(select_from_inner, select_from_as_inner): ts_row_numbers = SelectQuery() ts_row_numbers.select_from(select_from_inner, alias=select_from_as_inner) self._select_columns_list( ts_row_numbers, column_names=self.similarity_computation_columns, table_name=select_from_as_inner) ts_row_number_expression = (Expression().rowNumber().over( Window( partition_by=[ Column(self.based_column, table_name=select_from_as_inner) ], order_by=[ Column(self.dku_config.timestamps_column_name, table_name=select_from_as_inner), Column(self.pivot_column, table_name=select_from_as_inner), ], order_types=["DESC", "DESC"], mode=None, ))) ts_row_numbers.select(ts_row_number_expression, alias=self.TIMESTAMP_FILTERED_ROW_NB) return ts_row_numbers built_ts_row_numbers = _build_timestamp_filtered_row_number( select_from, select_from_as) ts_row_numbers_alias = "_ts_row_numbers" timestamp_filtered = SelectQuery() timestamp_filtered.select_from(built_ts_row_numbers, alias=ts_row_numbers_alias) self._select_columns_list( timestamp_filtered, column_names=self.similarity_computation_columns, table_name=ts_row_numbers_alias) timestamp_filtered.where( Column(self.TIMESTAMP_FILTERED_ROW_NB, table_name=ts_row_numbers_alias).le( Constant(self.dku_config.top_n_most_recent))) return timestamp_filtered
def _build_top_n(self, select_from, select_from_as="_row_number_table"): top_n = SelectQuery() top_n.select_from(select_from, alias=select_from_as) columns_to_select = [ f"{self.based_column}_1", f"{self.based_column}_2", constants.SIMILARITY_COLUMN_NAME ] self._select_columns_list(top_n, column_names=columns_to_select, table_name=select_from_as) top_n.where( Column(self.ROW_NUMBER_AS, table_name=select_from_as).le( Constant(self.dku_config.top_n_most_similar))) return top_n
def _build_filtered_samples(inner_select_from, inner_select_from_as): ratio = float(self.dku_config.negative_samples_percentage / 100.0) filtered_samples = SelectQuery() filtered_samples.select_from(inner_select_from, inner_select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME] self._select_columns_list(filtered_samples, columns_to_select) nb_negative_threshold_expr = ( Column(NB_POSITIVE_PER_USER, table_name=select_from_as) .times(Constant(ratio)) .div(Constant(1).minus(Constant(ratio))) .ceil() ) filtered_samples.where( Column(constants.TARGET_COLUMN_NAME, table_name=select_from_as) .eq(Constant(1)) .or_(Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(nb_negative_threshold_expr)) ) return filtered_samples
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Group by and limit sb = SelectQuery() sb.select_from('myTable') sb.group_by(Column('groupCol')) sb.limit(1000) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Where clause and select distinct sb = SelectQuery() sb.distinct() sb.select_from('myTable') sb.where(Column('wherecol').le(Constant(33))) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Having clause sb = SelectQuery() sb.select_from('myTable') sb.having(Column('havingcol').ge(Constant(33))) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # CTE sb = SelectQuery() sb.select(Constant(33))