def _build_ordered_similarity( self, select_from, with_clause_as="_with_clause_normalization_factor"): """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """ similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=self.LEFT_NORMALIZATION_FACTOR_AS) join_conditions = [ Column(self.pivot_column, self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe( Column(self.pivot_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] if self.supports_full_outer_join: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).lt( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] else: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).ne( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] similarity.join(select_from, JoinTypes.INNER, join_conditions, alias=self.RIGHT_NORMALIZATION_FACTOR_AS) similarity.group_by( Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)) similarity.group_by( Column(self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)) similarity.select(Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_1") similarity.select(Column( self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_2") similarity.select(self._get_similarity_formula(), alias=constants.SIMILARITY_COLUMN_NAME) return similarity
def make_full_transform_query(aggregation_queries, dataset, aggregation_params, transform_params, encoding_feature=False): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() inner = SelectQuery() if is_hdfs: inner.select_from('_'.join(dataset.name.split('.'))) else: inner.select_from(dataset) if aggregation_params.is_rolling_window(): inner.select(Column('*')) else: inner.distinct() #TODO why?! -> avoid dupplicate for key in aggregation_params.get_effective_keys(): inner.select(Column(key)) prefilter = _make_prefilter(aggregation_params, transform_params) inner.where(prefilter) outer = SelectQuery() outer.select_from(inner, alias='inner') if aggregation_params.is_rolling_window(): outer.select(Column('*', 'inner')) else: for col in aggregation_params.get_effective_keys(): #+ feature_names: outer.select(Column(col, 'inner')) reverse_mapping_dict = {} for idx, agg_query in enumerate(aggregation_queries): agg_query.alias( agg_query.get_alias() or 'cte_' + str(idx)) #TODO remove, make sure they have ids outer.with_cte(agg_query) join_cond = Expression() for key in aggregation_params.get_effective_keys(): join_cond = join_cond.and_( Column(key, 'inner').eq_null_unsafe( Column(key, agg_query.get_alias()))) outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond) for idx2, col in enumerate(agg_query.get_columns_alias()): if encoding_feature: if aggregation_params.feature_name_mapping.get(col): new_alias = '{}_{}_{}'.format( aggregation_params.feature_name_mapping.get(col), idx, idx2) outer.select(Column(col, agg_query.get_alias()), new_alias) reverse_mapping_dict[new_alias] = col else: outer.select(Column(col, agg_query.get_alias())) return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
def _build_unordered_similarity( self, select_from, left_select_from_as="_left_ordered_similarity", right_select_from_as="_right_ordered_similarity", with_clause_as="_with_clause_ordered_similarity", ): """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table""" similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=left_select_from_as) join_condition = Constant(1).eq_null_unsafe(Constant(0)) similarity.join(select_from, JoinTypes.FULL, join_condition, alias=right_select_from_as) similarity.select( Column(f"{self.based_column}_1", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_2", table_name=right_select_from_as)), alias=f"{self.based_column}_1", ) similarity.select( Column(f"{self.based_column}_2", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_1", table_name=right_select_from_as)), alias=f"{self.based_column}_2", ) similarity.select( Column("similarity", table_name=left_select_from_as).coalesce( Column("similarity", table_name=right_select_from_as)), alias=constants.SIMILARITY_COLUMN_NAME, ) return similarity
def _build_sum_of_similarity_scores( self, top_n, normalization_factor, top_n_as="_top_n", normalization_factor_as="_normalization_factor"): cf_scores = SelectQuery() cf_scores.select_from(top_n, alias=top_n_as) join_condition = Column(f"{self.based_column}_2", top_n_as).eq_null_unsafe( Column(self.based_column, normalization_factor_as)) cf_scores.join(normalization_factor, JoinTypes.INNER, join_condition, alias=normalization_factor_as) cf_scores.group_by( Column(f"{self.based_column}_1", table_name=top_n_as)) cf_scores.group_by( Column(self.pivot_column, table_name=normalization_factor_as)) cf_scores.select(Column(f"{self.based_column}_1", table_name=top_n_as), alias=self.based_column) cf_scores.select( Column(self.pivot_column, table_name=normalization_factor_as)) cf_scores.select(self._get_user_item_similarity_formula( top_n_as, normalization_factor_as), alias=constants.SCORE_COLUMN_NAME) cf_scores.order_by(Column(self.based_column)) cf_scores.order_by(Column(constants.SCORE_COLUMN_NAME), direction="DESC") return cf_scores
columns_right = get_recipe_config()['columns_2'] ############################# # Original recipe ############################# #Start the loop joins = ['LEFT', 'RIGHT', 'INNER'] join_conds = [] for key in range(len(key_a)): join_cond = Expression() globals()['join_cond_'+str(key)] = join_cond.and_(Column(key_a[key], input_A_names[0]).eq_null_unsafe(Column(key_b[key], input_B_names[0]))) join_conds.append(globals()['join_cond_'+str(key)]) for i in joins: query = SelectQuery() query.select_from(input_A_datasets[0], alias = input_A_names[0]) for j in columns_left: query.select(Column(j, input_A_names[0]),alias = j) for k in columns_right: query.select(Column(k, input_B_names[0]),alias = k) query.join(input_B_datasets[0], i, join_conds, operatorBetweenConditions = operator , alias= input_B_names[0]) globals()['sql_'+str(i)] = toSQL(query, input_A_datasets[0]) e = SQLExecutor2() e.exec_recipe_fragment(output_A_datasets[0], sql_LEFT) e.exec_recipe_fragment(output_B_datasets[0], sql_RIGHT) e.exec_recipe_fragment(output_C_datasets[0], sql_INNER)