def _build_ordered_similarity( self, select_from, with_clause_as="_with_clause_normalization_factor"): """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """ similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=self.LEFT_NORMALIZATION_FACTOR_AS) join_conditions = [ Column(self.pivot_column, self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe( Column(self.pivot_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] if self.supports_full_outer_join: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).lt( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] else: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).ne( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] similarity.join(select_from, JoinTypes.INNER, join_conditions, alias=self.RIGHT_NORMALIZATION_FACTOR_AS) similarity.group_by( Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)) similarity.group_by( Column(self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)) similarity.select(Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_1") similarity.select(Column( self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_2") similarity.select(self._get_similarity_formula(), alias=constants.SIMILARITY_COLUMN_NAME) return similarity
def make_full_transform_query(aggregation_queries, dataset, aggregation_params, transform_params, encoding_feature=False): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() inner = SelectQuery() if is_hdfs: inner.select_from('_'.join(dataset.name.split('.'))) else: inner.select_from(dataset) if aggregation_params.is_rolling_window(): inner.select(Column('*')) else: inner.distinct() #TODO why?! -> avoid dupplicate for key in aggregation_params.get_effective_keys(): inner.select(Column(key)) prefilter = _make_prefilter(aggregation_params, transform_params) inner.where(prefilter) outer = SelectQuery() outer.select_from(inner, alias='inner') if aggregation_params.is_rolling_window(): outer.select(Column('*', 'inner')) else: for col in aggregation_params.get_effective_keys(): #+ feature_names: outer.select(Column(col, 'inner')) reverse_mapping_dict = {} for idx, agg_query in enumerate(aggregation_queries): agg_query.alias( agg_query.get_alias() or 'cte_' + str(idx)) #TODO remove, make sure they have ids outer.with_cte(agg_query) join_cond = Expression() for key in aggregation_params.get_effective_keys(): join_cond = join_cond.and_( Column(key, 'inner').eq_null_unsafe( Column(key, agg_query.get_alias()))) outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond) for idx2, col in enumerate(agg_query.get_columns_alias()): if encoding_feature: if aggregation_params.feature_name_mapping.get(col): new_alias = '{}_{}_{}'.format( aggregation_params.feature_name_mapping.get(col), idx, idx2) outer.select(Column(col, agg_query.get_alias()), new_alias) reverse_mapping_dict[new_alias] = col else: outer.select(Column(col, agg_query.get_alias())) return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
def _build_unordered_similarity( self, select_from, left_select_from_as="_left_ordered_similarity", right_select_from_as="_right_ordered_similarity", with_clause_as="_with_clause_ordered_similarity", ): """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table""" similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=left_select_from_as) join_condition = Constant(1).eq_null_unsafe(Constant(0)) similarity.join(select_from, JoinTypes.FULL, join_condition, alias=right_select_from_as) similarity.select( Column(f"{self.based_column}_1", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_2", table_name=right_select_from_as)), alias=f"{self.based_column}_1", ) similarity.select( Column(f"{self.based_column}_2", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_1", table_name=right_select_from_as)), alias=f"{self.based_column}_2", ) similarity.select( Column("similarity", table_name=left_select_from_as).coalesce( Column("similarity", table_name=right_select_from_as)), alias=constants.SIMILARITY_COLUMN_NAME, ) return similarity
toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Having clause sb = SelectQuery() sb.select_from('myTable') sb.having(Column('havingcol').ge(Constant(33))) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # CTE sb = SelectQuery() sb.select(Constant(33)) sb.with_cte(sql.SelectQuery().select_from('cte_table').alias('myCTE')) sb.select_from('myTable') toSQL(sb, Dialects.GREENPLUM) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Order sb = SelectQuery() sb.select_from('myTable') sb.order_by(Column('orderCol')) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Subquery q1 = SelectQuery()