def make_distinct_values_query(column, dataset):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()

    column = Column(column)

    layer_1 = SelectQuery()
    layer_1.select(column)
    layer_1.select(Column('*').count(), 'count')
    if is_hdfs:
        layer_1.select_from('_'.join(dataset.name.split('.')))
    else:
        layer_1.select_from(dataset)
    layer_1.group_by(column)

    count = Column('count')
    layer_2 = SelectQuery()
    layer_2.select(column)
    layer_2.select(
        count.div(count.sum().over(Window())).times(100), 'distribution')
    layer_2.select_from(layer_1, alias='layer_1')

    dialect_handler = dialectHandler(dataset)
    return dialect_handler.convertToSQL(
        layer_2)  #toSQL(query, dialect='Hive')#dataset=dataset)
    def _build_ordered_similarity(
            self,
            select_from,
            with_clause_as="_with_clause_normalization_factor"):
        """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from,
                               alias=self.LEFT_NORMALIZATION_FACTOR_AS)

        join_conditions = [
            Column(self.pivot_column,
                   self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe(
                       Column(self.pivot_column,
                              self.RIGHT_NORMALIZATION_FACTOR_AS))
        ]

        if self.supports_full_outer_join:
            join_conditions += [
                Column(self.based_column,
                       self.LEFT_NORMALIZATION_FACTOR_AS).lt(
                           Column(self.based_column,
                                  self.RIGHT_NORMALIZATION_FACTOR_AS))
            ]
        else:
            join_conditions += [
                Column(self.based_column,
                       self.LEFT_NORMALIZATION_FACTOR_AS).ne(
                           Column(self.based_column,
                                  self.RIGHT_NORMALIZATION_FACTOR_AS))
            ]

        similarity.join(select_from,
                        JoinTypes.INNER,
                        join_conditions,
                        alias=self.RIGHT_NORMALIZATION_FACTOR_AS)

        similarity.group_by(
            Column(self.based_column,
                   table_name=self.LEFT_NORMALIZATION_FACTOR_AS))
        similarity.group_by(
            Column(self.based_column,
                   table_name=self.RIGHT_NORMALIZATION_FACTOR_AS))

        similarity.select(Column(self.based_column,
                                 table_name=self.LEFT_NORMALIZATION_FACTOR_AS),
                          alias=f"{self.based_column}_1")
        similarity.select(Column(
            self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS),
                          alias=f"{self.based_column}_2")

        similarity.select(self._get_similarity_formula(),
                          alias=constants.SIMILARITY_COLUMN_NAME)

        return similarity
Beispiel #3
0
 def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME):
     samples_with_only_positives = SelectQuery()
     samples_with_only_positives.select_from(inner_select_from, inner_select_from_as)
     samples_with_only_positives.select(Column(self.dku_config.users_column_name))
     samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER)
     samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)))
     samples_with_only_positives.group_by(Column(self.dku_config.users_column_name))
     return samples_with_only_positives
    def _build_sum_of_similarity_scores(
            self,
            top_n,
            normalization_factor,
            top_n_as="_top_n",
            normalization_factor_as="_normalization_factor"):
        cf_scores = SelectQuery()
        cf_scores.select_from(top_n, alias=top_n_as)

        join_condition = Column(f"{self.based_column}_2",
                                top_n_as).eq_null_unsafe(
                                    Column(self.based_column,
                                           normalization_factor_as))
        cf_scores.join(normalization_factor,
                       JoinTypes.INNER,
                       join_condition,
                       alias=normalization_factor_as)

        cf_scores.group_by(
            Column(f"{self.based_column}_1", table_name=top_n_as))
        cf_scores.group_by(
            Column(self.pivot_column, table_name=normalization_factor_as))

        cf_scores.select(Column(f"{self.based_column}_1", table_name=top_n_as),
                         alias=self.based_column)
        cf_scores.select(
            Column(self.pivot_column, table_name=normalization_factor_as))

        cf_scores.select(self._get_user_item_similarity_formula(
            top_n_as, normalization_factor_as),
                         alias=constants.SCORE_COLUMN_NAME)

        cf_scores.order_by(Column(self.based_column))
        cf_scores.order_by(Column(constants.SCORE_COLUMN_NAME),
                           direction="DESC")
        return cf_scores
Beispiel #5
0
toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# join in fluent style
sb = SelectQuery() \
    .select(Constant('Hello')) \
    .select_from('t1') \
    .join('t2', JoinTypes.INNER, Column('c', 't1').eq(Column('c', 't2')))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Group by and limit
sb = SelectQuery()
sb.select_from('myTable')
sb.group_by(Column('groupCol'))
sb.limit(1000)

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Where clause and select distinct
sb = SelectQuery()
sb.distinct()
sb.select_from('myTable')
sb.where(Column('wherecol').le(Constant(33)))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Having clause