Esempio n. 1
0
def pivot(trades, prices):
    """
    Pivot and fill the columns on the event id so that each row contains a
    column for each id + column combination where the value is the most recent
    non-null value for that id. For example, given the above input tables the
    expected output is:

    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+
    | id|    timestamp|  bid|  ask|price|quantity|10_bid|10_ask|10_price|10_quantity|20_bid|20_ask|20_price|20_quantity|
    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+
    | 10|1546300799000| 37.5|37.51| null|    null|  37.5| 37.51|    null|       null|  null|  null|    null|       null|
    | 10|1546300800000| null| null| 37.5|   100.0|  37.5| 37.51|    37.5|      100.0|  null|  null|    null|       null|
    | 10|1546300801000| null| null|37.51|   100.0|  37.5| 37.51|   37.51|      100.0|  null|  null|    null|       null|
    | 10|1546300802000|37.51|37.52| null|    null| 37.51| 37.52|   37.51|      100.0|  null|  null|    null|       null|
    | 20|1546300804000| null| null|12.67|   300.0| 37.51| 37.52|   37.51|      100.0|  null|  null|   12.67|      300.0|
    | 10|1546300806000| 37.5|37.51| null|    null|  37.5| 37.51|   37.51|      100.0|  null|  null|   12.67|      300.0|
    | 10|1546300807000| null| null| 37.5|   200.0|  37.5| 37.51|    37.5|      200.0|  null|  null|   12.67|      300.0|
    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+

    :param trades: DataFrame of trade events
    :param prices: DataFrame of price events
    :return: A DataFrame of the combined events and pivoted columns.
    """
    df = fill(trades, prices).groupBy('id', 'timestamp', 'bid', 'ask', 'price', 'quantity').pivot('id').agg(
            func.last('bid').alias('bid'),
            func.last('ask').alias('ask'),
            func.last('price').alias('price'),
            func.last('quantity').alias('quantity')) \
        .orderBy("timestamp")
    return df
Esempio n. 2
0
    def set_d1_state_vector(self):
        """
        returns 1D representation for the distributed state
        :return:
        """

        d1_state_vector_udf = F.udf(d1_state_vector,
                                    ArrayType(ArrayType(FloatType())))
        window = Window.partitionBy(['id', F.to_date('avg_ts')
                                     ]).orderBy('ts').rangeBetween(
                                         -OD_time_frame, OD_time_frame)


        self.df = self.df \
            .withColumn('ts', F.col('avg_ts').cast('long')) \
            .withColumn('first_ts', F.first('avg_ts').over(window)) \
            .withColumn('last_ts', F.last('avg_ts').over(window)) \
            .withColumn('first_lat_idx', F.first('lat_idx').over(window)) \
            .withColumn('last_lat_idx', F.last('lat_idx').over(window)) \
            .withColumn('first_lon_idx', F.first('lon_idx').over(window)) \
            .withColumn('last_lon_idx', F.last('lon_idx').over(window)) \
            .withColumn('d1_states1',
                        d1_state_vector_udf(F.col('first_lon_idx'), F.col('first_lat_idx'), F.lit(width), F.lit(lon_cells), F.lit(lat_cells))
                        ) \
            .withColumn('d1_states2',
                        d1_state_vector_udf(F.col('last_lon_idx'), F.col('last_lat_idx'), F.lit(width), F.lit(lon_cells), F.lit(lat_cells))
                        )\
Esempio n. 3
0
    def __generate_target_fill(self, df: DataFrame, partition_cols: List[str],
                               ts_col: str, target_col: str) -> DataFrame:
        """
        Create columns for previous and next value for a specific target column

        :param df: input DataFrame
        :param partition_cols: partition column names
        :param ts_col: timestamp column name
        :param target_col: target column name
        """
        return (df.withColumn(
            f"previous_{target_col}",
            last(df[target_col], ignorenulls=True).over(
                Window.partitionBy(
                    *partition_cols).orderBy(ts_col).rowsBetween(
                        Window.unboundedPreceding, 0)),
        )
                # Handle if subsequent value is null
                .withColumn(
                    f"next_null_{target_col}",
                    last(df[target_col], ignorenulls=True).over(
                        Window.partitionBy(*partition_cols).orderBy(
                            col(ts_col).desc()).rowsBetween(
                                Window.unboundedPreceding, 0)),
                ).withColumn(
                    f"next_{target_col}",
                    lead(df[target_col]).over(
                        Window.partitionBy(*partition_cols).orderBy(ts_col)),
                ))
def bi_fluent_join(pyData):
    df = spark.createDataFrame(pyData)
    level1 = df \
        .groupBy(df.grp) \
        .agg(
            func.mean(df.C).alias("mean_of_C"),
            func.max(df.D).alias("max_of_D"))
    level2 = df \
        .groupBy(df.grp, df.subgrp) \
        .agg(
            func.variance(df.E).alias("var_of_E"),
            ((func.sum(df.E * df.E)-
              func.sum(df.E) * func.avg(df.E))
             /(func.count(df.E)-1)).alias("var_of_E2")
        )
    level3 = level2 \
        .join(level1, "grp") \
        .groupBy(level1.grp) \
        .agg(
            func.last(level1.mean_of_C).alias("mean_of_C"),
            func.last(level1.max_of_D).alias("max_of_D"),
            func.avg(level2.var_of_E).alias("avg_var_of_E"),
            func.avg(level2.var_of_E2).alias("avg_var_of_E2")
        ) \
        .orderBy(level1.grp)
    # .collect()
    return level3, None
Esempio n. 5
0
 def test_first_last_ignorenulls(self):
     from pyspark.sql import functions
     df = self.spark.range(0, 100)
     df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id"))
     df3 = df2.select(functions.first(df2.id, False).alias('a'),
                      functions.first(df2.id, True).alias('b'),
                      functions.last(df2.id, False).alias('c'),
                      functions.last(df2.id, True).alias('d'))
     self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())
Esempio n. 6
0
 def test_first_last_ignorenulls(self):
     from pyspark.sql import functions
     df = self.spark.range(0, 100)
     df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id"))
     df3 = df2.select(functions.first(df2.id, False).alias('a'),
                      functions.first(df2.id, True).alias('b'),
                      functions.last(df2.id, False).alias('c'),
                      functions.last(df2.id, True).alias('d'))
     self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())
Esempio n. 7
0
def transform_data_with_udf(clickstream_data, purchase_data):
    window1 = Window.partitionBy('userId').orderBy('eventTime')
    window2 = Window.orderBy('sessionId')

    clickstream_data = (clickstream_data.withColumn(
        'appOpenFlag',
        app_open_flag_udf(clickstream_data['eventType'])).withColumn(
            'sessionId',
            sum(col('appOpenFlag')).over(window1)).withColumn(
                'attr',
                attributes_udf(
                    clickstream_data['eventType'],
                    clickstream_data['attributes'])).withColumn(
                        'campaign_id',
                        when(
                            get_json_object('attr',
                                            '$.campaign_id').isNotNull(),
                            get_json_object('attr',
                                            '$.campaign_id')).otherwise(None)
                    ).withColumn(
                        'channel_id',
                        when(
                            get_json_object('attr',
                                            '$.channel_id').isNotNull(),
                            get_json_object(
                                'attr',
                                '$.channel_id')).otherwise(None)).withColumn(
                                    'purchase_id',
                                    when(
                                        get_json_object(
                                            'attr',
                                            '$.purchase_id').isNotNull(),
                                        get_json_object(
                                            'attr',
                                            '$.purchase_id')).otherwise(None)).
                        withColumn(
                            'campaignId',
                            last(col('campaign_id'), ignorenulls=True).over(
                                window2.rowsBetween(
                                    Window.unboundedPreceding, 0))).withColumn(
                                        'channelId',
                                        last(col('channel_id'),
                                             ignorenulls=True).over(
                                                 window2.rowsBetween(
                                                     Window.unboundedPreceding,
                                                     0))))

    target_df = clickstream_data.join(
        purchase_data,
        clickstream_data['purchase_id'] == purchase_data['purchaseId'],
        JOIN_TYPE.LEFT)

    return target_df.select(col('purchaseId'), col('purchaseTime'),
                            col('billingCost'), col('isConfirmed'),
                            col('sessionId'), col('campaignId'),
                            col('channelId'))
Esempio n. 8
0
 def _get_distances(self, prob_df: DataFrame, df_cdf_0: DataFrame,
                    df_cdf_1: DataFrame) -> DataFrame:
     window_fill = Window.orderBy(self.probability_col).rowsBetween(
         Window.unboundedPreceding, Window.currentRow)
     df_ks = prob_df.select(self.probability_col) \
         .join(df_cdf_0, on=self.probability_col, how='left') \
         .join(df_cdf_1, on=self.probability_col, how='left') \
         .withColumn(self.EMPIRICAL_CDF_NEG, F.last(self.EMPIRICAL_CDF_NEG, ignorenulls=True).over(window_fill)) \
         .withColumn(self.EMPIRICAL_CDF_POS, F.last(self.EMPIRICAL_CDF_POS, ignorenulls=True).over(window_fill)) \
         .fillna(0) \
         .withColumn(self.DISTANCE, F.abs(F.col(self.EMPIRICAL_CDF_NEG) - F.col(self.EMPIRICAL_CDF_POS)))
     return df_ks
Esempio n. 9
0
    def __getLastRightRow(self, left_ts_col, right_cols, sequence_col,
                          tsPartitionVal):
        from functools import reduce
        """Get last right value of each right column (inc. right timestamp) for each self.ts_col value
    
    self.ts_col, which is the combined time-stamp column of both left and right dataframe, is dropped at the end
    since it is no longer used in subsequent methods.
    """
        ptntl_sort_keys = [self.ts_col, sequence_col]
        sort_keys = [
            f.col(col_name) for col_name in ptntl_sort_keys if col_name != ''
        ]
        sort_keys.append('rec_ind')

        window_spec = Window.partitionBy(
            self.partitionCols).orderBy(sort_keys).rowsBetween(
                Window.unboundedPreceding, Window.currentRow)

        # splitting off the condition as we want different columns in the reduce if we are implementing the skew AS OF join
        if tsPartitionVal is None:
            df = reduce(
                lambda df, idx: df.withColumn(
                    right_cols[idx],
                    f.last(right_cols[idx], True).over(window_spec)),
                range(len(right_cols)), self.df)
        else:
            df = reduce(
                lambda df, idx: df.withColumn(
                    right_cols[idx],
                    f.last(right_cols[idx], True).over(window_spec)).
                withColumn('non_null_ct' + right_cols[idx],
                           f.count(right_cols[idx]).over(window_spec)),
                range(len(right_cols)), self.df)

        df = (df.filter(f.col(left_ts_col).isNotNull()).drop(
            self.ts_col)).drop('rec_ind')

        # remove the null_ct stats used to record missing values in partitioned as of join
        if tsPartitionVal is not None:
            for column in df.columns:
                if (column.startswith("non_null")):
                    any_blank_vals = (df.agg({
                        column: 'min'
                    }).collect()[0][0] == 0)
                    newCol = column.replace("non_null_ct", "")
                    if any_blank_vals:
                        print(
                            "Column " + newCol +
                            " had no values within the lookback window. Consider using a larger window to avoid missing values. If this is the first record in the data frame, this warning can be ignored."
                        )
                    df = df.drop(column)

        return TSDF(df, left_ts_col, self.partitionCols)
Esempio n. 10
0
def reduce_to_ohlc(time, rdd):
    row_rdd = rdd.map(lambda row: row.split(',')) \
                 .filter(lambda row: len(row) == 3) \
                 .map(lambda row: Row(
                       symbol=row[0],
                       tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'),
                       price=float(row[1])
                 ))
    sql_context = get_sql_context_instance(rdd.context)
    data = sql_context.createDataFrame(row_rdd)
    data.cache()
    data.write.format('org.apache.spark.sql.cassandra') \
            .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \
            .mode('append') \
            .save()

    ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \
                .orderBy('tx_time') \
                .groupBy('symbol', 'batch_time') \
                .agg(
                   F.first(data.price).alias('open'),
                   F.max(data.price).alias('high'),
                   F.min(data.price).alias('low'),
                   F.last(data.price).alias('close'),
                   F.first(data.tx_time).alias('open_time'),
                   F.last(data.tx_time).alias('close_time')
                )

    existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \
            .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
            .load() \
            .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time')

    merged_ohlc = ohlc.join(existing_ohlc,
                             (ohlc.symbol == existing_ohlc.symbol) &
                             (ohlc.batch_time == existing_ohlc.batch_time),
                             'left'
                           )

    merged_ohlc = merged_ohlc.select(
        ohlc.symbol.alias('symbol'),
        ohlc.batch_time.alias('batch_time'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'),
        F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'),
        F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high')
    )
    merged_ohlc.write.format('org.apache.spark.sql.cassandra') \
                .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
                .mode('append') \
                .save()
Esempio n. 11
0
def pivot(trades, prices):
    """
    Pivot and fill the columns on the event id so that each row contains a
    column for each id + column combination where the value is the most recent
    non-null value for that id. For example, given the above input tables the
    expected output is:

    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+
    | id|    timestamp|  bid|  ask|price|quantity|10_bid|10_ask|10_price|10_quantity|20_bid|20_ask|20_price|20_quantity|
    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+
    | 10|1546300799000| 37.5|37.51| null|    null|  37.5| 37.51|    null|       null|  null|  null|    null|       null|
    | 10|1546300800000| null| null| 37.5|   100.0|  37.5| 37.51|    37.5|      100.0|  null|  null|    null|       null|
    | 10|1546300801000| null| null|37.51|   100.0|  37.5| 37.51|   37.51|      100.0|  null|  null|    null|       null|
    | 10|1546300802000|37.51|37.52| null|    null| 37.51| 37.52|   37.51|      100.0|  null|  null|    null|       null|
    | 20|1546300804000| null| null|12.67|   300.0| 37.51| 37.52|   37.51|      100.0|  null|  null|   12.67|      300.0|
    | 10|1546300806000| 37.5|37.51| null|    null|  37.5| 37.51|   37.51|      100.0|  null|  null|   12.67|      300.0|
    | 10|1546300807000| null| null| 37.5|   200.0|  37.5| 37.51|    37.5|      200.0|  null|  null|   12.67|      300.0|
    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+

    :param trades: DataFrame of trade events
    :param prices: DataFrame of price events
    :return: A DataFrame of the combined events and pivoted columns.
    """
    trades_prices = trades. \
        join(prices, ['id', 'timestamp'], 'outer'). \
        select('id', 'timestamp', 'bid', 'ask', 'price', 'quantity'). \
        orderBy(asc("timestamp"))
    unique_ids = trades_prices.select('id').distinct().collect()
    result = None
    for row in unique_ids:
        id = str(row.id)
        dyn_columns = trades_prices. \
            withColumn("bid", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('bid')).cast(T.DoubleType()))).\
            withColumn("ask", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('ask')).cast(T.DoubleType()))).\
            withColumn("price", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('price')).cast(T.DoubleType()))).\
            withColumn("quantity", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('quantity')).cast(T.DoubleType()))).\
            withColumn(id+"_id", when(col("id") == row.id, lit(id).cast(T.IntegerType())).otherwise(lit(id).cast(T.IntegerType()))).\
            withColumn(id + "_bid", func.last('bid', True).over(
            Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \
            withColumn(id + "_ask", func.last('ask', True).over(
            Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \
            withColumn(id + "_price", func.last('price', True).over(
            Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \
            withColumn(id + "_quantity", func.last('quantity', True).over(
            Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))).\
            drop('bid', 'ask', 'price', 'quantity', id + "_id")
        if result is None:
            result = trades_prices.join(dyn_columns, ['id', 'timestamp'], how='outer')
        else:
            result = result.join(dyn_columns, ['id', 'timestamp'], how='outer')

    return result.orderBy('timestamp')
def get_editor_features(tag_user_histories):
    tag_features = tag_user_histories.groupby('event_user_id') \
        .agg(f.last('num_groups').alias('num_groups'),
             f.countDistinct('page_id').alias('num_articles'),
             f.count('revision_id').alias('num_edits'),
             f.last('num_blocks_historical').alias('num_past_blocks'),
             f.last('num_curr_blocks').alias('num_curr_blocks'),
             f.sum(col("is_revert_bool")).alias('num_reverts_by_others'),
             f.sum(col('is_reverted_bool')).alias('num_reverts_of_others'),
             f.last('days_since_registration').alias('time_since_registration'),
             udf_page_talk_ratio(f.collect_list('page_namespace')).alias('talk_article_ratio'),
             udf_contribution_frac(f.collect_list('page_id')).alias('contribution_frac_entropy')
             )

    return tag_features
def bi_fluent_window(pyData):
    df = spark.createDataFrame(pyData)
    window = Window \
        .partitionBy(df.grp, df.subgrp) \
        .orderBy(df.id)
    df = df \
        .orderBy(df.grp, df.subgrp, df.id)\
        .withColumn("sub_var_of_E",
                    func.variance(df.E)\
                              .over(window))
    df = df \
        .groupBy(df.grp, df.subgrp)\
        .agg(func.sum(df.C).alias("sub_sum_of_C"),
            func.count(df.C).alias("sub_count"),
            func.max(df.D).alias("sub_max_of_D"),
            func.last(df.sub_var_of_E).alias("sub_var_of_E1"),
            func.variance(df.E).alias("sub_var_of_E2"))
    df \
        .groupBy(df.grp)\
        .agg(
            (func.sum(df.sub_sum_of_C)/
             func.sum(df.sub_count)).alias("mean_of_C"),
            func.max(df.sub_max_of_D).alias("max_of_D"),
            func.avg(df.sub_var_of_E1).alias("avg_var_of_E1"),
            func.avg(df.sub_var_of_E2).alias("avg_var_of_E2"))\
        .orderBy(df.grp)\
        .collect()
    def _denoise_marker_column(self, window, start=True) -> Column:
        """Return marker column with noises removed and forward/backwards
         filled.

        Parameters
        ----------
        window: pyspark.sql.Window
            Resembles a window specification according to groupby/order.
        start: bool, optional
            Indicate fill order. If True, forward fill for start markers. If
            False, backwards fill for end markers.

        Returns
        -------
        denoised: pyspark.sql.column.Column
            Return spark column expression with denoised values.

        """

        marker_column = F.col(self.marker_column)

        # remove noise values
        valid_values = [self.marker_start, self.marker_end]
        mask_no_noise = marker_column.isin(valid_values)
        denoised = F.when(mask_no_noise, marker_column)

        # forward fill with remaining start/end markers
        if start:
            ffill_window = window.rowsBetween(Window.unboundedPreceding, 0)
            fill = F.last(denoised, ignorenulls=True).over(ffill_window)
        else:
            bfill_window = window.rowsBetween(0, Window.unboundedFollowing)
            fill = F.first(denoised, ignorenulls=True).over(bfill_window)

        return fill
Esempio n. 15
0
def task_4(df):
    window_duplicate_remove = Window.partitionBy('video_id').orderBy(
        col('views').desc())

    task_4_res_df = df.select('channel_title', 'video_id', 'views', 'trending_date',
                              F.row_number().over(window_duplicate_remove).alias('rn')). \
        filter(col('rn') == 1).groupBy("channel_title").agg(
        F.first('trending_date').alias("start_date"),
        F.last('trending_date').alias("end_date"),
        F.sum('views').alias("total_views"),
        F.collect_list('video_id').alias("video_id_list"),
        F.collect_list('views').alias("views_list")
    ).orderBy(col('total_views'), ascending=False).limit(20)
    query_result = task_4_res_df.collect()
    json_result = {
        "channels": [{
            "channel_name":
            row.channel_title,
            "start_date":
            row.start_date,
            "end_date":
            row.end_date,
            "total_views":
            row.total_views,
            "videos_views": [{
                "video_id": row.video_id_list[i],
                "viewes": row.views_list[i]
            } for i in range(len(row.video_id_list))]
        } for row in query_result]
    }
    return json_result
Esempio n. 16
0
def metricDaysPerWeekPerProfileDay(data,
                                   needed_dimension_variables,
                                   feature_col,
                                   sampling_multiplier,
                                   days=7,
                                   include_day_of_week=False):
    all_user_days = data.select("id").distinct().crossJoin(
        data.select("date").distinct())

    data = data.filter(
        col(feature_col) > 0).select(["id", "date", "bucket", feature_col] +
                                     needed_dimension_variables).distinct()

    data = data.alias("intermediate_table")
    all_user_days = all_user_days.alias("all_user_days")

    # Augment activity table to include non-active days
    intermediate_table2 = data.join(
        all_user_days, ['id', 'date'], 'outer').withColumn(
            "n_", F.coalesce("intermediate_table." + feature_col,
                             lit(0))).drop(feature_col).withColumnRenamed(
                                 "n_", feature_col)

    if include_day_of_week:
        intermediate_table2 = intermediate_table2.withColumn(
            feature_col + "_weekend",
            F.when(
                F.date_format('date', 'u').cast(IntegerType()) >= 6,
                col(feature_col)).otherwise(0)).withColumn(
                    feature_col + "_weekday",
                    F.when(
                        F.date_format('date', 'u').cast(IntegerType()) <= 5,
                        col(feature_col)).otherwise(0))

    # Calculate active days per week for each profile-day
    windowSpec = Window.partitionBy([intermediate_table2.id]).orderBy(
        intermediate_table2.date).rowsBetween(1 - days, 0)

    intermediate_table3 = intermediate_table2.withColumn(
        "n_",
        F.sum(intermediate_table2[feature_col]).over(windowSpec)).drop(
            feature_col).withColumnRenamed("n_", feature_col)
    if include_day_of_week:
        intermediate_table3 = intermediate_table3.withColumn(
            "n_",
            F.sum(intermediate_table2[feature_col + "_weekend"]).over(
                windowSpec)).drop(feature_col + "_weekend").withColumnRenamed(
                    "n_", feature_col + "_weekend")

        intermediate_table3 = intermediate_table3.withColumn(
            "n_",
            F.sum(intermediate_table2[feature_col + "_weekday"]).over(
                windowSpec)).drop(feature_col + "_weekday").withColumnRenamed(
                    "n_", feature_col + "_weekday")

    for v in needed_dimension_variables:
        intermediate_table3 = intermediate_table3.withColumn(
            v,
            F.last(v, True).over(windowSpec))
    return intermediate_table3
Esempio n. 17
0
def task_2(df_proper_date, categories_map):
    window_group_WoY = Window.partitionBy("WoY").orderBy(
        col('total_views').desc())
    task_2_res_df = df_proper_date.withColumn('WoY', (
                F.weekofyear(df_proper_date.dateframe) + F.year(df_proper_date.dateframe) * 53)). \
        groupBy('WoY', "category_id", "video_id").agg(
        F.count('video_id').alias("count"),
        F.first('views').alias("start_views"),
        F.last('views').alias("end_views"),
    ).filter(col('count') > 1). \
        withColumn("diff", (col("end_views") - col("start_views"))). \
        groupBy("WoY", "category_id").agg(
        F.sum('diff').alias("total_views"),
        F.collect_list('video_id').alias("video_id_list"),
    ).withColumn("rank", F.row_number().over(window_group_WoY)).filter(col("rank") == 1)
    query_result = task_2_res_df.collect()
    json_result = {
        "weeks": [{
            "start_date":
            date_by_week_n_from(int(row.WoY / 53), row.WoY % 53),
            "end_date":
            date_by_week_n_to(int(row.WoY / 53), row.WoY % 53),
            "category_id":
            row.category_id,
            "category_name":
            categories_map[row.category_id],
            "number_of_videos":
            len(row.video_id_list),
            "total_views":
            row.total_views,
            "video_ids":
            row.video_id_list
        } for row in query_result]
    }
    return json_result
def cond_fluent_window(pyData):
    dfData = spark.createDataFrame(pyData)
    dfData = dfData \
        .withColumn("cond", func.when(dfData.E < 0, -1).otherwise( +1))
    dfData = dfData \
        .orderBy(dfData.grp, dfData.subgrp, dfData.cond, dfData.id)
    window = Window \
        .partitionBy(dfData.grp, dfData.subgrp, dfData.cond) \
        .orderBy(dfData.id)\
        .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    dfData = dfData \
        .withColumn("cond_var_of_E_2_pre1",
                    func.when(dfData.cond < 0,
                            func.variance(dfData.E)\
                            .over(window)))
    dfData = dfData \
        .groupBy(dfData.grp, dfData.subgrp, dfData.cond)\
        .agg(func.sum(dfData.C).alias("sum_of_C_pre"),
            func.count(dfData.C).alias("count_of_C_pre"),
            func.max(dfData.D).alias("max_of_D_pre"),
            func.variance(func.when(dfData.E < 0, dfData.E)).alias("cond_var_of_E_1_pre"),
            func.last(dfData.cond_var_of_E_2_pre1).alias("cond_var_of_E_2_pre2"))

    dfData = dfData \
        .groupBy(dfData.grp, dfData.subgrp)\
        .agg((func.sum(dfData.sum_of_C_pre) \
            / func.sum(dfData.count_of_C_pre)\
            ).alias("mean_of_C"),
            func.max(dfData.max_of_D_pre).alias("max_of_D"),
            func.max(dfData.cond_var_of_E_1_pre).alias("cond_var_of_E_1"),
            func.max(dfData.cond_var_of_E_2_pre2).alias("cond_var_of_E_2"))\
        .orderBy(dfData.grp, dfData.subgrp)\
        .collect()
Esempio n. 19
0
    def forward_fill_dataframe(df, partition_cols, filling_cols):
        forward_fill_window = Window.partitionBy(partition_cols).rowsBetween(-sys.maxsize, 0)
        for column in filling_cols:
            filled_column_values = F.last(df[column], ignorenulls=True).over(forward_fill_window)
            df = df.withColumn(column, filled_column_values)

        return df
Esempio n. 20
0
    def join_stress_streams(self, dataStream, propagation='forward'):
        """
        filter data

        Args:
            columnName (str): name of the column
            operator (str): basic operators (e.g., >, <, ==, !=)
            value (Any): if the columnName is timestamp, please provide python datatime object

        Returns:
            DataStream: this will return a new datastream object with blank metadata
        """
        combined_df = self._data.join(
            dataStream.data,
            on=['user', 'timestamp', 'localtime', 'version'],
            how='full').orderBy('timestamp')
        combined_filled = combined_df.withColumn(
            "data_quality",
            F.last('data_quality', True).over(
                Window.partitionBy('user').orderBy('timestamp').rowsBetween(
                    -sys.maxsize, 0)))
        combined_filled_filtered = combined_filled.filter(
            combined_filled.ecg.isNotNull())

        return DataStream(data=combined_filled_filtered, metadata=Metadata())
Esempio n. 21
0
    def ffill_windows(cls, df, time_col, columns_to_fill):
        """
    Forward filling strategy. This strategy fills empty 
    spots using the last know value of a column

    """
        import sys
        from pyspark.sql import Window
        from pyspark.sql.functions import last

        # define the window (and order it by time)
        window = Window.orderBy(time_col)\
                    .rowsBetween(-sys.maxsize, 0)

        # fill every column and replace columns
        for col_entry in columns_to_fill:
            col_name_to_fill = col_entry[0]
            col_name_new = col_entry[1]

            if (col_name_new is None):
                col_name_new = col_name_to_fill

            df = df.withColumn(
                col_name_new,
                last(df[col_name_to_fill], ignorenulls=True).over(window))

        return df
Esempio n. 22
0
def extract_itineraries(job):
    find_spark()
    from util import hdfs_fn
    from pyspark.sql.functions import collect_list, first, last

    def make_line(row):
        return '{} {} {} {} {}'.format(
            row.ItinID, row.FirstAirportID,
            ' '.join(map(str, row.OriginAirportIDs)), row.LastAirportID,
            row.LastAirportID)

    with build_spark() as spark:
        df = spark.read.csv(hdfs_fn(job, 'Coupon.csv'),
                            header=True,
                            inferSchema=True)
        col_names = [
            'ItinID', 'SeqNum', 'OriginAirportID', 'Origin', 'DestAirportID',
            'Dest'
        ]
        df_network = df[col_names].repartition('ItinID').sort(
            ['ItinID', 'SeqNum'])
        itins = df_network.groupby(['ItinID']).agg(
            first('OriginAirportID').alias('FirstAirportID'),
            collect_list('OriginAirportID').alias('OriginAirportIDs'),
            last('DestAirportID').alias('LastAirportID'))
        itins.rdd.map(make_line).saveAsTextFile(
            hdfs_fn(job, 'hon_itineraries.txt'))
Esempio n. 23
0
    def _summary(self, name=None):
        """
        Return a summarized representation.

        Parameters
        ----------
        name : str
            name to use in the summary representation

        Returns
        -------
        String with a summarized representation of the index
        """
        head, tail, total_count = self._kdf._sdf.select(
            F.first(self._scol), F.last(self._scol),
            F.count(F.expr("*"))).first()

        if total_count > 0:
            index_summary = ", %s to %s" % (pprint_thing(head),
                                            pprint_thing(tail))
        else:
            index_summary = ""

        if name is None:
            name = type(self).__name__
        return "%s: %s entries%s" % (name, total_count, index_summary)
Esempio n. 24
0
def countTest():
    countDf = spark.read.format('csv').option("header", "true").option(
        "inferSchema", "true").load("../../data/airlines.csv")
    print(countDf.count())
    countDf.select(count('Code')).show()
    countDf.select(countDistinct('Code')).show()
    countDf.select(first('Code'), last('Code')).show()
    def _generate_raw_iids_special(self, start_first: bool,
                                   add_negate_shift_col: bool,
                                   reverse=False) -> Column:
        """Create sequence of interval ids in increasing order regardless of
        their validity.

        Parameters
        ----------
        start_first: bool
            Defines if the first start is used for intervals.
        add_negate_shift_col: bool
            True if the shift col have to be negated.
        reverse: bool, optional
            Define order by.

        Returns
        -------
        raw_iids: pyspark.sql.column.Column

        """

        marker_col = F.col(self.marker_column)
        window = self._window_groupby(reverse)

        # generate forward fill depending on interval
        if start_first:
            default = 0
            forward_fill = F.when(marker_col == self.marker_start, 1) \
                .when(marker_col == self.marker_end, 0) \
                .otherwise(None)
        else:
            default = 1
            forward_fill = F.when(marker_col == self.marker_end, 1) \
                .when(marker_col == self.marker_start, 0) \
                .otherwise(None)

        ff_window = window.rowsBetween(Window.unboundedPreceding, 0)
        forward_fill_col = F.last(forward_fill, ignorenulls=True).over(
            ff_window)

        # shifting marker_col forward
        shift_col = F.lag(forward_fill_col, default=default, count=1) \
            .over(window) \
            .cast("integer")

        # compare forward fill col and shifted forward fill col
        end_marker_null_col = F.when(shift_col == forward_fill_col, 0) \
            .otherwise(forward_fill_col)

        if add_negate_shift_col:
            shift_col_negated = F.when(shift_col == 0, 1).otherwise(0)
            add_col = end_marker_null_col + shift_col_negated
        else:
            add_col = end_marker_null_col

        # build cum sum over window
        raw_iids = F.sum(add_col).over(window)

        return raw_iids
Esempio n. 26
0
def get_last_user_event_value(
    target_column: str, user_column: str = "user_id"
) -> Column:
    return F.last(F.col(target_column)).over(
        Window()
        .partitionBy(user_column)
        .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    )
Esempio n. 27
0
 def __fill_nans(self, df):
     # part of the fix related to JIRA ARESPY-20
     window = Window.partitionBy('timestamp').orderBy(
         'timestamp').rowsBetween(-100000, 0)
     for column in df.columns:
         df = df.withColumn('filled_%s' % column,
                            F.last(F.col(column)).over(window).isNotNull())
     return df
Esempio n. 28
0
def create_prediction_df(training_df, prediction_period, other_column=None):
    """
    :param training_df: -- dataframe: for training
    :param prediction_period: -- integer: number of period to predict on
    :param other_column: -- col column for prediction other than the overall rank
    :return: -- dataframe and corresponding dates
    """
    if other_column:
        last_date = training_df.select('Date').distinct().orderBy(
            'Date').select(last('Date')).collect()[0][0]
        last_id = training_df.select('id').distinct().orderBy('id').select(
            max('id')).collect()[0][0]

        date_rows = list(
            Row(
                float(last_id + i),
                date(last_date.year, last_date.month, 1) +
                timedelta(days=i * 31))
            for i in range(1, prediction_period + 1))

        date_df = spark.createDataFrame(date_rows, ['id', 'Date'])

        specialized_rows = training_df.select(other_column, other_column +
                                              '_idx').distinct()

        prediction_df = specialized_rows.join(date_df)

        assembler = VectorAssembler(inputCols=['id', other_column + '_idx'],
                                    outputCol='features')
        return assembler.transform(prediction_df)

    else:
        last_date = training_df.orderBy('Date').select(
            last('Date')).collect()[0][0]
        last_id = training_df.orderBy('id').select(
            max('id')).collect()[0][0][0]
        prediction_rows = list(
            Row(
                float(last_id + i),
                date(last_date.year, last_date.month, 1) +
                timedelta(days=i * 31))
            for i in range(1, prediction_period + 1))

        prediction_df = spark.createDataFrame(prediction_rows, ['id', 'Date'])

    return prediction_df
Esempio n. 29
0
def metricRetention(data,
                    needed_dimension_variables,
                    feature_col,
                    sampling_multiplier,
                    activated=False):
    activity_data = data.filter(col(feature_col) > 0).select(
        ["id", "date", feature_col]).distinct()

    pcd_table = data.select(["date", "id", "bucket"] +
                            needed_dimension_variables)
    windowSpec = Window.partitionBy([pcd_table.id] +
                                    needed_dimension_variables).orderBy(
                                        pcd_table.date).rowsBetween(0, 13)
    for v in needed_dimension_variables:
        pcd_table = pcd_table.withColumn(v, F.last(v, True).over(windowSpec))
    pcd_table = pcd_table.filter(col("new_profile") == 1)

    if activated:
        pcd_table = pcd_table.alias("pcd_t").join(
            activity_data.alias("i_t"), (col('pcd_t.id') == col('i_t.id')) &
            (col('i_t.date') >= F.date_add(col('pcd_t.date'), 1)) &
            (col('i_t.date') <= F.date_add(col('pcd_t.date'), 6)),
            "inner").filter(col("i_t." + feature_col) > 0).dropDuplicates([
                'id'
            ]).select([
                col('pcd_t.{}'.format(c))
                for c in ['id', 'bucket', "date"] + needed_dimension_variables
            ])

    intermediate_table3 = pcd_table.alias("pcd_t").join(
        activity_data.alias("i_t"), (col('pcd_t.id') == col('i_t.id')) &
        (col('i_t.date') >= F.date_add(col('pcd_t.date'), 7)) &
        (col('i_t.date') <= F.date_add(col('pcd_t.date'), 13)),
        "outer").select([
            'pcd_t.{}'.format(c)
            for c in ['id', 'date', 'bucket'] + needed_dimension_variables
        ] + [feature_col]).fillna(0, [feature_col]).groupBy([
            'pcd_t.{}'.format(c)
            for c in ['id', 'date', 'bucket'] + needed_dimension_variables
        ], ).agg(F.max(col(feature_col))).drop(feature_col).withColumnRenamed(
            "MAX({})".format(feature_col), feature_col).select([
                col("pcd_t.{}".format(c)).alias(c)
                for c in ['id', 'bucket', 'date'] + needed_dimension_variables
            ] + [feature_col])

    intermediate_table4 = intermediate_table3.groupBy(
        ["date", "bucket"] +
        needed_dimension_variables).mean(feature_col).withColumnRenamed(
            'avg({})'.format(feature_col), feature_col)
    intermediate_table4_allbucket = intermediate_table3.groupBy(
        ["date"] +
        needed_dimension_variables).mean(feature_col).withColumnRenamed(
            'avg({})'.format(feature_col),
            feature_col).withColumn('bucket', lit("ALL"))

    joined_intermediate = intermediate_table4.unionByName(
        intermediate_table4_allbucket)
    return joined_intermediate
Esempio n. 30
0
def fill_activity_na(df):
    # define the window
    window = Window.orderBy('timestamp').rowsBetween(-20, 0)

    # define the forward-filled column
    filled_column = last(df['heart_rate'], ignorenulls=True).over(window)

    df = df.withColumn('heart_rate', filled_column)

    return df
Esempio n. 31
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
        self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
        self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
Esempio n. 32
0
    def last(self):
        """
        Compute last of group values.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby
        """
        return self._reduce_for_stat_function(
            lambda col: F.last(col, ignorenulls=True), only_numeric=False)
Esempio n. 33
0
    def last(self):
        """
        Compute last of group values.

        See Also
        --------

        koalas.DataFrame.groupby
        """
        return self._reduce_for_stat_function(lambda col: F.last(col, ignorenulls=True),
                                              only_numeric=False)
# COMMAND ----------

from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070


# COMMAND ----------

from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364


# COMMAND ----------

from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()


# COMMAND ----------

from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import sum
df.select(sum("Quantity")).show() # 5176450


# COMMAND ----------