Python row_number Examples, pyspark.sql.functions.row_number Python Examples

Example #1

0

Show file

File: spark_splitters.py Project: David-Li-L/recommenders

def spark_timestamp_split(
    data,
    ratio=0.75,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark timestamp based splitter
    The splitter splits the data into sets by timestamps without stratification on either
    user or item.
    The ratios are applied on the timestamp column which is divided accordingly into
    several partitions.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
            Earlier indexed splits will have earlier times
            (e.g the latest time in split[0] <= the earliest time in split[1])
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps. Float number represented in
        seconds since Epoch.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.orderBy(col(col_timestamp))
    rating = data.withColumn("rank", row_number().over(window_spec))

    data_count = rating.count()
    rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count)

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop(
                "rank"
            )
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            ).drop("rank")

        splits.append(rating_split)

    return splits

Example #2

0

Show file

 def topPlayersOVA(self, data: DataFrame):
     """
     This function finds the top 10 players for each position based on the OVA indicator
     :param data: A dataframe
     :return: A dataframe with the top 10 players for each position
     """
     try:
         df = data.select(F.col(Name), F.col(OVA), F.col(Single_position).alias(Position),
                          F.row_number().over(Window.partitionBy(Single_position)
                                              .orderBy(F.col(OVA).desc())).alias(row_number))\
             .filter(F.col(row_number).isin(top10)).drop(row_number)
         return df
     except Exception as ex:
         print(ex)

Example #3

0

Show file

def to_friday(df, dt):
    '''Convert all days to Fridays

    Note that this is not forward mapping as Saturday and Sunday are mapped backward.
    This is fine since all the data are supposed to be in "business days"
    To be consistent over the weekends, use the timestamps'''
    cols = df.columns
    # Convert all days to Friday of the week
    df = df.withColumn('friday', F.date_add(F.date_trunc('week', dt), 4))
    # Keep only the last record in each week
    w = Window.partitionBy('friday').orderBy(F.col(dt).desc())
    df = df.withColumn('rn', F.row_number().over(w)).where(F.col('rn') == 1)
    df = df.drop(dt).withColumnRenamed('friday', dt)
    return df.select(*cols)

Example #4

0

Show file

def create_race_dimention(cities_fact_df):
    try:
        # Select distinct race types
        race_dim_df = cities_fact_df.select("Race").distinct()

        # Assign unique id for each race type as race_type_id
        race_window = Window.orderBy(f.col("Race"))
        race_dim_df = race_dim_df.withColumn("race_type_id",
                                             f.row_number().over(race_window))

        return race_dim_df

    except Exception as e:
        print(str(e))

Example #5

0

Show file

def find_otp_bus_legs_actual_start_time(otp_legs_df, clean_bus_trips_df):
    w = Window.partitionBy(
        ['date', 'user_trip_id', 'itinerary_id', 'route',
         'from_stop_id']).orderBy(['timediff'])
    return otp_legs_df \
        .withColumn('stopPointId', F.col('from_stop_id')) \
        .join(clean_bus_trips_df, ['date','route','stopPointId'], how='inner') \
        .na.drop(subset=['timestamp']) \
        .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('otp_start_time')))) \
        .drop('otp_duration') \
        .withColumn('rn', F.row_number().over(w)) \
        .where(F.col('rn') == 1) \
        .select(['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','otp_start_time','timestamp','to_stop_id','otp_end_time']) \
        .withColumnRenamed('timestamp','from_timestamp')

Example #6

0

Show file

def get_source_upvotes(memes_source, upvotes_column, source):
    """Get upvotes_centile and save upvotes to hdfs (for hour predictions)"""

    df = memes_source
    df.select(upvotes_column).write.format("com.databricks.spark.csv").mode(
        "overwrite").save('hdfs:///upvotes/{0}'.format(source))
    n = df.count()
    w = Window.orderBy(upvotes_column)
    df = df.withColumn('upvotes_centile',
                       functions.round(row_number().over(w) / n, 5))
    df = df.select('id',
                   functions.col(upvotes_column).alias('upvotes'),
                   'upvotes_centile', 'timestamp')
    return df

Example #7

0

Show file

File: top10addons.py Project: tylerwx51/Fx_Usage_Report

def top_10_addons_on_date(data, date, topN, period=7, country_list=None):
    """ Gets the number of users in the past week who have used the top N addons,
        broken down by country.

        Parameters:
        data - The main ping server.
        date - The day you which you want to get the top N addons.
        topN - the number of addons to get.
        period - number of days to use to calculate metric
        country_list - a list of country names in string

        Returns:
        Dataframe containing the number of users using each of the addons.
        submission_date_s3, country, addon_id, name, percent_of_active_users
    """
    addon_filter = (~col('addon.is_system')) & (~col('addon.foreign_install')) & \
        (~col('addon.addon_id').isin(NON_MOZ_TP)) & (~col('addon.addon_id').like('%@mozilla%')) &\
        (~col('addon.addon_id').like('%@shield.mozilla%')) &\
        (~col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%'))

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    wau = data_all.filter((col('submission_date_s3') > begin) &
                          (col('submission_date_s3') <= date))\
        .groupBy('country')\
        .agg(lit(date).alias('submission_date_s3'),
             F.countDistinct('client_id').alias('wau'))

    counts = data_all.select('submission_date_s3', 'country', 'client_id',
                             F.explode('active_addons').alias('addon'))\
        .filter((col('submission_date_s3') > begin) &
                (col('submission_date_s3') <= date))\
        .filter(addon_filter)\
        .select('country', 'client_id', 'addon.addon_id', 'addon.name')\
        .distinct()\
        .groupBy('country', 'addon_id')\
        .agg(F.count('*').alias('number_of_users'), F.last('name').alias('name'))\
        .select('*', lit(date).alias('submission_date_s3'),
                lit(begin).alias('start_date'),
                F.row_number().over(Window.partitionBy('country')
                                    .orderBy(desc('number_of_users'))
                                    .rowsBetween(Window.unboundedPreceding, Window.currentRow))
                              .alias('rank'))\
        .filter(col('rank') <= topN)

    return counts.join(F.broadcast(wau), on=['country'], how='left')\
        .select(lit(date).alias('submission_date_s3'), 'country',
                'addon_id', col('name').alias('addon_name'),
                (100.0 * col('number_of_users') / col('wau')).alias('pct_with_addon'))

Example #8

0

Show file

def mergeCdcData(mergedDfTarget, primaryKeyArray, orderByField):
    """ Combine data and get the latest row out for a primary key based on the change_order_by key
    :param mergeddfTarget: dataframe with old and new records
    :param primary_key: database primary
    :param change_order_by: by time - last updated timestamp
    :return: dataframe - with the merged records
    """
    win = Window().partitionBy(primaryKeyArray).orderBy(F.col(orderByField).desc())
    mergedTargetRanked = mergedDfTarget.withColumn("order_inc_rank", F.row_number().over(win))
    mergeTargetRankedWithoutDuplicates = mergedTargetRanked.dropDuplicates()
    rowsWithLatestUpdatesDf = mergeTargetRankedWithoutDuplicates.filter(
        mergeTargetRankedWithoutDuplicates.order_inc_rank == 1)
    rowsWithLatestUpdatesDf = rowsWithLatestUpdatesDf.drop("order_inc_rank")
    return rowsWithLatestUpdatesDf

Example #9

0

Show file

def get_data_missing_from_musicbrainz(partial_listens_df, msid_mbid_mapping_df):
    """ Get data that has been submitted to ListenBrainz but is missing from MusicBrainz.

        Args:
            partial_listens_df (dataframe): dataframe of listens.
            msid_mbid_mapping_df (dataframe): msid->mbid mapping. For columns refer to
                                              msid_mbid_mapping_schema in listenbrainz_spark/schema.py

        Returns:
            missing_musicbrainz_data_itr (iterator): Data missing from the MusicBrainz.
    """
    condition = [
        partial_listens_df.track_name_matchable == msid_mbid_mapping_df.msb_recording_name_matchable,
        partial_listens_df.artist_name_matchable == msid_mbid_mapping_df.msb_artist_credit_name_matchable
    ]

    df = partial_listens_df.join(msid_mbid_mapping_df, condition, 'left') \
                           .select('artist_msid',
                                   'artist_name',
                                   'listened_at',
                                   'recording_msid',
                                   'release_msid',
                                   'release_name',
                                   'track_name',
                                   'user_name') \
                           .where(col('msb_recording_name_matchable').isNull() &
                                  col('msb_artist_credit_name_matchable').isNull())

    current_app.logger.info('Number of (artist, recording) pairs missing from mapping: {}'.format(df.count()))
    window = Window.partitionBy('user_name').orderBy(col('listened_at').desc())

    # limiting listens to 200 for each user so that messages don't drop
    # Also, we don't want to overwhelm users with the data that they
    # have submitted to LB and should consider submitting to MB.
    # The data will be sorted on "listened_at"

    missing_musicbrainz_data_itr = df.groupBy('artist_msid',
                                              'artist_name',
                                              'recording_msid',
                                              'release_msid',
                                              'release_name',
                                              'track_name',
                                              'user_name') \
        .agg(func.max('listened_at').alias('listened_at')) \
        .withColumn('rank', row_number().over(window)) \
        .where(col('rank') <= 200) \
        .toLocalIterator()

    return missing_musicbrainz_data_itr

Example #10

0

Show file

def run_ALS(path_prefix, path_results, path_train, nr_predictions=300):
    behaviors_presel_df, behaviors_df, preselection_df = read_data(path_prefix)
    _, behaviors_train_df, preselection_train_df = read_data(path_train)
    max_index_behaviors_df = behaviors_df.groupby().max(
        'index').first().asDict()['max(index)']
    behaviors_train_df = behaviors_train_df.withColumnRenamed('index', 'i2')\
                                           .withColumn('index', F.col('i2') + max_index_behaviors_df)\
                                           .drop('i2')
    behaviors_train_df = behaviors_train_df.cache()
    behaviors_df_train = behaviors_train_df.select(
        'index', 'item_id', 'rating',
        'user').union(behaviors_df.select('index', 'item_id', 'rating',
                                          'user'))

    model, als = train_ALS_model(behaviors_df_train,
                                 rank=100,
                                 alpha=10,
                                 reg_par=0.1,
                                 max_iter=10)

    index_subset = behaviors_df_train.filter(
        F.col('index') <= max_index_behaviors_df)

    index_recs = model.recommendForUserSubset(index_subset, nr_predictions)
    indexitem_recs_df = index_recs.select(
        'index', F.explode(index_recs.recommendations))

    indexitem_df = indexitem_recs_df.select(
        'index',
        F.col('col.item_id').alias('item_id'),
        F.col('col.rating').alias('prediction'))

    indexitem_predictions_df = indexitem_df.join(preselection_df.select(
        'item_id', 'index', 'item').distinct(), ['item_id', 'index'],
                                                 how='inner')

    indexitem_predictions_df = indexitem_predictions_df.withColumn(
        'rank',
        F.row_number().over(
            Window.partitionBy('index').orderBy(F.desc('prediction'))))
    indexitem_predictions_df = indexitem_predictions_df.withColumn('dic', F.create_map(['item', 'rank']))\
                                                       .select('index', 'dic')
    indexitem_predictions_df = indexitem_predictions_df.groupby('index').agg(
        F.collect_list('dic').alias('dic_list'))

    result = get_pred_rank(indexitem_predictions_df, behaviors_presel_df)

    result.repartition(1).write.json(path_results, mode='overwrite')
    return result, model

Example #11

0

Show file

def prepare_client_rows(main_summary):
    """Coalesce client pings into a DataFrame that contains one row for
    each client."""
    in_columns = {
        "client_id", "timestamp",
        "scalar_parent_browser_engagement_total_uri_count",
        "scalar_parent_browser_engagement_unique_domains_count",
        "subsession_length"
    }
    out_columns = (set(main_summary.columns) | {
        "usage_seconds", "total_uri_count", "unique_domains_count_per_profile"
    })
    assert (in_columns <= set(main_summary.columns))

    # Get the newest ping per client and append to original dataframe
    window_spec = (Window.partitionBy(F.col('client_id')).orderBy(
        F.col('timestamp').desc()))
    newest_per_client = (main_summary.withColumn(
        'client_rank',
        F.row_number().over(window_spec)).where(F.col("client_rank") == 1))

    # Compute per client aggregates lost during newest client computation
    select_expr = utils.build_col_expr({
        "client_id":
        None,
        "total_uri_count":
        (F.coalesce("scalar_parent_browser_engagement_total_uri_count",
                    F.lit(0))),
        "unique_domains_count":
        (F.coalesce("scalar_parent_browser_engagement_unique_domains_count",
                    F.lit(0))),
        # Clamp broken subsession values to [0, MAX_SUBSESSION_LENGTH].
        "subsession_length": (F.when(
            F.col('subsession_length') > MAX_SUBSESSION_LENGTH,
            MAX_SUBSESSION_LENGTH).otherwise(
                F.when(F.col('subsession_length') < 0,
                       0).otherwise(F.col('subsession_length'))))
    })

    per_client_aggregates = (main_summary.select(
        *select_expr).groupby('client_id').agg(
            F.sum('subsession_length').alias('usage_seconds'),
            F.sum('total_uri_count').alias('total_uri_count'),
            F.avg('unique_domains_count').alias(
                'unique_domains_count_per_profile')))

    # Join the two intermediate datasets
    return (newest_per_client.join(per_client_aggregates, 'client_id',
                                   'inner').select(*out_columns))

Example #12

0

Show file

    def evaluate_batch(self, test_ratings, weighted_ratings, users):
        total_users = test_ratings.select("user_id").distinct().count()
        weighted_ratings.cache()
        ratingsPredicted = test_ratings.select(
            "user_id", "business_id",
            col("stars").cast("float")).join(weighted_ratings,
                                             ["user_id", "business_id"])
        #     predictedVsActual.orderBy("user_id").show()
        #     print(ratingsPredicted.groupBy("user_id", "business_id")\
        #                         .agg(F.count("*").alias("count")).where(col("count")>1).count())

        total = test_ratings.count()
        predicted = ratingsPredicted.count()
        print("Total {}".format(total), " Predicted {}".format(predicted))
        coverage = predicted * 100 / total

        evaluator = RegressionEvaluator(metricName="mae",
                                        labelCol="stars",
                                        predictionCol="pred_stars")
        mae = evaluator.evaluate(ratingsPredicted)

        precision = {}
        precision_liked = {}
        for i in [5, 10]:
            wi = Window.partitionBy("user_id").orderBy(
                col("pred_stars").desc())
            topi = weighted_ratings.withColumn(
                "rn",
                F.row_number().over(wi)).where(col("rn") <= i).drop("rn")
            #         topi.show(100)
            topratings = topi.join(test_ratings.select("user_id", "business_id", "stars"),
                                   ["user_id", "business_id"]) \
                .join(users.select("user_id", "average_stars"), "user_id")
            topratings = topratings.withColumn("liked", col("stars") >= col("average_stars")) \
                .withColumn("pred_liked",
                            col(
                                "pred_stars") >= col(
                                "average_stars"))
            topratings.cache()
            hits = topratings.count()
            hits_liked = topratings.where(
                col("liked") == col("pred_liked")).count()
            precision[i] = hits / (i * total_users)
            precision_liked[i] = hits_liked / (i * total_users)
            print("Hits : {} ".format(hits),
                  " Hits Liked: {} ".format(hits_liked))
            topratings.unpersist()
        weighted_ratings.unpersist()
        return mae, coverage, precision, precision_liked

Example #13

0

Show file

File: sentiment_analysis.py Project: Sunghee2/Unification.NorthKorea-analysis

def analysis(folder_name):
    tweets = spark.read.load("hdfs:///user/maria_dev/project/data/" + folder_name + "/clean_data.csv",
                    format="csv", sep=",", inferSchema="true", header="true", encoding="utf-8")

    # parse date type
    tweets = tweets.withColumn("date", to_date("date"))
    
    # date별 언급량
    tweets_num = tweets.groupBy("date").count().orderBy("date", ascending=0)
    tweets_num = tweets_num.na.drop()

    # flatten word
    tweets = tweets.withColumn("word", explode(split("word", '\\|')))

    # date별 word count
    tweets = tweets.groupBy(["word", "date"]).count().orderBy(["date", "count"], ascending=[0, 0])

    # word와 pos 나누기
    split_col = split(tweets.word, "\\,")
    tweets = tweets.withColumn("split_word", split_col.getItem(0))
    tweets = tweets.withColumn("pos", split_col.getItem(1))

    # 긍정 부정 구하기
    udf_calc = udf(get_sentiment, StringType())
    tweets = tweets.withColumn("sentiment", udf_calc(col("split_word"), col("pos")))

    # 긍부정 퍼센트 계산
    pos_percentage = tweets.groupBy(["date", "sentiment"]).sum().withColumnRenamed("sum(count)", "count")\
                        .withColumn("total", sum("count").over(Window.partitionBy("date")))\
                        .withColumn("percent", (col("count") / col("total")) * 100)\
                        .filter(col("sentiment").isin(["POS", "NEG"]))\
                        .select(["date", "sentiment", "percent"])\
                        .orderBy(["date", "percent"], ascending=[0,0])

    # # date별 많은 word 15개 추출
    w = Window().partitionBy("date").orderBy(col("count").desc())
    tweets = tweets.withColumn("rn", row_number().over(w)).where(col("rn") <= 15)\
                    .select("split_word", "date", "count", "pos", "sentiment")\
                    .orderBy(["date", "rn"], ascending=[0,1])

    # 용언에 '-다' 추가
    tweets = tweets.withColumn("split_word",\
                    when((tweets.pos == 'PV') | (tweets.pos == 'PA'), concat(col("split_word"), lit("다"))).otherwise(tweets.split_word))

    # nlp 안된 것 처리('김정'은)
    tweets = tweets.withColumn("split_word",\
                    when(tweets.split_word == '김정', concat(col("split_word"), lit("은"))).otherwise(tweets.split_word))

    import_data(folder_name, tweets, pos_percentage, tweets_num)

Example #14

0

Show file

File: etl.py Project: HasanAhmadQ7/datalakeSparkify

def process_song_data(spark, input_data, output_data):
    """Extracts, transforms song data available in s3 and
    loads it into two tables: songs_table, artists_table. The tables
    are stored as parquet files in s3.

    Args:
        SparkSession: The entry point to programming Spark with the Dataset
        and DataFrame API.
        input_data: A path to the songs_data
        output_data: the location where songs and artists tables will be stored
    """

    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    # read song data file
    df = spark.read.json(song_data, schema)

    # extract columns to create songs table
    songs_table_cols = ["song_id",
                        "title",
                        "artist_id",
                        "year",
                        "duration"]
    songs_table = df[songs_table_cols].na.\
        drop(subset=["song_id"]).dropDuplicates(subset=["song_id"])

    # write songs table to parquet files partitioned by year and artist
    output_songs = output_data+"songs"
    songs_table.write.mode("overwrite").partitionBy("year", "artist_id").\
        parquet(output_songs)

    artists_cols = ["artist_id",
                    "artist_name",
                    "artist_location",
                    "artist_latitude",
                    "artist_longitude"]

    # Rank the artists in order to be able to choose one in case of duplicates
    window = Window.partitionBy("artist_id").orderBy("year")
    ranked_artists = df.withColumn("row_num", row_number().over(window))

    # extract columns to create artists table
    artists_table = ranked_artists.filter(ranked_artists.row_num == 1).\
        select(artists_cols)

    # write artists table to parquet files
    output_artists = output_data+"artists"
    artists_table.write.mode("overwrite").parquet(output_artists)

Example #15

0

Show file

File: updated_Code.py Project: mpooja123/scala-

    def transform(self):
        self.df_order_table = self.df_order_table.withColumn(
            'order_date', to_date(col('order_datetime'), 'yyyy-MM-dd'))
        self.df_order_table = self.df_order_table.withColumn(
            'order_month', func.month(col('order_datetime')))

        df_filter_cust = self.df_customer_table.where(col('age') > 18)
        ###inner join
        df_order_customer = self.df_order_table.join(
            df_filter_cust,
            on=(self.df_order_table['customer_id'] ==
                df_filter_cust['customer_id']),
            how='inner').select(df_filter_cust['customer_id'],
                                self.df_order_table['order_id'],
                                self.df_order_table['order_month'],
                                self.df_order_table['amount'])

        # total sales amount for each month of each customer who are greater than age 18
        wind = Window.partitionBy('customer_id', 'order_month')

        df_order_customer = df_order_customer.withColumn(
            'total_sale',
            func.sum(col('amount')).over(wind))

        df_order_customer.distinct()
        df_order_customer.show()

        ###list the cutomer_id and their second order_id of customers who places more than 2 order in last 20 dayssss
        ########################
        wind = Window.partitionBy('customer_id', 'order_date').orderBy(
            func.col('order_id').asc())
        df_temp = self.df_order_table.withColumn('row', func.row_number().over(wind))\

        df_temp = df_temp.withColumn(
            'current_date', to_date(func.current_timestamp(), 'yyyy-MM-dd'))

        df_temp = df_temp.withColumn(
            'diff_days', func.datediff('current_date', 'order_date'))

        df_temp = df_temp.withColumn(
            "diff",
            when((col('diff_days') <= lit(20)), lit(1)).otherwise(0))
        df_temp = df_temp.where(col('diff') == 1)
        wind = Window.partitionBy('customer_id')
        df_temp = df_temp.withColumn('count',
                                     func.count('order_id').over(wind))
        df_temp = df_temp.where((col('count') > 2) & (col('row') == 2))

        df_temp.show()

Example #16

0

Show file

def windowing(df, batch_size):
    """
    Args:
        df: dataframe to perform windowing on
        batch_size: number of rows per batch
    """
    if "timestamp" not in df.columns:
        raise ValueError("timestamp column not found!")
    df = df.withColumn("timestamp_1", F.unix_timestamp(F.col("timestamp")))
    window_spec = Window.orderBy("timestamp_1")
    return df.withColumn(
        "batch_id",
        F.floor(
            (F.row_number().over(window_spec) - F.lit(1)) / int(batch_size)),
    )

Example #17

0

Show file

def main_category_data(df, output_path, s3_partition):
    '''
    Creates dataframe of product main category to be a dimension table (normalized)
    Selects distinct main categories and drop duplicates
    Creates main_cat_id with window function
    Writes .parquet file to S3 
    '''
    window = Window.orderBy(col('main_cat'))
    main_cat_table = df.select(['main_cat']) \
                        .where(col('main_cat').isNotNull()) \
                        .dropDuplicates() \
                        .withColumn('main_cat_id', row_number().over(window))
    main_cat_table.repartition(5).write.parquet(
        output_path + 'main_category/' + s3_partition, 'overwrite')
    return main_cat_table

Example #18

0

Show file

File: preprocess.py Project: stevensallright/similarity_generator

    def _remove_duplicate_indexes(self):
        """
        Removes duplicate recipes by randomly selecting one if duplicated.

        :return:
        """

        window = Window \
            .partitionBy([self.index_column]) \
            .orderBy(f.rand())

        self.df_labels = self.df_labels\
            .withColumn('rn', f.row_number().over(window))\
            .filter(f.col('rn') == 1)\
            .drop('rn')

Example #19

0

Show file

File: modifications.py Project: ourway/prediction_spark

def overall_prediction_grouping(csv):
    """
    Grouping dataset by date
    :param csv: -- dataframe: containing all the data
    :return: -- dataframe: grouped
    """
    grouped = csv.groupby('Date').agg({'Date': 'count'})
    grouped_with_date = grouped.withColumn('Date',
                                           change_to_date_func(col('Date')))
    window_row = Window().orderBy('Date')
    grouped_indexed = grouped_with_date.withColumn(
        'id',
        row_number().over(window_row))

    return grouped_indexed.withColumn('id', to_vector(col('id')))

Example #20

0

Show file

    def generate_variables(dataframe, partition_array, order_array):
        """
        Generates new variables to identify recurrent payments. It calculates the difference of days
        (difference_of_days) between transactions of the same amount, made in the same commerce;
         the number of such transactions (nu_payment); the amount payed (impoper_f);
        and the order of the transactions of the same kind made more than once.
        It returns a data frame with this information
        :param dataframe:
        :param partition_array:['customer_id', 'commerce_name']
        :param order_array: ['operation_date']
        :return: Dataframe identifying  recurrent payments according to the order of the trasnsactions.
        """

        utils = AnalyticUtils
        filtered_window = Window.partitionBy(partition_array).orderBy(
            order_array)
        filtered_descending = Window.partitionBy(fields.customer_id, fields.commerce_affiliation_id) \
            .orderBy(desc(fields.operation_date))
        filtered_by_range = Window.partitionBy(partition_array).orderBy(order_array) \
            .rowsBetween(-sys.maxsize, sys.maxsize)
        return dataframe.select(
            fields.customer_id, fields.commerce_affiliation_id,
            fields.transaction_amount, fields.operation_date,
            (utils.subtract_days(
                date_format(c(fields.operation_date), 'YYYY-MM-dd'),
                date_format(
                    lag(fields.operation_date).over(filtered_window),
                    'YYYY-MM-dd'))).alias('difference_of_days'),
            lag(fields.transaction_amount).over(filtered_window).alias(
                'impoper_f'),
            utils.retrieve_day(
                date_format(c(fields.operation_date),
                            'YYYY-MM-dd')).alias('day_month'),
            count('*').over(filtered_by_range).alias('nu_payments'),
            row_number().over(filtered_window).alias('order'),
            row_number().over(filtered_descending).alias('order2'))

Example #21

0

Show file

File: utils.py Project: swatimishra29/Spark_demo

def fill_auto_increment(
        df_existing:pyspark.sql.DataFrame,
        df_new:pyspark.sql.DataFrame,
        autoincrement_column:str):
    # add primary key from existing to new by shared business key
    # returns: df_new with a new primary key column
    #           for new entries, returns an autoincrement value
    w = Window().orderBy(F.lit('dummy'))
    max_id = get_max_value(df_existing,autoincrement_column)
    
    df_combined = df_new.withColumn(autoincrement_column,
        F.when( F.col(autoincrement_column).isNull(), 
                F.row_number().over(w)+max_id)\
                .otherwise(F.col(autoincrement_column)))
    return df_combined

Example #22

0

Show file

File: preprocess_zoo.py Project: oodunsi1/ARMemNet-BigDL

def generate_dataset_m(df_assembled, CONFIG_PREPROCESS):
    m_window_spec = Window.partitionBy('CELL_NUM').orderBy('dt')
    m_feat_cols = ['day0_features0']
    m_days_cols = ['day0_features']

    shuffled_with_cell = df_assembled.withColumn(
        'seq',
        row_number().over(m_window_spec)).cache()
    input_m = shuffled_with_cell.withColumnRenamed('features', m_feat_cols[0])
    vector_udt_metadata = input_m.schema[input_m.schema.fields.index(
        input_m.schema[m_feat_cols[0]])].metadata
    # Generate 1 day data (5min * 10 data)
    for i in range(1, CONFIG_PREPROCESS.INPUT_M_SIZE):
        n_features = lead(col(m_feat_cols[0]), i).over(m_window_spec)
        col_name = 'day{}_features{}'.format(0, i)
        input_m = input_m.withColumn(
            col_name, n_features.alias(col_name, metadata=vector_udt_metadata))
        m_feat_cols.append('day{}_features{}'.format(0, i))

    input_m = input_m.dropna()

    input_m = VectorAssembler().setInputCols(m_feat_cols).setOutputCol(
        m_days_cols[0]).transform(input_m)
    vector_udt_metadata = input_m.schema[-1].metadata

    # for DAYS_TO_MEMORY(7) days memory in same time zone
    for i in range(1, CONFIG_PREPROCESS.DAYS_TO_MEMORY):
        n_features = lead(col('day0_features'),
                          int(CONFIG_PREPROCESS.ITEMS_PER_DAY *
                              i)).over(m_window_spec)
        col_name = 'day{}_features'.format(i)
        input_m = input_m.withColumn(
            col_name, n_features.alias(col_name, metadata=vector_udt_metadata))
        m_days_cols.append('day{}_features'.format(i))

    m_tail_skip_size = CONFIG_PREPROCESS.ITEMS_PER_DAY  # rows to skip, for 1 Day (X & Y)
    inbound = when(
        input_m['seq'] <= (max(input_m['seq']).over(
            m_window_spec.rangeBetween(-sys.maxsize, sys.maxsize)) -
                           m_tail_skip_size), 1).otherwise(0)
    input_m = input_m.dropna().withColumn('inbound',
                                          inbound).filter('inbound == 1')
    input_m = VectorAssembler().setInputCols(
        m_days_cols).setOutputCol('features').transform(input_m).select([
            'dt', 'CELL_NUM', 'features'
        ])  # assemble DAYS_TO_MEMORY days columns into one ('features')

    return input_m

Example #23

0

Show file

File: tables_to_pg.py Project: Thelin90/liberta

def add_incrementral_id(df: DataFrame, id_column: str,
                        partitionby: str) -> DataFrame:
    """Will create a surrogate key that starts from 1 and is ordered
    since monotonically_increasing_id will ve unique but not ordered
    OVER (PARTITION BY . . . ORDER BY . . .)

    :param df: pyspark Dataframe
    :param id_column: name of column
    :param partitionby: column to partition by
    :return: new column with a unique and consistent surrogate key
    """
    return df.withColumn(
        id_column,
        F.row_number().over(
            Window.partitionBy(partitionby).orderBy(
                F.monotonically_increasing_id())))

Example #24

0

Show file

File: dataCleaning.py Project: HomeWave/SegmaEngineDataTool

def addIdCol1(dataDF, idFieldName="continuousID"):
    '''
        :param dataDF: 数据表; DataFrame
        :param idColName: 生成的ID列的名称：String
        :return: 末列添加了从1开始的连续递增ID的列表；DataFrame
    '''
    numParitions = dataDF.rdd.getNumPartitions()
    data_withindex = dataDF.withColumn("increasing_id_temp",
                                       fn.monotonically_increasing_id())
    data_withindex = data_withindex.withColumn(
        idFieldName,
        fn.row_number().over(Window.orderBy("increasing_id_temp")))
    data_withindex = data_withindex.repartition(numParitions)
    data_withindex = data_withindex.sort("increasing_id_temp")
    data_withindex = data_withindex.drop("increasing_id_temp")
    return data_withindex

Example #25

0

Show file

def _topn(df, c, by=None, n=3):
    cnt = f'{c}##cnt'
    rnk = f'{c}##rnk'

    _gcols = [by] if isinstance(by, str) and by else by or []
    s = df.select(*_gcols, c).groupby(*_gcols,
                                      c).agg(F.count(F.col(c)).alias(cnt))

    # calculate topn
    a = s.select(
        *_gcols, c, cnt,
        F.row_number().over(
            Window.partitionBy(*_gcols).orderBy(
                F.col(cnt).desc())).alias(rnk)).filter(F.col(rnk) <= n)

    return a.select(*_gcols, c, cnt)

Example #26

0

Show file

def getTargetStatsByFactors(df, factors, target):
    stats_names = ['mean', 'median', 'variance']
    output_cols = [*factors, *stats_names]

    w = Window.partitionBy(*factors).orderBy(target)
    rank_df = df.withColumn('rank', F.row_number().over(w))

    result_df = df.groupBy(*factors).agg(F.avg(target).alias('mean'),
                             F.var_pop(target).alias('variance'),
                             F.count(F.lit(1)).alias('count')) \
                        .join(rank_df, [*factors]) \
                        .filter(F.col('rank') == F.expr('count+1/2').cast(IntegerType())) \
                        .withColumn('median', F.col(target)) \
                        .orderBy(*factors) \
                        .select(*output_cols)
    return result_df

Example #27

0

Show file

def transform_data(df: DataFrame) -> DataFrame:
    """Apply data deduplication.
    
    params:
    df -- Spark DataFrame
    
    outputs:
    Spark DataFrame
    """
    w = Window.partitionBy('id').orderBy(col('update_date').desc())

    df = df.withColumn('rownum', row_number().over(w)) \
        .where(col('rownum') == 1) \
        .drop('rownum')

    return df

Example #28

0

Show file

def run_pyspark_pipeline(dir_path, spark, cores, out_path):
    """ Reads parquet files from 'dir_path' and parses trec_car_tools.Page object to create protobuf with entity
    linking. """

    print('start preprocessin')
    start_preprocess = time.time()

    # Reads parquet files from 'dir_path' - each row is a TREC CAR pages.
    df_in = spark.read.parquet(dir_path)
    df_in.printSchema()
    num_partitions = df_in.rdd.getNumPartitions()
    print("Number of default partitions: {}".format(num_partitions))

    print('end preprocess')
    end_preprocess = time.time()
    print("*** preprocess time: {:.2f}s ***".format(end_preprocess -
                                                    start_preprocess))

    print('start pyspark_processing job')
    start_pyspark_job = time.time()

    if num_partitions < cores * 4:
        print('repartitioning df')
        df_in = df_in.repartition(cores * 4)
        print("Number of partitions should equal 4*cores --> {}".format(
            df_in.rdd.getNumPartitions()))

    @udf(returnType=BinaryType())
    def parse_udf(page_bytearray):
        # Parses trec_car_tools.Page object to create protobuf with entity linking.
        page = pickle.loads(page_bytearray)
        tp = TrecCarParser()
        doc = tp.parse_page_to_protobuf(page=page)
        doc_bytearray = pickle.dumps(doc.SerializeToString())
        return doc_bytearray

    # Add index to DF.
    df_parse = df_in.withColumn("doc_bytearray", parse_udf("page_bytearray"))
    df_parse = df_parse.withColumn(
        "index",
        row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)
    df_parse.write.parquet(out_path)

    print('end pyspark_processing job')
    end_pyspark_job = time.time()
    print("*** pyspark_processing job time: {:.2f}s ***".format(
        end_pyspark_job - start_pyspark_job))

Example #29

0

Show file

File: Main.py Project: EmbeddAlex/nhl_game_task

def spark_app():

    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")

    sql_context = SQLContext(spark)

    # loading *.CSV files as DataFrames
    team_info = sql_context.read.csv("src/team_info.csv", header=True, sep=",")
    game_skater_stats = sql_context.read.csv("src/game_skater_stats.csv",
                                             header=True,
                                             sep=",")
    player_info = sql_context.read.csv("src/player_info.csv",
                                       header=True,
                                       sep=",")

    # joining tables
    data_frame = team_info. \
        join(game_skater_stats, game_skater_stats.team_id == team_info.team_id). \
        join(player_info, game_skater_stats.player_id == player_info.player_id)

    # first task:
    data_frame.groupby('shortName', 'teamName'). \
        agg((f.sum(f.when(data_frame.primaryPosition == 'D', data_frame.goals).otherwise(0)) / f.sum('goals')).alias('METRIC')). \
        sort('METRIC', ascending=False).limit(5).show()

    # additional task
    game = sql_context.read.csv("src/game.csv", header=True, sep=",")
    data_frame = data_frame.join(game,
                                 game_skater_stats.game_id == game.game_id)

    data_frame_2 = data_frame. \
        groupby('shortName', 'teamName', f.year(data_frame.date_time).alias('years')). \
        agg((f.sum(f.when(data_frame.primaryPosition == 'D', data_frame.goals).otherwise(0)) / f.sum('goals')).alias('METRIC'))

    window = Window.partitionBy(f.col('years')).orderBy(
        (f.col('METRIC')).desc())
    data_frame_2.select(f.col('shortName'), f.col('teamName'), f.col('years'),
                        f.col('METRIC'),
                        f.row_number().over(window).alias('row_number')).where(
                            f.col('row_number') <= 5).show()

    spark.stop()

Example #30

0

Show file

File: internal.py Project: rsn491/koalas

    def attach_default_index(sdf, default_index_type=None):
        """
        This method attaches a default index to Spark DataFrame. Spark does not have the index
        notion so corresponding column should be generated.
        There are several types of default index can be configured by `compute.default_index_type`.

        >>> spark_frame = ks.range(10).to_spark()
        >>> spark_frame
        DataFrame[id: bigint]

        It adds the default index column '__index_level_0__'.

        >>> spark_frame = InternalFrame.attach_default_index(spark_frame)
        >>> spark_frame
        DataFrame[__index_level_0__: int, id: bigint]

        It throws an exception if the given column name already exists.

        >>> InternalFrame.attach_default_index(spark_frame)
        ... # doctest: +ELLIPSIS
        Traceback (most recent call last):
          ...
        AssertionError: '__index_level_0__' already exists...
        """
        index_column = SPARK_DEFAULT_INDEX_NAME
        assert (index_column not in sdf.columns
                ), "'%s' already exists in the Spark column names '%s'" % (
                    index_column, sdf.columns)

        if default_index_type is None:
            default_index_type = get_option("compute.default_index_type")

        scols = [scol_for(sdf, column) for column in sdf.columns]
        if default_index_type == "sequence":
            sequential_index = (F.row_number().over(
                Window.orderBy(F.monotonically_increasing_id())) - 1)
            return sdf.select(sequential_index.alias(index_column), *scols)
        elif default_index_type == "distributed-sequence":
            return InternalFrame.attach_distributed_sequence_column(
                sdf, column_name=index_column)
        elif default_index_type == "distributed":
            return InternalFrame.attach_distributed_column(
                sdf, column_name=index_column)
        else:
            raise ValueError(
                "'compute.default_index_type' should be one of 'sequence',"
                " 'distributed-sequence' and 'distributed'")

Example #31

0

Show file

def process_song_data(spark: SparkSession, input_data: str,
                      output_data: str) -> None:
    """
    Extracts raw song data from data lake, transforms it to songs and artists table and loads these tables back
    to the data lake.

    :param spark: instance of SparkSession
    :param input_data: data lake path to input data
    :param output_data: data lake path to output data
    """
    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # read song data file
    df = spark.read.json(song_data)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'title', 'artist_id', 'year', 'duration') \
                    .where(col('song_id').isNotNull()) \
                    .drop_duplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy(
        'year', 'artist_id').parquet(os.path.join(output_data, 'songs'))

    # extract columns to create artists table
    # we need some extra logic to clean up duplicated artist_id records with distinct artist_name; we want to:
    #  1. Remove "featured" names (e.g. "Elton John feat. Sting" or "Elton John featuring Sting");
    #  2. From the remaining records, choose the one with the shortest artist name and try to get a record with
    #     non-null location, latitude and longitude.
    artist_cleanup_ordering = row_number().over(
        Window.partitionBy('artist_id').orderBy(
            length('name'),
            col('location').asc_nulls_last(),
            col('latitude').asc_nulls_last(),
            col('longitude').asc_nulls_last()))
    artists_table = df.select(col('artist_id'), col('artist_name').alias('name'),
                              col('artist_location').alias('location'), col('artist_latitude').alias('latitude'),
                              col('artist_longitude').alias('longitude')) \
                      .where(~col('name').like('%feat.%') | ~col('name').like('%featuring')) \
                      .withColumn('rn', artist_cleanup_ordering) \
                      .where(col('rn') == 1) \
                      .drop('rn')

    # write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(
        os.path.join(output_data, 'artists'))

Example #32

0

Show file

File: spark_evaluation.py Project: David-Li-L/recommenders

def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user

Example #33

0

Show file

File: test_context.py Project: JingchengDu/spark

 def test_window_functions_without_partitionBy(self):
     df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
     w = Window.orderBy("key", df.value)
     from pyspark.sql import functions as F
     sel = df.select(df.value, df.key,
                     F.max("key").over(w.rowsBetween(0, 1)),
                     F.min("key").over(w.rowsBetween(0, 1)),
                     F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))),
                     F.row_number().over(w),
                     F.rank().over(w),
                     F.dense_rank().over(w),
                     F.ntile(2).over(w))
     rs = sorted(sel.collect())
     expected = [
         ("1", 1, 1, 1, 4, 1, 1, 1, 1),
         ("2", 1, 1, 1, 4, 2, 2, 2, 1),
         ("2", 1, 2, 1, 4, 3, 2, 2, 2),
         ("2", 2, 2, 2, 4, 4, 4, 3, 2)
     ]
     for r, ex in zip(rs, expected):
         self.assertEqual(tuple(r), ex[:len(r)])

Example #34

0

Show file

File: spark_evaluation.py Project: David-Li-L/recommenders

def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user

Example #35

0

Show file

File: dataframe_sample.py Project: oopchoi/spark

def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);

    # array
    df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
    df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()

Example #36

0

Show file

File: spark_splitters.py Project: David-Li-L/recommenders

def spark_chrono_split(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark chronological splitter
    This function splits data in a chronological manner. That is, for each user / item, the
    split function takes proportions of ratings which is specified by the split ratio(s).
    The split is stratified.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits 
            data into several portions corresponding to the split ratios. If a list is 
            provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to filter
            with min_rating.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError("min_rating should be integer and larger than or equal to 1.")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    if min_rating > 1:
        data = min_rating_filter_spark(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.partitionBy(split_by_column).orderBy(col(col_timestamp))

    rating_grouped = (
        data.groupBy(split_by_column)
        .agg({col_timestamp: "count"})
        .withColumnRenamed("count(" + col_timestamp + ")", "count")
    )
    rating_all = data.join(broadcast(rating_grouped), on=split_by_column)

    rating_rank = rating_all.withColumn(
        "rank", row_number().over(window_spec) / col("count")
    )

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i])
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            )

        splits.append(rating_split)

    return splits

Example #37

0

Show file

File: aggregate-and-import.py Project: mozilla/distribution-viewer

def collect_numeric_metric(metric, df, population):
    cdf = df.select(df[metric['src']])
    cdf = cdf.dropna(subset=metric['src'])
    cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket'))

    total_count = cdf.count()
    num_partitions = total_count / 500
    ws = Window.orderBy('bucket')
    cdf = cdf.select(
        cdf['bucket'],
        cume_dist().over(ws).alias('c'),
        row_number().over(ws).alias('i'))
    cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions)
    cdf = cdf.collect()

    # Collapse rows with duplicate buckets.
    collapsed_data = []
    prev = None
    for d in cdf:
        if not collapsed_data:
            collapsed_data.append(d)  # Always keep first record.
            continue
        if prev and prev['bucket'] == d['bucket']:
            collapsed_data.pop()
        collapsed_data.append(d)
        prev = d

    # Calculate `p` from `c`.
    data = []
    prev = None
    for i, d in enumerate(collapsed_data):
        p = d['c'] - prev['c'] if prev else d['c']
        data.append({
            'bucket': d['bucket'],
            'c': d['c'],
            'p': p,
        })
        prev = d
    """
    Example of what `data` looks like now::

        [{'bucket': 0.0,        'c': 0.00126056, 'p': 0.00126056},
         {'bucket': 3.0,        'c': 0.00372313, 'p': 0.00246256},
         {'bucket': 4.0,        'c': 0.00430616, 'p': 0.0005830290622683026},
         {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184},
         {'bucket': 8.0,        'c': 0.08114486, 'p': 0.07514685},
         {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795},
         ...]
    """
    # Push data to database.
    sql = ("INSERT INTO api_numericcollection "
           "(num_observations, population, metric_id, dataset_id) "
           "VALUES (%s, %s, %s, %s) "
           "RETURNING id")
    params = [total_count, population, metric['id'], dataset_id]
    if DEBUG_SQL:
        collection_id = 0
        print sql, params
    else:
        cursor.execute(sql, params)
        conn.commit()
        collection_id = cursor.fetchone()[0]

    for d in data:
        sql = ("INSERT INTO api_numericpoint "
               "(bucket, proportion, collection_id) "
               "VALUES (%s, %s, %s)")
        params = [d['bucket'], d['p'], collection_id]
        if DEBUG_SQL:
            print sql, params
        else:
            cursor.execute(sql, params)

    if not DEBUG_SQL:
        conn.commit()