def get_cdf(df, variable, col_name):

    cdf = df.select(variable).na.drop().\
        withColumn(
            col_name,
            funcs.cume_dist().over(Window.orderBy(variable))
        ).distinct()

    return cdf
Esempio n. 2
0
def fit_distribution(df):
    raw_impressions = [
        547788492, 113155690, 357229507, 353837519, 231243807, 343536283,
        204197454, 298400019, 154702872, 101595992, 173078606, 245617500,
        210661722, 94400758, 100694511, 104621562, 47302450, 254680986,
        38653660, 118198547, 167100705, 484483594, 69681730, 230778513,
        109614764, 86169134, 125825905, 478045002, 476281884, 155086936,
        100034338, 140623283, 132801777, 150906980, 108772611, 2165682,
        41589670, 327990489, 85909446, 349940682, 8776893, 33502930, 282401283,
        82297276, 385473488, 219411532, 154739307, 51048940, 192605283,
        114587775, 230422182, 41743162, 103709711, 171397519, 158854098,
        105911195, 118981954, 78965914, 91906274, 158792685, 63487656,
        54706539, 111072455, 92442258, 150615474, 79697857, 108585993,
        112360549, 262424361, 494715712, 1152693549, 1035303850, 324325907,
        921851042, 390727201, 1257338071, 392629713, 778974819, 129782245,
        1683290505, 811910155, 1997598872
    ]
    total_impressions = sum(raw_impressions)
    region_percentage = {
        i + 1: float(x) / float(total_impressions)
        for i, x in enumerate(raw_impressions)
    }
    cum_percentage = list(np.cumsum(region_percentage.values()))
    cum_pairs = zip([0] + cum_percentage[:-1], cum_percentage)
    cum_map = {x: i + 1 for i, x in enumerate(cum_pairs)}
    df1 = df.withColumn('uid', fn.monotonically_increasing_id())
    w = Window.partitionBy('uckey').orderBy('uid')
    df2 = df1.withColumn('cum_id', fn.cume_dist().over(w))

    def lookup_ipl(value):
        for pair in cum_pairs:
            low, high = pair
            if low <= value and value < high or value == 1.0:
                return cum_map[pair]
        return -1

    _udf_1 = fn.udf(lookup_ipl, t.IntegerType())
    df3 = df2.withColumn('index', _udf_1(fn.col('cum_id')))
    df3 = df3.drop(df3.uid).drop(df3.cum_id)
    return df3
Esempio n. 3
0
def collect_numeric_metric(metric, df, population):
    cdf = df.select(df[metric['src']])
    cdf = cdf.dropna(subset=metric['src'])
    cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket'))

    total_count = cdf.count()
    num_partitions = total_count / 500
    ws = Window.orderBy('bucket')
    cdf = cdf.select(cdf['bucket'],
                     cume_dist().over(ws).alias('c'),
                     row_number().over(ws).alias('i'))
    cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions)
    cdf = cdf.collect()

    # Collapse rows with duplicate buckets.
    collapsed_data = []
    prev = None
    for d in cdf:
        if not collapsed_data:
            collapsed_data.append(d)  # Always keep first record.
            continue
        if prev and prev['bucket'] == d['bucket']:
            collapsed_data.pop()
        collapsed_data.append(d)
        prev = d

    # Calculate `p` from `c`.
    data = []
    prev = None
    for i, d in enumerate(collapsed_data):
        p = d['c'] - prev['c'] if prev else d['c']
        data.append({
            'bucket': d['bucket'],
            'c': d['c'],
            'p': p,
        })
        prev = d
    """
    Example of what `data` looks like now::

        [{'bucket': 0.0,        'c': 0.00126056, 'p': 0.00126056},
         {'bucket': 3.0,        'c': 0.00372313, 'p': 0.00246256},
         {'bucket': 4.0,        'c': 0.00430616, 'p': 0.0005830290622683026},
         {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184},
         {'bucket': 8.0,        'c': 0.08114486, 'p': 0.07514685},
         {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795},
         ...]
    """
    # Push data to database.
    sql = ("INSERT INTO api_numericcollection "
           "(num_observations, population, metric_id, dataset_id) "
           "VALUES (%s, %s, %s, %s) "
           "RETURNING id")
    params = [total_count, population, metric['id'], dataset_id]
    if DEBUG_SQL:
        collection_id = 0
        print sql, params
    else:
        cursor.execute(sql, params)
        conn.commit()
        collection_id = cursor.fetchone()[0]

    for d in data:
        sql = ("INSERT INTO api_numericpoint "
               "(bucket, proportion, collection_id) "
               "VALUES (%s, %s, %s)")
        params = [d['bucket'], d['p'], collection_id]
        if DEBUG_SQL:
            print sql, params
        else:
            cursor.execute(sql, params)

    if not DEBUG_SQL:
        conn.commit()
sc = sparkSession.sparkContext
sqlContext = SQLContext(sc)

np.random.seed(1)

keys = ["foo"] * 10 + ["bar"] * 10
values = np.hstack([np.random.normal(0, 1, 10), np.random.normal(10, 1, 100)])

df = sqlContext.createDataFrame([{
    "k": k,
    "v": round(float(v), 3)
} for k, v in zip(keys, values)])

w = Window.partitionBy(df.k).orderBy(df.v)

df2 = df.select("k", "v", cume_dist().over(w).alias("cume_dists"))
df2.show()

df3 = df.groupBy('k') \
    .agg(F.collect_list('v')).over(w)

df = sc.sql.createDataFrame([
    ('1', '02', '3', '[6]'),
    ('1', '02', '3.1', '[6]'),
    ('1', '02', '3.2', '[6]'),
    ('1', '02', '3', '[6]'),
    ('1', '02', '3', '[6]'),
    ('1', '02', '3', '[6]'),
    ('1', '02', '3', '[6]'),
    ('1', '02', '3', '[6]'),
    ('1', '02', '3', '[6]'),
# ## Examples of various window functions

# Create a simple DataFrame:
data = [(100, ), (95, ), (95, ), (88, ), (73, ), (73, )]
df = spark.createDataFrame(data, ["score"])
df.show()

# Create a simple window specification:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
ws = Window.orderBy(desc("score"))

from pyspark.sql.functions import row_number, cume_dist, ntile
df.select("score", row_number().over(ws).alias("row_number")).show()
df.select("score", cume_dist().over(ws).alias("cume_dist")).show()
df.select("score", ntile(2).over(ws).alias("ntile(2)")).show()

from pyspark.sql.functions import rank, dense_rank, percent_rank
df.select("score",
          rank().over(ws).alias("rank"),
          dense_rank().over(ws).alias("dense_rank")).show()
df.select("score", percent_rank().over(ws).alias("percent_rank")).show()

from pyspark.sql.functions import lag, lead
df.select("score",
          lag("score", count=1).over(ws).alias("lag"),
          lead("score", count=2).over(ws).alias("lead")).show()

# ## Compute mean star rating over last five rides
Esempio n. 6
0
    clusters = KMeans.train(df_stores_rdd.map(
        lambda p: [p[index] for index in store_group_col_index]),
                            cluster_number,
                            maxIterations=200)
    df_stores_rdd = df_stores_rdd.zip(
        clusters.predict(
            df_stores_rdd.map(
                lambda p: [p[index] for index in store_group_col_index]))).map(
                    lambda p: p[0] + [p[1]])
    df_stores = spark.createDataFrame(
        df_stores_rdd,
        StructType(df_stores.schema.fields +
                   [StructField('cluster_val', IntegerType(), True)]))
    df_stores = df_stores.withColumn('rand', F.rand())
    window = Window.partitionBy('cluster_val').orderBy('rand')
    df_stores = df_stores.withColumn('cum_dist', F.cume_dist().over(window))
    ## groups only with larger than 2 stores are valid group to sample from
    valid_groups = df_stores.groupBy([
        'cluster_val'
    ]).count().filter('count>=2').select('cluster_val').collect()
    valid_groups = [i[0] for i in valid_groups]
    df_stores = spark.createDataFrame(
        df_stores.rdd.map(define_group),
        StructType(df_stores.schema.fields +
                   [StructField('group_val', StringType(), True)]))
    header_stores = ['store_id', 'avg_hhi', 'avg_traffic']
    df_stores.repartition(1).write.parquet(df_stores_loc, mode='overwrite')
    df_stores_join = df_stores.select(header_stores + ['group_val'])

# In[7]:
def fit_distribution(df):
    raw_impressions = [
        547788492, 113155690, 357229507, 353837519, 231243807, 343536283,
        204197454, 298400019, 154702872, 101595992, 173078606, 245617500,
        210661722, 94400758, 100694511, 104621562, 47302450, 254680986,
        38653660, 118198547, 167100705, 484483594, 69681730, 230778513,
        109614764, 86169134, 125825905, 478045002, 476281884, 155086936,
        100034338, 140623283, 132801777, 150906980, 108772611, 2165682,
        41589670, 327990489, 85909446, 349940682, 8776893, 33502930, 282401283,
        82297276, 385473488, 219411532, 154739307, 51048940, 192605283,
        114587775, 230422182, 41743162, 103709711, 171397519, 158854098,
        105911195, 118981954, 78965914, 91906274, 158792685, 63487656,
        54706539, 111072455, 92442258, 150615474, 79697857, 108585993,
        112360549, 262424361, 494715712, 1152693549, 1035303850, 324325907,
        921851042, 390727201, 1257338071, 392629713, 778974819, 129782245,
        1683290505, 811910155, 1997598872
    ]
    total_impressions = sum(raw_impressions)
    # calculate the region percentage and store it in a dict of
    # region ID --> percentage of number of impressions in that region ID
    region_percentage = {
        i + 1: float(x) / float(total_impressions)
        for i, x in enumerate(raw_impressions)
    }
    # cumulatively sum up the region percentage's value
    # so that cum_percentage is
    # [region_percentage[0], region_percentage[0] + region_percentage[1], ...]
    cum_percentage = list(np.cumsum(region_percentage.values()))
    # forming a list of tuple with interval of [lower_end, upper_end)
    cum_pairs = zip([0] + cum_percentage[:-1], cum_percentage)
    # for reverse lookup, map of
    # [lower_end, upper_end) --> region ID
    cum_map = {x: i + 1 for i, x in enumerate(cum_pairs)}
    # add a new column with uid
    # pyspark.sql.functions.monotonically_increasing_id()
    # A column that generates monotonically increasing 64-bit integers.
    # The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. The current
    # implementation puts the partition ID in the upper 31 bits, and the record number within each partition in the
    # lower 33 bits. The assumption is that the data frame has less than 1 billion partitions, and each partition has
    # less than 8 billion records.
    # As an example, consider a DataFrame with two partitions, each with 3 records. This expression would return the
    # following IDs: 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
    df1 = df.withColumn('uid', fn.monotonically_increasing_id())
    # create a window function to partition the df by uckey, and order within each partition by uid
    w = Window.partitionBy('uckey').orderBy('uid')
    # pyspark.sql.functions.cume_dist()
    # Window function: returns the cumulative distribution of values within a window partition, i.e. the fraction of
    # rows that are below the current row.
    # Because ordered by UID, which is globally unique, and unique within each partition, each row will always be below
    # its successive rows.
    # This becomes a cumulative percentage of number of rows. e.g. assuming we have dataset of 10 rows, cumu_dist will
    # be [0.1, 0.2, 0.3, ...] == [i / total number of rows in partition for i from 1 to number of rows]
    # In any partitions, the PDF of a single row is 1 / rows b/c UID is unique
    # Therefore, the CDF of the partition becomes [1 / rows, 2 / rows, ...]
    df2 = df1.withColumn('cum_id', fn.cume_dist().over(w))

    def lookup_ipl(value):
        # naive algorithm, O(n)
        # for each (lower_end, upper_end) pair in the map, find out if the given 'value' is in the range of
        # lower_end and upper_end. If yes, return that region ID
        for pair in cum_pairs:
            low, high = pair
            if low <= value < high or value == 1.0:
                return cum_map[pair]
        return -1

    _udf_1 = fn.udf(lookup_ipl, t.IntegerType())
    df3 = df2.withColumn('index', _udf_1(fn.col('cum_id')))
    df3 = df3.drop(df3.uid).drop(df3.cum_id)
    return df3
Esempio n. 8
0
df.withColumn("percent_rank",percent_rank().over(window_spec)) \
    .show()


#ntile Window Function
'''
ntile() window function returns the relative rank of result rows within a window partition. 
In below example we have used 2 as an argument to ntile hence it returns ranking between 2 values (1 and 2)
'''
df.withColumn("ntile",ntile(2).over(window_spec)) \
    .show()


#PySpark Window Analytic functions
#cume_dist Window Function
'''
cume_dist() window function is used to get the cumulative distribution of values within a window partition.
'''
df.withColumn("cume_dist",cume_dist().over(window_spec)) \
    .show()

#lag Window Function
df.withColumn("lag",lag("salary",2).over(window_spec)) \
    .show()

#lead Window Function
df.withColumn("lead",lead("salary",2).over(window_spec)) \
    .show()

#PySpark Window Aggregate Functions
windowSpecAgg  = Window.partitionBy("department")
def collect_numeric_metric(metric, df, population):
    cdf = df.select(df[metric['src']])
    cdf = cdf.dropna(subset=metric['src'])
    cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket'))

    total_count = cdf.count()
    num_partitions = total_count / 500
    ws = Window.orderBy('bucket')
    cdf = cdf.select(
        cdf['bucket'],
        cume_dist().over(ws).alias('c'),
        row_number().over(ws).alias('i'))
    cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions)
    cdf = cdf.collect()

    # Collapse rows with duplicate buckets.
    collapsed_data = []
    prev = None
    for d in cdf:
        if not collapsed_data:
            collapsed_data.append(d)  # Always keep first record.
            continue
        if prev and prev['bucket'] == d['bucket']:
            collapsed_data.pop()
        collapsed_data.append(d)
        prev = d

    # Calculate `p` from `c`.
    data = []
    prev = None
    for i, d in enumerate(collapsed_data):
        p = d['c'] - prev['c'] if prev else d['c']
        data.append({
            'bucket': d['bucket'],
            'c': d['c'],
            'p': p,
        })
        prev = d
    """
    Example of what `data` looks like now::

        [{'bucket': 0.0,        'c': 0.00126056, 'p': 0.00126056},
         {'bucket': 3.0,        'c': 0.00372313, 'p': 0.00246256},
         {'bucket': 4.0,        'c': 0.00430616, 'p': 0.0005830290622683026},
         {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184},
         {'bucket': 8.0,        'c': 0.08114486, 'p': 0.07514685},
         {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795},
         ...]
    """
    # Push data to database.
    sql = ("INSERT INTO api_numericcollection "
           "(num_observations, population, metric_id, dataset_id) "
           "VALUES (%s, %s, %s, %s) "
           "RETURNING id")
    params = [total_count, population, metric['id'], dataset_id]
    if DEBUG_SQL:
        collection_id = 0
        print sql, params
    else:
        cursor.execute(sql, params)
        conn.commit()
        collection_id = cursor.fetchone()[0]

    for d in data:
        sql = ("INSERT INTO api_numericpoint "
               "(bucket, proportion, collection_id) "
               "VALUES (%s, %s, %s)")
        params = [d['bucket'], d['p'], collection_id]
        if DEBUG_SQL:
            print sql, params
        else:
            cursor.execute(sql, params)

    if not DEBUG_SQL:
        conn.commit()