def get_cdf(df, variable, col_name): cdf = df.select(variable).na.drop().\ withColumn( col_name, funcs.cume_dist().over(Window.orderBy(variable)) ).distinct() return cdf
def fit_distribution(df): raw_impressions = [ 547788492, 113155690, 357229507, 353837519, 231243807, 343536283, 204197454, 298400019, 154702872, 101595992, 173078606, 245617500, 210661722, 94400758, 100694511, 104621562, 47302450, 254680986, 38653660, 118198547, 167100705, 484483594, 69681730, 230778513, 109614764, 86169134, 125825905, 478045002, 476281884, 155086936, 100034338, 140623283, 132801777, 150906980, 108772611, 2165682, 41589670, 327990489, 85909446, 349940682, 8776893, 33502930, 282401283, 82297276, 385473488, 219411532, 154739307, 51048940, 192605283, 114587775, 230422182, 41743162, 103709711, 171397519, 158854098, 105911195, 118981954, 78965914, 91906274, 158792685, 63487656, 54706539, 111072455, 92442258, 150615474, 79697857, 108585993, 112360549, 262424361, 494715712, 1152693549, 1035303850, 324325907, 921851042, 390727201, 1257338071, 392629713, 778974819, 129782245, 1683290505, 811910155, 1997598872 ] total_impressions = sum(raw_impressions) region_percentage = { i + 1: float(x) / float(total_impressions) for i, x in enumerate(raw_impressions) } cum_percentage = list(np.cumsum(region_percentage.values())) cum_pairs = zip([0] + cum_percentage[:-1], cum_percentage) cum_map = {x: i + 1 for i, x in enumerate(cum_pairs)} df1 = df.withColumn('uid', fn.monotonically_increasing_id()) w = Window.partitionBy('uckey').orderBy('uid') df2 = df1.withColumn('cum_id', fn.cume_dist().over(w)) def lookup_ipl(value): for pair in cum_pairs: low, high = pair if low <= value and value < high or value == 1.0: return cum_map[pair] return -1 _udf_1 = fn.udf(lookup_ipl, t.IntegerType()) df3 = df2.withColumn('index', _udf_1(fn.col('cum_id'))) df3 = df3.drop(df3.uid).drop(df3.cum_id) return df3
def collect_numeric_metric(metric, df, population): cdf = df.select(df[metric['src']]) cdf = cdf.dropna(subset=metric['src']) cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket')) total_count = cdf.count() num_partitions = total_count / 500 ws = Window.orderBy('bucket') cdf = cdf.select(cdf['bucket'], cume_dist().over(ws).alias('c'), row_number().over(ws).alias('i')) cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions) cdf = cdf.collect() # Collapse rows with duplicate buckets. collapsed_data = [] prev = None for d in cdf: if not collapsed_data: collapsed_data.append(d) # Always keep first record. continue if prev and prev['bucket'] == d['bucket']: collapsed_data.pop() collapsed_data.append(d) prev = d # Calculate `p` from `c`. data = [] prev = None for i, d in enumerate(collapsed_data): p = d['c'] - prev['c'] if prev else d['c'] data.append({ 'bucket': d['bucket'], 'c': d['c'], 'p': p, }) prev = d """ Example of what `data` looks like now:: [{'bucket': 0.0, 'c': 0.00126056, 'p': 0.00126056}, {'bucket': 3.0, 'c': 0.00372313, 'p': 0.00246256}, {'bucket': 4.0, 'c': 0.00430616, 'p': 0.0005830290622683026}, {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184}, {'bucket': 8.0, 'c': 0.08114486, 'p': 0.07514685}, {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795}, ...] """ # Push data to database. sql = ("INSERT INTO api_numericcollection " "(num_observations, population, metric_id, dataset_id) " "VALUES (%s, %s, %s, %s) " "RETURNING id") params = [total_count, population, metric['id'], dataset_id] if DEBUG_SQL: collection_id = 0 print sql, params else: cursor.execute(sql, params) conn.commit() collection_id = cursor.fetchone()[0] for d in data: sql = ("INSERT INTO api_numericpoint " "(bucket, proportion, collection_id) " "VALUES (%s, %s, %s)") params = [d['bucket'], d['p'], collection_id] if DEBUG_SQL: print sql, params else: cursor.execute(sql, params) if not DEBUG_SQL: conn.commit()
sc = sparkSession.sparkContext sqlContext = SQLContext(sc) np.random.seed(1) keys = ["foo"] * 10 + ["bar"] * 10 values = np.hstack([np.random.normal(0, 1, 10), np.random.normal(10, 1, 100)]) df = sqlContext.createDataFrame([{ "k": k, "v": round(float(v), 3) } for k, v in zip(keys, values)]) w = Window.partitionBy(df.k).orderBy(df.v) df2 = df.select("k", "v", cume_dist().over(w).alias("cume_dists")) df2.show() df3 = df.groupBy('k') \ .agg(F.collect_list('v')).over(w) df = sc.sql.createDataFrame([ ('1', '02', '3', '[6]'), ('1', '02', '3.1', '[6]'), ('1', '02', '3.2', '[6]'), ('1', '02', '3', '[6]'), ('1', '02', '3', '[6]'), ('1', '02', '3', '[6]'), ('1', '02', '3', '[6]'), ('1', '02', '3', '[6]'), ('1', '02', '3', '[6]'),
# ## Examples of various window functions # Create a simple DataFrame: data = [(100, ), (95, ), (95, ), (88, ), (73, ), (73, )] df = spark.createDataFrame(data, ["score"]) df.show() # Create a simple window specification: from pyspark.sql.window import Window from pyspark.sql.functions import desc ws = Window.orderBy(desc("score")) from pyspark.sql.functions import row_number, cume_dist, ntile df.select("score", row_number().over(ws).alias("row_number")).show() df.select("score", cume_dist().over(ws).alias("cume_dist")).show() df.select("score", ntile(2).over(ws).alias("ntile(2)")).show() from pyspark.sql.functions import rank, dense_rank, percent_rank df.select("score", rank().over(ws).alias("rank"), dense_rank().over(ws).alias("dense_rank")).show() df.select("score", percent_rank().over(ws).alias("percent_rank")).show() from pyspark.sql.functions import lag, lead df.select("score", lag("score", count=1).over(ws).alias("lag"), lead("score", count=2).over(ws).alias("lead")).show() # ## Compute mean star rating over last five rides
clusters = KMeans.train(df_stores_rdd.map( lambda p: [p[index] for index in store_group_col_index]), cluster_number, maxIterations=200) df_stores_rdd = df_stores_rdd.zip( clusters.predict( df_stores_rdd.map( lambda p: [p[index] for index in store_group_col_index]))).map( lambda p: p[0] + [p[1]]) df_stores = spark.createDataFrame( df_stores_rdd, StructType(df_stores.schema.fields + [StructField('cluster_val', IntegerType(), True)])) df_stores = df_stores.withColumn('rand', F.rand()) window = Window.partitionBy('cluster_val').orderBy('rand') df_stores = df_stores.withColumn('cum_dist', F.cume_dist().over(window)) ## groups only with larger than 2 stores are valid group to sample from valid_groups = df_stores.groupBy([ 'cluster_val' ]).count().filter('count>=2').select('cluster_val').collect() valid_groups = [i[0] for i in valid_groups] df_stores = spark.createDataFrame( df_stores.rdd.map(define_group), StructType(df_stores.schema.fields + [StructField('group_val', StringType(), True)])) header_stores = ['store_id', 'avg_hhi', 'avg_traffic'] df_stores.repartition(1).write.parquet(df_stores_loc, mode='overwrite') df_stores_join = df_stores.select(header_stores + ['group_val']) # In[7]:
def fit_distribution(df): raw_impressions = [ 547788492, 113155690, 357229507, 353837519, 231243807, 343536283, 204197454, 298400019, 154702872, 101595992, 173078606, 245617500, 210661722, 94400758, 100694511, 104621562, 47302450, 254680986, 38653660, 118198547, 167100705, 484483594, 69681730, 230778513, 109614764, 86169134, 125825905, 478045002, 476281884, 155086936, 100034338, 140623283, 132801777, 150906980, 108772611, 2165682, 41589670, 327990489, 85909446, 349940682, 8776893, 33502930, 282401283, 82297276, 385473488, 219411532, 154739307, 51048940, 192605283, 114587775, 230422182, 41743162, 103709711, 171397519, 158854098, 105911195, 118981954, 78965914, 91906274, 158792685, 63487656, 54706539, 111072455, 92442258, 150615474, 79697857, 108585993, 112360549, 262424361, 494715712, 1152693549, 1035303850, 324325907, 921851042, 390727201, 1257338071, 392629713, 778974819, 129782245, 1683290505, 811910155, 1997598872 ] total_impressions = sum(raw_impressions) # calculate the region percentage and store it in a dict of # region ID --> percentage of number of impressions in that region ID region_percentage = { i + 1: float(x) / float(total_impressions) for i, x in enumerate(raw_impressions) } # cumulatively sum up the region percentage's value # so that cum_percentage is # [region_percentage[0], region_percentage[0] + region_percentage[1], ...] cum_percentage = list(np.cumsum(region_percentage.values())) # forming a list of tuple with interval of [lower_end, upper_end) cum_pairs = zip([0] + cum_percentage[:-1], cum_percentage) # for reverse lookup, map of # [lower_end, upper_end) --> region ID cum_map = {x: i + 1 for i, x in enumerate(cum_pairs)} # add a new column with uid # pyspark.sql.functions.monotonically_increasing_id() # A column that generates monotonically increasing 64-bit integers. # The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. The current # implementation puts the partition ID in the upper 31 bits, and the record number within each partition in the # lower 33 bits. The assumption is that the data frame has less than 1 billion partitions, and each partition has # less than 8 billion records. # As an example, consider a DataFrame with two partitions, each with 3 records. This expression would return the # following IDs: 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. df1 = df.withColumn('uid', fn.monotonically_increasing_id()) # create a window function to partition the df by uckey, and order within each partition by uid w = Window.partitionBy('uckey').orderBy('uid') # pyspark.sql.functions.cume_dist() # Window function: returns the cumulative distribution of values within a window partition, i.e. the fraction of # rows that are below the current row. # Because ordered by UID, which is globally unique, and unique within each partition, each row will always be below # its successive rows. # This becomes a cumulative percentage of number of rows. e.g. assuming we have dataset of 10 rows, cumu_dist will # be [0.1, 0.2, 0.3, ...] == [i / total number of rows in partition for i from 1 to number of rows] # In any partitions, the PDF of a single row is 1 / rows b/c UID is unique # Therefore, the CDF of the partition becomes [1 / rows, 2 / rows, ...] df2 = df1.withColumn('cum_id', fn.cume_dist().over(w)) def lookup_ipl(value): # naive algorithm, O(n) # for each (lower_end, upper_end) pair in the map, find out if the given 'value' is in the range of # lower_end and upper_end. If yes, return that region ID for pair in cum_pairs: low, high = pair if low <= value < high or value == 1.0: return cum_map[pair] return -1 _udf_1 = fn.udf(lookup_ipl, t.IntegerType()) df3 = df2.withColumn('index', _udf_1(fn.col('cum_id'))) df3 = df3.drop(df3.uid).drop(df3.cum_id) return df3
df.withColumn("percent_rank",percent_rank().over(window_spec)) \ .show() #ntile Window Function ''' ntile() window function returns the relative rank of result rows within a window partition. In below example we have used 2 as an argument to ntile hence it returns ranking between 2 values (1 and 2) ''' df.withColumn("ntile",ntile(2).over(window_spec)) \ .show() #PySpark Window Analytic functions #cume_dist Window Function ''' cume_dist() window function is used to get the cumulative distribution of values within a window partition. ''' df.withColumn("cume_dist",cume_dist().over(window_spec)) \ .show() #lag Window Function df.withColumn("lag",lag("salary",2).over(window_spec)) \ .show() #lead Window Function df.withColumn("lead",lead("salary",2).over(window_spec)) \ .show() #PySpark Window Aggregate Functions windowSpecAgg = Window.partitionBy("department")
def collect_numeric_metric(metric, df, population): cdf = df.select(df[metric['src']]) cdf = cdf.dropna(subset=metric['src']) cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket')) total_count = cdf.count() num_partitions = total_count / 500 ws = Window.orderBy('bucket') cdf = cdf.select( cdf['bucket'], cume_dist().over(ws).alias('c'), row_number().over(ws).alias('i')) cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions) cdf = cdf.collect() # Collapse rows with duplicate buckets. collapsed_data = [] prev = None for d in cdf: if not collapsed_data: collapsed_data.append(d) # Always keep first record. continue if prev and prev['bucket'] == d['bucket']: collapsed_data.pop() collapsed_data.append(d) prev = d # Calculate `p` from `c`. data = [] prev = None for i, d in enumerate(collapsed_data): p = d['c'] - prev['c'] if prev else d['c'] data.append({ 'bucket': d['bucket'], 'c': d['c'], 'p': p, }) prev = d """ Example of what `data` looks like now:: [{'bucket': 0.0, 'c': 0.00126056, 'p': 0.00126056}, {'bucket': 3.0, 'c': 0.00372313, 'p': 0.00246256}, {'bucket': 4.0, 'c': 0.00430616, 'p': 0.0005830290622683026}, {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184}, {'bucket': 8.0, 'c': 0.08114486, 'p': 0.07514685}, {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795}, ...] """ # Push data to database. sql = ("INSERT INTO api_numericcollection " "(num_observations, population, metric_id, dataset_id) " "VALUES (%s, %s, %s, %s) " "RETURNING id") params = [total_count, population, metric['id'], dataset_id] if DEBUG_SQL: collection_id = 0 print sql, params else: cursor.execute(sql, params) conn.commit() collection_id = cursor.fetchone()[0] for d in data: sql = ("INSERT INTO api_numericpoint " "(bucket, proportion, collection_id) " "VALUES (%s, %s, %s)") params = [d['bucket'], d['p'], collection_id] if DEBUG_SQL: print sql, params else: cursor.execute(sql, params) if not DEBUG_SQL: conn.commit()