def test_window_functions_cumulative_sum(self): df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"]) from pyspark.sql import functions as F # Test cumulative sum sel = df.select( df.key, F.sum(df.value).over( Window.rowsBetween(Window.unboundedPreceding, 0))) rs = sorted(sel.collect()) expected = [("one", 1), ("two", 3)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow sel = df.select( df.key, F.sum(df.value).over( Window.rowsBetween(Window.unboundedPreceding - 1, 0))) rs = sorted(sel.collect()) expected = [("one", 1), ("two", 3)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow frame_end = Window.unboundedFollowing + 1 sel = df.select( df.key, F.sum(df.value).over( Window.rowsBetween(Window.currentRow, frame_end))) rs = sorted(sel.collect()) expected = [("one", 3), ("two", 2)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def test_window_functions_cumulative_sum(self): df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"]) from pyspark.sql import functions as F # Test cumulative sum sel = df.select( df.key, F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding, 0))) rs = sorted(sel.collect()) expected = [("one", 1), ("two", 3)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow sel = df.select( df.key, F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding - 1, 0))) rs = sorted(sel.collect()) expected = [("one", 1), ("two", 3)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow frame_end = Window.unboundedFollowing + 1 sel = df.select( df.key, F.sum(df.value).over(Window.rowsBetween(Window.currentRow, frame_end))) rs = sorted(sel.collect()) expected = [("one", 3), ("two", 2)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def generate_window(order_col, rowrange=None, partitions=None): window = Window().orderBy(order_col) # for future reference: if we want to use partitions # if partitions is not None: # window = window.partitionBy(partitions) if rowrange is not None: window = window.rowsBetween(*rowrange) return window
def transform(self, df): # this transforms the spark dataframe (i.e time-series column) # and creates column contain the moving-average over created # time-window mywindow = Window.rowsBetween(-self.nLags, 0) strMovAvg = self.columnName+'_'\ + str(self.nLags)+'_MovingAvg' df = df.withColumn(strMovAvg,\ avg(df[self.columnName]).over(mywindow)) self.FeatureNames.append(strMovAvg) return df
def ks_2samp(df1, var1, df2, var2): ks_stat = get_cdf(df1, var1, CDF_1).\ join( get_cdf(df2, var2, CDF_2), on=var1 == var2, how='outer' ).\ withColumn( FILLED_CDF_1, funcs.last(funcs.col(CDF_1), ignorenulls=True). over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow)) ).\ withColumn( FILLED_CDF_2, funcs.last(funcs.col(CDF_2), ignorenulls=True). over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow)) ).\ select( funcs.max( funcs.abs( funcs.col(FILLED_CDF_1) - funcs.col(FILLED_CDF_2) ) ) ).\ collect()[0][0] # Adapted from scipy.stats ks_2samp n1 = df1.select(var1).na.drop().count() n2 = df2.select(var2).na.drop().count() en = np.sqrt(n1 * n2 / float(n1 + n2)) try: prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * ks_stat) except: prob = 1.0 return ks_stat, prob
# Create a SparkSession: from pyspark.sql import SparkSession spark = SparkSession.builder.master("local").appName("window").getOrCreate() # Read the enhanced ride data from HDFS: rides = spark.read.parquet("/duocar/joined/") # ## Example: Cumulative Count and Sum # Create a simple DataFrame: df = spark.range(10) df.show() # Create a simple window specification: from pyspark.sql.window import Window ws = Window.rowsBetween(Window.unboundedPreceding, Window.currentRow) type(ws) # Use the window specification to compute cumulative count and sum: from pyspark.sql.functions import count, sum df.select("id", count("id").over(ws).alias("cum_cnt"), sum("id").over(ws).alias("cum_sum")).show() # **Tip:** Examine the default column name to gain additional insight (if you # are SQL literate): df.select("id", count("id").over(ws), sum("id").over(ws)).printSchema() # ## Example: Compute average days between rides for each rider # Create window specification:
def compute(cls, base: DataFrame, parameters: Dict[str, Any] = None) -> Column: return F.count(StudentPerformance.STUDENT_ID).over( Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))
df2 = events(df) dyf2 = DynamicFrame.fromDF(df2, glueContext, "paradox-events") glueContext.write_dynamic_frame.from_options( frame=dyf2, connection_type='s3', connection_options={"path": "s3://{}/paradox-events".format(bucket_name)}, format="json") # Find activation events vectors timeout = 60 #seconds w = Window().orderBy(F.col("timestamp").cast('long')) begin_column = F.when( F.lag('timestamp', 1).over(w).isNull(), F.col('timestamp')).otherwise( F.when((F.col('timestamp').cast("long") - F.lag('timestamp', 1).over(w).cast("long")) > timeout, F.col('timestamp'))) df4 = df2.filter(F.col('event').contains('-UP')).withColumn( 'begin', begin_column) df4 = df4.withColumn( 'begin', F.last('begin', True).over(w.rowsBetween(-sys.maxsize, 0))) df4 = df4.groupBy('begin').agg(F.collect_list("event").alias('vector')) dyf4 = DynamicFrame.fromDF(df4, glueContext, "paradox-vectors") glueContext.write_dynamic_frame.from_options( frame=dyf4, connection_type='s3', connection_options={"path": "s3://{}/paradox-vectors".format(bucket_name)}, format="json")