Ejemplo n.º 1
0
    def test_window_functions_cumulative_sum(self):
        df = self.spark.createDataFrame([("one", 1), ("two", 2)],
                                        ["key", "value"])
        from pyspark.sql import functions as F

        # Test cumulative sum
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.unboundedPreceding, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.unboundedPreceding - 1, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow
        frame_end = Window.unboundedFollowing + 1
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.currentRow, frame_end)))
        rs = sorted(sel.collect())
        expected = [("one", 3), ("two", 2)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])
Ejemplo n.º 2
0
    def test_window_functions_cumulative_sum(self):
        df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"])
        from pyspark.sql import functions as F

        # Test cumulative sum
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding - 1, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow
        frame_end = Window.unboundedFollowing + 1
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.currentRow, frame_end)))
        rs = sorted(sel.collect())
        expected = [("one", 3), ("two", 2)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])
Ejemplo n.º 3
0
def generate_window(order_col, rowrange=None, partitions=None):
    window = Window().orderBy(order_col)

    # for future reference: if we want to use partitions
    # if partitions is not None:
    #     window = window.partitionBy(partitions)

    if rowrange is not None:
        window = window.rowsBetween(*rowrange)

    return window
 def transform(self, df):
     # this transforms the spark dataframe (i.e time-series column)
     # and creates column contain the moving-average over created
     # time-window
     mywindow = Window.rowsBetween(-self.nLags, 0)
     strMovAvg = self.columnName+'_'\
                 + str(self.nLags)+'_MovingAvg'
     df = df.withColumn(strMovAvg,\
                        avg(df[self.columnName]).over(mywindow))
     self.FeatureNames.append(strMovAvg)
     return df
def ks_2samp(df1, var1, df2, var2):

    ks_stat = get_cdf(df1, var1, CDF_1).\
        join(
            get_cdf(df2, var2, CDF_2),
            on=var1 == var2,
            how='outer'
        ).\
        withColumn(
            FILLED_CDF_1,
            funcs.last(funcs.col(CDF_1), ignorenulls=True).
            over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))
        ).\
        withColumn(
            FILLED_CDF_2,
            funcs.last(funcs.col(CDF_2), ignorenulls=True).
            over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))
        ).\
        select(
            funcs.max(
                funcs.abs(
                    funcs.col(FILLED_CDF_1) - funcs.col(FILLED_CDF_2)
                )
            )
        ).\
        collect()[0][0]

    # Adapted from scipy.stats ks_2samp
    n1 = df1.select(var1).na.drop().count()
    n2 = df2.select(var2).na.drop().count()
    en = np.sqrt(n1 * n2 / float(n1 + n2))
    try:
        prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * ks_stat)
    except:
        prob = 1.0

    return ks_stat, prob
# Create a SparkSession:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("window").getOrCreate()

# Read the enhanced ride data from HDFS:
rides = spark.read.parquet("/duocar/joined/")

# ## Example: Cumulative Count and Sum

# Create a simple DataFrame:
df = spark.range(10)
df.show()

# Create a simple window specification:
from pyspark.sql.window import Window
ws = Window.rowsBetween(Window.unboundedPreceding, Window.currentRow)
type(ws)

# Use the window specification to compute cumulative count and sum:
from pyspark.sql.functions import count, sum
df.select("id",
          count("id").over(ws).alias("cum_cnt"),
          sum("id").over(ws).alias("cum_sum")).show()

# **Tip:** Examine the default column name to gain additional insight (if you
# are SQL literate):
df.select("id", count("id").over(ws), sum("id").over(ws)).printSchema()

# ## Example: Compute average days between rides for each rider

# Create window specification:
Ejemplo n.º 7
0
 def compute(cls,
             base: DataFrame,
             parameters: Dict[str, Any] = None) -> Column:
     return F.count(StudentPerformance.STUDENT_ID).over(
         Window.rowsBetween(Window.unboundedPreceding,
                            Window.unboundedFollowing))
Ejemplo n.º 8
0
df2 = events(df)
dyf2 = DynamicFrame.fromDF(df2, glueContext, "paradox-events")
glueContext.write_dynamic_frame.from_options(
    frame=dyf2,
    connection_type='s3',
    connection_options={"path": "s3://{}/paradox-events".format(bucket_name)},
    format="json")

# Find activation events vectors
timeout = 60  #seconds

w = Window().orderBy(F.col("timestamp").cast('long'))
begin_column = F.when(
    F.lag('timestamp', 1).over(w).isNull(), F.col('timestamp')).otherwise(
        F.when((F.col('timestamp').cast("long") -
                F.lag('timestamp', 1).over(w).cast("long")) > timeout,
               F.col('timestamp')))
df4 = df2.filter(F.col('event').contains('-UP')).withColumn(
    'begin', begin_column)
df4 = df4.withColumn(
    'begin',
    F.last('begin', True).over(w.rowsBetween(-sys.maxsize, 0)))
df4 = df4.groupBy('begin').agg(F.collect_list("event").alias('vector'))

dyf4 = DynamicFrame.fromDF(df4, glueContext, "paradox-vectors")
glueContext.write_dynamic_frame.from_options(
    frame=dyf4,
    connection_type='s3',
    connection_options={"path": "s3://{}/paradox-vectors".format(bucket_name)},
    format="json")