Python Window.rowsBetween Examples

Programming Language: Python

Namespace/Package Name: pyspark.sql.window

Class/Type: Window

Method/Function: rowsBetween

Examples at hotexamples.com: 8

Python Window.rowsBetween - 8 examples found. These are the top rated real world Python examples of pyspark.sql.window.Window.rowsBetween extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Window(30)

orderBy(30)

partitionBy(30)

rowsBetween(7)

partitionby(1)

Example #1

Show file

    def test_window_functions_cumulative_sum(self):
        df = self.spark.createDataFrame([("one", 1), ("two", 2)],
                                        ["key", "value"])
        from pyspark.sql import functions as F

        # Test cumulative sum
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.unboundedPreceding, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.unboundedPreceding - 1, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow
        frame_end = Window.unboundedFollowing + 1
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.currentRow, frame_end)))
        rs = sorted(sel.collect())
        expected = [("one", 3), ("two", 2)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

Example #2

Show file

File: test_context.py Project: JingchengDu/spark

    def test_window_functions_cumulative_sum(self):
        df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"])
        from pyspark.sql import functions as F

        # Test cumulative sum
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding - 1, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow
        frame_end = Window.unboundedFollowing + 1
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.currentRow, frame_end)))
        rs = sorted(sel.collect())
        expected = [("one", 3), ("two", 2)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

Example #3

Show file

def generate_window(order_col, rowrange=None, partitions=None):
    window = Window().orderBy(order_col)

    # for future reference: if we want to use partitions
    # if partitions is not None:
    #     window = window.partitionBy(partitions)

    if rowrange is not None:
        window = window.rowsBetween(*rowrange)

    return window

Example #4

Show file

File: TimeSeriesFeatureGenerator.py Project: ramanv1/time-series-sparkML

 def transform(self, df):
     # this transforms the spark dataframe (i.e time-series column)
     # and creates column contain the moving-average over created
     # time-window
     mywindow = Window.rowsBetween(-self.nLags, 0)
     strMovAvg = self.columnName+'_'\
                 + str(self.nLags)+'_MovingAvg'
     df = df.withColumn(strMovAvg,\
                        avg(df[self.columnName]).over(mywindow))
     self.FeatureNames.append(strMovAvg)
     return df

Example #5

Show file

File: ks_2samp_sparksql.py Project: Ben-Epstein/KS-2Samp-PySparkSQL

def ks_2samp(df1, var1, df2, var2):

    ks_stat = get_cdf(df1, var1, CDF_1).\
        join(
            get_cdf(df2, var2, CDF_2),
            on=var1 == var2,
            how='outer'
        ).\
        withColumn(
            FILLED_CDF_1,
            funcs.last(funcs.col(CDF_1), ignorenulls=True).
            over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))
        ).\
        withColumn(
            FILLED_CDF_2,
            funcs.last(funcs.col(CDF_2), ignorenulls=True).
            over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))
        ).\
        select(
            funcs.max(
                funcs.abs(
                    funcs.col(FILLED_CDF_1) - funcs.col(FILLED_CDF_2)
                )
            )
        ).\
        collect()[0][0]

    # Adapted from scipy.stats ks_2samp
    n1 = df1.select(var1).na.drop().count()
    n2 = df2.select(var2).na.drop().count()
    en = np.sqrt(n1 * n2 / float(n1 + n2))
    try:
        prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * ks_stat)
    except:
        prob = 1.0

    return ks_stat, prob

Example #6

Show file

File: 10_window.py Project: marcioshochi/data-scientist-trainning

# Create a SparkSession:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("window").getOrCreate()

# Read the enhanced ride data from HDFS:
rides = spark.read.parquet("/duocar/joined/")

# ## Example: Cumulative Count and Sum

# Create a simple DataFrame:
df = spark.range(10)
df.show()

# Create a simple window specification:
from pyspark.sql.window import Window
ws = Window.rowsBetween(Window.unboundedPreceding, Window.currentRow)
type(ws)

# Use the window specification to compute cumulative count and sum:
from pyspark.sql.functions import count, sum
df.select("id",
          count("id").over(ws).alias("cum_cnt"),
          sum("id").over(ws).alias("cum_sum")).show()

# **Tip:** Examine the default column name to gain additional insight (if you
# are SQL literate):
df.select("id", count("id").over(ws), sum("id").over(ws)).printSchema()

# ## Example: Compute average days between rides for each rider

# Create window specification:

Example #7

Show file

 def compute(cls,
             base: DataFrame,
             parameters: Dict[str, Any] = None) -> Column:
     return F.count(StudentPerformance.STUDENT_ID).over(
         Window.rowsBetween(Window.unboundedPreceding,
                            Window.unboundedFollowing))

Example #8

Show file

File: job.py Project: vittorio-nardone/web-scraping-at-edge

df2 = events(df)
dyf2 = DynamicFrame.fromDF(df2, glueContext, "paradox-events")
glueContext.write_dynamic_frame.from_options(
    frame=dyf2,
    connection_type='s3',
    connection_options={"path": "s3://{}/paradox-events".format(bucket_name)},
    format="json")

# Find activation events vectors
timeout = 60  #seconds

w = Window().orderBy(F.col("timestamp").cast('long'))
begin_column = F.when(
    F.lag('timestamp', 1).over(w).isNull(), F.col('timestamp')).otherwise(
        F.when((F.col('timestamp').cast("long") -
                F.lag('timestamp', 1).over(w).cast("long")) > timeout,
               F.col('timestamp')))
df4 = df2.filter(F.col('event').contains('-UP')).withColumn(
    'begin', begin_column)
df4 = df4.withColumn(
    'begin',
    F.last('begin', True).over(w.rowsBetween(-sys.maxsize, 0)))
df4 = df4.groupBy('begin').agg(F.collect_list("event").alias('vector'))

dyf4 = DynamicFrame.fromDF(df4, glueContext, "paradox-vectors")
glueContext.write_dynamic_frame.from_options(
    frame=dyf4,
    connection_type='s3',
    connection_options={"path": "s3://{}/paradox-vectors".format(bucket_name)},
    format="json")