Example #1
0
    #types.StructField('created_utc', types.StringType(), False),
    #types.StructField('distinguished', types.StringType(), False),
    #types.StructField('downs', types.LongType(), False),
    #types.StructField('edited', types.StringType(), False),
    #types.StructField('gilded', types.LongType(), False),
    #types.StructField('id', types.StringType(), False),
    #types.StructField('link_id', types.StringType(), False),
    #types.StructField('name', types.StringType(), False),
    #types.StructField('parent_id', types.StringType(), True),
    #types.StructField('retrieved_on', types.LongType(), False),
    #types.StructField('score', types.LongType(), False),
    #types.StructField('score_hidden', types.BooleanType(), False),
    #types.StructField('subreddit', types.StringType(), False),
    #types.StructField('subreddit_id', types.StringType(), False),
    #types.StructField('ups', types.LongType(), False),
    types.StructField('lang', types.StringType(), False),
    types.StructField('title', types.StringType(), False),
    types.StructField('visit_count', types.LongType(), False),
    types.StructField('data_size', types.LongType(), False),
])


def pathToTime(path):
    start = path.rfind("pagecounts-") + 11
    end = path.rfind(".") - 4
    return path[start:end]


def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]
Example #2
0
 def sqlType(cls):
   # NB: this is actually an instance method in practice O_O !
   return types.StructType([
     types.StructField("np_bytes", types.BinaryType(), False)
   ])
import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('first Spark app').getOrCreate()

assert sys.version_info >= (3, 4) # make sure we have Python 3.4+
assert spark.version >= '2.1' # make sure we have Spark 2.1+


schema = types.StructType([
    types.StructField('id', types.IntegerType(), False),
    types.StructField('x', types.FloatType(), False),
    types.StructField('y', types.FloatType(), False),
    types.StructField('z', types.FloatType(), False),
])


def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]

    # Read the data from the JSON files
    xyz = spark.read.json(in_directory, schema=schema)
    #xyz.show(); return

    # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by.
    with_bins = xyz.select(
        xyz['x'],
        # TODO: also the y values
        xyz['y'],
        (xyz['id'] % 10).alias('bin'),
Example #4
0
 def __init__(self, tpe):
     # Seems we cannot specify field names. I currently gave some default names
     # `c0, c1, ... cn`.
     self.tpe = types.StructType([
         types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe))
     ])  # type: types.StructType
import helloworld
import pytest
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import types

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

# Create some example data
data = [
    ('Batmobile', 7, 5.0),
    ('Catmobile', 3, 9.0),
    ('', 4, 21.0),
]

custom_schema = types.StructType()
custom_schema.add(types.StructField("Vehicle", types.StringType()))
custom_schema.add(types.StructField("wheels", types.IntegerType()))
custom_schema.add(types.StructField("speed", types.FloatType()))

df = spark.createDataFrame(data, custom_schema)


def test_add():
    assert (helloworld.add(1, 1) == 2)


def test_data_count_check():
    assert (df.count() == 3)
Example #6
0
from pyspark.sql import SparkSession, types, functions
from pyspark.sql.functions import *
import pandas as pd
import elevation_grid as eg
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
from pyspark.ml import PipelineModel
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName('weather_plot').getOrCreate()
assert spark.version >= '2.3'  # make sure we have Spark 2.3+
spark.sparkContext.setLogLevel('WARN')

tmax_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.DateType()),
    types.StructField('latitude', types.FloatType()),
    types.StructField('longitude', types.FloatType()),
    types.StructField('elevation', types.FloatType()),
    types.StructField('tmax', types.FloatType()),
])

elevation_schema = types.StructType([
    types.StructField('latitude', types.FloatType()),
    types.StructField('longitude', types.FloatType()),
    types.StructField('elevation', types.FloatType()),
    types.StructField('date', types.DateType()),
    types.StructField('tmax', types.FloatType())
])
Example #7
0
    def test_exclude_key_columns(self):
        df = self.spark.createDataFrame(
            data=[
                ('k1', 'k14', [1, 14, 141]),
                ('k1', 'k12', [1, 12, 121]),
                ('k1', 'k11', [1, 11, 111]),
                ('k1', 'k13', [1, 13, 131]),
            ],
            schema=T.StructType([
                T.StructField('key_1', T.StringType()),
                T.StructField('key_2', T.StringType()),
                T.StructField('aux_data', T.ArrayType(T.IntegerType())),
            ])
        )

        redis_client = redis.StrictRedis('redis.docker')

        # simple key
        df.write_ext.redis(
            key_by=['key_2'],
            key_prefix='hello',
            exclude_key_columns=True,
            host='redis.docker',
        )

        self.assertRowsEqual(
            redis_client.keys(),
            [b'hello.k11', b'hello.k12', b'hello.k13', b'hello.k14'],
            ignore_order=True,
        )

        written_data = [
            json.loads(redis_client.get(key))
            for key in ['hello.k11', 'hello.k12', 'hello.k13', 'hello.k14']
        ]

        expected = [
            {'key_1': 'k1', 'aux_data': [1, 11, 111]},
            {'key_1': 'k1', 'aux_data': [1, 12, 121]},
            {'key_1': 'k1', 'aux_data': [1, 13, 131]},
            {'key_1': 'k1', 'aux_data': [1, 14, 141]},
        ]

        self.assertEqual(written_data, expected)

        redis_client.flushdb()

        # composite key
        df.write_ext.redis(
            key_by=['key_1', 'key_2'],
            key_prefix='hello',
            exclude_key_columns=True,
            host='redis.docker',
        )

        self.assertRowsEqual(
            redis_client.keys(),
            [b'hello.k1.k11', b'hello.k1.k12', b'hello.k1.k13', b'hello.k1.k14'],
            ignore_order=True,
        )

        written_data = [
            json.loads(redis_client.get(key))
            for key in ['hello.k1.k11', 'hello.k1.k12', 'hello.k1.k13', 'hello.k1.k14']
        ]

        expected = [
            {'aux_data': [1, 11, 111]},
            {'aux_data': [1, 12, 121]},
            {'aux_data': [1, 13, 131]},
            {'aux_data': [1, 14, 141]},
        ]

        self.assertEqual(written_data, expected)
# pylint: disable = invalid-name
import pyspark.sql.types as t
from datalakebundle.table.schema.TableSchemaGenerator import TableSchemaGenerator

schema = t.StructType([
    t.StructField("FIELD1", t.IntegerType()),
    t.StructField("FIELD2", t.DoubleType()),
    t.StructField("FIELD3", t.DoubleType()),
    t.StructField(
        "STRUCT1",
        t.StructType([
            t.StructField("NESTED_FIELD1", t.StringType()),
            t.StructField(
                "STRUCT2",
                t.StructType([
                    t.StructField("NESTED_FIELD2", t.StringType()),
                ], ),
            ),
        ], ),
    ),
], )

expected_result = """def get_schema():
    return dp.TableSchema(
        [
            t.StructField("FIELD1", t.IntegerType()),
            t.StructField("FIELD2", t.DoubleType()),
            t.StructField("FIELD3", t.DoubleType()),
            t.StructField(
                "STRUCT1",
                t.StructType(
Example #9
0
def main(inputs, output):
    comments_schema = types.StructType([
        types.StructField('archived', types.BooleanType()),
        types.StructField('author', types.StringType()),
        types.StructField('author_flair_css_class', types.StringType()),
        types.StructField('author_flair_text', types.StringType()),
        types.StructField('body', types.StringType()),
        types.StructField('controversiality', types.LongType()),
        types.StructField('created_utc', types.StringType()),
        types.StructField('distinguished', types.StringType()),
        types.StructField('downs', types.LongType()),
        types.StructField('edited', types.StringType()),
        types.StructField('gilded', types.LongType()),
        types.StructField('id', types.StringType()),
        types.StructField('link_id', types.StringType()),
        types.StructField('name', types.StringType()),
        types.StructField('parent_id', types.StringType()),
        types.StructField('retrieved_on', types.LongType()),
        types.StructField('score', types.LongType()),
        types.StructField('score_hidden', types.BooleanType()),
        types.StructField('subreddit', types.StringType()),
        types.StructField('subreddit_id', types.StringType()),
        types.StructField('ups', types.LongType()),
        # types.StructField('year', types.IntegerType()),
        # types.StructField('month', types.IntegerType()),
    ])

    df = spark.read.json(inputs, schema=comments_schema)
    averages = df.groupBy(df['subreddit']).agg(
        functions.avg(df['score']).alias('average_score'))
    averages.write.csv(output, mode='overwrite')
    averages.explain()
#  Row(id=2, name='Melanie', dob='1963-10-21', chelsea_fan=False)]
dataA = [
    Row(id=1, name='Alan', dob='1962-11-25', chelsea_fan=True),
    Row(id=2, name='Melanie', dob='1963-10-21', chelsea_fan=False)
]
dfA = spark.createDataFrame(dataA)
print('dfA is ...')
print(dfA)
print('dfA.collect() is ...')
print(dfA.collect())

# produce a Dataframe from a list of Row objects (schema is supplied)
# schema has 3rd argument saying if field can contain nulls
# DataFrame[people_id: int, name: string, dob: date, chelsea_fan: boolean]
schema = T.StructType([
    T.StructField("person_id", T.IntegerType(), True),
    T.StructField("name", T.StringType(), True),
    T.StructField("dob", T.DateType(), True),
    T.StructField("chelsea_fan", T.BooleanType(), True),
])
data = [
    Row(person_id=1, name='Alan', dob=date(1962, 11, 25), chelsea_fan=True),
    Row(person_id=2, name='Melanie', dob=date(1963, 10, 21), chelsea_fan=False)
]
dfPersons1 = spark.createDataFrame(data, schema)
print('dfPersons1 is ...')
print(dfPersons1)

# produce a Dataframe from a list of Dictionaries (schema is supplied)
# DataFrame[people_id: int, name: string, dob: date, chelsea_fan: boolean]
schema = T.StructType([
Example #11
0
import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('reddit relative scores').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
assert spark.version >= '2.3' # make sure we have Spark 2.3+

comments_schema = types.StructType([
    types.StructField('archived', types.BooleanType()),
    types.StructField('author', types.StringType()),
    types.StructField('author_flair_css_class', types.StringType()),
    types.StructField('author_flair_text', types.StringType()),
    types.StructField('body', types.StringType()),
    types.StructField('controversiality', types.LongType()),
    types.StructField('created_utc', types.StringType()),
    types.StructField('distinguished', types.StringType()),
    types.StructField('downs', types.LongType()),
    types.StructField('edited', types.StringType()),
    types.StructField('gilded', types.LongType()),
    types.StructField('id', types.StringType()),
    types.StructField('link_id', types.StringType()),
    types.StructField('name', types.StringType()),
    types.StructField('parent_id', types.StringType()),
    types.StructField('retrieved_on', types.LongType()),
    types.StructField('score', types.LongType()),
    types.StructField('score_hidden', types.BooleanType()),
    types.StructField('subreddit', types.StringType()),
    types.StructField('subreddit_id', types.StringType()),
    types.StructField('ups', types.LongType()),
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+

from pyspark.sql import SparkSession, functions, types, Row
spark = SparkSession.builder.appName(
    'OSM point of interest extracter').getOrCreate()
assert spark.version >= '2.4'  # make sure we have Spark 2.4+
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext
spark.conf.set("spark.sql.session.timeZone", "UTC")

from lxml import etree
import dateutil.parser
#import datetime

amenity_schema = types.StructType([
    types.StructField('lat', types.DoubleType(), nullable=False),
    types.StructField('lon', types.DoubleType(), nullable=False),
    types.StructField('unix_time', types.DoubleType(), nullable=False),
    #types.StructField('timestamp', types.TimestampType(), nullable=False),
    types.StructField('amenity', types.StringType(), nullable=False),
    types.StructField('name', types.StringType(), nullable=True),
    types.StructField('tags',
                      types.MapType(types.StringType(), types.StringType()),
                      nullable=False),
])


def get_amenities(line):
    root = etree.fromstring(line)
    if root.tag != 'node':
        return
Example #13
0
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"

cluster_seeds = ['199.60.17.171', '199.60.17.188']

cluster_seeds = ['199.60.17.171', '199.60.17.188']

conf = SparkConf().setAppName('example code') \
    .set('spark.cassandra.connection.host', ','.join(cluster_seeds))

spark = SparkSession.builder.appName('Big Data Project').getOrCreate()
sc = spark.sparkContext
assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.2'  # make sure we have Spark 2.2+

schema = types.StructType([
    types.StructField('county_code', types.StringType(), True),
    types.StructField('month', types.StringType(), True),
    types.StructField('year', types.StringType(), True),
    types.StructField('observation_count', types.DoubleType(), True),
    types.StructField('observation_percent', types.DoubleType(), True),
    types.StructField('max_value', types.DoubleType(), True),
    types.StructField('max_hour', types.DoubleType(), True),
    types.StructField('arithmetic_mean', types.DoubleType(), True),
    types.StructField('am_wind', types.DoubleType(), True),
    types.StructField('am_temp', types.DoubleType(), True),
    types.StructField('am_rh', types.DoubleType(), True),
    types.StructField('am_press', types.DoubleType(), True)
])

itr = 0
for j in ['44201', '42401']:
Example #14
0
import sys, re, uuid
from datetime import datetime
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types

cluster_seeds = ['199.60.17.32', '199.60.17.65']
spark = SparkSession.builder.appName('Spark Cassandra example') \
    .config('spark.cassandra.connection.host', ','.join(cluster_seeds)).getOrCreate()
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

line_re = re.compile(
    r'^(\S+) - - \[(\S+ [+-]\d+)\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$')
schema = types.StructType([
    #types.StructField('id', types.StringType()),
    types.StructField('host', types.StringType()),
    types.StructField('datetime', types.TimestampType()),
    types.StructField('path', types.StringType()),
    types.StructField('bytes', types.IntegerType())
])


def read_line(line):
    m = line_re.match(line)
    if m is None:
        return None
    return (m.group(1), datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S %z'),
            m.group(3), int(m.group(4)))


def main(input_dir, keyspace, table):
Example #15
0
from pyspark.sql import SparkSession, functions, types
from pyspark.sql import types
import sys
import datetime as dt
from pyspark.sql import functions

assert sys.version_info >= (3, 5)   # make sure we have Python 3.5+

conf = SparkConf().setAppName('example code')
sc = SparkContext(conf=conf)
spark = SparkSession.builder.appName('example code').getOrCreate()
assert spark.version >= '2.3'     # make sure we have Spark 2.3+


#######################aggregate transofrmation : reduce scalar in rdd

schema = [
      ('Date', types.StringType())
    , ('Region', types.StringType())
    , ('Rep',  types.StringType())
    , ('Item', types.StringType())
    , ('Units', types.IntegerType())
    , ('Unit Cost', types.DoubleType())
    , ('total', types.DoubleType())
]

schema_sales = types.StructType([types.StructField(e[0],e[1], False) for e in schema])

sales_df = spark.read.csv('sales.csv',header=True,schema=schema_sales)

sales_df.select('Region','Rep').distinct().orderBy(sales_df.Region,sales_df.Rep).show()
Example #16
0
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"

cluster_seeds = ['199.60.17.171', '199.60.17.188']

cluster_seeds = ['199.60.17.171', '199.60.17.188']

conf = SparkConf().setAppName('example code') \
    .set('spark.cassandra.connection.host', ','.join(cluster_seeds))

spark = SparkSession.builder.appName('Big Data Project').getOrCreate()
sc = spark.sparkContext
assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.2'  # make sure we have Spark 2.2+

schema = types.StructType([
    types.StructField('county_code', types.IntegerType(), True),
    types.StructField('month', types.IntegerType(), True),
    types.StructField('year', types.IntegerType(), True),
    types.StructField('am_wind', types.DoubleType(), True)
])

train_final = spark.createDataFrame(sc.emptyRDD(), schema=schema)

for year in range(2013, 2018):
    support = spark.read.csv(
        "/home/ldua/Desktop/BigDataProject/support/daily_WIND_" + str(year) +
        ".csv",
        header=True)

    support_f = support.select(
        'County Code', 'Date Local',
Example #17
0
import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('weather ETL').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
assert spark.version >= '2.4'  # make sure we have Spark 2.4+

observation_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.StringType()),
    types.StructField('observation', types.StringType()),
    types.StructField('value', types.IntegerType()),
    types.StructField('mflag', types.StringType()),
    types.StructField('qflag', types.StringType()),
    types.StructField('sflag', types.StringType()),
    types.StructField('obstime', types.StringType()),
])


def main(in_directory, out_directory):

    weather = spark.read.csv(in_directory, schema=observation_schema)

    # TODO: finish here.
    weather = weather.filter(weather.qflag.isNull())
    weather = weather.filter(weather.station.startswith('CA'))
    weather = weather.filter(weather.observation == 'TMAX')

    cleaned_data = weather.select(weather['station'], weather['date'],
Example #18
0
def preprocessing(spark: SparkSession, pppath: Path, datadir: Path):
    def prepro(s5: DataFrame) -> DataFrame:
        stages = []
        catvars = ['dept_id', 'item_id', 'store_id', 'wday']
        for v in catvars:
            stages += [StringIndexer(inputCol=v,
                                     outputCol=f"i{v}")]
        stages += [OneHotEncoderEstimator(inputCols=[f"i{v}" for v in catvars],
                                          outputCols=[f"v{v}" for v in catvars])]
        stages += [VectorAssembler(inputCols=['vwday', 'vitem_id', 'vdept_id', 'vstore_id', 'flag_ram',
                                              'snap', 'dn', 'month', 'year'],
                                   outputCol='features')]

        pip: Pipeline = Pipeline(stages=stages)
        pipm = pip.fit(s5)
        df: DataFrame = pipm.transform(s5)
        return df.drop('idept_id', 'iitem_id', 'istore_id', 'iwday', 'vdept_id', 'vtem_id', 'vstore_id', 'vwday')

    print("--- preprocessing -----------------------")

    schema = t.StructType([
        t.StructField('year', t.IntegerType(), True),
        t.StructField('month', t.IntegerType(), True),
        t.StructField('dn', t.IntegerType(), True),
        t.StructField('wday', t.IntegerType(), True),
        t.StructField('snap', t.IntegerType(), True),
        t.StructField('dept_id', t.StringType(), True),
        t.StructField('item_id', t.StringType(), True),
        t.StructField('store_id', t.StringType(), True),
        t.StructField('sales', t.DoubleType(), True),
        t.StructField('flag_ram', t.IntegerType(), True),
        t.StructField('Sales_Pred', t.DoubleType(), True)
    ])

    csv_path = datadir / "Sales5_Ab2011_InklPred.csv"
    print(f"--- Reading: '{csv_path}'")

    sales5: DataFrame = spark.read.csv(str(csv_path), header='true', schema=schema) \
        .withColumn("label", f.col('sales'))

    ppdf = prepro(sales5)
    print(f"--- Writing: '{pppath}'")
    ppdf.write \
        .format("parquet") \
        .mode("overwrite") \
        .save(str(pppath))
Example #19
0
    def test_invalid_input(self):
        df = self.spark.createDataFrame(
            data=[],
            schema=T.StructType([T.StructField('key_1', T.StringType())]),
        )

        with six.assertRaisesRegex(
                self,
                AssertionError,
                'redis: url must define keyBy columns to construct redis key',
        ):
            df.write_ext.by_url('redis://redis.docker')

        with six.assertRaisesRegex(
                self,
                ValueError,
                'redis: true and false \(default\) are the only supported groupByKey values',
        ):
            df.write_ext.by_url('redis://redis.docker?keyBy=key_1&groupByKey=tru')

        with six.assertRaisesRegex(
                self,
                ValueError,
                'redis: true and false \(default\) are the only supported excludeKeyColumns '
                'values',
        ):
            df.write_ext.by_url('redis://redis.docker?keyBy=key_1&excludeKeyColumns=tru')

        with six.assertRaisesRegex(
                self,
                ValueError,
                'redis: expire must be positive',
        ):
            df.write_ext.by_url('redis://redis.docker?keyBy=key_1&expire=0')

        with six.assertRaisesRegex(
                self,
                ValueError,
                'redis: expire must be a base 10, positive integer',
        ):
            df.write_ext.by_url('redis://redis.docker?keyBy=key_1&expire=0x11')

        with six.assertRaisesRegex(
                self,
                ValueError,
                'redis: bzip2, gzip and zlib are the only supported compression codecs',
        ):
            df.write_ext.by_url('redis://redis.docker?keyBy=key_1&compression=snappy')

        with six.assertRaisesRegex(
                self,
                ValueError,
                'redis: max pipeline size must be positive',
        ):
            df.write_ext.by_url('redis://redis.docker?keyBy=key_1&maxPipelineSize=0')

        with six.assertRaisesRegex(
                self,
                ValueError,
                'redis: maxPipelineSize must be a base 10, positive integer',
        ):
            df.write_ext.by_url('redis://redis.docker?keyBy=key_1&maxPipelineSize=0x11')

        with six.assertRaisesRegex(
                self,
                ValueError,
                'redis: only append \(default\), ignore and overwrite modes are supported',
        ):
            df.write_ext.by_url('redis://redis.docker?keyBy=key_1&mode=error')
Example #20
0
import string, re
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank, desc, udf, to_json
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import RegexTokenizer
from pyspark.sql.types import IntegerType


spark = SparkSession.builder.appName('reddit averages').getOrCreate()

assert sys.version_info >= (3, 4) # make sure we have Python 3.4+
assert spark.version >= '2.1' # make sure we have Spark 2.1+


schema = types.StructType([ # commented-out fields won't be read
    types.StructField('body', types.StringType(), False),
    types.StructField('subreddit', types.StringType(), False),
])


def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]

    comments = spark.read.json(in_directory, schema=schema)
    
    comments.cache() 
   
    wordbreak = r'[%s\s]+' % (re.escape(string.punctuation + '0123456789'),)

    # NLP processing code adapted from https://spark.apache.org/docs/latest/ml-features.html
Example #21
0
    def test_mode(self):
        redis_client = redis.StrictRedis('redis.docker')
        redis_client.set('k11', '"hey!"')
        redis_client.set('k13', '"you!"')
        redis_client.set('k14', '"brick!"')

        df = self.spark.createDataFrame(
            data=[
                ('k1', 'k14', [1, 14, 141]),
                ('k1', 'k12', [1, 12, 121]),
                ('k1', 'k11', [1, 11, 111]),
                ('k1', 'k13', [1, 13, 131]),
            ],
            schema=T.StructType([
                T.StructField('key_1', T.StringType()),
                T.StructField('key_2', T.StringType()),
                T.StructField('aux_data', T.ArrayType(T.IntegerType())),
            ])
        )

        # test ignore
        df.write_ext.redis(
            key_by=['key_2'],
            mode='ignore',
            host='redis.docker',
        )

        self.assertRowsEqual(
            redis_client.keys(),
            [b'k11', b'k12', b'k13', b'k14'],
            ignore_order=True,
        )

        written_data = [
            json.loads(redis_client.get(key)) for key in ['k11', 'k12', 'k13', 'k14']
        ]

        expected = [
            'hey!',
            {'key_1': 'k1', 'key_2': 'k12', 'aux_data': [1, 12, 121]},
            'you!',
            'brick!',
        ]

        self.assertEqual(written_data, expected)

        # test append
        df.write_ext.redis(
            key_by=['key_2'],
            mode='append',
            host='redis.docker',
        )

        self.assertRowsEqual(
            redis_client.keys(),
            [b'k11', b'k12', b'k13', b'k14'],
            ignore_order=True,
        )

        written_data = [
            json.loads(redis_client.get(key)) for key in ['k11', 'k12', 'k13', 'k14']
        ]

        expected = [
            {'key_1': 'k1', 'key_2': 'k11', 'aux_data': [1, 11, 111]},
            {'key_1': 'k1', 'key_2': 'k12', 'aux_data': [1, 12, 121]},
            {'key_1': 'k1', 'key_2': 'k13', 'aux_data': [1, 13, 131]},
            {'key_1': 'k1', 'key_2': 'k14', 'aux_data': [1, 14, 141]},
        ]

        self.assertEqual(written_data, expected)

        # test overwrite
        df.where(F.col('key_2') == 'k11').write_ext.redis(
            key_by=['key_2'],
            mode='overwrite',
            host='redis.docker',
        )

        self.assertEqual(redis_client.keys(), [b'k11'])

        written_data = [json.loads(redis_client.get('k11'))]

        expected = [
            {'key_1': 'k1', 'key_2': 'k11', 'aux_data': [1, 11, 111]},
        ]

        self.assertEqual(written_data, expected)
Example #22
0
def infer_return_type(
        f: Callable
) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]:
    """
    Infer the return type from the return type annotation of the given function.

    The returned type class indicates both dtypes (a pandas only dtype object
    or a numpy dtype object) and its corresponding Spark DataType.

    >>> def func() -> int:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.Series[int]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.DataFrame[np.float, str]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)])

    >>> def func() -> ps.DataFrame[np.float]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True)])

    >>> def func() -> 'int':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> 'ps.Series[int]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> 'ps.DataFrame[np.float, str]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)])

    >>> def func() -> 'ps.DataFrame[np.float]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True)])

    >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)])

    >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])

    >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])

    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])

    >>> def func() -> ps.Series[pdf.b.dtype]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    CategoricalDtype(categories=[3, 4, 5], ordered=False)
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.DataFrame[int, [int, int]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]

    >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]

    >>> def func() -> ps.DataFrame[
    ...     ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)),
    ...     [("id", int), ("A", int)]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<index:bigint,id:bigint,A:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=category, struct_field=StructField('index', LongType(), True))]

    >>> def func() -> ps.DataFrame[
    ...         (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]
    """
    # We should re-import to make sure the class 'SeriesType' is not treated as a class
    # within this module locally. See Series.__class_getitem__ which imports this class
    # canonically.
    from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT
    from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder
    from pyspark.pandas.utils import name_like_string

    tpe = get_type_hints(f).get("return", None)

    if tpe is None:
        raise ValueError("A return value is required for the input function")

    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
        tpe = tpe.__args__[0]
        if issubclass(tpe, NameTypeHolder):
            tpe = tpe.tpe
        dtype, spark_type = pandas_on_spark_type(tpe)
        return SeriesType(dtype, spark_type)

    # Note that, DataFrame type hints will create a Tuple.
    # Tuple has _name but other types have __name__
    name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
    # Check if the name is Tuple.
    if name == "Tuple":
        tuple_type = tpe
        parameters = getattr(tuple_type, "__args__")

        index_parameters = [
            p for p in parameters
            if isclass(p) and issubclass(p, IndexNameTypeHolder)
        ]
        data_parameters = [p for p in parameters if p not in index_parameters]
        assert len(
            data_parameters) > 0, "Type hints for data must not be empty."

        index_fields = []
        if len(index_parameters) >= 1:
            for level, index_parameter in enumerate(index_parameters):
                index_name = index_parameter.name
                index_dtype, index_spark_type = pandas_on_spark_type(
                    index_parameter.tpe)
                index_fields.append(
                    InternalField(
                        dtype=index_dtype,
                        struct_field=types.StructField(
                            name=index_name if index_name is not None else
                            SPARK_INDEX_NAME_FORMAT(level),
                            dataType=index_spark_type,
                        ),
                    ))
        else:
            # No type hint for index.
            assert len(index_parameters) == 0

        data_dtypes, data_spark_types = zip(
            *(pandas_on_spark_type(p.tpe) if isclass(p)
              and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p)
              for p in data_parameters))
        data_names = [
            p.name if isclass(p) and issubclass(p, NameTypeHolder) else None
            for p in data_parameters
        ]
        data_fields = []
        for i, (data_name, data_dtype, data_spark_type) in enumerate(
                zip(data_names, data_dtypes, data_spark_types)):
            data_fields.append(
                InternalField(
                    dtype=data_dtype,
                    struct_field=types.StructField(
                        name=name_like_string(data_name)
                        if data_name is not None else ("c%s" % i),
                        dataType=data_spark_type,
                    ),
                ))

        return DataFrameType(index_fields=index_fields,
                             data_fields=data_fields)

    tpes = pandas_on_spark_type(tpe)
    if tpes is None:
        return UnknownType(tpe)
    else:
        return ScalarType(*tpes)
def main(inputs, output):
    # main logic starts here
    comments_schema = types.StructType([  # commented-out fields won't be read
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StringType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True),
        #types.StructField('year', types.IntegerType(), False),
        #types.StructField('month', types.IntegerType(), False),
    ])
    comments = spark.read.json(inputs, schema=comments_schema)
    average_func = {'score': 'avg'}
    comments_average = comments.groupby(
        comments['subreddit']).agg(average_func)
    averages = comments_average.sort(comments['subreddit'], ascending=True)
    averages.write.csv(output, mode='overwrite')
Example #24
0
def prepare_align_udf(dsalign_args, alphabet_path, max_length_ms,
                      max_silence_length_ms):
    args = dsalign_args
    ALIGN_RETURN_TYPE = T.StructType([
        T.StructField("start_ms", T.ArrayType(T.LongType())),
        T.StructField("end_ms", T.ArrayType(T.LongType())),
        T.StructField("label", T.ArrayType(T.StringType())),
        T.StructField("cer", T.ArrayType(T.FloatType())),
        T.StructField("wer", T.ArrayType(T.FloatType())),
        T.StructField("hypotheses", T.ArrayType(T.StringType())),
        # T.StructField("sws", T.ArrayType(T.FloatType())),
        # T.StructField("levenshtein", T.ArrayType(T.FloatType())),
    ])

    @F.pandas_udf(ALIGN_RETURN_TYPE)
    def align_table(
        name_series: pd.Series,
        audio_name_series: pd.Series,
        transcript_series: pd.Series,
        ctm_content_series: pd.Series,
    ) -> pd.DataFrame:
        alphabet = Alphabet(alphabet_path)
        silence_words = frozenset(["<unk>", "[laughter]", "[noise]"])
        result_dict = {
            "start_ms": [],
            "end_ms": [],
            "label": [],
            "cer": [],
            "wer": [],
            "hypotheses": [],
            # "sws": [],
            # "levenshtein": []
        }
        for name, audio_name, ctm_content, transcript in zip(
                name_series, audio_name_series, ctm_content_series,
                transcript_series):
            print(f"GALVEZ:name={name}")
            print(f"GALVEZ:audio_name={audio_name}")
            fragments = join_fragments(
                parse_ctm(ctm_content, silence_words),
                max_length_ms,
                max_silence_length_ms,
                audio_name,
            )
            # timeout after 200 seconds
            output = timeout(align, (args, fragments, transcript, alphabet),
                             timeout_duration=200)
            if output is None:
                print(
                    f"GALVEZ: timed out for name={name} audio_name={audio_name}"
                )
            if output is not None:
                _, _, _, aligned_results = output
                start_times = []
                end_times = []
                labels = []
                cers = []
                wers = []
                hypotheses = []
                # swses = []
                # levenshteins = []
                for result in aligned_results:
                    start_times.append(result["start"])
                    end_times.append(result["end"])
                    labels.append(result["aligned"])
                    hypotheses.append(result["transcript"])

                    cers.append(result["cer"])
                    wers.append(result["wer"])
                    # swses.append(result['sws'])
                    # levenshteins.append(result['levenshtein'])
                    # aligned-raw includes tokens that are not part of the alphabet.
                    # We would like to exclude those.
                    # labels.append(result['aligned-raw'])
                result_dict["start_ms"].append(start_times)
                result_dict["end_ms"].append(end_times)
                result_dict["label"].append(labels)
                result_dict["cer"].append(cers)
                result_dict["wer"].append(wers)
                result_dict["hypotheses"].append(hypotheses)
                # result_dict["sws"].append(swses)
                # result_dict["levenshtein"].append(levenshteins)
            else:
                result_dict["start_ms"].append([])
                result_dict["end_ms"].append([])
                result_dict["label"].append([])
                result_dict["cer"].append([])
                result_dict["wer"].append([])
                result_dict["hypotheses"].append([])
                # result_dict["sws"].append([])
                # result_dict["levenshtein"].append([])
        return pd.DataFrame(result_dict)

    return align_table
Example #25
0
def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]:
    """
    Infer the return type from the return type annotation of the given function.

    The returned type class indicates both dtypes (a pandas only dtype object
    or a numpy dtype object) and its corresponding Spark DataType.

    >>> def func() -> int:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.Series[int]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[np.float, str]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> ps.DataFrame[np.float]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> 'int':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.Series[int]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.DataFrame[np.float, str]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> 'ps.DataFrame[np.float]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> def func() -> ps.Series[pdf.b.dtype]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    CategoricalDtype(categories=[3, 4, 5], ordered=False)
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[int, [int, int]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_field
    InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))

    >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_field
    InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))

    >>> def func() -> ps.DataFrame[
    ...     ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)),
    ...     [("id", int), ("A", int)]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<index:bigint,id:bigint,A:bigint>'
    >>> inferred.index_field
    InternalField(dtype=category,struct_field=StructField(index,LongType,true))

    >>> def func() -> ps.DataFrame[
    ...         (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
    >>> inferred.index_field
    InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))
    """
    # We should re-import to make sure the class 'SeriesType' is not treated as a class
    # within this module locally. See Series.__class_getitem__ which imports this class
    # canonically.
    from pyspark.pandas.internal import InternalField, SPARK_DEFAULT_INDEX_NAME
    from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder
    from pyspark.pandas.utils import name_like_string

    spec = getfullargspec(f)
    tpe = spec.annotations.get("return", None)
    if isinstance(tpe, str):
        # This type hint can happen when given hints are string to avoid forward reference.
        tpe = resolve_string_type_hint(tpe)

    if hasattr(tpe, "__origin__") and (
        tpe.__origin__ == ps.DataFrame or tpe.__origin__ == ps.Series
    ):
        # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints.
        tpe = tpe.__args__[0]

    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
        tpe = tpe.__args__[0]
        if issubclass(tpe, NameTypeHolder):
            tpe = tpe.tpe
        dtype, spark_type = pandas_on_spark_type(tpe)
        return SeriesType(dtype, spark_type)

    # Note that, DataFrame type hints will create a Tuple.
    # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`.
    # Check if the name is Tuple.
    name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
    if name == "Tuple":
        tuple_type = tpe
        if hasattr(tuple_type, "__tuple_params__"):
            # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead.
            # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py
            parameters = getattr(tuple_type, "__tuple_params__")
        else:
            parameters = getattr(tuple_type, "__args__")

        index_parameters = [p for p in parameters if issubclass(p, IndexNameTypeHolder)]
        data_parameters = [p for p in parameters if p not in index_parameters]
        assert len(data_parameters) > 0, "Type hints for data must not be empty."

        if len(index_parameters) == 1:
            index_name = index_parameters[0].name
            index_dtype, index_spark_type = pandas_on_spark_type(index_parameters[0].tpe)
            index_field = InternalField(
                dtype=index_dtype,
                struct_field=types.StructField(
                    name=index_name if index_name is not None else SPARK_DEFAULT_INDEX_NAME,
                    dataType=index_spark_type,
                ),
            )
        else:
            assert len(index_parameters) == 0
            # No type hint for index.
            index_field = None

        data_dtypes, data_spark_types = zip(
            *(
                pandas_on_spark_type(p.tpe)
                if isclass(p) and issubclass(p, NameTypeHolder)
                else pandas_on_spark_type(p)
                for p in data_parameters
            )
        )
        data_names = [
            p.name if isclass(p) and issubclass(p, NameTypeHolder) else None
            for p in data_parameters
        ]
        data_fields = []
        for i, (data_name, data_dtype, data_spark_type) in enumerate(
            zip(data_names, data_dtypes, data_spark_types)
        ):
            data_fields.append(
                InternalField(
                    dtype=data_dtype,
                    struct_field=types.StructField(
                        name=name_like_string(data_name) if data_name is not None else ("c%s" % i),
                        dataType=data_spark_type,
                    ),
                )
            )

        return DataFrameType(index_field=index_field, data_fields=data_fields)

    tpes = pandas_on_spark_type(tpe)
    if tpes is None:
        return UnknownType(tpe)
    else:
        return ScalarType(*tpes)
Example #26
0
def main():
    LOG.info('Begin execution')
    spark = SparkSession.builder.appName('FlightDelaysETL').getOrCreate()
    datahub_airports_schema = T.StructType([
        T.StructField("continent", T.StringType(), True),
        T.StructField("coordinates", T.StringType(), True),
        T.StructField("elevation_ft", T.FloatType(), True),
        T.StructField("gps_code", T.StringType(), True),
        T.StructField("iata_code", T.StringType(), True),
        T.StructField("ident", T.StringType(), True),
        T.StructField("iso_country", T.StringType(), True),
        T.StructField("iso_region", T.StringType(), True),
        T.StructField("local_code", T.StringType(), True),
        T.StructField("municipality", T.StringType(), True),
        T.StructField("name", T.StringType(), True),
        T.StructField("type", T.StringType(), True)
    ])

    datahub_airports = (
        spark.read.format('json').schema(datahub_airports_schema).load(
            's3://{}/dend/airport-codes.json'.format(S3_BUCKET)))

    LOG.info('# datahub airport entries: %d', datahub_airports.count())

    scsg_schema = T.StructType([
        T.StructField('iata', T.StringType(), True),
        T.StructField('airport', T.StringType(), True),
        T.StructField('city', T.StringType(), True),
        T.StructField('state', T.StringType(), True),
        T.StructField('country', T.StringType(), True),
        T.StructField('lat', T.FloatType(), True),
        T.StructField('long', T.FloatType(), True)
    ])
    scsg_airports = (spark.read.format('csv').schema(scsg_schema).option(
        'header',
        True).load('s3://{}/dend/scsg-airports.csv'.format(S3_BUCKET)))
    LOG.info('# scsg airport entries: %d', scsg_airports.count())

    scsg_airports.limit(5).show()

    # join the two airport-related tables on the IATA code
    airports = (scsg_airports.join(
        datahub_airports, scsg_airports.iata == datahub_airports.iata_code,
        'right').drop('airport', 'iata_code', 'gps_code'))

    (airports.write.mode('overwrite').parquet(
        's3://{}/dend/pq_mart/airports' (S3_BUCKET)))
    LOG.info('# merged airport entries: %d', airports.count())

    delays_schema = T.StructType([
        T.StructField("Year", T.IntegerType(), True),
        T.StructField("Month", T.IntegerType(), True),
        T.StructField("DayofMonth", T.IntegerType(), True),
        T.StructField("DayOfWeek", T.IntegerType(), True),
        T.StructField("DepTime", T.IntegerType(), True),
        T.StructField("CRSDepTime", T.IntegerType(), True),
        T.StructField("ArrTime", T.IntegerType(), True),
        T.StructField("CRSArrTime", T.IntegerType(), True),
        T.StructField("UniqueCarrier", T.StringType(), True),
        T.StructField("FlightNum", T.StringType(), True),
        T.StructField("TailNum", T.StringType(), True),
        T.StructField("ActualElapsedTime", T.StringType(), True),
        T.StructField("CRSElapsedTime", T.StringType(), True),
        T.StructField("AirTime", T.IntegerType(), True),
        T.StructField("ArrDelay", T.IntegerType(), True),
        T.StructField("DepDelay", T.IntegerType(), True),
        T.StructField("Origin", T.StringType(), True),
        T.StructField("Dest", T.StringType(), True),
        T.StructField("Distance", T.IntegerType(), True),
        T.StructField("TaxiIn", T.IntegerType(), True),
        T.StructField("TaxiOut", T.IntegerType(), True),
        T.StructField("Cancelled", T.IntegerType(), True),
        T.StructField("CancellationCode", T.IntegerType(), True),
        T.StructField("Diverted", T.IntegerType(), True),
        T.StructField("CarrierDelay", T.IntegerType(), True),
        T.StructField("WeatherDelay", T.IntegerType(), True),
        T.StructField("NASDelay", T.IntegerType(), True),
        T.StructField("SecurityDelay", T.IntegerType(), True),
        T.StructField("LateAircraftDelay", T.IntegerType(), True)
    ])

    delays = (spark.read.format('csv').schema(delays_schema).option(
        'header', True).option('nullValue', 'NA').load(
            's3://{}/dend/flights/1988*'.format(S3_BUCKET)).withColumnRenamed(
                'DayofMonth', 'DayOfMonth'))
    LOG.info('# flight delay entries: %d', delays.count())

    # - delays: calculate total delay
    # - join two airport-related tables on the iata code

    delay_columns = [
        'ArrDelay', 'DepDelay', 'TaxiIn', 'TaxiOut', 'CarrierDelay',
        'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'
    ]

    delays = (delays.fillna(0, subset=delay_columns).withColumn(
        'TotalDelay', sum([F.col(col) for col in delay_columns])))

    # join in the `iso_region` field from the dimension table, so that the downstream business users can
    # derive results on how the flight delays relate to the flight region
    delays = (delays.join(
        airports.select('iata', 'iso_region'),
        delays.Origin == airports.iata, 'left').withColumnRenamed(
            'iso_region', 'OriginRegion').drop('iata').join(
                airports.select('iata',
                                'iso_region'), delays.Dest == airports.iata,
                'left').withColumnRenamed('iso_region',
                                          'DestRegion').drop('iata'))
        assert (len(one_hot_row) == len(enumerated_labels))
        return one_hot_row

    # Write the one-hot-encoded questions to S3 as a parquet file
    one_hot_questions = questions_tags.rdd.map(lambda x: Row(
        _Body=x._Body, _Tags=one_hot_encode(x._Tags, enumerated_labels)))
    if DEBUG is True:
        print(one_hot_questions.take(10))
        # Verify we have multiple labels present
        print(
            one_hot_questions.sortBy(lambda x: sum(x._Tags),
                                     ascending=False).take(10))

    # Create a DataFrame for persisting as Parquet format
    schema = T.StructType([
        T.StructField("_Body", T.ArrayType(T.StringType())),
        T.StructField("_Tags", T.ArrayType(T.IntegerType()))
    ])

    one_hot_df = spark.createDataFrame(one_hot_questions, schema)
    one_hot_df.show()
    one_hot_df.write.mode('overwrite').parquet(
        PATHS['one_hot'][PATH_SET].format(tag_limit))
    one_hot_df = spark.read.parquet(
        PATHS['one_hot'][PATH_SET].format(tag_limit))

    def create_schema(one_row):
        schema_list = [
            T.StructField("_Body", T.ArrayType(T.StringType())),
        ]
        for i, val in list(enumerate(one_row._Tags)):
import pyspark.sql.functions as F
from pyspark.ml.feature import Bucketizer
from pyspark.sql import types
from pyspark.sql import Window

import model_utils as mu


DAYS_FROM_EULA_BINS = \
    [float('-Inf'), 6, 12, 21, 30, 45, 60, 80, 110, 140, 180, 250, 350, float('Inf')]
INT_TO_CHAR_BASELINE = 97

convert_to_char = F.udf(lambda x: chr(x), types.StringType())

TRIAL_SUCCESS_PAIR = types.StructType([
    types.StructField('trial', types.FloatType(), False),
    types.StructField('success', types.FloatType(), False)
])
pair_trial_success = F.udf(lambda t, s: (t, s), TRIAL_SUCCESS_PAIR)

calc_prob = F.udf(
    lambda trial, success, alpha, beta: (success + alpha) /
    (trial + alpha + beta), types.FloatType())


def get_table(sqlContext):
    return sqlContext.table('l2_sprint.mixpanel_home')


def load_received_notifications(events, start_date, end_date):
    # datetime objects are serializable only from spark 2.2.1
Example #29
0
def get_dataset(dataset_type,
                data,
                schemas=None,
                profiler=ColumnsExistProfiler,
                caching=True):
    """Utility to create datasets for json-formatted tests.
    """
    df = pd.DataFrame(data)
    if dataset_type == "PandasDataset":
        if schemas and "pandas" in schemas:
            schema = schemas["pandas"]
            pandas_schema = {}
            for (key, value) in schema.items():
                # Note, these are just names used in our internal schemas to build datasets *for internal tests*
                # Further, some changes in pandas internal about how datetimes are created means to support pandas
                # pre- 0.25, we need to explicitly specify when we want timezone.

                # We will use timestamp for timezone-aware (UTC only) dates in our tests
                if value.lower() in ["timestamp", "datetime64[ns, tz]"]:
                    df[key] = pd.to_datetime(df[key], utc=True)
                    continue
                elif value.lower() in [
                        "datetime", "datetime64", "datetime64[ns]"
                ]:
                    df[key] = pd.to_datetime(df[key])
                    continue
                try:
                    type_ = np.dtype(value)
                except TypeError:
                    type_ = getattr(pd.core.dtypes.dtypes, value)
                    # If this raises AttributeError it's okay: it means someone built a bad test
                pandas_schema[key] = type_
            # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()}
            df = df.astype(pandas_schema)
        return PandasDataset(df, profiler=profiler, caching=caching)

    elif dataset_type == "sqlite":
        from sqlalchemy import create_engine

        engine = create_engine("sqlite://")
        conn = engine.connect()
        # Add the data to the database as a new table

        sql_dtypes = {}
        if (schemas and "sqlite" in schemas
                and isinstance(engine.dialect, sqlitetypes.dialect)):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + "".join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "postgresql":
        from sqlalchemy import create_engine

        # Create a new database
        engine = create_engine("postgresql://postgres@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "postgresql" in schemas
                and isinstance(engine.dialect, postgresqltypes.dialect)):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + "".join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mysql":
        from sqlalchemy import create_engine

        engine = create_engine("mysql://root@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "mysql" in schemas
                and isinstance(engine.dialect, mysqltypes.dialect)):
            schema = schemas["mysql"]
            sql_dtypes = {
                col: MYSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + "".join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "SparkDFDataset":
        from pyspark.sql import SparkSession
        import pyspark.sql.types as sparktypes

        SPARK_TYPES = {
            "StringType": sparktypes.StringType,
            "IntegerType": sparktypes.IntegerType,
            "LongType": sparktypes.LongType,
            "DateType": sparktypes.DateType,
            "TimestampType": sparktypes.TimestampType,
            "FloatType": sparktypes.FloatType,
            "DoubleType": sparktypes.DoubleType,
            "BooleanType": sparktypes.BooleanType,
            "DataType": sparktypes.DataType,
            "NullType": sparktypes.NullType,
        }

        spark = SparkSession.builder.getOrCreate()
        # We need to allow null values in some column types that do not support them natively, so we skip
        # use of df in this case.
        data_reshaped = list(
            zip(*[v for _, v in data.items()]))  # create a list of rows
        if schemas and "spark" in schemas:
            schema = schemas["spark"]
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]](), True)
                    for column in schema
                ])
                # We create these every time, which is painful for testing
                # However nuance around null treatment as well as the desire
                # for real datetime support in tests makes this necessary
                data = copy.deepcopy(data)
                if "ts" in data:
                    print(data)
                    print(schema)
                for col in schema:
                    type_ = schema[col]
                    if type_ in ["IntegerType", "LongType"]:
                        # Ints cannot be None...but None can be valid in Spark (as Null)
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(int(val))
                        data[col] = vals
                    elif type_ in ["FloatType", "DoubleType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(float(val))
                        data[col] = vals
                    elif type_ in ["DateType", "TimestampType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(parse(val))
                        data[col] = vals
                # Do this again, now that we have done type conversion using the provided schema
                data_reshaped = list(
                    zip(*[v
                          for _, v in data.items()]))  # create a list of rows
                spark_df = spark.createDataFrame(data_reshaped,
                                                 schema=spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, profiler=profiler, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
import pyspark.sql.types as t

expected_schema = t.StructType([
    t.StructField("FIELD1", t.IntegerType()),
    t.StructField("FIELD2", t.DateType()),
    t.StructField(
        "STRUCT1",
        t.StructType([
            t.StructField("NESTED_FIELD1", t.StringType()),
            t.StructField(
                "STRUCT2",
                t.StructType([
                    t.StructField("NESTED_FIELD2", t.StringType()),
                ], ),
            ),
        ], ),
    ),
], )