#types.StructField('created_utc', types.StringType(), False), #types.StructField('distinguished', types.StringType(), False), #types.StructField('downs', types.LongType(), False), #types.StructField('edited', types.StringType(), False), #types.StructField('gilded', types.LongType(), False), #types.StructField('id', types.StringType(), False), #types.StructField('link_id', types.StringType(), False), #types.StructField('name', types.StringType(), False), #types.StructField('parent_id', types.StringType(), True), #types.StructField('retrieved_on', types.LongType(), False), #types.StructField('score', types.LongType(), False), #types.StructField('score_hidden', types.BooleanType(), False), #types.StructField('subreddit', types.StringType(), False), #types.StructField('subreddit_id', types.StringType(), False), #types.StructField('ups', types.LongType(), False), types.StructField('lang', types.StringType(), False), types.StructField('title', types.StringType(), False), types.StructField('visit_count', types.LongType(), False), types.StructField('data_size', types.LongType(), False), ]) def pathToTime(path): start = path.rfind("pagecounts-") + 11 end = path.rfind(".") - 4 return path[start:end] def main(): in_directory = sys.argv[1] out_directory = sys.argv[2]
def sqlType(cls): # NB: this is actually an instance method in practice O_O ! return types.StructType([ types.StructField("np_bytes", types.BinaryType(), False) ])
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('first Spark app').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.1' # make sure we have Spark 2.1+ schema = types.StructType([ types.StructField('id', types.IntegerType(), False), types.StructField('x', types.FloatType(), False), types.StructField('y', types.FloatType(), False), types.StructField('z', types.FloatType(), False), ]) def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] # Read the data from the JSON files xyz = spark.read.json(in_directory, schema=schema) #xyz.show(); return # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by. with_bins = xyz.select( xyz['x'], # TODO: also the y values xyz['y'], (xyz['id'] % 10).alias('bin'),
def __init__(self, tpe): # Seems we cannot specify field names. I currently gave some default names # `c0, c1, ... cn`. self.tpe = types.StructType([ types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe)) ]) # type: types.StructType
import helloworld import pytest from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.sql import types sc = SparkContext.getOrCreate() spark = SparkSession(sc) # Create some example data data = [ ('Batmobile', 7, 5.0), ('Catmobile', 3, 9.0), ('', 4, 21.0), ] custom_schema = types.StructType() custom_schema.add(types.StructField("Vehicle", types.StringType())) custom_schema.add(types.StructField("wheels", types.IntegerType())) custom_schema.add(types.StructField("speed", types.FloatType())) df = spark.createDataFrame(data, custom_schema) def test_add(): assert (helloworld.add(1, 1) == 2) def test_data_count_check(): assert (df.count() == 3)
from pyspark.sql import SparkSession, types, functions from pyspark.sql.functions import * import pandas as pd import elevation_grid as eg import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap, cm from pyspark.ml import PipelineModel from pyspark.ml import Pipeline spark = SparkSession.builder.appName('weather_plot').getOrCreate() assert spark.version >= '2.3' # make sure we have Spark 2.3+ spark.sparkContext.setLogLevel('WARN') tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) elevation_schema = types.StructType([ types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('date', types.DateType()), types.StructField('tmax', types.FloatType()) ])
def test_exclude_key_columns(self): df = self.spark.createDataFrame( data=[ ('k1', 'k14', [1, 14, 141]), ('k1', 'k12', [1, 12, 121]), ('k1', 'k11', [1, 11, 111]), ('k1', 'k13', [1, 13, 131]), ], schema=T.StructType([ T.StructField('key_1', T.StringType()), T.StructField('key_2', T.StringType()), T.StructField('aux_data', T.ArrayType(T.IntegerType())), ]) ) redis_client = redis.StrictRedis('redis.docker') # simple key df.write_ext.redis( key_by=['key_2'], key_prefix='hello', exclude_key_columns=True, host='redis.docker', ) self.assertRowsEqual( redis_client.keys(), [b'hello.k11', b'hello.k12', b'hello.k13', b'hello.k14'], ignore_order=True, ) written_data = [ json.loads(redis_client.get(key)) for key in ['hello.k11', 'hello.k12', 'hello.k13', 'hello.k14'] ] expected = [ {'key_1': 'k1', 'aux_data': [1, 11, 111]}, {'key_1': 'k1', 'aux_data': [1, 12, 121]}, {'key_1': 'k1', 'aux_data': [1, 13, 131]}, {'key_1': 'k1', 'aux_data': [1, 14, 141]}, ] self.assertEqual(written_data, expected) redis_client.flushdb() # composite key df.write_ext.redis( key_by=['key_1', 'key_2'], key_prefix='hello', exclude_key_columns=True, host='redis.docker', ) self.assertRowsEqual( redis_client.keys(), [b'hello.k1.k11', b'hello.k1.k12', b'hello.k1.k13', b'hello.k1.k14'], ignore_order=True, ) written_data = [ json.loads(redis_client.get(key)) for key in ['hello.k1.k11', 'hello.k1.k12', 'hello.k1.k13', 'hello.k1.k14'] ] expected = [ {'aux_data': [1, 11, 111]}, {'aux_data': [1, 12, 121]}, {'aux_data': [1, 13, 131]}, {'aux_data': [1, 14, 141]}, ] self.assertEqual(written_data, expected)
# pylint: disable = invalid-name import pyspark.sql.types as t from datalakebundle.table.schema.TableSchemaGenerator import TableSchemaGenerator schema = t.StructType([ t.StructField("FIELD1", t.IntegerType()), t.StructField("FIELD2", t.DoubleType()), t.StructField("FIELD3", t.DoubleType()), t.StructField( "STRUCT1", t.StructType([ t.StructField("NESTED_FIELD1", t.StringType()), t.StructField( "STRUCT2", t.StructType([ t.StructField("NESTED_FIELD2", t.StringType()), ], ), ), ], ), ), ], ) expected_result = """def get_schema(): return dp.TableSchema( [ t.StructField("FIELD1", t.IntegerType()), t.StructField("FIELD2", t.DoubleType()), t.StructField("FIELD3", t.DoubleType()), t.StructField( "STRUCT1", t.StructType(
def main(inputs, output): comments_schema = types.StructType([ types.StructField('archived', types.BooleanType()), types.StructField('author', types.StringType()), types.StructField('author_flair_css_class', types.StringType()), types.StructField('author_flair_text', types.StringType()), types.StructField('body', types.StringType()), types.StructField('controversiality', types.LongType()), types.StructField('created_utc', types.StringType()), types.StructField('distinguished', types.StringType()), types.StructField('downs', types.LongType()), types.StructField('edited', types.StringType()), types.StructField('gilded', types.LongType()), types.StructField('id', types.StringType()), types.StructField('link_id', types.StringType()), types.StructField('name', types.StringType()), types.StructField('parent_id', types.StringType()), types.StructField('retrieved_on', types.LongType()), types.StructField('score', types.LongType()), types.StructField('score_hidden', types.BooleanType()), types.StructField('subreddit', types.StringType()), types.StructField('subreddit_id', types.StringType()), types.StructField('ups', types.LongType()), # types.StructField('year', types.IntegerType()), # types.StructField('month', types.IntegerType()), ]) df = spark.read.json(inputs, schema=comments_schema) averages = df.groupBy(df['subreddit']).agg( functions.avg(df['score']).alias('average_score')) averages.write.csv(output, mode='overwrite') averages.explain()
# Row(id=2, name='Melanie', dob='1963-10-21', chelsea_fan=False)] dataA = [ Row(id=1, name='Alan', dob='1962-11-25', chelsea_fan=True), Row(id=2, name='Melanie', dob='1963-10-21', chelsea_fan=False) ] dfA = spark.createDataFrame(dataA) print('dfA is ...') print(dfA) print('dfA.collect() is ...') print(dfA.collect()) # produce a Dataframe from a list of Row objects (schema is supplied) # schema has 3rd argument saying if field can contain nulls # DataFrame[people_id: int, name: string, dob: date, chelsea_fan: boolean] schema = T.StructType([ T.StructField("person_id", T.IntegerType(), True), T.StructField("name", T.StringType(), True), T.StructField("dob", T.DateType(), True), T.StructField("chelsea_fan", T.BooleanType(), True), ]) data = [ Row(person_id=1, name='Alan', dob=date(1962, 11, 25), chelsea_fan=True), Row(person_id=2, name='Melanie', dob=date(1963, 10, 21), chelsea_fan=False) ] dfPersons1 = spark.createDataFrame(data, schema) print('dfPersons1 is ...') print(dfPersons1) # produce a Dataframe from a list of Dictionaries (schema is supplied) # DataFrame[people_id: int, name: string, dob: date, chelsea_fan: boolean] schema = T.StructType([
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('reddit relative scores').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.3' # make sure we have Spark 2.3+ comments_schema = types.StructType([ types.StructField('archived', types.BooleanType()), types.StructField('author', types.StringType()), types.StructField('author_flair_css_class', types.StringType()), types.StructField('author_flair_text', types.StringType()), types.StructField('body', types.StringType()), types.StructField('controversiality', types.LongType()), types.StructField('created_utc', types.StringType()), types.StructField('distinguished', types.StringType()), types.StructField('downs', types.LongType()), types.StructField('edited', types.StringType()), types.StructField('gilded', types.LongType()), types.StructField('id', types.StringType()), types.StructField('link_id', types.StringType()), types.StructField('name', types.StringType()), types.StructField('parent_id', types.StringType()), types.StructField('retrieved_on', types.LongType()), types.StructField('score', types.LongType()), types.StructField('score_hidden', types.BooleanType()), types.StructField('subreddit', types.StringType()), types.StructField('subreddit_id', types.StringType()), types.StructField('ups', types.LongType()),
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SparkSession, functions, types, Row spark = SparkSession.builder.appName( 'OSM point of interest extracter').getOrCreate() assert spark.version >= '2.4' # make sure we have Spark 2.4+ spark.sparkContext.setLogLevel('WARN') sc = spark.sparkContext spark.conf.set("spark.sql.session.timeZone", "UTC") from lxml import etree import dateutil.parser #import datetime amenity_schema = types.StructType([ types.StructField('lat', types.DoubleType(), nullable=False), types.StructField('lon', types.DoubleType(), nullable=False), types.StructField('unix_time', types.DoubleType(), nullable=False), #types.StructField('timestamp', types.TimestampType(), nullable=False), types.StructField('amenity', types.StringType(), nullable=False), types.StructField('name', types.StringType(), nullable=True), types.StructField('tags', types.MapType(types.StringType(), types.StringType()), nullable=False), ]) def get_amenities(line): root = etree.fromstring(line) if root.tag != 'node': return
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3" cluster_seeds = ['199.60.17.171', '199.60.17.188'] cluster_seeds = ['199.60.17.171', '199.60.17.188'] conf = SparkConf().setAppName('example code') \ .set('spark.cassandra.connection.host', ','.join(cluster_seeds)) spark = SparkSession.builder.appName('Big Data Project').getOrCreate() sc = spark.sparkContext assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.2' # make sure we have Spark 2.2+ schema = types.StructType([ types.StructField('county_code', types.StringType(), True), types.StructField('month', types.StringType(), True), types.StructField('year', types.StringType(), True), types.StructField('observation_count', types.DoubleType(), True), types.StructField('observation_percent', types.DoubleType(), True), types.StructField('max_value', types.DoubleType(), True), types.StructField('max_hour', types.DoubleType(), True), types.StructField('arithmetic_mean', types.DoubleType(), True), types.StructField('am_wind', types.DoubleType(), True), types.StructField('am_temp', types.DoubleType(), True), types.StructField('am_rh', types.DoubleType(), True), types.StructField('am_press', types.DoubleType(), True) ]) itr = 0 for j in ['44201', '42401']:
import sys, re, uuid from datetime import datetime assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SparkSession, functions, types cluster_seeds = ['199.60.17.32', '199.60.17.65'] spark = SparkSession.builder.appName('Spark Cassandra example') \ .config('spark.cassandra.connection.host', ','.join(cluster_seeds)).getOrCreate() spark.sparkContext.setLogLevel('WARN') sc = spark.sparkContext line_re = re.compile( r'^(\S+) - - \[(\S+ [+-]\d+)\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$') schema = types.StructType([ #types.StructField('id', types.StringType()), types.StructField('host', types.StringType()), types.StructField('datetime', types.TimestampType()), types.StructField('path', types.StringType()), types.StructField('bytes', types.IntegerType()) ]) def read_line(line): m = line_re.match(line) if m is None: return None return (m.group(1), datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S %z'), m.group(3), int(m.group(4))) def main(input_dir, keyspace, table):
from pyspark.sql import SparkSession, functions, types from pyspark.sql import types import sys import datetime as dt from pyspark.sql import functions assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ conf = SparkConf().setAppName('example code') sc = SparkContext(conf=conf) spark = SparkSession.builder.appName('example code').getOrCreate() assert spark.version >= '2.3' # make sure we have Spark 2.3+ #######################aggregate transofrmation : reduce scalar in rdd schema = [ ('Date', types.StringType()) , ('Region', types.StringType()) , ('Rep', types.StringType()) , ('Item', types.StringType()) , ('Units', types.IntegerType()) , ('Unit Cost', types.DoubleType()) , ('total', types.DoubleType()) ] schema_sales = types.StructType([types.StructField(e[0],e[1], False) for e in schema]) sales_df = spark.read.csv('sales.csv',header=True,schema=schema_sales) sales_df.select('Region','Rep').distinct().orderBy(sales_df.Region,sales_df.Rep).show()
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3" cluster_seeds = ['199.60.17.171', '199.60.17.188'] cluster_seeds = ['199.60.17.171', '199.60.17.188'] conf = SparkConf().setAppName('example code') \ .set('spark.cassandra.connection.host', ','.join(cluster_seeds)) spark = SparkSession.builder.appName('Big Data Project').getOrCreate() sc = spark.sparkContext assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.2' # make sure we have Spark 2.2+ schema = types.StructType([ types.StructField('county_code', types.IntegerType(), True), types.StructField('month', types.IntegerType(), True), types.StructField('year', types.IntegerType(), True), types.StructField('am_wind', types.DoubleType(), True) ]) train_final = spark.createDataFrame(sc.emptyRDD(), schema=schema) for year in range(2013, 2018): support = spark.read.csv( "/home/ldua/Desktop/BigDataProject/support/daily_WIND_" + str(year) + ".csv", header=True) support_f = support.select( 'County Code', 'Date Local',
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('weather ETL').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.4' # make sure we have Spark 2.4+ observation_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.StringType()), types.StructField('observation', types.StringType()), types.StructField('value', types.IntegerType()), types.StructField('mflag', types.StringType()), types.StructField('qflag', types.StringType()), types.StructField('sflag', types.StringType()), types.StructField('obstime', types.StringType()), ]) def main(in_directory, out_directory): weather = spark.read.csv(in_directory, schema=observation_schema) # TODO: finish here. weather = weather.filter(weather.qflag.isNull()) weather = weather.filter(weather.station.startswith('CA')) weather = weather.filter(weather.observation == 'TMAX') cleaned_data = weather.select(weather['station'], weather['date'],
def preprocessing(spark: SparkSession, pppath: Path, datadir: Path): def prepro(s5: DataFrame) -> DataFrame: stages = [] catvars = ['dept_id', 'item_id', 'store_id', 'wday'] for v in catvars: stages += [StringIndexer(inputCol=v, outputCol=f"i{v}")] stages += [OneHotEncoderEstimator(inputCols=[f"i{v}" for v in catvars], outputCols=[f"v{v}" for v in catvars])] stages += [VectorAssembler(inputCols=['vwday', 'vitem_id', 'vdept_id', 'vstore_id', 'flag_ram', 'snap', 'dn', 'month', 'year'], outputCol='features')] pip: Pipeline = Pipeline(stages=stages) pipm = pip.fit(s5) df: DataFrame = pipm.transform(s5) return df.drop('idept_id', 'iitem_id', 'istore_id', 'iwday', 'vdept_id', 'vtem_id', 'vstore_id', 'vwday') print("--- preprocessing -----------------------") schema = t.StructType([ t.StructField('year', t.IntegerType(), True), t.StructField('month', t.IntegerType(), True), t.StructField('dn', t.IntegerType(), True), t.StructField('wday', t.IntegerType(), True), t.StructField('snap', t.IntegerType(), True), t.StructField('dept_id', t.StringType(), True), t.StructField('item_id', t.StringType(), True), t.StructField('store_id', t.StringType(), True), t.StructField('sales', t.DoubleType(), True), t.StructField('flag_ram', t.IntegerType(), True), t.StructField('Sales_Pred', t.DoubleType(), True) ]) csv_path = datadir / "Sales5_Ab2011_InklPred.csv" print(f"--- Reading: '{csv_path}'") sales5: DataFrame = spark.read.csv(str(csv_path), header='true', schema=schema) \ .withColumn("label", f.col('sales')) ppdf = prepro(sales5) print(f"--- Writing: '{pppath}'") ppdf.write \ .format("parquet") \ .mode("overwrite") \ .save(str(pppath))
def test_invalid_input(self): df = self.spark.createDataFrame( data=[], schema=T.StructType([T.StructField('key_1', T.StringType())]), ) with six.assertRaisesRegex( self, AssertionError, 'redis: url must define keyBy columns to construct redis key', ): df.write_ext.by_url('redis://redis.docker') with six.assertRaisesRegex( self, ValueError, 'redis: true and false \(default\) are the only supported groupByKey values', ): df.write_ext.by_url('redis://redis.docker?keyBy=key_1&groupByKey=tru') with six.assertRaisesRegex( self, ValueError, 'redis: true and false \(default\) are the only supported excludeKeyColumns ' 'values', ): df.write_ext.by_url('redis://redis.docker?keyBy=key_1&excludeKeyColumns=tru') with six.assertRaisesRegex( self, ValueError, 'redis: expire must be positive', ): df.write_ext.by_url('redis://redis.docker?keyBy=key_1&expire=0') with six.assertRaisesRegex( self, ValueError, 'redis: expire must be a base 10, positive integer', ): df.write_ext.by_url('redis://redis.docker?keyBy=key_1&expire=0x11') with six.assertRaisesRegex( self, ValueError, 'redis: bzip2, gzip and zlib are the only supported compression codecs', ): df.write_ext.by_url('redis://redis.docker?keyBy=key_1&compression=snappy') with six.assertRaisesRegex( self, ValueError, 'redis: max pipeline size must be positive', ): df.write_ext.by_url('redis://redis.docker?keyBy=key_1&maxPipelineSize=0') with six.assertRaisesRegex( self, ValueError, 'redis: maxPipelineSize must be a base 10, positive integer', ): df.write_ext.by_url('redis://redis.docker?keyBy=key_1&maxPipelineSize=0x11') with six.assertRaisesRegex( self, ValueError, 'redis: only append \(default\), ignore and overwrite modes are supported', ): df.write_ext.by_url('redis://redis.docker?keyBy=key_1&mode=error')
import string, re from pyspark.sql.window import Window from pyspark.sql.functions import col, rank, desc, udf, to_json from pyspark.ml.feature import StopWordsRemover from pyspark.ml.feature import RegexTokenizer from pyspark.sql.types import IntegerType spark = SparkSession.builder.appName('reddit averages').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.1' # make sure we have Spark 2.1+ schema = types.StructType([ # commented-out fields won't be read types.StructField('body', types.StringType(), False), types.StructField('subreddit', types.StringType(), False), ]) def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] comments = spark.read.json(in_directory, schema=schema) comments.cache() wordbreak = r'[%s\s]+' % (re.escape(string.punctuation + '0123456789'),) # NLP processing code adapted from https://spark.apache.org/docs/latest/ml-features.html
def test_mode(self): redis_client = redis.StrictRedis('redis.docker') redis_client.set('k11', '"hey!"') redis_client.set('k13', '"you!"') redis_client.set('k14', '"brick!"') df = self.spark.createDataFrame( data=[ ('k1', 'k14', [1, 14, 141]), ('k1', 'k12', [1, 12, 121]), ('k1', 'k11', [1, 11, 111]), ('k1', 'k13', [1, 13, 131]), ], schema=T.StructType([ T.StructField('key_1', T.StringType()), T.StructField('key_2', T.StringType()), T.StructField('aux_data', T.ArrayType(T.IntegerType())), ]) ) # test ignore df.write_ext.redis( key_by=['key_2'], mode='ignore', host='redis.docker', ) self.assertRowsEqual( redis_client.keys(), [b'k11', b'k12', b'k13', b'k14'], ignore_order=True, ) written_data = [ json.loads(redis_client.get(key)) for key in ['k11', 'k12', 'k13', 'k14'] ] expected = [ 'hey!', {'key_1': 'k1', 'key_2': 'k12', 'aux_data': [1, 12, 121]}, 'you!', 'brick!', ] self.assertEqual(written_data, expected) # test append df.write_ext.redis( key_by=['key_2'], mode='append', host='redis.docker', ) self.assertRowsEqual( redis_client.keys(), [b'k11', b'k12', b'k13', b'k14'], ignore_order=True, ) written_data = [ json.loads(redis_client.get(key)) for key in ['k11', 'k12', 'k13', 'k14'] ] expected = [ {'key_1': 'k1', 'key_2': 'k11', 'aux_data': [1, 11, 111]}, {'key_1': 'k1', 'key_2': 'k12', 'aux_data': [1, 12, 121]}, {'key_1': 'k1', 'key_2': 'k13', 'aux_data': [1, 13, 131]}, {'key_1': 'k1', 'key_2': 'k14', 'aux_data': [1, 14, 141]}, ] self.assertEqual(written_data, expected) # test overwrite df.where(F.col('key_2') == 'k11').write_ext.redis( key_by=['key_2'], mode='overwrite', host='redis.docker', ) self.assertEqual(redis_client.keys(), [b'k11']) written_data = [json.loads(redis_client.get('k11'))] expected = [ {'key_1': 'k1', 'key_2': 'k11', 'aux_data': [1, 11, 111]}, ] self.assertEqual(written_data, expected)
def infer_return_type( f: Callable ) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType() >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), ... [("id", int), ("A", int)]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<index:bigint,id:bigint,A:bigint>' >>> inferred.index_fields [InternalField(dtype=category, struct_field=StructField('index', LongType(), True))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder from pyspark.pandas.utils import name_like_string tpe = get_type_hints(f).get("return", None) if tpe is None: raise ValueError("A return value is required for the input function") if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Tuple has _name but other types have __name__ name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) # Check if the name is Tuple. if name == "Tuple": tuple_type = tpe parameters = getattr(tuple_type, "__args__") index_parameters = [ p for p in parameters if isclass(p) and issubclass(p, IndexNameTypeHolder) ] data_parameters = [p for p in parameters if p not in index_parameters] assert len( data_parameters) > 0, "Type hints for data must not be empty." index_fields = [] if len(index_parameters) >= 1: for level, index_parameter in enumerate(index_parameters): index_name = index_parameter.name index_dtype, index_spark_type = pandas_on_spark_type( index_parameter.tpe) index_fields.append( InternalField( dtype=index_dtype, struct_field=types.StructField( name=index_name if index_name is not None else SPARK_INDEX_NAME_FORMAT(level), dataType=index_spark_type, ), )) else: # No type hint for index. assert len(index_parameters) == 0 data_dtypes, data_spark_types = zip( *(pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in data_parameters)) data_names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in data_parameters ] data_fields = [] for i, (data_name, data_dtype, data_spark_type) in enumerate( zip(data_names, data_dtypes, data_spark_types)): data_fields.append( InternalField( dtype=data_dtype, struct_field=types.StructField( name=name_like_string(data_name) if data_name is not None else ("c%s" % i), dataType=data_spark_type, ), )) return DataFrameType(index_fields=index_fields, data_fields=data_fields) tpes = pandas_on_spark_type(tpe) if tpes is None: return UnknownType(tpe) else: return ScalarType(*tpes)
def main(inputs, output): # main logic starts here comments_schema = types.StructType([ # commented-out fields won't be read types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StringType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True), #types.StructField('year', types.IntegerType(), False), #types.StructField('month', types.IntegerType(), False), ]) comments = spark.read.json(inputs, schema=comments_schema) average_func = {'score': 'avg'} comments_average = comments.groupby( comments['subreddit']).agg(average_func) averages = comments_average.sort(comments['subreddit'], ascending=True) averages.write.csv(output, mode='overwrite')
def prepare_align_udf(dsalign_args, alphabet_path, max_length_ms, max_silence_length_ms): args = dsalign_args ALIGN_RETURN_TYPE = T.StructType([ T.StructField("start_ms", T.ArrayType(T.LongType())), T.StructField("end_ms", T.ArrayType(T.LongType())), T.StructField("label", T.ArrayType(T.StringType())), T.StructField("cer", T.ArrayType(T.FloatType())), T.StructField("wer", T.ArrayType(T.FloatType())), T.StructField("hypotheses", T.ArrayType(T.StringType())), # T.StructField("sws", T.ArrayType(T.FloatType())), # T.StructField("levenshtein", T.ArrayType(T.FloatType())), ]) @F.pandas_udf(ALIGN_RETURN_TYPE) def align_table( name_series: pd.Series, audio_name_series: pd.Series, transcript_series: pd.Series, ctm_content_series: pd.Series, ) -> pd.DataFrame: alphabet = Alphabet(alphabet_path) silence_words = frozenset(["<unk>", "[laughter]", "[noise]"]) result_dict = { "start_ms": [], "end_ms": [], "label": [], "cer": [], "wer": [], "hypotheses": [], # "sws": [], # "levenshtein": [] } for name, audio_name, ctm_content, transcript in zip( name_series, audio_name_series, ctm_content_series, transcript_series): print(f"GALVEZ:name={name}") print(f"GALVEZ:audio_name={audio_name}") fragments = join_fragments( parse_ctm(ctm_content, silence_words), max_length_ms, max_silence_length_ms, audio_name, ) # timeout after 200 seconds output = timeout(align, (args, fragments, transcript, alphabet), timeout_duration=200) if output is None: print( f"GALVEZ: timed out for name={name} audio_name={audio_name}" ) if output is not None: _, _, _, aligned_results = output start_times = [] end_times = [] labels = [] cers = [] wers = [] hypotheses = [] # swses = [] # levenshteins = [] for result in aligned_results: start_times.append(result["start"]) end_times.append(result["end"]) labels.append(result["aligned"]) hypotheses.append(result["transcript"]) cers.append(result["cer"]) wers.append(result["wer"]) # swses.append(result['sws']) # levenshteins.append(result['levenshtein']) # aligned-raw includes tokens that are not part of the alphabet. # We would like to exclude those. # labels.append(result['aligned-raw']) result_dict["start_ms"].append(start_times) result_dict["end_ms"].append(end_times) result_dict["label"].append(labels) result_dict["cer"].append(cers) result_dict["wer"].append(wers) result_dict["hypotheses"].append(hypotheses) # result_dict["sws"].append(swses) # result_dict["levenshtein"].append(levenshteins) else: result_dict["start_ms"].append([]) result_dict["end_ms"].append([]) result_dict["label"].append([]) result_dict["cer"].append([]) result_dict["wer"].append([]) result_dict["hypotheses"].append([]) # result_dict["sws"].append([]) # result_dict["levenshtein"].append([]) return pd.DataFrame(result_dict) return align_table
def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_field InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true)) >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_field InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true)) >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), ... [("id", int), ("A", int)]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<index:bigint,id:bigint,A:bigint>' >>> inferred.index_field InternalField(dtype=category,struct_field=StructField(index,LongType,true)) >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_field InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true)) """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.internal import InternalField, SPARK_DEFAULT_INDEX_NAME from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder from pyspark.pandas.utils import name_like_string spec = getfullargspec(f) tpe = spec.annotations.get("return", None) if isinstance(tpe, str): # This type hint can happen when given hints are string to avoid forward reference. tpe = resolve_string_type_hint(tpe) if hasattr(tpe, "__origin__") and ( tpe.__origin__ == ps.DataFrame or tpe.__origin__ == ps.Series ): # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints. tpe = tpe.__args__[0] if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`. # Check if the name is Tuple. name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) if name == "Tuple": tuple_type = tpe if hasattr(tuple_type, "__tuple_params__"): # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead. # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py parameters = getattr(tuple_type, "__tuple_params__") else: parameters = getattr(tuple_type, "__args__") index_parameters = [p for p in parameters if issubclass(p, IndexNameTypeHolder)] data_parameters = [p for p in parameters if p not in index_parameters] assert len(data_parameters) > 0, "Type hints for data must not be empty." if len(index_parameters) == 1: index_name = index_parameters[0].name index_dtype, index_spark_type = pandas_on_spark_type(index_parameters[0].tpe) index_field = InternalField( dtype=index_dtype, struct_field=types.StructField( name=index_name if index_name is not None else SPARK_DEFAULT_INDEX_NAME, dataType=index_spark_type, ), ) else: assert len(index_parameters) == 0 # No type hint for index. index_field = None data_dtypes, data_spark_types = zip( *( pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in data_parameters ) ) data_names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in data_parameters ] data_fields = [] for i, (data_name, data_dtype, data_spark_type) in enumerate( zip(data_names, data_dtypes, data_spark_types) ): data_fields.append( InternalField( dtype=data_dtype, struct_field=types.StructField( name=name_like_string(data_name) if data_name is not None else ("c%s" % i), dataType=data_spark_type, ), ) ) return DataFrameType(index_field=index_field, data_fields=data_fields) tpes = pandas_on_spark_type(tpe) if tpes is None: return UnknownType(tpe) else: return ScalarType(*tpes)
def main(): LOG.info('Begin execution') spark = SparkSession.builder.appName('FlightDelaysETL').getOrCreate() datahub_airports_schema = T.StructType([ T.StructField("continent", T.StringType(), True), T.StructField("coordinates", T.StringType(), True), T.StructField("elevation_ft", T.FloatType(), True), T.StructField("gps_code", T.StringType(), True), T.StructField("iata_code", T.StringType(), True), T.StructField("ident", T.StringType(), True), T.StructField("iso_country", T.StringType(), True), T.StructField("iso_region", T.StringType(), True), T.StructField("local_code", T.StringType(), True), T.StructField("municipality", T.StringType(), True), T.StructField("name", T.StringType(), True), T.StructField("type", T.StringType(), True) ]) datahub_airports = ( spark.read.format('json').schema(datahub_airports_schema).load( 's3://{}/dend/airport-codes.json'.format(S3_BUCKET))) LOG.info('# datahub airport entries: %d', datahub_airports.count()) scsg_schema = T.StructType([ T.StructField('iata', T.StringType(), True), T.StructField('airport', T.StringType(), True), T.StructField('city', T.StringType(), True), T.StructField('state', T.StringType(), True), T.StructField('country', T.StringType(), True), T.StructField('lat', T.FloatType(), True), T.StructField('long', T.FloatType(), True) ]) scsg_airports = (spark.read.format('csv').schema(scsg_schema).option( 'header', True).load('s3://{}/dend/scsg-airports.csv'.format(S3_BUCKET))) LOG.info('# scsg airport entries: %d', scsg_airports.count()) scsg_airports.limit(5).show() # join the two airport-related tables on the IATA code airports = (scsg_airports.join( datahub_airports, scsg_airports.iata == datahub_airports.iata_code, 'right').drop('airport', 'iata_code', 'gps_code')) (airports.write.mode('overwrite').parquet( 's3://{}/dend/pq_mart/airports' (S3_BUCKET))) LOG.info('# merged airport entries: %d', airports.count()) delays_schema = T.StructType([ T.StructField("Year", T.IntegerType(), True), T.StructField("Month", T.IntegerType(), True), T.StructField("DayofMonth", T.IntegerType(), True), T.StructField("DayOfWeek", T.IntegerType(), True), T.StructField("DepTime", T.IntegerType(), True), T.StructField("CRSDepTime", T.IntegerType(), True), T.StructField("ArrTime", T.IntegerType(), True), T.StructField("CRSArrTime", T.IntegerType(), True), T.StructField("UniqueCarrier", T.StringType(), True), T.StructField("FlightNum", T.StringType(), True), T.StructField("TailNum", T.StringType(), True), T.StructField("ActualElapsedTime", T.StringType(), True), T.StructField("CRSElapsedTime", T.StringType(), True), T.StructField("AirTime", T.IntegerType(), True), T.StructField("ArrDelay", T.IntegerType(), True), T.StructField("DepDelay", T.IntegerType(), True), T.StructField("Origin", T.StringType(), True), T.StructField("Dest", T.StringType(), True), T.StructField("Distance", T.IntegerType(), True), T.StructField("TaxiIn", T.IntegerType(), True), T.StructField("TaxiOut", T.IntegerType(), True), T.StructField("Cancelled", T.IntegerType(), True), T.StructField("CancellationCode", T.IntegerType(), True), T.StructField("Diverted", T.IntegerType(), True), T.StructField("CarrierDelay", T.IntegerType(), True), T.StructField("WeatherDelay", T.IntegerType(), True), T.StructField("NASDelay", T.IntegerType(), True), T.StructField("SecurityDelay", T.IntegerType(), True), T.StructField("LateAircraftDelay", T.IntegerType(), True) ]) delays = (spark.read.format('csv').schema(delays_schema).option( 'header', True).option('nullValue', 'NA').load( 's3://{}/dend/flights/1988*'.format(S3_BUCKET)).withColumnRenamed( 'DayofMonth', 'DayOfMonth')) LOG.info('# flight delay entries: %d', delays.count()) # - delays: calculate total delay # - join two airport-related tables on the iata code delay_columns = [ 'ArrDelay', 'DepDelay', 'TaxiIn', 'TaxiOut', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay' ] delays = (delays.fillna(0, subset=delay_columns).withColumn( 'TotalDelay', sum([F.col(col) for col in delay_columns]))) # join in the `iso_region` field from the dimension table, so that the downstream business users can # derive results on how the flight delays relate to the flight region delays = (delays.join( airports.select('iata', 'iso_region'), delays.Origin == airports.iata, 'left').withColumnRenamed( 'iso_region', 'OriginRegion').drop('iata').join( airports.select('iata', 'iso_region'), delays.Dest == airports.iata, 'left').withColumnRenamed('iso_region', 'DestRegion').drop('iata'))
assert (len(one_hot_row) == len(enumerated_labels)) return one_hot_row # Write the one-hot-encoded questions to S3 as a parquet file one_hot_questions = questions_tags.rdd.map(lambda x: Row( _Body=x._Body, _Tags=one_hot_encode(x._Tags, enumerated_labels))) if DEBUG is True: print(one_hot_questions.take(10)) # Verify we have multiple labels present print( one_hot_questions.sortBy(lambda x: sum(x._Tags), ascending=False).take(10)) # Create a DataFrame for persisting as Parquet format schema = T.StructType([ T.StructField("_Body", T.ArrayType(T.StringType())), T.StructField("_Tags", T.ArrayType(T.IntegerType())) ]) one_hot_df = spark.createDataFrame(one_hot_questions, schema) one_hot_df.show() one_hot_df.write.mode('overwrite').parquet( PATHS['one_hot'][PATH_SET].format(tag_limit)) one_hot_df = spark.read.parquet( PATHS['one_hot'][PATH_SET].format(tag_limit)) def create_schema(one_row): schema_list = [ T.StructField("_Body", T.ArrayType(T.StringType())), ] for i, val in list(enumerate(one_row._Tags)):
import pyspark.sql.functions as F from pyspark.ml.feature import Bucketizer from pyspark.sql import types from pyspark.sql import Window import model_utils as mu DAYS_FROM_EULA_BINS = \ [float('-Inf'), 6, 12, 21, 30, 45, 60, 80, 110, 140, 180, 250, 350, float('Inf')] INT_TO_CHAR_BASELINE = 97 convert_to_char = F.udf(lambda x: chr(x), types.StringType()) TRIAL_SUCCESS_PAIR = types.StructType([ types.StructField('trial', types.FloatType(), False), types.StructField('success', types.FloatType(), False) ]) pair_trial_success = F.udf(lambda t, s: (t, s), TRIAL_SUCCESS_PAIR) calc_prob = F.udf( lambda trial, success, alpha, beta: (success + alpha) / (trial + alpha + beta), types.FloatType()) def get_table(sqlContext): return sqlContext.table('l2_sprint.mixpanel_home') def load_received_notifications(events, start_date, end_date): # datetime objects are serializable only from spark 2.2.1
def get_dataset(dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == "PandasDataset": if schemas and "pandas" in schemas: schema = schemas["pandas"] pandas_schema = {} for (key, value) in schema.items(): # Note, these are just names used in our internal schemas to build datasets *for internal tests* # Further, some changes in pandas internal about how datetimes are created means to support pandas # pre- 0.25, we need to explicitly specify when we want timezone. # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: df[key] = pd.to_datetime(df[key], utc=True) continue elif value.lower() in [ "datetime", "datetime64", "datetime64[ns]" ]: df[key] = pd.to_datetime(df[key]) continue try: type_ = np.dtype(value) except TypeError: type_ = getattr(pd.core.dtypes.dtypes, value) # If this raises AttributeError it's okay: it means someone built a bad test pandas_schema[key] = type_ # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": from sqlalchemy import create_engine engine = create_engine("sqlite://") conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if (schemas and "sqlite" in schemas and isinstance(engine.dialect, sqlitetypes.dialect)): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "postgresql": from sqlalchemy import create_engine # Create a new database engine = create_engine("postgresql://postgres@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "postgresql" in schemas and isinstance(engine.dialect, postgresqltypes.dialect)): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mysql": from sqlalchemy import create_engine engine = create_engine("mysql://root@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "mysql" in schemas and isinstance(engine.dialect, mysqltypes.dialect)): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "SparkDFDataset": from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes SPARK_TYPES = { "StringType": sparktypes.StringType, "IntegerType": sparktypes.IntegerType, "LongType": sparktypes.LongType, "DateType": sparktypes.DateType, "TimestampType": sparktypes.TimestampType, "FloatType": sparktypes.FloatType, "DoubleType": sparktypes.DoubleType, "BooleanType": sparktypes.BooleanType, "DataType": sparktypes.DataType, "NullType": sparktypes.NullType, } spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and "spark" in schemas: schema = schemas["spark"] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
import pyspark.sql.types as t expected_schema = t.StructType([ t.StructField("FIELD1", t.IntegerType()), t.StructField("FIELD2", t.DateType()), t.StructField( "STRUCT1", t.StructType([ t.StructField("NESTED_FIELD1", t.StringType()), t.StructField( "STRUCT2", t.StructType([ t.StructField("NESTED_FIELD2", t.StringType()), ], ), ), ], ), ), ], )