def map_parking_tickets_to_centerline_locations_by_hour(
        sc,
        sqlContext,
        ticket_file_path=None,
        centerline_file_path=None,
        output_directory=None):
    default_ticket_path = "hdfs:///projects/group12/ParkingData/Parking_Violations_Issued_-_Fiscal_Year_2014__August_2013___June_2014_.csv"
    if ticket_file_path is None:
        ticket_file_path = default_ticket_path

    default_centerline_path = "hdfs:///projects/group12/StreetData/Centerline.csv"
    if centerline_file_path is None:
        centerline_file_path = default_centerline_path

    default_output_path = "hdfs:///projects/group12/TicketsByHour"
    if output_directory is None:
        output_directory = default_output_path

    # Load in Datasets
    centerline_df = load_centerline_dataset(sc, sqlContext,
                                            centerline_file_path)
    tickets_df = load_parking_dataset(sc, sqlContext, ticket_file_path)

    # Filter to 2 new colums named Hour and AMPM
    tickets_df = tickets_df.withColumn(
        "AMPM", regexp_replace('ViolationTime', '(\\d+)', ''))
    tickets_df = tickets_df.withColumn(
        "Time", regexp_replace('ViolationTime', '([a-zA-Z]+)', ''))
    tickets_df = tickets_df.withColumn('Hour', tickets_df.Time / 100)
    tickets_df = tickets_df.withColumn('Hour', tickets_df.Hour.cast('int'))
    tickets_df = tickets_df.where(tickets_df.AMPM.isNotNull()
                                  & tickets_df.Hour.isNotNull())
    tickets_df = tickets_df[(tickets_df['Hour'] >= 1)
                            & (tickets_df['Hour'] <= 12)]
    tickets_df = tickets_df[(tickets_df['AMPM'] == 'A') |
                            (tickets_df['AMPM'] == 'P')]

    # Cast the house number boundary columns to be integers
    centerline_df = centerline_df.withColumn(
        "L_LOW_HN", centerline_df["L_LOW_HN"].cast(IntegerType()))
    centerline_df = centerline_df.withColumn(
        "L_HIGH_HN", centerline_df["L_HIGH_HN"].cast(IntegerType()))
    centerline_df = centerline_df.withColumn(
        "R_LOW_HN", centerline_df["R_LOW_HN"].cast(IntegerType()))
    centerline_df = centerline_df.withColumn(
        "R_HIGH_HN", centerline_df["R_HIGH_HN"].cast(IntegerType()))

    # Join by street such that the parking ticket was writting within the street segment's range.
    joined_df = tickets_df.join(
        centerline_df, (centerline_df.ST_LABEL == tickets_df['StreetName'])
        & (((tickets_df['HouseNumber'] >= centerline_df.R_LOW_HN) &
            (tickets_df['HouseNumber'] <= centerline_df.R_HIGH_HN))
           | ((tickets_df['HouseNumber'] >= centerline_df.L_LOW_HN) &
              (tickets_df['HouseNumber'] <= centerline_df.L_HIGH_HN))),
        "inner")

    # Join by street segment as well as the hour of the day,
    # this means we should have about 24 times the number of rows as when we just map tickets to street segments.
    grouped = joined_df.groupBy("PHYSICALID", "L_LOW_HN", "L_HIGH_HN",
                                "R_LOW_HN", "R_HIGH_HN", "ST_LABEL",
                                "the_geom", "Hour", "AMPM").count()
    # Select only the relevant columns which include:
    # 1. the geometry of the road segment, the # of tickets, the hour of the day, and whether it is AM or PM.
    grouped = grouped.select("the_geom", "count", "Hour", "AMPM")
    # Write to .csv file.
    grouped.write.csv(output_directory)
Exemple #2
0
 def test_udf2(self):
     with self.tempView("test"):
         self.spark.catalog.registerFunction("strlen", lambda string: len(string), IntegerType())
         self.spark.createDataFrame(self.sc.parallelize([Row(a="test")]))\
             .createOrReplaceTempView("test")
         [res] = self.spark.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1").collect()
         self.assertEqual(4, res[0])
def mock_input_prep(input_data):
    """
    apply a set of transformation to the entry spark.dataframe.
    create a dataset from the result of exercice 2 which can be use
    to answer the exercice 3  

    Transformations : 

        CREATING an UDF that creat an array of int 
            based on the data from an other column

        CREATING a column N on which the udf n_to_array is applied

        EXPLODING the data in the n column and stored it in the MOCK column
            The rows are duplicated

        MULTIPLYING the number of client (TOTAL_CLIENTS) 
            by an ascending value of integer 

        DROPING the MOCK and N column  
        

    Parameters : 
    input_data : spark.DataFrame

    root
    |-- NMPTF: string (nullable = true)
    |-- TOTAL_CLIENTS: long (nullable = true)
    |-- TOTAL_CONTRATS: long (nullable = true)
    |-- ECH_1: long (nullable = true)
    |-- ECH_2: long (nullable = true)
    |-- ECH_3: long (nullable = true)
    |-- ECH_4: long (nullable = true)
    |-- ECH_6: long (nullable = true)
    |-- ECH_8: long (nullable = true)
    |-- ECH_9: long (nullable = true)
    |-- ECH_10: long (nullable = true)
    |-- ECH_11: long (nullable = true)
    |-- ECH_12: long (nullable = true)

    return :
    mock_input_data_final : spark.DataFrame

    root
    |-- NMPTF: string (nullable = true)
    |-- TOTAL_CLIENTS: long (nullable = true)
    |-- TOTAL_CONTRATS: long (nullable = true)
    |-- ECH_1: long (nullable = true)
    |-- ECH_2: long (nullable = true)
    |-- ECH_3: long (nullable = true)
    |-- ECH_4: long (nullable = true)
    |-- ECH_6: long (nullable = true)
    |-- ECH_8: long (nullable = true)
    |-- ECH_9: long (nullable = true)
    |-- ECH_10: long (nullable = true)
    |-- ECH_11: long (nullable = true)
    |-- ECH_12: long (nullable = true)


    """

    n_to_array = udf(lambda n: [n] * n, ArrayType(IntegerType()))

    mock_input_data = (input_data.withColumn(
        'N', n_to_array(input_data.TOTAL_CLIENTS)))

    mock_input_data2 = (mock_input_data.withColumn("MOCK",
                                                   explode(mock_input_data.N)))

    mock_input_data3 = (mock_input_data2.withColumn(
        "TOTAL_CLIENTS", (mock_input_data2["TOTAL_CLIENTS"] *
                          monotonically_increasing_id() * 3).cast("int")))

    mock_input_data_final = mock_input_data3.drop("MOCK").drop("N")

    return mock_input_data_final
Exemple #4
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 30 12:47:58 2020

@author: srishti
"""

from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

spark = SparkSession.builder.appName("TotalSpentByCustomer").master(
    "local[*]").getOrCreate()

# Create schema when reading customer-orders
customerOrderSchema = StructType([ \
                                  StructField("cust_id", IntegerType(), True),
                                  StructField("item_id", IntegerType(), True),
                                  StructField("amount_spent", FloatType(), True)
                                  ])

# Load up the data into spark dataset
customersDF = spark.read.schema(customerOrderSchema).csv(
    "../data/customer-orders.csv")

customersDF.groupBy("cust_id").sum("amount_spent").sort(
    "sum(amount_spent)").show(customersDF.count())
Exemple #5
0
 def test_single_udf_with_repeated_argument(self):
     # regression test for SPARK-20685
     self.spark.catalog.registerFunction("add", lambda x, y: x + y, IntegerType())
     row = self.spark.sql("SELECT add(1, 1)").first()
     self.assertEqual(tuple(row), (2, ))
Exemple #6
0
def cluster_gps(ds: DataStream,
                epsilon_constant: int = 1000,
                km_per_radian: int = 6371.0088,
                geo_fence_distance: int = 30,
                minimum_points_in_cluster: int = 1,
                latitude_column_name: str = 'latitude',
                longitude_column_name: str = 'longitude'):
    """
    Cluster GPS data - Algorithm used to cluster GPS data is based on DBScan

    Args:
        ds (DataStream): Windowed/grouped DataStream object
        epsilon_constant (int):
        km_per_radian (int):
        geo_fence_distance (int):
        minimum_points_in_cluster (int):
        latitude_column_name (str):
        longitude_column_name (str):

    Returns:
        DataStream object
    """
    centroid_id_name = 'centroid_id'
    features_list = [
        StructField('centroid_longitude', DoubleType()),
        StructField('centroid_latitude', DoubleType()),
        StructField('centroid_id', IntegerType()),
        StructField('centroid_area', DoubleType())
    ]

    schema = StructType(ds._data._df.schema.fields + features_list)
    column_names = [a.name for a in schema.fields]

    def reproject(latitude, longitude):
        from math import pi, cos, radians
        earth_radius = 6371009  # in meters
        lat_dist = pi * earth_radius / 180.0

        y = [lat * lat_dist for lat in latitude]
        x = [
            long * lat_dist * cos(radians(lat))
            for lat, long in zip(latitude, longitude)
        ]
        return np.column_stack((x, y))

    def get_centermost_point(cluster: np.ndarray) -> object:
        """
        Get center most point of a cluster

        Args:
            cluster (np.ndarray):

        Returns:

        """
        try:
            if cluster.shape[0] >= 3:
                points_project = reproject(cluster[:, 0], cluster[:, 1])
                hull = ConvexHull(points_project)
                area = hull.area
            else:
                area = 1
        except:
            area = 1
        centroid = (MultiPoint(cluster).centroid.x,
                    MultiPoint(cluster).centroid.y)
        centermost_point = min(
            cluster, key=lambda point: great_circle(point, centroid).m)
        return list(centermost_point) + [area]

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    @CC_MProvAgg('gps--org.md2k.phonesensor--phone', 'gps_clustering',
                 'gps--org.md2k.clusters', ['user', 'timestamp'],
                 ['user', 'timestamp'])
    def gps_clustering(data):
        if data.shape[0] < minimum_points_in_cluster:
            return pd.DataFrame([], columns=column_names)
        elif data.shape[0] < 2:
            data['centroid_area'] = 1
            data['centroid_id'] = 0
            data['centroid_latitude'] = data[latitude_column_name].values[0]
            data['centroid_longitude'] = data[longitude_column_name].values[0]
            return data

        coords = np.float64(data[[latitude_column_name,
                                  longitude_column_name]].values)

        epsilon = geo_fence_distance / (epsilon_constant * km_per_radian)

        db = DBSCAN(eps=epsilon,
                    min_samples=minimum_points_in_cluster,
                    algorithm='ball_tree',
                    metric='haversine').fit(np.radians(coords))

        data[centroid_id_name] = db.labels_
        cluster_labels = db.labels_
        clusters = pd.Series(
            [coords[cluster_labels == n] for n in np.unique(cluster_labels)])

        cluster_names = np.array([n for n in np.unique(cluster_labels)])
        centermost_points = clusters.map(get_centermost_point)
        centermost_points = np.array(centermost_points)

        all_dict = []
        for i, col in enumerate(cluster_names):
            cols = np.array(centermost_points[i])
            all_dict.append([col, cols[0], cols[1], cols[2]])

        temp_df = pd.DataFrame(all_dict,
                               columns=[
                                   centroid_id_name, 'centroid_latitude',
                                   'centroid_longitude', 'centroid_area'
                               ])
        data = pd.merge(data,
                        temp_df,
                        how='left',
                        left_on=[centroid_id_name],
                        right_on=[centroid_id_name])
        return data

    # check if datastream object contains grouped type of DataFrame
    if not isinstance(ds._data, GroupedData):
        raise Exception(
            "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm"
        )

    data = ds._data.apply(gps_clustering)
    results = DataStream(data=data, metadata=Metadata())
    metadta = update_metadata(
        stream_metadata=results.metadata,
        stream_name="gps--org.md2k.clusters",
        stream_desc="GPS clusters computed using DBSCAN algorithm.",
        module_name="cerebralcortex.algorithms.gps.clustering.cluster_gps",
        module_version="1.0.0",
        authors=[{
            "Azim": "*****@*****.**"
        }])
    results.metadata = metadta
    return results
Exemple #7
0
#explode the small DF and multiply random to big DF
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

big = [1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3]

rdd1 = sc.parallelize(small)
row_rdd = rdd1.map(lambda x: Row(x))
Small = sqlContext.createDataFrame(row_rdd, ['numbers_1'])

small = [1, 2, 3, 4]
rdd12 = sc.parallelize(big)
row_rdd2 = rdd12.map(lambda x: Row(x))
Big = sqlContext.createDataFrame(row_rdd2, ['numbers_2'])

Small = Small.withColumn('rand_col',
                         (functions.rand() * 5).cast(IntegerType()))
Small = Small.withColumn('KeyS',
                         functions.concat(col("numbers_1"), col("rand_col")))

Big = Big.withColumn(
    'expld',
    functions.explode(functions.array(lit(0), lit(1), lit(2), lit(3), lit(4))))
Big = Big.withColumn('KeyB', functions.concat(col("numbers_2"), col("expld")))

Big.join(Small, Big.KeyB == Small.KeyS, how='left').show()
    parts = lines.map(lambda l: l.split(",", 1))
    parts = parts.map(lambda l: [l[0], l[1].split(",")])
    plantsRDD = parts.map(lambda p: Row(plant=p[0], items=p[1]))

    plantsRDD_result = spark.createDataFrame(plantsRDD)

    plants_withID = plantsRDD_result.orderBy('plant').withColumn(
        "id", monotonically_increasing_id())
    plants_withID.createOrReplaceTempView("plant_states")

    getFrequentItems = plants_withID.select("id", "items")
    fpGrowth = FPGrowth(itemsCol="items",
                        minSupport=float(sys.argv[3]),
                        minConfidence=float(sys.argv[4]))
    model = fpGrowth.fit(getFrequentItems)

    def get_antecedent_length(antecedent):
        return len(antecedent)

    antecedent_length_func = udf(get_antecedent_length, IntegerType())

    freq_item_table = model.associationRules.select(
        "antecedent", "consequent", "confidence",
        antecedent_length_func("antecedent").alias("antecedent_length"))
    #.orderBy("items", "freq").show(int(sys.argv[2]))
    freq_item_table.createOrReplaceTempView("fre_antecedent_result")

    result_rows = sys.argv[2]
    spark.sql(
        "SELECT antecedent, consequent, confidence FROM fre_antecedent_result ORDER BY antecedent_length desc, confidence desc"
    ).show(int(result_rows))
Exemple #9
0
 def _helper(df, feature_name, feature_value):
     ohe_feature = feature_name + '_' + str(feature_value)
     df = df.withColumn(ohe_feature, udf(lambda x: 1 if x ==
                                         feature_value else 0, IntegerType())(df[feature_name]))
     return df
Exemple #10
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.functions import to_date, udf
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, ArrayType

conf = SparkConf().setAppName("Max Temperature").setMaster("local[3]")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

my_schema = StructType([
    StructField("id", IntegerType()),
    StructField("type", StringType())
])


# def my_fun(x, y):
#     if y == "tri":
#         return x + 1
#     else:
#         return x
#
#
# def my_fun2(lines):
#     x = 0
#     fields = lines.split(" ")
#     for field in fields:
#         x = x + field
#     return x
#
#
# df1 = spark.read.schema(my_schema).csv(r"D:\pythonProject\tammingBigDataSparkPython\bigdatausergroup")
# # df1.show()
Exemple #11
0
def change_to_int(data,col):
    for conv_col in col:
        data = data.withColumn(conv_col, data[conv_col].cast(IntegerType()))
    return(data)
        (("Satyajeet", "", ""), "2000-05-19", "M", 4000),
        (("Rajeev", "Kumar", "Jha"), "1978-09-05", "M", 4000),
        (("Anshika", "", "Srivastava"), "2000-12-01", "F", 4000),
        (("Yogita", "", "Bhardwaj"), "1990-02-17", "F", -1)]

schema = StructType([
    StructField(
        "name",
        StructType([
            StructField("firstname", StringType(), True),
            StructField("middlename", StringType(), True),
            StructField("lastname", StringType(), True)
        ]), True),
    StructField("dob", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])

spark = (SparkSession.builder.appName("using_withcolumnrenamed").getOrCreate())

df = spark.createDataFrame(data=data, schema=schema)
df.show(truncate=False)
df.printSchema()

df.withColumnRenamed("dob", "dateofbirth").printSchema()

df2 = (df.withColumnRenamed("dob", "dateofbirth").withColumnRenamed(
    "salary", "sal_amount"))

df2.printSchema()
# Carolyn Mason
# 12/12/18
# Big Data Analytics CSCIE-63

# Join the data sets
from pyspark.sql.types import *
from pyspark.sql.functions import expr, desc, col
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType

# Custom schemas
#cylinders,displ,drive,fueltype,make,model,ucity,uhighway,transmission,vclass,year,rn
fields = [StructField("cylinders",FloatType(),True), StructField("displ",FloatType(),True), StructField("drive",StringType(),True),StructField("fueltype", StringType(), True),StructField("make", StringType(),True), StructField("model",StringType(),True), StructField("ucity",FloatType(),True), StructField("uhighway",FloatType(),True), StructField("transmission",StringType(),True), StructField("vclass",StringType(),True), StructField("year",IntegerType(),True), StructField("rn",IntegerType(),True)]
fuelSchema = StructType(fields)
#title,url,price,address,vin,odometer,condition,cylinders,drive,fuel,paint_color,size,title_status,transmission,type,year,make,model,description
fields2 = [StructField("title",StringType(),True), StructField("url",StringType(),True), StructField("price",FloatType(),True),StructField("address", StringType(), True),StructField("vin", StringType(),True), StructField("odometer",FloatType(),True), StructField("condition",StringType(),True), StructField("cylinders",FloatType(),True), StructField("drive",StringType(),True), StructField("fuel",StringType(),True), StructField("paint_color",StringType(),True), StructField("size",StringType(),True), StructField("title_status",StringType(),True), StructField("transmission",StringType(),True),StructField("type",StringType(),True), StructField("year",IntegerType(),True),StructField("make",StringType(),True), StructField("model",StringType(),True), StructField("description",StringType(),True)]
dataSchema = StructType(fields2)

# Loads csv's to data frames
#df_fuel = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("fuel_simple.csv")
#df_craigslist = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("vans.csv")
df_fuel = spark.read.format("csv").option("header", "true").schema(fuelSchema).load("fuel_simple.csv")
df_craigslist = spark.read.format("csv").option("header", "true").schema(dataSchema).load("vans.csv")


# Create tables to query using SQL
df_fuel.createOrReplaceTempView("fuel")
df_craigslist.createOrReplaceTempView("data")

# Joins
#cylinders,displ,drive,fueltype,make,model,ucity,uhighway,transmission,vclass,year,rn
#title,url,price,address,vin,odometer,condition,cylinders,drive,fuel,paint_color,size,title_status,transmission,type,year,make,model,description
Exemple #14
0
# find spark path
import findspark
findspark.init()

# import necessary packages&methods
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

# read files
path = "D:\\CMU\\Tasks\\Task15_Big_Data_Analysis\\home_depot_data\\"
# load train data with specified StructType
schema_train = StructType([StructField('id', IntegerType(), True),StructField('product_uid1', IntegerType(), True),StructField('product_title', StringType(), True),StructField('search_term', StringType(), True),StructField('relevance', FloatType(), True)])
traindata = spark.read.csv(path + "train.csv", header=True, mode="DROPMALFORMED", schema=schema_train)
traindata = traindata.select("product_uid1", "product_title", "search_term", "relevance")

# load test data 
schema_test = StructType([StructField('id', IntegerType(), True),StructField('product_uid1', IntegerType(), True),StructField('product_title', StringType(), True),StructField('search_term', StringType(), True)])
testdata = spark.read.csv(path + "test.csv", header=True, mode="DROPMALFORMED", schema=schema_test)
testdata = testdata.select("id", "product_uid1", "product_title", "search_term")

# load product description data
schema_desc = StructType([StructField('product_uid2', IntegerType(), True), StructField('product_description', StringType(), True)])
descrdata = spark.read.csv(path + "product_descriptions.csv", header=True, schema=schema_desc).orderBy("product_uid2")

traindata.show(10)
testdata.show(10)
descrdata.show(10)

from pyspark.sql.functions import regexp_replace, col, when
Exemple #15
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
''' 
List the names of all superheroes with only ONE connection
compute the actual smallet number of connections in the data set instea of assuing it is one

'''
spark = SparkSession.builder.appName("MostObscureSuperheroes").getOrCreate()

schema = StructType([ \
                     StructField("id", IntegerType(), True), \
                     StructField("name", StringType(), True)])

names = spark.read.schema(schema).option(
    "sep", " ").csv("file:///SparkCourse/Marvel-names.txt")

lines = spark.read.text("file:///SparkCourse/Marvel-graph.txt")

connections = lines.withColumn("id", func.split(func.col("value"), " ")[0]) \
    .withColumn("connections", func.size(func.split(func.col("value"), " ")) - 1) \
    .groupBy("id").agg(func.sum("connections").alias("connections"))

minConnectionCount = connections.agg(func.min("connections")).first()[0]

minConnections = connections.filter(
    func.col("connections") == minConnectionCount)

minConnectionsWithNames = minConnections.join(names, "id")

print("The following characters have only " + str(minConnectionCount) +
filename_string = ",".join(filenames)

# Ingest raw and parse
raw = spark.sparkContext.textFile(filename_string)

## TODO: read schema from JSON
parsed = raw.map(lambda line: parse_line(line))

# Establish common event schema
schema = StructType([ \
    StructField('trade_dt', DateType(), True), \
    StructField('rec_type', StringType(), True), \
    StructField('symbol', StringType(), True), \
    StructField('exchange', StringType(), True), \
    StructField('event_tm', TimestampType(), True), \
    StructField('event_seq_nb', IntegerType(), True), \
    StructField('arrival_tm', TimestampType(), True), \
    StructField('trade_pr', DecimalType(17,14), True), \
    StructField('bid_pr', DecimalType(17,14), True), \
    StructField('bid_size', IntegerType(), True), \
    StructField('ask_pr', DecimalType(17,14), True), \
    StructField('ask_size', IntegerType(), True), \
    StructField('partition', StringType(), True) \
])

# Create dataframe with parsed data and schema
df = spark.createDataFrame(parsed, schema)

df.show(10)

df.write.partitionBy("partition").mode("overwrite").parquet("ingest-data")
def process_song_data(spark, input_data, output_data, input_song_pattern):
    """
    Extract data for song and artist dimensions from source song json files then insert into parquet files
    Parameters:
      spark - Spark session
      input_data - filepath to source json files
      output_data - filepath to target parquet files
      input_song_pattern - file pattern for input song files
    """

    # get filepath to song data file
    song_data = input_data + input_song_pattern
    print("Processing song source data: " + song_data)

    # this is not necessary but useful as a code sample for future reference
    songSchema = StructType([StructField("artist_id", StringType()),
                             StructField("artist_latitude", DoubleType()),
                             StructField("artist_location", StringType()),
                             StructField("artist_longitude", DoubleType()),
                             StructField("artist_name", StringType()),
                             StructField("duration", DoubleType()),
                             StructField("num_songs", IntegerType()),
                             StructField("song_id", StringType()),
                             StructField("title", StringType()),
                             StructField("year", IntegerType())
                            ])

    # read song/artist source data file
    dfSongSource = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs dataframe (pre songs table)
    # use Spark SQL query to create songs dataframe (proto songs table)
    dfSongSource.createOrReplaceTempView("staging_songs")

    songs_table = spark.sql(
    """
    SELECT song_id,
           MIN(title) AS title,
           MIN(artist_id) AS artist_id,
           MIN(year) AS year,
           MIN(duration) AS duration
      FROM staging_songs
     GROUP BY song_id
    """)

    # add unknown dummy row to songs dataframe
    unknownSongRow = spark.createDataFrame([('***UNKNOWN_SONG***', '***Unknown Song***', '***UNKNOWN_ARTIST***', 0, 0)])
    songs_table = songs_table.union(unknownSongRow)

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").format("parquet").mode("overwrite").save(output_data + "songs.parquet")
    print("Processed songs dimension")

    # extract columns to create artists table
    # use Spark SQL query to create artists dataframe (proto artists table)
    dfSongSource.createOrReplaceTempView("staging_songs")

    artists_table = spark.sql(
    """
    SELECT artist_id,
           MIN(artist_name) AS name,
           MIN(artist_location) AS location,
           MIN(artist_latitude) AS latitude,
           MIN(artist_longitude) AS longitude
      FROM staging_songs
     GROUP BY artist_id
    """)

    # add unknown dummy row to artists dataframe
    unknownArtistRow = spark.createDataFrame([('***UNKNOWN_ARTIST***', '*** Unknown Artist ***', '', 0.0, 0.0)])
    artists_table = artists_table.union(unknownArtistRow)

    # write artists table to parquet files
    artists_table.write.format("parquet").mode("overwrite").save(output_data + "artists.parquet")
    print("Processed artists dimension")

    # extract columns to create song keys file dataframe (proto song_keys table)
    song_keys_table = dfSongSource.select(["song_id", "title", "duration", "artist_id", "artist_name"]).dropDuplicates()

    # write song keys table to parquet files
    song_keys_table.write.format("parquet").mode("overwrite").save(output_data + "song_keys.parquet")
    print("Processed song keys table")
def transform(spark, s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data): 
    print('Processing {} => {}'.format(s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data))
 
    schema = StructType([
        StructField('marketplace', StringType(), True),
        StructField('customer_id', StringType(), True),
        StructField('review_id', StringType(), True),
        StructField('product_id', StringType(), True),
        StructField('product_parent', StringType(), True),
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True),
        StructField('helpful_votes', IntegerType(), True),
        StructField('total_votes', IntegerType(), True),
        StructField('vine', StringType(), True),
        StructField('verified_purchase', StringType(), True),
        StructField('review_headline', StringType(), True),
        StructField('review_body', StringType(), True),
        StructField('review_date', StringType(), True)
    ])
    
    df_csv = spark.read.csv(path=s3_input_data,
                            sep='\t',
                            schema=schema,
                            header=True,
                            quote=None)
    df_csv.show()

    # This dataset should already be clean, but always good to double-check
    print('Showing null review_body rows...')
    df_csv.where(col('review_body').isNull()).show()

    df_csv_cleaned = df_csv.na.drop(subset=['review_body'])
    df_csv_cleaned.where(col('review_body').isNull()).show()

    # TODO:  Balance
    

#     tokenizer = Tokenizer(inputCol='review_body', outputCol='words')
#     wordsData = tokenizer.transform(df_csv_cleaned)
    
#     hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000)
#     featurizedData = hashingTF.transform(wordsData)
    
#     # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
#     # 1) compute the IDF vector 
#     # 2) scale the term frequencies by IDF
#     # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass
#     featurizedData.cache()

#     # spark.mllib's IDF implementation provides an option for ignoring terms
#     # which occur in less than a minimum number of documents.
#     # In such cases, the IDF for these terms is set to 0.
#     # This feature can be used by passing the minDocFreq value to the IDF constructor.
#     idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2)
#     idfModel = idf.fit(featurizedData)

#    features_df = idfModel.transform(featurizedData)
#    features_df.select('star_rating', 'features').show()

#     num_features=300
#     pca = PCA(k=num_features, inputCol='features', outputCol='pca_features')
#     pca_model = pca.fit(features_df)
#     pca_features_df = pca_model.transform(features_df).select('star_rating', 'pca_features')
#     pca_features_df.show(truncate=False)

#     standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features')
#     standard_scaler_model = standard_scaler.fit(pca_features_df)
#     standard_scaler_features_df = standard_scaler_model.transform(pca_features_df).select('star_rating', 'scaled_pca_features')
#     standard_scaler_features_df.show(truncate=False)

#     expanded_features_df = (standard_scaler_features_df.withColumn('f', to_array(col('scaled_pca_features')))
#         .select(['star_rating'] + [col('f')[i] for i in range(num_features)]))
#     expanded_features_df.show()

    features_df = df_csv_cleaned.select(['star_rating', 'review_body'])

    # TODO:  Convert to TFRecord
    bert_transformer = udf(lambda str: tokenizer.encode_plus(str, 
                                                             pad_to_max_length=True,
                                                             max_length=MAX_SEQ_LENGTH),
			   StringType())
    spark.udf.register('bert_transformer', bert_transformer)

    transformed_df = features_df.withColumn('star_rating', bert_transformer('review_body'))

    # TODO:  Split
    train_df, validation_df, test_df = transformed_df.randomSplit([0.9, 0.05, 0.05])

    # TODO:  Potentially use TFRecord Writer from LI
    train_df.write.csv(path=s3_output_train_data,
                       header=None,
                       quote=None)
    print('Wrote to output file:  {}'.format(s3_output_train_data))

    validation_df.write.csv(path=s3_output_validation_data,
                            header=None,
                            quote=None)
    print('Wrote to output file:  {}'.format(s3_output_validation_data))

    test_df.write.csv(path=s3_output_test_data,
                      header=None,
                      quote=None) 
    print('Wrote to output file:  {}'.format(s3_output_test_data))
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.appName("Max Transaction").getOrCreate()
schema = StructType([
    StructField("txn_id", IntegerType(), True),
    StructField("txn_date", StringType(), True),
    StructField("txn_code", IntegerType(), True),
    StructField("txn_amt", IntegerType(), True),
    StructField("txn_product", StringType(), True),
    StructField("txn_city", StringType(), True)
])
df = spark.read.csv(sys.argv[1], schema=schema, header=False)
df.createOrReplaceTempView("transactions")
spark.sql("select txn_city, txn_product, max(txn_amt) from transactions group by txn_city, txn_product")    \
     .coalesce(1).write.mode("overwrite").csv(sys.argv[3])

df2.explain()

spark.stop()
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

# Let's see how much maintenance we could have saved if we used this model
def f(actual, predicted, cost):
    if actual==predicted:
        if actual=='Healthy':
          return 0
        elif actual=='Preventive':
          return cost
        elif actual=='Corrective':
          return 30000
    else:
        return cost
    
predictedCost = F.udf(f, IntegerType())
predictedMaintenance = i2s.transform(predictions)\
  .select('date', 'maintenanceType','predictedLabel')\
  .join(maintCosts, 'date')

costSavings = predictedMaintenance.select('date', 'cost', predictedCost('maintenanceType','predictedLabel','cost').alias('predictedCost'))\
  .withColumn('costSavings', F.col('cost')-F.col('predictedCost'))\
  .groupBy(F.date_format('date','yyyyMM').alias('month'))\
  .agg(F.sum('cost').alias('actualCost'), F.sum('predictedCost').alias('predictedCost'), F.sum('costSavings').alias('predictedSavings'))
  
csPD = costSavings.select('month',F.round('actualCost'),F.round('predictedCost')).toPandas()
csPD.plot(kind='line', x='month')
csPD.describe()

print('Total Cost Savings Using This Model')
costSavings.agg(F.sum('actualCost').alias('TotalCost'), 
Exemple #21
0
 def sqlType(cls):
     return StructType([
         StructField("type", ByteType(), False),
         StructField("size", IntegerType(), True),
         StructField("indices", ArrayType(IntegerType(), False), True),
         StructField("values", ArrayType(DoubleType(), False), True)])
Exemple #22
0
def main(context):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    # TASK 1
    # Load the data into PySpark.

    # For the comments:
    if not os.path.exists("./comments.parquet"):
        comments = context.read.json("comments-minimal.json.bz2")
        comments.write.parquet("comments.parquet")

    # For the submissions:
    if not os.path.exists("./submissions.parquet"):
        submissions = context.read.json("submissions.json.bz2")
        submissions.write.parquet("submissions.parquet")
    #submissions.printSchema()

    # For labelled data:
    if not os.path.exists("./labels.parquet"):
        labels = context.read.format('csv').options(
            header='true', inferSchema='true').load("labeled_data.csv")
        labels.write.parquet("labels.parquet")

    # TASK 2
    # Code for Task 2...
    # For task 2, we will join the labels and comments

    commentsParquet = context.read.parquet("comments.parquet")
    commentsParquet.createOrReplaceTempView("comments")

    labelsParquet = context.read.parquet("labels.parquet")
    labelsParquet.createOrReplaceTempView("labels")

    # Now, compute the join:
    if not os.path.exists("./joinedComments.parquet"):
        joinedComments = context.sql(
            "SELECT labels.Input_id, labels.labeldem, labels.labelgop, labels.labeldjt, body FROM comments JOIN labels on id=Input_id"
        )
        joinedComments.write.parquet("joinedComments.parquet")
    joinedComments = context.read.parquet("joinedComments.parquet")
    joinedComments.createOrReplaceTempView("joinedComments")
    #joinedComments.printSchema()

    # TASK 3
    # NOT NEEDED

    # TASK 4
    # Register the user defined function
    context.registerFunction("sanitize", clean_wrapper,
                             ArrayType(StringType()))

    # TASK 5
    if not os.path.exists("./santized.parquet"):
        sanitizedText = context.sql(
            "SELECT Input_id, labeldem, labelgop, labeldjt, sanitize(body) as body FROM joinedComments"
        )
        sanitizedText.write.parquet("sanitized.parquet")

    # TASK 6A
    sanitizedText = context.read.parquet("sanitized.parquet")
    sanitizedText.createOrReplaceTempView("sanitizedText")
    cv = CountVectorizer(inputCol="body",
                         outputCol="features",
                         minDF=10.0,
                         binary=True)
    fitted = cv.fit(sanitizedText)
    vector = fitted.transform(sanitizedText)
    # TASK 6B
    vector.createOrReplaceTempView("vector")
    pos = context.sql("SELECT *, if(labeldjt=1, 1, 0) AS label FROM vector")
    neg = context.sql("SELECT *, if(labeldjt=-1, 1, 0) AS label FROM vector")

    # TASK 7
    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.save("project2/pos.model")
    negModel.save("project2/neg.model")

    # TASK 8 and TASK 9
    # Create the submissions and comments tables from the parquets:
    if not os.path.exists("sanitizedJoinedData.parquet"):
        submissions = context.read.parquet("submissions.parquet")
        submissions.createOrReplaceTempView("submissions")

        comments = context.read.parquet("comments.parquet")
        comments.createOrReplaceTempView("comments")
        comments = comments.sample(False, 0.2, None)
        joinedData = context.sql(
            "SELECT comments.link_id AS id, comments.body, comments.created_utc, submissions.title, comments.author_flair_text, submissions.score AS submission_score, comments.score as comments_score FROM comments JOIN submissions ON REPLACE(comments.link_id, 't3_', '')=submissions.id AND comments.body NOT LIKE '%/s%' AND comments.body NOT LIKE '&gt%'"
        )
        #joinedData.show(joinedData.count(), False)
        #print(str(joinedData.count()))

        # Repeating earlier tasks: Tasks 4 and 5
        joinedData.createOrReplaceTempView("joinedData")
        # Re-register temporary function since we are forced to:
        context.registerFunction("sanitize", clean_wrapper,
                                 ArrayType(StringType()))
        print("writing sanitized parquet now")
        sanitizedJoinedData = context.sql(
            "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, sanitize(body) AS body FROM joinedData"
        )
        sanitizedJoinedData.write.parquet("sanitizedJoinedData.parquet")

    sanitizedJoinedData = context.read.parquet("sanitizedJoinedData.parquet")
    sanitizedJoinedData = sanitizedJoinedData.sample(False, 0.2, None)
    cv = CountVectorizer(inputCol="body",
                         outputCol="features",
                         minDF=10.0,
                         binary=True)
    newVector = fitted.transform(sanitizedJoinedData)

    seenPosModel = CrossValidatorModel.load("project2/pos.model")
    seenNegModel = CrossValidatorModel.load("project2/neg.model")

    posResult = seenPosModel.transform(newVector)
    posResult = posResult.selectExpr("id", "created_utc", "title",
                                     "author_flair_text", "submission_score",
                                     "comments_score", "body", "features",
                                     "probability as positive_probability")

    cumResult = seenNegModel.transform(posResult)
    cumResult = cumResult.selectExpr("id", "created_utc", "title",
                                     "author_flair_text", "submission_score",
                                     "comments_score", "body", "features",
                                     "positive_probability",
                                     "probability as negative_probability")

    cumResult.createOrReplaceTempView("cumResult")

    context.registerFunction("positiveFunc", positiveUDF, IntegerType())
    context.registerFunction("negativeFunc", negativeUDF, IntegerType())
    cumResult = context.sql(
        "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, body, features, positiveFunc(positive_probability) AS positive_probability,negativeFunc(negative_probability) AS negative_probability FROM cumResult"
    )
    cumResult.write.parquet("cumResult.parquet")

    # TASK 10

    cumResult = context.read.parquet("cumResult.parquet")
    cumResult.createOrReplaceTempView("cumResult")
    # Actual 10.2

    task10_6 = context.sql(
        "SELECT DATE(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created ORDER BY date_created"
    )
    task10_6.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_6.csv")

    # Top 10 posts:

    if not os.path.exists("./task10_top_pos.csv"):
        task10_top_pos = cumResult.groupBy('title')\
            .agg(
                 (F.sum('positive_probability') / F.count(F.lit(1))).alias('pct_pos'),
                 F.count(F.lit(1)).alias('count')
                 )\
                .orderBy(F.desc('pct_pos'), F.desc('count')).limit(10)\
                .select('title', 'pct_pos')
        task10_top_pos.repartition(
            1).write.format("com.databricks.spark.csv").option(
                "header", "true").save("task10_top_pos.csv")
    if not os.path.exists("./task10_top_neg.csv"):
        task10_top_neg = cumResult.groupBy('title')\
            .agg(
                 (F.sum('negative_probability') / F.count(F.lit(1))).alias('pct_neg'),
                 F.count(F.lit(1)).alias('count')
                 )\
                .orderBy(F.desc('pct_neg'), F.desc('count')).limit(10)\
                .select('title', 'pct_neg')
        task10_top_neg.repartition(
            1).write.format("com.databricks.spark.csv").option(
                "header", "true").save("task10_top_neg.csv")

    # 10.1
    # Get the number of records
    totalRows = cumResult.count()
    # Calculate percentages
    task10_1 = context.sql(
        "SELECT SUM(positive_probability)/ {0} AS pos, SUM(negative_probability)/{1} AS neg FROM cumResult"
        .format(totalRows, totalRows))

    # 10.2
    task10_2 = context.sql(
        "SELECT DAYOFWEEK(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created"
    )

    # 10.3
    context.registerFunction("checkStateWrapper", checkState, BooleanType())
    task10_3 = context.sql(
        "SELECT author_flair_text AS state, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult WHERE(checkStateWrapper(author_flair_text)) GROUP BY author_flair_text"
    )

    # 10.4
    task10_4 = context.sql(
        "SELECT comments_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY comments_score"
    )
    task10_5 = context.sql(
        "SELECT submission_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY submission_score"
    )
    #    cumResult.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("cumResults.csv")
    task10_1.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_1.csv")
    task10_2.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_2.csv")
    task10_3.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_3.csv")
    task10_4.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_4.csv")
    task10_5.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_5.csv")
Exemple #23
0
def process_i94_cit_res_data(spark: SparkSession,
                             df_right: DataFrame) -> DataFrame:
    """ Processes mapping file between countries_of_the_world.csv and i94_data.parquet """

    # Filename of input file
    filename = 's3://data-eng-capstone-cf/staging/i94_cit_res_data.csv'

    # Read into a spark dataframe
    df = spark.read.csv(filename, header=True)

    # Cast country_id as an IntegerType()
    df = df.withColumn('country_id', df.country_id.cast(IntegerType()))

    # Make country unique, by appending '(<country_id>)' to string name
    # Only do when country equals INVALID: STATELESS or INVALID: UNITED STATES
    df = df.withColumn(
        'country',
        F.when(df.country.isin('INVALID: STATELESS', 'INVALID: UNITED STATES'),
               F.concat(df.country, F.lit(' ('), df.country_id,
                        F.lit(')'))).otherwise(df.country))

    # Add foreign key column so can join to df_cow
    df = df.withColumn('country_join', F.initcap('country'))

    # Manual adjustments
    df = df.withColumn(
        'country_join',
        F.when(
            df.country ==
            'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)',
            'Mexico').when(
                df.country == 'ANTIGUA-BARBUDA', 'Antigua & Barbuda').when(
                    df.country == 'BAHAMAS', 'Bahamas, The').when(
                        df.country == 'BOSNIA-HERZEGOVINA',
                        'Bosnia & Herzegovina').when(
                            df.country == 'BRITISH VIRGIN ISLANDS',
                            'British Virgin Is.').when(
                                df.country == 'CENTRAL AFRICAN REPUBLIC',
                                'Central African Rep.').when(
                                    df.country == 'GAMBIA',
                                    'Gambia, The').when(
                                        df.country == 'GUINEA-BISSAU',
                                        'Guinea-Bissau').
        when(df.country == 'MAYOTTE (AFRICA - FRENCH)', 'Mayotte').when(
            df.country == 'MICRONESIA, FED. STATES OF',
            'Micronesia, Fed. St.').when(
                df.country == 'NORTH KOREA', 'Korea, North').when(
                    df.country == 'MICRONESIA, FED. STATES OF',
                    'Micronesia, Fed. St.').when(
                        df.country == 'MICRONESIA, FED. STATES OF',
                        'Micronesia, Fed. St.').when(
                            df.country == 'SOUTH KOREA',
                            'Korea, South').when(df.country == 'ST. HELENA',
                                                 'Saint Helena').
        when(df.country == 'ST. KITTS-NEVIS',
             'Saint Kitts & Nevis').when(
                 df.country == 'ST. LUCIA',
                 'Saint Lucia').when(
                     df.country == 'ST. PIERRE AND MIQUELON',
                     'St Pierre & Miquelon').when(
                         df.country == 'ST. VINCENT-GRENADINES',
                         'Saint Vincent and the Grenadines').when(
                             df.country == 'TRINIDAD AND TOBAGO',
                             'Trinidad & Tobago').when(
                                 df.country == 'TURKS AND CAICOS ISLANDS',
                                 'Turks & Caicos Is').when(
                                     df.country == 'WALLIS AND FUTUNA ISLANDS',
                                     'Wallis and Futuna').when(
                                         df.country == 'CHINA, PRC',
                                         'China').otherwise(df.country_join))

    # Define country_fk via left outer join
    df = df.join(df_right, df.country_join == df_right.country, how='left') \
           .select('country_id', df.country, df_right.country.alias('country_fk'))

    # Check schema and count
    df.printSchema()
    df.count()

    # Return transformed dataframe
    return df
Exemple #24
0
# COMMAND ----------

# MAGIC %md
# MAGIC #### 2. Define schema for source data
# MAGIC Different years have different schemas - fields added/removed

# COMMAND ----------

#Schema for data based on year and month

#2017
yellowTripSchema2017H1 = StructType([
    StructField("vendor_id", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_location_id", IntegerType(), True),
    StructField("dropoff_location_id", IntegerType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True)
])
Exemple #25
0
 def test_udf_with_order_by_and_limit(self):
     my_copy = udf(lambda x: x, IntegerType())
     df = self.spark.range(10).orderBy("id")
     res = df.select(df.id, my_copy(df.id).alias("copy")).limit(1)
     self.assertEqual(res.collect(), [Row(id=0, copy=0)])
Exemple #26
0
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType:
    """Convert pyarrow type to Spark data type."""
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types

    spark_type: DataType
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None:
        spark_type = TimestampNTZType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_duration(at):
        spark_type = DayTimeIntervalType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [
                StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
                for field in at
            ]
        )
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
# Check for null value
df.where(df.CustomerID.isNull()).count()
df.where(df.Gender.isNull()).count()
df.where(df.Age.isNull()).count()
df.where(df.AI.isNull()).count()
df.where(df.SS.isNull()).count()


# Change Male and Female to integer values 
from pyspark.sql.functions import *
newDf = df.withColumn('Gender', regexp_replace('Gender', 'Male', '1'))
upDf = newDf.withColumn('Gender', regexp_replace('Gender', 'Female', '2'))

# Change Gender type to string
from pyspark.sql.types import IntegerType
upDf = upDf.withColumn("Gender", upDf["Gender"].cast(IntegerType()))
upDf.cache()
upDf.printSchema()


from pyspark.ml.feature import VectorAssembler

# set up data for ML
vectorAssembler = VectorAssembler(inputCols = ['Gender', 'Age', 'AI'], outputCol = 'features')
ml_df = vectorAssembler.transform(upDf)
ml_df = ml_df.select(['features', 'SS'])
ml_df.show(3)

# split data, train and test
splits = ml_df.randomSplit([0.7, 0.3])
train_df = splits[0]
    def _transform_data(df):
        """Transform original dataset.

        :param df: Input DataFrame.
        :return: Transformed DataFrame.
        """
        # Cast key variables and rename headers
        rename_cols = {
            '_c0': 'id_siniestro',
            '_c1': 'id_poliza',
            '_c2': 'id_producto',
            '_c3': 'fecha_apertura',
            '_c4': 'fecha_terminado',
            '_c5': 'nif_o_intm',
            '_c6': 'nombre',
            '_c7': 'nif_pagador',
            '_c8': 'nombre_pagador',
            '_c9': 'iban',
            '_c10': 'id_mediador'
        }

        for old_name, new_name in rename_cols.items():
            df = df.withColumnRenamed(old_name, new_name)

        # Cast claims id
        df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType()))

        # We save the other participants columns in a list
        others = [
            'id_siniestro', 'id_poliza', 'fecha_apertura', 'fecha_terminado',
            'iban'
        ] + [col for col in df.columns if col.startswith('_c')]
        df_others = df.select(*others)

        # We drop others from df
        df = df.select(df.columns[:11])
        df = df.drop(
            *['nombre', 'nif_pagador', 'nombre_pagador', 'id_producto'])

        # We add column cod_rol and rol
        df = df.withColumn('rol', lit('Tomador'))
        df = df.withColumn('cod_rol', lit(2))

        # We take intermediary separately
        intermediary = df.drop('nif_o_intm')
        intermediary = intermediary.withColumnRenamed('id_mediador',
                                                      'nif_o_intm')
        intermediary = intermediary.withColumn('rol', lit('Intermediario'))
        intermediary = intermediary.withColumn('cod_rol', lit(3))
        intermediary = intermediary.select([
            'id_siniestro', 'id_poliza', 'fecha_apertura', 'fecha_terminado',
            'nif_o_intm', 'iban', 'rol', 'cod_rol'
        ])
        df = df.drop('id_mediador')

        # We concat the two dataframe
        df = df.union(intermediary)

        # We return with the others and rename ('cod_rol', 'rol', 'nif_o_intm')
        for col in range(11, len(df_others.columns), 3):
            df_others_i = df_others.select([
                'id_siniestro', 'id_poliza', 'fecha_apertura',
                'fecha_terminado', '_c' + str(col + 2), 'iban',
                '_c' + str(col + 1), '_c' + str(col)
            ])
            df_others_i = df_others_i.withColumnRenamed(
                '_c' + str(col), 'cod_rol')
            df_others_i = df_others_i.withColumnRenamed(
                '_c' + str(col + 1), 'rol')
            df_others_i = df_others_i.withColumnRenamed(
                '_c' + str(col + 2), 'nif_o_intm')
            df_others_i = df_others_i.dropna(thresh=1, subset='nif_o_intm')

            df = df.union(df_others_i)
        df = df.dropDuplicates()

        return df
#connect to the database
pw_df = spark.read.jdbc("jdbc:postgresql://timescale.lab11.eecs.umich.edu/powerwatch", "pw_dedupe",
        properties={"user": config['user'], "password": config['password'],"driver":"org.postgresql.Driver"})

#read the data that we care about
pw_df = pw_df.select(pw_df['core_id'],pw_df['time'],pw_df['is_powered'],pw_df['product_id'])
pw_df = pw_df.filter("product_id = 7008 OR product_id= 7009")

#now we need to created a window function that looks at the leading lagging edge of is powered and detects transitions
#then we can filter out all data that is not a transition
def detectTransition(value1, value2):
    if(value1 == value2):
        return 0
    else:
        return 1
udfDetectTransition = udf(detectTransition, IntegerType())
w = Window.partitionBy("core_id").orderBy(asc("time"))
is_powered_lag = lag("is_powered",1).over(w)
pw_df = pw_df.withColumn("transition", udfDetectTransition("is_powered",is_powered_lag))

#filter out all transitions
pw_df = pw_df.filter("transition != 0")

#now count each outage (really restoration)
def countOutage(value1, value2, value3):
    if(value1 == False and value2 == True and value3 == True):
        return 1
    else:
        return 0
udfCountTransition = udf(countOutage, IntegerType())
is_powered_lead = lead("is_powered",1).over(w)
# Generate top 10 movie recommendations for a subset of users
users = ratings.select('userId').distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show(truncate=False)

# Generate top 10 user recommendations for a subset of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.show(truncate=False)

# In[8]:
# Generate top 10 movie recommendations for a specified set of users defined by you
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

df = spark.createDataFrame([1, 2, 3], IntegerType())
df.show()
df = df.select(col("value").alias("userId"))
df.show()

userSubsetRecs = model.recommendForUserSubset(df, 10)
userSubsetRecs.show(truncate=False)

# In[9]:
# Generate top 10 user recommendations for a specified set of movies defined by you
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

df = spark.createDataFrame([1, 2, 3], IntegerType())
df = df.select(col("value").alias("movieId"))
df.show()