def map_parking_tickets_to_centerline_locations_by_hour( sc, sqlContext, ticket_file_path=None, centerline_file_path=None, output_directory=None): default_ticket_path = "hdfs:///projects/group12/ParkingData/Parking_Violations_Issued_-_Fiscal_Year_2014__August_2013___June_2014_.csv" if ticket_file_path is None: ticket_file_path = default_ticket_path default_centerline_path = "hdfs:///projects/group12/StreetData/Centerline.csv" if centerline_file_path is None: centerline_file_path = default_centerline_path default_output_path = "hdfs:///projects/group12/TicketsByHour" if output_directory is None: output_directory = default_output_path # Load in Datasets centerline_df = load_centerline_dataset(sc, sqlContext, centerline_file_path) tickets_df = load_parking_dataset(sc, sqlContext, ticket_file_path) # Filter to 2 new colums named Hour and AMPM tickets_df = tickets_df.withColumn( "AMPM", regexp_replace('ViolationTime', '(\\d+)', '')) tickets_df = tickets_df.withColumn( "Time", regexp_replace('ViolationTime', '([a-zA-Z]+)', '')) tickets_df = tickets_df.withColumn('Hour', tickets_df.Time / 100) tickets_df = tickets_df.withColumn('Hour', tickets_df.Hour.cast('int')) tickets_df = tickets_df.where(tickets_df.AMPM.isNotNull() & tickets_df.Hour.isNotNull()) tickets_df = tickets_df[(tickets_df['Hour'] >= 1) & (tickets_df['Hour'] <= 12)] tickets_df = tickets_df[(tickets_df['AMPM'] == 'A') | (tickets_df['AMPM'] == 'P')] # Cast the house number boundary columns to be integers centerline_df = centerline_df.withColumn( "L_LOW_HN", centerline_df["L_LOW_HN"].cast(IntegerType())) centerline_df = centerline_df.withColumn( "L_HIGH_HN", centerline_df["L_HIGH_HN"].cast(IntegerType())) centerline_df = centerline_df.withColumn( "R_LOW_HN", centerline_df["R_LOW_HN"].cast(IntegerType())) centerline_df = centerline_df.withColumn( "R_HIGH_HN", centerline_df["R_HIGH_HN"].cast(IntegerType())) # Join by street such that the parking ticket was writting within the street segment's range. joined_df = tickets_df.join( centerline_df, (centerline_df.ST_LABEL == tickets_df['StreetName']) & (((tickets_df['HouseNumber'] >= centerline_df.R_LOW_HN) & (tickets_df['HouseNumber'] <= centerline_df.R_HIGH_HN)) | ((tickets_df['HouseNumber'] >= centerline_df.L_LOW_HN) & (tickets_df['HouseNumber'] <= centerline_df.L_HIGH_HN))), "inner") # Join by street segment as well as the hour of the day, # this means we should have about 24 times the number of rows as when we just map tickets to street segments. grouped = joined_df.groupBy("PHYSICALID", "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN", "ST_LABEL", "the_geom", "Hour", "AMPM").count() # Select only the relevant columns which include: # 1. the geometry of the road segment, the # of tickets, the hour of the day, and whether it is AM or PM. grouped = grouped.select("the_geom", "count", "Hour", "AMPM") # Write to .csv file. grouped.write.csv(output_directory)
def test_udf2(self): with self.tempView("test"): self.spark.catalog.registerFunction("strlen", lambda string: len(string), IntegerType()) self.spark.createDataFrame(self.sc.parallelize([Row(a="test")]))\ .createOrReplaceTempView("test") [res] = self.spark.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1").collect() self.assertEqual(4, res[0])
def mock_input_prep(input_data): """ apply a set of transformation to the entry spark.dataframe. create a dataset from the result of exercice 2 which can be use to answer the exercice 3 Transformations : CREATING an UDF that creat an array of int based on the data from an other column CREATING a column N on which the udf n_to_array is applied EXPLODING the data in the n column and stored it in the MOCK column The rows are duplicated MULTIPLYING the number of client (TOTAL_CLIENTS) by an ascending value of integer DROPING the MOCK and N column Parameters : input_data : spark.DataFrame root |-- NMPTF: string (nullable = true) |-- TOTAL_CLIENTS: long (nullable = true) |-- TOTAL_CONTRATS: long (nullable = true) |-- ECH_1: long (nullable = true) |-- ECH_2: long (nullable = true) |-- ECH_3: long (nullable = true) |-- ECH_4: long (nullable = true) |-- ECH_6: long (nullable = true) |-- ECH_8: long (nullable = true) |-- ECH_9: long (nullable = true) |-- ECH_10: long (nullable = true) |-- ECH_11: long (nullable = true) |-- ECH_12: long (nullable = true) return : mock_input_data_final : spark.DataFrame root |-- NMPTF: string (nullable = true) |-- TOTAL_CLIENTS: long (nullable = true) |-- TOTAL_CONTRATS: long (nullable = true) |-- ECH_1: long (nullable = true) |-- ECH_2: long (nullable = true) |-- ECH_3: long (nullable = true) |-- ECH_4: long (nullable = true) |-- ECH_6: long (nullable = true) |-- ECH_8: long (nullable = true) |-- ECH_9: long (nullable = true) |-- ECH_10: long (nullable = true) |-- ECH_11: long (nullable = true) |-- ECH_12: long (nullable = true) """ n_to_array = udf(lambda n: [n] * n, ArrayType(IntegerType())) mock_input_data = (input_data.withColumn( 'N', n_to_array(input_data.TOTAL_CLIENTS))) mock_input_data2 = (mock_input_data.withColumn("MOCK", explode(mock_input_data.N))) mock_input_data3 = (mock_input_data2.withColumn( "TOTAL_CLIENTS", (mock_input_data2["TOTAL_CLIENTS"] * monotonically_increasing_id() * 3).cast("int"))) mock_input_data_final = mock_input_data3.drop("MOCK").drop("N") return mock_input_data_final
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Sep 30 12:47:58 2020 @author: srishti """ from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, FloatType spark = SparkSession.builder.appName("TotalSpentByCustomer").master( "local[*]").getOrCreate() # Create schema when reading customer-orders customerOrderSchema = StructType([ \ StructField("cust_id", IntegerType(), True), StructField("item_id", IntegerType(), True), StructField("amount_spent", FloatType(), True) ]) # Load up the data into spark dataset customersDF = spark.read.schema(customerOrderSchema).csv( "../data/customer-orders.csv") customersDF.groupBy("cust_id").sum("amount_spent").sort( "sum(amount_spent)").show(customersDF.count())
def test_single_udf_with_repeated_argument(self): # regression test for SPARK-20685 self.spark.catalog.registerFunction("add", lambda x, y: x + y, IntegerType()) row = self.spark.sql("SELECT add(1, 1)").first() self.assertEqual(tuple(row), (2, ))
def cluster_gps(ds: DataStream, epsilon_constant: int = 1000, km_per_radian: int = 6371.0088, geo_fence_distance: int = 30, minimum_points_in_cluster: int = 1, latitude_column_name: str = 'latitude', longitude_column_name: str = 'longitude'): """ Cluster GPS data - Algorithm used to cluster GPS data is based on DBScan Args: ds (DataStream): Windowed/grouped DataStream object epsilon_constant (int): km_per_radian (int): geo_fence_distance (int): minimum_points_in_cluster (int): latitude_column_name (str): longitude_column_name (str): Returns: DataStream object """ centroid_id_name = 'centroid_id' features_list = [ StructField('centroid_longitude', DoubleType()), StructField('centroid_latitude', DoubleType()), StructField('centroid_id', IntegerType()), StructField('centroid_area', DoubleType()) ] schema = StructType(ds._data._df.schema.fields + features_list) column_names = [a.name for a in schema.fields] def reproject(latitude, longitude): from math import pi, cos, radians earth_radius = 6371009 # in meters lat_dist = pi * earth_radius / 180.0 y = [lat * lat_dist for lat in latitude] x = [ long * lat_dist * cos(radians(lat)) for lat, long in zip(latitude, longitude) ] return np.column_stack((x, y)) def get_centermost_point(cluster: np.ndarray) -> object: """ Get center most point of a cluster Args: cluster (np.ndarray): Returns: """ try: if cluster.shape[0] >= 3: points_project = reproject(cluster[:, 0], cluster[:, 1]) hull = ConvexHull(points_project) area = hull.area else: area = 1 except: area = 1 centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y) centermost_point = min( cluster, key=lambda point: great_circle(point, centroid).m) return list(centermost_point) + [area] @pandas_udf(schema, PandasUDFType.GROUPED_MAP) @CC_MProvAgg('gps--org.md2k.phonesensor--phone', 'gps_clustering', 'gps--org.md2k.clusters', ['user', 'timestamp'], ['user', 'timestamp']) def gps_clustering(data): if data.shape[0] < minimum_points_in_cluster: return pd.DataFrame([], columns=column_names) elif data.shape[0] < 2: data['centroid_area'] = 1 data['centroid_id'] = 0 data['centroid_latitude'] = data[latitude_column_name].values[0] data['centroid_longitude'] = data[longitude_column_name].values[0] return data coords = np.float64(data[[latitude_column_name, longitude_column_name]].values) epsilon = geo_fence_distance / (epsilon_constant * km_per_radian) db = DBSCAN(eps=epsilon, min_samples=minimum_points_in_cluster, algorithm='ball_tree', metric='haversine').fit(np.radians(coords)) data[centroid_id_name] = db.labels_ cluster_labels = db.labels_ clusters = pd.Series( [coords[cluster_labels == n] for n in np.unique(cluster_labels)]) cluster_names = np.array([n for n in np.unique(cluster_labels)]) centermost_points = clusters.map(get_centermost_point) centermost_points = np.array(centermost_points) all_dict = [] for i, col in enumerate(cluster_names): cols = np.array(centermost_points[i]) all_dict.append([col, cols[0], cols[1], cols[2]]) temp_df = pd.DataFrame(all_dict, columns=[ centroid_id_name, 'centroid_latitude', 'centroid_longitude', 'centroid_area' ]) data = pd.merge(data, temp_df, how='left', left_on=[centroid_id_name], right_on=[centroid_id_name]) return data # check if datastream object contains grouped type of DataFrame if not isinstance(ds._data, GroupedData): raise Exception( "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm" ) data = ds._data.apply(gps_clustering) results = DataStream(data=data, metadata=Metadata()) metadta = update_metadata( stream_metadata=results.metadata, stream_name="gps--org.md2k.clusters", stream_desc="GPS clusters computed using DBSCAN algorithm.", module_name="cerebralcortex.algorithms.gps.clustering.cluster_gps", module_version="1.0.0", authors=[{ "Azim": "*****@*****.**" }]) results.metadata = metadta return results
#explode the small DF and multiply random to big DF from pyspark.sql.functions import * from pyspark.sql.types import IntegerType from pyspark.sql.functions import col big = [1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3] rdd1 = sc.parallelize(small) row_rdd = rdd1.map(lambda x: Row(x)) Small = sqlContext.createDataFrame(row_rdd, ['numbers_1']) small = [1, 2, 3, 4] rdd12 = sc.parallelize(big) row_rdd2 = rdd12.map(lambda x: Row(x)) Big = sqlContext.createDataFrame(row_rdd2, ['numbers_2']) Small = Small.withColumn('rand_col', (functions.rand() * 5).cast(IntegerType())) Small = Small.withColumn('KeyS', functions.concat(col("numbers_1"), col("rand_col"))) Big = Big.withColumn( 'expld', functions.explode(functions.array(lit(0), lit(1), lit(2), lit(3), lit(4)))) Big = Big.withColumn('KeyB', functions.concat(col("numbers_2"), col("expld"))) Big.join(Small, Big.KeyB == Small.KeyS, how='left').show()
parts = lines.map(lambda l: l.split(",", 1)) parts = parts.map(lambda l: [l[0], l[1].split(",")]) plantsRDD = parts.map(lambda p: Row(plant=p[0], items=p[1])) plantsRDD_result = spark.createDataFrame(plantsRDD) plants_withID = plantsRDD_result.orderBy('plant').withColumn( "id", monotonically_increasing_id()) plants_withID.createOrReplaceTempView("plant_states") getFrequentItems = plants_withID.select("id", "items") fpGrowth = FPGrowth(itemsCol="items", minSupport=float(sys.argv[3]), minConfidence=float(sys.argv[4])) model = fpGrowth.fit(getFrequentItems) def get_antecedent_length(antecedent): return len(antecedent) antecedent_length_func = udf(get_antecedent_length, IntegerType()) freq_item_table = model.associationRules.select( "antecedent", "consequent", "confidence", antecedent_length_func("antecedent").alias("antecedent_length")) #.orderBy("items", "freq").show(int(sys.argv[2])) freq_item_table.createOrReplaceTempView("fre_antecedent_result") result_rows = sys.argv[2] spark.sql( "SELECT antecedent, consequent, confidence FROM fre_antecedent_result ORDER BY antecedent_length desc, confidence desc" ).show(int(result_rows))
def _helper(df, feature_name, feature_value): ohe_feature = feature_name + '_' + str(feature_value) df = df.withColumn(ohe_feature, udf(lambda x: 1 if x == feature_value else 0, IntegerType())(df[feature_name])) return df
from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession, functions as f from pyspark.sql.functions import to_date, udf from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, ArrayType conf = SparkConf().setAppName("Max Temperature").setMaster("local[3]") spark = SparkSession.builder.config(conf=conf).getOrCreate() my_schema = StructType([ StructField("id", IntegerType()), StructField("type", StringType()) ]) # def my_fun(x, y): # if y == "tri": # return x + 1 # else: # return x # # # def my_fun2(lines): # x = 0 # fields = lines.split(" ") # for field in fields: # x = x + field # return x # # # df1 = spark.read.schema(my_schema).csv(r"D:\pythonProject\tammingBigDataSparkPython\bigdatausergroup") # # df1.show()
def change_to_int(data,col): for conv_col in col: data = data.withColumn(conv_col, data[conv_col].cast(IntegerType())) return(data)
(("Satyajeet", "", ""), "2000-05-19", "M", 4000), (("Rajeev", "Kumar", "Jha"), "1978-09-05", "M", 4000), (("Anshika", "", "Srivastava"), "2000-12-01", "F", 4000), (("Yogita", "", "Bhardwaj"), "1990-02-17", "F", -1)] schema = StructType([ StructField( "name", StructType([ StructField("firstname", StringType(), True), StructField("middlename", StringType(), True), StructField("lastname", StringType(), True) ]), True), StructField("dob", StringType(), True), StructField("gender", StringType(), True), StructField("salary", IntegerType(), True) ]) spark = (SparkSession.builder.appName("using_withcolumnrenamed").getOrCreate()) df = spark.createDataFrame(data=data, schema=schema) df.show(truncate=False) df.printSchema() df.withColumnRenamed("dob", "dateofbirth").printSchema() df2 = (df.withColumnRenamed("dob", "dateofbirth").withColumnRenamed( "salary", "sal_amount")) df2.printSchema()
# Carolyn Mason # 12/12/18 # Big Data Analytics CSCIE-63 # Join the data sets from pyspark.sql.types import * from pyspark.sql.functions import expr, desc, col from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType # Custom schemas #cylinders,displ,drive,fueltype,make,model,ucity,uhighway,transmission,vclass,year,rn fields = [StructField("cylinders",FloatType(),True), StructField("displ",FloatType(),True), StructField("drive",StringType(),True),StructField("fueltype", StringType(), True),StructField("make", StringType(),True), StructField("model",StringType(),True), StructField("ucity",FloatType(),True), StructField("uhighway",FloatType(),True), StructField("transmission",StringType(),True), StructField("vclass",StringType(),True), StructField("year",IntegerType(),True), StructField("rn",IntegerType(),True)] fuelSchema = StructType(fields) #title,url,price,address,vin,odometer,condition,cylinders,drive,fuel,paint_color,size,title_status,transmission,type,year,make,model,description fields2 = [StructField("title",StringType(),True), StructField("url",StringType(),True), StructField("price",FloatType(),True),StructField("address", StringType(), True),StructField("vin", StringType(),True), StructField("odometer",FloatType(),True), StructField("condition",StringType(),True), StructField("cylinders",FloatType(),True), StructField("drive",StringType(),True), StructField("fuel",StringType(),True), StructField("paint_color",StringType(),True), StructField("size",StringType(),True), StructField("title_status",StringType(),True), StructField("transmission",StringType(),True),StructField("type",StringType(),True), StructField("year",IntegerType(),True),StructField("make",StringType(),True), StructField("model",StringType(),True), StructField("description",StringType(),True)] dataSchema = StructType(fields2) # Loads csv's to data frames #df_fuel = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("fuel_simple.csv") #df_craigslist = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("vans.csv") df_fuel = spark.read.format("csv").option("header", "true").schema(fuelSchema).load("fuel_simple.csv") df_craigslist = spark.read.format("csv").option("header", "true").schema(dataSchema).load("vans.csv") # Create tables to query using SQL df_fuel.createOrReplaceTempView("fuel") df_craigslist.createOrReplaceTempView("data") # Joins #cylinders,displ,drive,fueltype,make,model,ucity,uhighway,transmission,vclass,year,rn #title,url,price,address,vin,odometer,condition,cylinders,drive,fuel,paint_color,size,title_status,transmission,type,year,make,model,description
# find spark path import findspark findspark.init() # import necessary packages&methods from pyspark.sql import SparkSession spark = SparkSession.builder.appName("test").getOrCreate() from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType # read files path = "D:\\CMU\\Tasks\\Task15_Big_Data_Analysis\\home_depot_data\\" # load train data with specified StructType schema_train = StructType([StructField('id', IntegerType(), True),StructField('product_uid1', IntegerType(), True),StructField('product_title', StringType(), True),StructField('search_term', StringType(), True),StructField('relevance', FloatType(), True)]) traindata = spark.read.csv(path + "train.csv", header=True, mode="DROPMALFORMED", schema=schema_train) traindata = traindata.select("product_uid1", "product_title", "search_term", "relevance") # load test data schema_test = StructType([StructField('id', IntegerType(), True),StructField('product_uid1', IntegerType(), True),StructField('product_title', StringType(), True),StructField('search_term', StringType(), True)]) testdata = spark.read.csv(path + "test.csv", header=True, mode="DROPMALFORMED", schema=schema_test) testdata = testdata.select("id", "product_uid1", "product_title", "search_term") # load product description data schema_desc = StructType([StructField('product_uid2', IntegerType(), True), StructField('product_description', StringType(), True)]) descrdata = spark.read.csv(path + "product_descriptions.csv", header=True, schema=schema_desc).orderBy("product_uid2") traindata.show(10) testdata.show(10) descrdata.show(10) from pyspark.sql.functions import regexp_replace, col, when
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, StringType ''' List the names of all superheroes with only ONE connection compute the actual smallet number of connections in the data set instea of assuing it is one ''' spark = SparkSession.builder.appName("MostObscureSuperheroes").getOrCreate() schema = StructType([ \ StructField("id", IntegerType(), True), \ StructField("name", StringType(), True)]) names = spark.read.schema(schema).option( "sep", " ").csv("file:///SparkCourse/Marvel-names.txt") lines = spark.read.text("file:///SparkCourse/Marvel-graph.txt") connections = lines.withColumn("id", func.split(func.col("value"), " ")[0]) \ .withColumn("connections", func.size(func.split(func.col("value"), " ")) - 1) \ .groupBy("id").agg(func.sum("connections").alias("connections")) minConnectionCount = connections.agg(func.min("connections")).first()[0] minConnections = connections.filter( func.col("connections") == minConnectionCount) minConnectionsWithNames = minConnections.join(names, "id") print("The following characters have only " + str(minConnectionCount) +
filename_string = ",".join(filenames) # Ingest raw and parse raw = spark.sparkContext.textFile(filename_string) ## TODO: read schema from JSON parsed = raw.map(lambda line: parse_line(line)) # Establish common event schema schema = StructType([ \ StructField('trade_dt', DateType(), True), \ StructField('rec_type', StringType(), True), \ StructField('symbol', StringType(), True), \ StructField('exchange', StringType(), True), \ StructField('event_tm', TimestampType(), True), \ StructField('event_seq_nb', IntegerType(), True), \ StructField('arrival_tm', TimestampType(), True), \ StructField('trade_pr', DecimalType(17,14), True), \ StructField('bid_pr', DecimalType(17,14), True), \ StructField('bid_size', IntegerType(), True), \ StructField('ask_pr', DecimalType(17,14), True), \ StructField('ask_size', IntegerType(), True), \ StructField('partition', StringType(), True) \ ]) # Create dataframe with parsed data and schema df = spark.createDataFrame(parsed, schema) df.show(10) df.write.partitionBy("partition").mode("overwrite").parquet("ingest-data")
def process_song_data(spark, input_data, output_data, input_song_pattern): """ Extract data for song and artist dimensions from source song json files then insert into parquet files Parameters: spark - Spark session input_data - filepath to source json files output_data - filepath to target parquet files input_song_pattern - file pattern for input song files """ # get filepath to song data file song_data = input_data + input_song_pattern print("Processing song source data: " + song_data) # this is not necessary but useful as a code sample for future reference songSchema = StructType([StructField("artist_id", StringType()), StructField("artist_latitude", DoubleType()), StructField("artist_location", StringType()), StructField("artist_longitude", DoubleType()), StructField("artist_name", StringType()), StructField("duration", DoubleType()), StructField("num_songs", IntegerType()), StructField("song_id", StringType()), StructField("title", StringType()), StructField("year", IntegerType()) ]) # read song/artist source data file dfSongSource = spark.read.json(song_data, schema=songSchema) # extract columns to create songs dataframe (pre songs table) # use Spark SQL query to create songs dataframe (proto songs table) dfSongSource.createOrReplaceTempView("staging_songs") songs_table = spark.sql( """ SELECT song_id, MIN(title) AS title, MIN(artist_id) AS artist_id, MIN(year) AS year, MIN(duration) AS duration FROM staging_songs GROUP BY song_id """) # add unknown dummy row to songs dataframe unknownSongRow = spark.createDataFrame([('***UNKNOWN_SONG***', '***Unknown Song***', '***UNKNOWN_ARTIST***', 0, 0)]) songs_table = songs_table.union(unknownSongRow) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").format("parquet").mode("overwrite").save(output_data + "songs.parquet") print("Processed songs dimension") # extract columns to create artists table # use Spark SQL query to create artists dataframe (proto artists table) dfSongSource.createOrReplaceTempView("staging_songs") artists_table = spark.sql( """ SELECT artist_id, MIN(artist_name) AS name, MIN(artist_location) AS location, MIN(artist_latitude) AS latitude, MIN(artist_longitude) AS longitude FROM staging_songs GROUP BY artist_id """) # add unknown dummy row to artists dataframe unknownArtistRow = spark.createDataFrame([('***UNKNOWN_ARTIST***', '*** Unknown Artist ***', '', 0.0, 0.0)]) artists_table = artists_table.union(unknownArtistRow) # write artists table to parquet files artists_table.write.format("parquet").mode("overwrite").save(output_data + "artists.parquet") print("Processed artists dimension") # extract columns to create song keys file dataframe (proto song_keys table) song_keys_table = dfSongSource.select(["song_id", "title", "duration", "artist_id", "artist_name"]).dropDuplicates() # write song keys table to parquet files song_keys_table.write.format("parquet").mode("overwrite").save(output_data + "song_keys.parquet") print("Processed song keys table")
def transform(spark, s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data): print('Processing {} => {}'.format(s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data)) schema = StructType([ StructField('marketplace', StringType(), True), StructField('customer_id', StringType(), True), StructField('review_id', StringType(), True), StructField('product_id', StringType(), True), StructField('product_parent', StringType(), True), StructField('product_title', StringType(), True), StructField('product_category', StringType(), True), StructField('star_rating', IntegerType(), True), StructField('helpful_votes', IntegerType(), True), StructField('total_votes', IntegerType(), True), StructField('vine', StringType(), True), StructField('verified_purchase', StringType(), True), StructField('review_headline', StringType(), True), StructField('review_body', StringType(), True), StructField('review_date', StringType(), True) ]) df_csv = spark.read.csv(path=s3_input_data, sep='\t', schema=schema, header=True, quote=None) df_csv.show() # This dataset should already be clean, but always good to double-check print('Showing null review_body rows...') df_csv.where(col('review_body').isNull()).show() df_csv_cleaned = df_csv.na.drop(subset=['review_body']) df_csv_cleaned.where(col('review_body').isNull()).show() # TODO: Balance # tokenizer = Tokenizer(inputCol='review_body', outputCol='words') # wordsData = tokenizer.transform(df_csv_cleaned) # hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000) # featurizedData = hashingTF.transform(wordsData) # # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # # 1) compute the IDF vector # # 2) scale the term frequencies by IDF # # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass # featurizedData.cache() # # spark.mllib's IDF implementation provides an option for ignoring terms # # which occur in less than a minimum number of documents. # # In such cases, the IDF for these terms is set to 0. # # This feature can be used by passing the minDocFreq value to the IDF constructor. # idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2) # idfModel = idf.fit(featurizedData) # features_df = idfModel.transform(featurizedData) # features_df.select('star_rating', 'features').show() # num_features=300 # pca = PCA(k=num_features, inputCol='features', outputCol='pca_features') # pca_model = pca.fit(features_df) # pca_features_df = pca_model.transform(features_df).select('star_rating', 'pca_features') # pca_features_df.show(truncate=False) # standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features') # standard_scaler_model = standard_scaler.fit(pca_features_df) # standard_scaler_features_df = standard_scaler_model.transform(pca_features_df).select('star_rating', 'scaled_pca_features') # standard_scaler_features_df.show(truncate=False) # expanded_features_df = (standard_scaler_features_df.withColumn('f', to_array(col('scaled_pca_features'))) # .select(['star_rating'] + [col('f')[i] for i in range(num_features)])) # expanded_features_df.show() features_df = df_csv_cleaned.select(['star_rating', 'review_body']) # TODO: Convert to TFRecord bert_transformer = udf(lambda str: tokenizer.encode_plus(str, pad_to_max_length=True, max_length=MAX_SEQ_LENGTH), StringType()) spark.udf.register('bert_transformer', bert_transformer) transformed_df = features_df.withColumn('star_rating', bert_transformer('review_body')) # TODO: Split train_df, validation_df, test_df = transformed_df.randomSplit([0.9, 0.05, 0.05]) # TODO: Potentially use TFRecord Writer from LI train_df.write.csv(path=s3_output_train_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_train_data)) validation_df.write.csv(path=s3_output_validation_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_validation_data)) test_df.write.csv(path=s3_output_test_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_test_data))
import sys from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, IntegerType, StringType spark = SparkSession.builder.appName("Max Transaction").getOrCreate() schema = StructType([ StructField("txn_id", IntegerType(), True), StructField("txn_date", StringType(), True), StructField("txn_code", IntegerType(), True), StructField("txn_amt", IntegerType(), True), StructField("txn_product", StringType(), True), StructField("txn_city", StringType(), True) ]) df = spark.read.csv(sys.argv[1], schema=schema, header=False) df.createOrReplaceTempView("transactions") spark.sql("select txn_city, txn_product, max(txn_amt) from transactions group by txn_city, txn_product") \ .coalesce(1).write.mode("overwrite").csv(sys.argv[3]) df2.explain() spark.stop()
accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) # Let's see how much maintenance we could have saved if we used this model def f(actual, predicted, cost): if actual==predicted: if actual=='Healthy': return 0 elif actual=='Preventive': return cost elif actual=='Corrective': return 30000 else: return cost predictedCost = F.udf(f, IntegerType()) predictedMaintenance = i2s.transform(predictions)\ .select('date', 'maintenanceType','predictedLabel')\ .join(maintCosts, 'date') costSavings = predictedMaintenance.select('date', 'cost', predictedCost('maintenanceType','predictedLabel','cost').alias('predictedCost'))\ .withColumn('costSavings', F.col('cost')-F.col('predictedCost'))\ .groupBy(F.date_format('date','yyyyMM').alias('month'))\ .agg(F.sum('cost').alias('actualCost'), F.sum('predictedCost').alias('predictedCost'), F.sum('costSavings').alias('predictedSavings')) csPD = costSavings.select('month',F.round('actualCost'),F.round('predictedCost')).toPandas() csPD.plot(kind='line', x='month') csPD.describe() print('Total Cost Savings Using This Model') costSavings.agg(F.sum('actualCost').alias('TotalCost'),
def sqlType(cls): return StructType([ StructField("type", ByteType(), False), StructField("size", IntegerType(), True), StructField("indices", ArrayType(IntegerType(), False), True), StructField("values", ArrayType(DoubleType(), False), True)])
def main(context): """Main function takes a Spark SQL context.""" # YOUR CODE HERE # YOU MAY ADD OTHER FUNCTIONS AS NEEDED # TASK 1 # Load the data into PySpark. # For the comments: if not os.path.exists("./comments.parquet"): comments = context.read.json("comments-minimal.json.bz2") comments.write.parquet("comments.parquet") # For the submissions: if not os.path.exists("./submissions.parquet"): submissions = context.read.json("submissions.json.bz2") submissions.write.parquet("submissions.parquet") #submissions.printSchema() # For labelled data: if not os.path.exists("./labels.parquet"): labels = context.read.format('csv').options( header='true', inferSchema='true').load("labeled_data.csv") labels.write.parquet("labels.parquet") # TASK 2 # Code for Task 2... # For task 2, we will join the labels and comments commentsParquet = context.read.parquet("comments.parquet") commentsParquet.createOrReplaceTempView("comments") labelsParquet = context.read.parquet("labels.parquet") labelsParquet.createOrReplaceTempView("labels") # Now, compute the join: if not os.path.exists("./joinedComments.parquet"): joinedComments = context.sql( "SELECT labels.Input_id, labels.labeldem, labels.labelgop, labels.labeldjt, body FROM comments JOIN labels on id=Input_id" ) joinedComments.write.parquet("joinedComments.parquet") joinedComments = context.read.parquet("joinedComments.parquet") joinedComments.createOrReplaceTempView("joinedComments") #joinedComments.printSchema() # TASK 3 # NOT NEEDED # TASK 4 # Register the user defined function context.registerFunction("sanitize", clean_wrapper, ArrayType(StringType())) # TASK 5 if not os.path.exists("./santized.parquet"): sanitizedText = context.sql( "SELECT Input_id, labeldem, labelgop, labeldjt, sanitize(body) as body FROM joinedComments" ) sanitizedText.write.parquet("sanitized.parquet") # TASK 6A sanitizedText = context.read.parquet("sanitized.parquet") sanitizedText.createOrReplaceTempView("sanitizedText") cv = CountVectorizer(inputCol="body", outputCol="features", minDF=10.0, binary=True) fitted = cv.fit(sanitizedText) vector = fitted.transform(sanitizedText) # TASK 6B vector.createOrReplaceTempView("vector") pos = context.sql("SELECT *, if(labeldjt=1, 1, 0) AS label FROM vector") neg = context.sql("SELECT *, if(labeldjt=-1, 1, 0) AS label FROM vector") # TASK 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("project2/pos.model") negModel.save("project2/neg.model") # TASK 8 and TASK 9 # Create the submissions and comments tables from the parquets: if not os.path.exists("sanitizedJoinedData.parquet"): submissions = context.read.parquet("submissions.parquet") submissions.createOrReplaceTempView("submissions") comments = context.read.parquet("comments.parquet") comments.createOrReplaceTempView("comments") comments = comments.sample(False, 0.2, None) joinedData = context.sql( "SELECT comments.link_id AS id, comments.body, comments.created_utc, submissions.title, comments.author_flair_text, submissions.score AS submission_score, comments.score as comments_score FROM comments JOIN submissions ON REPLACE(comments.link_id, 't3_', '')=submissions.id AND comments.body NOT LIKE '%/s%' AND comments.body NOT LIKE '>%'" ) #joinedData.show(joinedData.count(), False) #print(str(joinedData.count())) # Repeating earlier tasks: Tasks 4 and 5 joinedData.createOrReplaceTempView("joinedData") # Re-register temporary function since we are forced to: context.registerFunction("sanitize", clean_wrapper, ArrayType(StringType())) print("writing sanitized parquet now") sanitizedJoinedData = context.sql( "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, sanitize(body) AS body FROM joinedData" ) sanitizedJoinedData.write.parquet("sanitizedJoinedData.parquet") sanitizedJoinedData = context.read.parquet("sanitizedJoinedData.parquet") sanitizedJoinedData = sanitizedJoinedData.sample(False, 0.2, None) cv = CountVectorizer(inputCol="body", outputCol="features", minDF=10.0, binary=True) newVector = fitted.transform(sanitizedJoinedData) seenPosModel = CrossValidatorModel.load("project2/pos.model") seenNegModel = CrossValidatorModel.load("project2/neg.model") posResult = seenPosModel.transform(newVector) posResult = posResult.selectExpr("id", "created_utc", "title", "author_flair_text", "submission_score", "comments_score", "body", "features", "probability as positive_probability") cumResult = seenNegModel.transform(posResult) cumResult = cumResult.selectExpr("id", "created_utc", "title", "author_flair_text", "submission_score", "comments_score", "body", "features", "positive_probability", "probability as negative_probability") cumResult.createOrReplaceTempView("cumResult") context.registerFunction("positiveFunc", positiveUDF, IntegerType()) context.registerFunction("negativeFunc", negativeUDF, IntegerType()) cumResult = context.sql( "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, body, features, positiveFunc(positive_probability) AS positive_probability,negativeFunc(negative_probability) AS negative_probability FROM cumResult" ) cumResult.write.parquet("cumResult.parquet") # TASK 10 cumResult = context.read.parquet("cumResult.parquet") cumResult.createOrReplaceTempView("cumResult") # Actual 10.2 task10_6 = context.sql( "SELECT DATE(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created ORDER BY date_created" ) task10_6.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_6.csv") # Top 10 posts: if not os.path.exists("./task10_top_pos.csv"): task10_top_pos = cumResult.groupBy('title')\ .agg( (F.sum('positive_probability') / F.count(F.lit(1))).alias('pct_pos'), F.count(F.lit(1)).alias('count') )\ .orderBy(F.desc('pct_pos'), F.desc('count')).limit(10)\ .select('title', 'pct_pos') task10_top_pos.repartition( 1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_top_pos.csv") if not os.path.exists("./task10_top_neg.csv"): task10_top_neg = cumResult.groupBy('title')\ .agg( (F.sum('negative_probability') / F.count(F.lit(1))).alias('pct_neg'), F.count(F.lit(1)).alias('count') )\ .orderBy(F.desc('pct_neg'), F.desc('count')).limit(10)\ .select('title', 'pct_neg') task10_top_neg.repartition( 1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_top_neg.csv") # 10.1 # Get the number of records totalRows = cumResult.count() # Calculate percentages task10_1 = context.sql( "SELECT SUM(positive_probability)/ {0} AS pos, SUM(negative_probability)/{1} AS neg FROM cumResult" .format(totalRows, totalRows)) # 10.2 task10_2 = context.sql( "SELECT DAYOFWEEK(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created" ) # 10.3 context.registerFunction("checkStateWrapper", checkState, BooleanType()) task10_3 = context.sql( "SELECT author_flair_text AS state, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult WHERE(checkStateWrapper(author_flair_text)) GROUP BY author_flair_text" ) # 10.4 task10_4 = context.sql( "SELECT comments_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY comments_score" ) task10_5 = context.sql( "SELECT submission_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY submission_score" ) # cumResult.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("cumResults.csv") task10_1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_1.csv") task10_2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_2.csv") task10_3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_3.csv") task10_4.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_4.csv") task10_5.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_5.csv")
def process_i94_cit_res_data(spark: SparkSession, df_right: DataFrame) -> DataFrame: """ Processes mapping file between countries_of_the_world.csv and i94_data.parquet """ # Filename of input file filename = 's3://data-eng-capstone-cf/staging/i94_cit_res_data.csv' # Read into a spark dataframe df = spark.read.csv(filename, header=True) # Cast country_id as an IntegerType() df = df.withColumn('country_id', df.country_id.cast(IntegerType())) # Make country unique, by appending '(<country_id>)' to string name # Only do when country equals INVALID: STATELESS or INVALID: UNITED STATES df = df.withColumn( 'country', F.when(df.country.isin('INVALID: STATELESS', 'INVALID: UNITED STATES'), F.concat(df.country, F.lit(' ('), df.country_id, F.lit(')'))).otherwise(df.country)) # Add foreign key column so can join to df_cow df = df.withColumn('country_join', F.initcap('country')) # Manual adjustments df = df.withColumn( 'country_join', F.when( df.country == 'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)', 'Mexico').when( df.country == 'ANTIGUA-BARBUDA', 'Antigua & Barbuda').when( df.country == 'BAHAMAS', 'Bahamas, The').when( df.country == 'BOSNIA-HERZEGOVINA', 'Bosnia & Herzegovina').when( df.country == 'BRITISH VIRGIN ISLANDS', 'British Virgin Is.').when( df.country == 'CENTRAL AFRICAN REPUBLIC', 'Central African Rep.').when( df.country == 'GAMBIA', 'Gambia, The').when( df.country == 'GUINEA-BISSAU', 'Guinea-Bissau'). when(df.country == 'MAYOTTE (AFRICA - FRENCH)', 'Mayotte').when( df.country == 'MICRONESIA, FED. STATES OF', 'Micronesia, Fed. St.').when( df.country == 'NORTH KOREA', 'Korea, North').when( df.country == 'MICRONESIA, FED. STATES OF', 'Micronesia, Fed. St.').when( df.country == 'MICRONESIA, FED. STATES OF', 'Micronesia, Fed. St.').when( df.country == 'SOUTH KOREA', 'Korea, South').when(df.country == 'ST. HELENA', 'Saint Helena'). when(df.country == 'ST. KITTS-NEVIS', 'Saint Kitts & Nevis').when( df.country == 'ST. LUCIA', 'Saint Lucia').when( df.country == 'ST. PIERRE AND MIQUELON', 'St Pierre & Miquelon').when( df.country == 'ST. VINCENT-GRENADINES', 'Saint Vincent and the Grenadines').when( df.country == 'TRINIDAD AND TOBAGO', 'Trinidad & Tobago').when( df.country == 'TURKS AND CAICOS ISLANDS', 'Turks & Caicos Is').when( df.country == 'WALLIS AND FUTUNA ISLANDS', 'Wallis and Futuna').when( df.country == 'CHINA, PRC', 'China').otherwise(df.country_join)) # Define country_fk via left outer join df = df.join(df_right, df.country_join == df_right.country, how='left') \ .select('country_id', df.country, df_right.country.alias('country_fk')) # Check schema and count df.printSchema() df.count() # Return transformed dataframe return df
# COMMAND ---------- # MAGIC %md # MAGIC #### 2. Define schema for source data # MAGIC Different years have different schemas - fields added/removed # COMMAND ---------- #Schema for data based on year and month #2017 yellowTripSchema2017H1 = StructType([ StructField("vendor_id", StringType(), True), StructField("pickup_datetime", TimestampType(), True), StructField("dropoff_datetime", TimestampType(), True), StructField("passenger_count", IntegerType(), True), StructField("trip_distance", DoubleType(), True), StructField("rate_code_id", IntegerType(), True), StructField("store_and_fwd_flag", StringType(), True), StructField("pickup_location_id", IntegerType(), True), StructField("dropoff_location_id", IntegerType(), True), StructField("payment_type", StringType(), True), StructField("fare_amount", DoubleType(), True), StructField("extra", DoubleType(), True), StructField("mta_tax", DoubleType(), True), StructField("tip_amount", DoubleType(), True), StructField("tolls_amount", DoubleType(), True), StructField("improvement_surcharge", DoubleType(), True), StructField("total_amount", DoubleType(), True) ])
def test_udf_with_order_by_and_limit(self): my_copy = udf(lambda x: x, IntegerType()) df = self.spark.range(10).orderBy("id") res = df.select(df.id, my_copy(df.id).alias("copy")).limit(1) self.assertEqual(res.collect(), [Row(id=0, copy=0)])
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType: """Convert pyarrow type to Spark data type.""" from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types spark_type: DataType if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None: spark_type = TimestampNTZType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_duration(at): spark_type = DayTimeIntervalType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ] ) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
# Check for null value df.where(df.CustomerID.isNull()).count() df.where(df.Gender.isNull()).count() df.where(df.Age.isNull()).count() df.where(df.AI.isNull()).count() df.where(df.SS.isNull()).count() # Change Male and Female to integer values from pyspark.sql.functions import * newDf = df.withColumn('Gender', regexp_replace('Gender', 'Male', '1')) upDf = newDf.withColumn('Gender', regexp_replace('Gender', 'Female', '2')) # Change Gender type to string from pyspark.sql.types import IntegerType upDf = upDf.withColumn("Gender", upDf["Gender"].cast(IntegerType())) upDf.cache() upDf.printSchema() from pyspark.ml.feature import VectorAssembler # set up data for ML vectorAssembler = VectorAssembler(inputCols = ['Gender', 'Age', 'AI'], outputCol = 'features') ml_df = vectorAssembler.transform(upDf) ml_df = ml_df.select(['features', 'SS']) ml_df.show(3) # split data, train and test splits = ml_df.randomSplit([0.7, 0.3]) train_df = splits[0]
def _transform_data(df): """Transform original dataset. :param df: Input DataFrame. :return: Transformed DataFrame. """ # Cast key variables and rename headers rename_cols = { '_c0': 'id_siniestro', '_c1': 'id_poliza', '_c2': 'id_producto', '_c3': 'fecha_apertura', '_c4': 'fecha_terminado', '_c5': 'nif_o_intm', '_c6': 'nombre', '_c7': 'nif_pagador', '_c8': 'nombre_pagador', '_c9': 'iban', '_c10': 'id_mediador' } for old_name, new_name in rename_cols.items(): df = df.withColumnRenamed(old_name, new_name) # Cast claims id df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType())) # We save the other participants columns in a list others = [ 'id_siniestro', 'id_poliza', 'fecha_apertura', 'fecha_terminado', 'iban' ] + [col for col in df.columns if col.startswith('_c')] df_others = df.select(*others) # We drop others from df df = df.select(df.columns[:11]) df = df.drop( *['nombre', 'nif_pagador', 'nombre_pagador', 'id_producto']) # We add column cod_rol and rol df = df.withColumn('rol', lit('Tomador')) df = df.withColumn('cod_rol', lit(2)) # We take intermediary separately intermediary = df.drop('nif_o_intm') intermediary = intermediary.withColumnRenamed('id_mediador', 'nif_o_intm') intermediary = intermediary.withColumn('rol', lit('Intermediario')) intermediary = intermediary.withColumn('cod_rol', lit(3)) intermediary = intermediary.select([ 'id_siniestro', 'id_poliza', 'fecha_apertura', 'fecha_terminado', 'nif_o_intm', 'iban', 'rol', 'cod_rol' ]) df = df.drop('id_mediador') # We concat the two dataframe df = df.union(intermediary) # We return with the others and rename ('cod_rol', 'rol', 'nif_o_intm') for col in range(11, len(df_others.columns), 3): df_others_i = df_others.select([ 'id_siniestro', 'id_poliza', 'fecha_apertura', 'fecha_terminado', '_c' + str(col + 2), 'iban', '_c' + str(col + 1), '_c' + str(col) ]) df_others_i = df_others_i.withColumnRenamed( '_c' + str(col), 'cod_rol') df_others_i = df_others_i.withColumnRenamed( '_c' + str(col + 1), 'rol') df_others_i = df_others_i.withColumnRenamed( '_c' + str(col + 2), 'nif_o_intm') df_others_i = df_others_i.dropna(thresh=1, subset='nif_o_intm') df = df.union(df_others_i) df = df.dropDuplicates() return df
#connect to the database pw_df = spark.read.jdbc("jdbc:postgresql://timescale.lab11.eecs.umich.edu/powerwatch", "pw_dedupe", properties={"user": config['user'], "password": config['password'],"driver":"org.postgresql.Driver"}) #read the data that we care about pw_df = pw_df.select(pw_df['core_id'],pw_df['time'],pw_df['is_powered'],pw_df['product_id']) pw_df = pw_df.filter("product_id = 7008 OR product_id= 7009") #now we need to created a window function that looks at the leading lagging edge of is powered and detects transitions #then we can filter out all data that is not a transition def detectTransition(value1, value2): if(value1 == value2): return 0 else: return 1 udfDetectTransition = udf(detectTransition, IntegerType()) w = Window.partitionBy("core_id").orderBy(asc("time")) is_powered_lag = lag("is_powered",1).over(w) pw_df = pw_df.withColumn("transition", udfDetectTransition("is_powered",is_powered_lag)) #filter out all transitions pw_df = pw_df.filter("transition != 0") #now count each outage (really restoration) def countOutage(value1, value2, value3): if(value1 == False and value2 == True and value3 == True): return 1 else: return 0 udfCountTransition = udf(countOutage, IntegerType()) is_powered_lead = lead("is_powered",1).over(w)
# Generate top 10 movie recommendations for a subset of users users = ratings.select('userId').distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) userSubsetRecs.show(truncate=False) # Generate top 10 user recommendations for a subset of movies movies = ratings.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) movieSubSetRecs.show(truncate=False) # In[8]: # Generate top 10 movie recommendations for a specified set of users defined by you from pyspark.sql.types import IntegerType from pyspark.sql.functions import col df = spark.createDataFrame([1, 2, 3], IntegerType()) df.show() df = df.select(col("value").alias("userId")) df.show() userSubsetRecs = model.recommendForUserSubset(df, 10) userSubsetRecs.show(truncate=False) # In[9]: # Generate top 10 user recommendations for a specified set of movies defined by you from pyspark.sql.types import IntegerType from pyspark.sql.functions import col df = spark.createDataFrame([1, 2, 3], IntegerType()) df = df.select(col("value").alias("movieId")) df.show()