def test_multiple_udfs(self): """ Test multiple group aggregate pandas UDFs in one agg function. """ from pyspark.sql.functions import sum, mean df = self.data mean_udf = self.pandas_agg_mean_udf sum_udf = self.pandas_agg_sum_udf weighted_mean_udf = self.pandas_agg_weighted_mean_udf result1 = (df.groupBy('id') .agg(mean_udf(df.v), sum_udf(df.v), weighted_mean_udf(df.v, df.w)) .sort('id') .toPandas()) expected1 = (df.groupBy('id') .agg(mean(df.v), sum(df.v), mean(df.v).alias('weighted_mean(v, w)')) .sort('id') .toPandas()) self.assertPandasEqual(expected1, result1)
def test_mixed_udf(self): from pyspark.sql.functions import mean df = self.data w = self.unbounded_window plus_one = self.python_plus_one time_two = self.pandas_scalar_time_two mean_udf = self.pandas_agg_mean_udf result1 = df.withColumn( 'v2', plus_one(mean_udf(plus_one(df['v'])).over(w))) expected1 = df.withColumn( 'v2', plus_one(mean(plus_one(df['v'])).over(w))) result2 = df.withColumn( 'v2', time_two(mean_udf(time_two(df['v'])).over(w))) expected2 = df.withColumn( 'v2', time_two(mean(time_two(df['v'])).over(w))) self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
def test_basic(self): df = self.data weighted_mean_udf = self.pandas_agg_weighted_mean_udf # Groupby one column and aggregate one UDF with literal result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id') expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id') self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) # Groupby one expression and aggregate one UDF with literal result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\ .sort(df.id + 1) expected2 = df.groupby((col('id') + 1))\ .agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1) self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) # Groupby one column and aggregate one UDF without literal result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id') expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id') self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) # Groupby one expression and aggregate one UDF without literal result4 = df.groupby((col('id') + 1).alias('id'))\ .agg(weighted_mean_udf(df.v, df.w))\ .sort('id') expected4 = df.groupby((col('id') + 1).alias('id'))\ .agg(mean(df.v).alias('weighted_mean(v, w)'))\ .sort('id') self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
def test_without_partitionBy(self): df = self.data w = self.unpartitioned_window mean_udf = self.pandas_agg_mean_udf result1 = df.withColumn('v2', mean_udf(df['v']).over(w)) expected1 = df.withColumn('v2', mean(df['v']).over(w)) result2 = df.select(mean_udf(df['v']).over(w)) expected2 = df.select(mean(df['v']).over(w)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
def test_simple(self): df = self.data w = self.unbounded_window mean_udf = self.pandas_agg_mean_udf result1 = df.withColumn('mean_v', mean_udf(df['v']).over(w)) expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) result2 = df.select(mean_udf(df['v']).over(w)) expected2 = df.select(mean(df['v']).over(w)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
def test_shrinking_window(self): from pyspark.sql.functions import mean df = self.data w1 = self.shrinking_row_window w2 = self.shrinking_range_window mean_udf = self.pandas_agg_mean_udf result1 = df.withColumn('m1', mean_udf(df['v']).over(w1)) \ .withColumn('m2', mean_udf(df['v']).over(w2)) expected1 = df.withColumn('m1', mean(df['v']).over(w1)) \ .withColumn('m2', mean(df['v']).over(w2)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_replace_existing(self): df = self.data w = self.unbounded_window result1 = df.withColumn('v', self.pandas_agg_mean_udf(df['v']).over(w)) expected1 = df.withColumn('v', mean(df['v']).over(w)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_alias(self): df = self.data mean_udf = self.pandas_agg_mean_udf result1 = df.groupby('id').agg(mean_udf(df.v).alias('mean_alias')) expected1 = df.groupby('id').agg(mean(df.v).alias('mean_alias')) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_mixed_sql(self): df = self.data w = self.unbounded_window mean_udf = self.pandas_agg_mean_udf result1 = df.withColumn('v', mean_udf(df['v'] * 2).over(w) + 1) expected1 = df.withColumn('v', mean(df['v'] * 2).over(w) + 1) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_alias(self): from pyspark.sql.functions import mean df = self.data mean_udf = self.pandas_agg_mean_udf result1 = df.groupby('id').agg(mean_udf(df.v).alias('mean_alias')) expected1 = df.groupby('id').agg(mean(df.v).alias('mean_alias')) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def formatItens(firstTime): #format itenary data global itens itens = itens.withColumn("ORIGIN_AIRPORT_ID",toInt("ORIGIN_AIRPORT_ID")) itens = itens.withColumn("DEST_AIRPORT_ID",toInt("DEST_AIRPORT_ID")) itens = itens.withColumn("MARKET_MILES_FLOWN",toKm("MARKET_MILES_FLOWN")) itens = itens.withColumn("PASSENGERS",toInt("PASSENGERS")) if firstTime: aggArg = sum("PASSENGERS").alias("PASSENGERS"),mean("MARKET_MILES_FLOWN").alias("MARKET_KMS_FLOWN") itens = itens.groupBy("ORIGIN_AIRPORT_ID","DEST_AIRPORT_ID").agg(*aggArg).cache()
def test_bounded_mixed(self): from pyspark.sql.functions import mean, max df = self.data w1 = self.sliding_row_window w2 = self.unbounded_window mean_udf = self.pandas_agg_mean_udf max_udf = self.pandas_agg_max_udf result1 = df.withColumn('mean_v', mean_udf(df['v']).over(w1)) \ .withColumn('max_v', max_udf(df['v']).over(w2)) \ .withColumn('mean_unbounded_v', mean_udf(df['v']).over(w1)) expected1 = df.withColumn('mean_v', mean(df['v']).over(w1)) \ .withColumn('max_v', max(df['v']).over(w2)) \ .withColumn('mean_unbounded_v', mean(df['v']).over(w1)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_multiple_udfs(self): df = self.data w = self.unbounded_window result1 = df.withColumn('mean_v', self.pandas_agg_mean_udf(df['v']).over(w)) \ .withColumn('max_v', self.pandas_agg_max_udf(df['v']).over(w)) \ .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w)) expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) \ .withColumn('max_v', max(df['v']).over(w)) \ .withColumn('min_w', min(df['w']).over(w)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_mixed_udf(self): df = self.data w = self.unbounded_window plus_one = self.python_plus_one time_two = self.pandas_scalar_time_two mean_udf = self.pandas_agg_mean_udf result1 = df.withColumn( 'v2', plus_one(mean_udf(plus_one(df['v'])).over(w))) expected1 = df.withColumn( 'v2', plus_one(mean(plus_one(df['v'])).over(w))) result2 = df.withColumn( 'v2', time_two(mean_udf(time_two(df['v'])).over(w))) expected2 = df.withColumn( 'v2', time_two(mean(time_two(df['v'])).over(w))) assert_frame_equal(expected1.toPandas(), result1.toPandas()) assert_frame_equal(expected2.toPandas(), result2.toPandas())
def handleUIOptions(self, displayColName): agg = self.options.get("aggregation") valFields = self.options.get("valueFields") if agg == 'COUNT': return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas() elif agg == 'SUM': return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas() elif agg == 'AVG': return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas() elif agg == 'MIN': return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas() elif agg == 'MAX': return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas() elif agg == 'MEAN': return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas() else: return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
def transform(self, dataframe): """Applies standardization to the specified columns. # Arguments dataframe: dataframe. Spark Dataframe. """ # Compute the means of the specified columns. means = [mean(x) for x in self.columns] means = dataframe.select(means).collect()[0].asDict() self.means = self.clean_mean_keys(means) # Compute the standard deviation of the specified columns. stddevs = [stddev_pop(x) for x in self.columns] stddevs = dataframe.select(stddevs).collect()[0].asDict() self.stddevs = self.clean_stddev_keys(stddevs) # For every feature, add a new column to the dataframe. for column in self.columns: self.current_column = column dataframe = dataframe.rdd.map(self._transform).toDF() return dataframe
def test_invalid_args(self): df = self.data plus_one = self.python_plus_one mean_udf = self.pandas_agg_mean_udf with QuietTest(self.sc): with self.assertRaisesRegexp( AnalysisException, 'nor.*aggregate function'): df.groupby(df.id).agg(plus_one(df.v)).collect() with QuietTest(self.sc): with self.assertRaisesRegexp( AnalysisException, 'aggregate function.*argument.*aggregate function'): df.groupby(df.id).agg(mean_udf(mean_udf(df.v))).collect() with QuietTest(self.sc): with self.assertRaisesRegexp( AnalysisException, 'mixture.*aggregate function.*group aggregate pandas UDF'): df.groupby(df.id).agg(mean_udf(df.v), mean(df.v)).collect()
def test_bounded_simple(self): from pyspark.sql.functions import mean, max, min, count df = self.data w1 = self.sliding_row_window w2 = self.shrinking_range_window plus_one = self.python_plus_one count_udf = self.pandas_agg_count_udf mean_udf = self.pandas_agg_mean_udf max_udf = self.pandas_agg_max_udf min_udf = self.pandas_agg_min_udf result1 = df.withColumn('mean_v', mean_udf(plus_one(df['v'])).over(w1)) \ .withColumn('count_v', count_udf(df['v']).over(w2)) \ .withColumn('max_v', max_udf(df['v']).over(w2)) \ .withColumn('min_v', min_udf(df['v']).over(w1)) expected1 = df.withColumn('mean_v', mean(plus_one(df['v'])).over(w1)) \ .withColumn('count_v', count(df['v']).over(w2)) \ .withColumn('max_v', max(df['v']).over(w2)) \ .withColumn('min_v', min(df['v']).over(w1)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
#print("sessions %d : uniqueSessions %d" % (sessionCount, uniqueSessions)) #Convert orders to grouped values ordersDf = ordersDf.groupby("ssid").agg( sum("revenue").alias("revenue"), count("*").alias("transactions")) #Convert sessions to grouped vals conversionUdf = udf(convertUnixToDate,StringType()) sessionsDf = sessionsDf.withColumn("unixTime", split(sessionsDf .ssid,":")[2]) sessionsDf = sessionsDf.withColumn("startTime",conversionUdf("unixTime")) sessionsDf = sessionsDf.withColumn("siteId", split(sessionsDf .ssid,":")[1]) # #Join sessions and groups sessionsAlias = sessionsDf.alias("session") ordersAlias = ordersDf.alias("order") featureAlias = featuresDf.select("ssid", "ad").alias("features") sessionOrders = sessionsAlias.join(ordersAlias, ["ssid"]) joinedData = sessionOrders.join(featureAlias, ["ssid"]) # #Orderby and show values groupedData = joinedData.groupby("startTime","siteId","gr","ad","browser",).agg(count("*").alias("sessions"), sum("transactions").alias("transactions"), sum("revenue").alias("revenue")) #groupedData.coalesce(1).write.option("sep","\t").option("header","true").csv("results/target.tsv") # https://stackoverflow.com/questions/47995188/how-to-calculate-mean-and-standard-deviation-given-a-pyspark-dataframe # Get matching pairs and calculate the mean and value meanFeatureExp = [mean("feature-{0}".format(x+1)).alias("feature{0}_mean".format(x+1)) for x in range(4)] stdDevExp = [stddev("feature-{0}".format(x+1)).alias("feature{0}_std".format(x+1)) for x in range(4)] siteIdUdf = udf(ssidToSiteId,StringType()) featuresDf = featuresDf.withColumn("siteId",siteIdUdf("ssid")) adData = featuresDf.groupby("siteId","ad").agg(*meanFeatureExp,*stdDevExp) adData.coalesce(1).write.option("sep","\t").option("header","true").mode("append").csv("results/output.txt")
from pyspark.sql.functions import year, dayofyear, mean, round from pyspark.ml.clustering import KMeans import pandas as pd import matplotlib.pyplot as plt from pyspark.ml.feature import VectorAssembler, MinMaxScaler FEATURES_COL = [ "DayOfYear", "Avg(Temperature)", "Avg(Humidity)", "Avg(Pressure)", "Avg(WindSpeed)" ] spark = SparkSession.builder.appName("Project").getOrCreate() df = spark.read.csv("./data.csv", header=True) spark_df = df.filter((df.city == "Las Vegas") & (year("datetime") == 2016 )) \ .groupBy(dayofyear("datetime").alias("DayOfYear")) \ .agg(mean("temperature").alias("Avg(Temperature)"), \ mean("humidity").alias("Avg(Humidity)"), \ mean("pressure").alias("Avg(Pressure)"), \ mean("wind_speed").alias("Avg(WindSpeed)"), \ mean("wind_direction").alias("Avg(WindDirection)")) \ .orderBy('DayOfYear') vecAssembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features") df_kmeans = vecAssembler.transform(spark_df).select('DayOfYear', 'features') scaler = MinMaxScaler(inputCol="features",\ outputCol="scaledFeatures") scalerModel = scaler.fit(df_kmeans.select("features")) scaledData = scalerModel.transform(df_kmeans) scaledData.show() k = 5
def main(username): # For verification on the username received and print in console for demo. For actual deployment, # can comment away. print(f"Received username= {username}") # Start the Spark instance cnfg = SparkConf().setAppName("TwitterUserProfile").setMaster("local[2]") sc = SparkContext(conf=cnfg) spark = SparkSession(sc) # Initialise the first page of tweets & user (1 page consist of 10 entries) url = create_url(target=username) headers = create_headers(bearer_token) json_response = connect_to_endpoint(url, headers) # Parsing the JSON response returned by Twitter tweet_df = spark.createDataFrame(json_response['data']) # Check if there's geolocation field in the response. geo_exist = has_column(tweet_df, "geo") # Extracting the geolocation information via geo.place_id if geo_exist: tweet_df = tweet_df.select("author_id", "created_at", "geo.place_id", "id", "text") else: tweet_df = tweet_df.select("author_id", "created_at", "id", "text") # Extracting the user details user_df = spark.createDataFrame(json_response['includes']['users']) # flatten the public_metrics cols = list( map(lambda f: F.col("public_metrics").getItem(f).alias(str(f)), [ "following_count", "tweet_count", "listed_count", "followers_count" ])) public_metrics = user_df.select(cols) user_df = user_df.drop('public_metrics') # Merge user_df with public_metrics user_df = with_column_index(user_df) public_metrics = with_column_index(public_metrics) user_df = user_df.join(public_metrics, user_df.ColumnIndex == public_metrics.ColumnIndex, 'inner').drop("ColumnIndex") # If there are more tweets (next page / next token), append it to tweet_df. # user_df is just for a single user, so no need to append. Info will be the same. if 'next_token' not in json_response['meta']: pass else: next_token = json_response['meta']['next_token'] while next_token is not None: url = create_url(username, next_token) json_response = connect_to_endpoint(url, headers) new_tweets = spark.createDataFrame(json_response['data']) # Check if there's geolocation field in the new tweets new_tweet_geo_exist = has_column(new_tweets, "geo") if new_tweet_geo_exist: new_tweets = new_tweets.select("author_id", "created_at", "geo.place_id", "id", "text") else: new_tweets = new_tweets.select("author_id", "created_at", "id", "text") # to make sure all have the same number of columns for column in tweet_df.columns: if column not in new_tweets.columns: new_tweets = new_tweets.withColumn(column, F.lit(None)) for column in new_tweets.columns: if column not in tweet_df.columns: tweet_df = tweet_df.withColumn(column, F.lit(None)) # Reordering the column of new_tweets for union function if geo_exist: new_tweets = new_tweets.select("author_id", "created_at", "place_id", "id", "text") else: new_tweets = new_tweets.select("author_id", "created_at", "id", "text") tweet_df = tweet_df.union(new_tweets) if 'next_token' not in json_response['meta']: next_token = None else: next_token = json_response['meta']['next_token'] # Show the df. Can comment away in actual production. tweet_df.show(truncate=False) user_df.show(truncate=False) # Extract geolocation information within the tweets. Currently not in use. if geo_exist: location_df = tweet_df.select("author_id", "id", "place_id").dropna() location_df.show(truncate=False) # WORD FREQUENCY - to be made into word cloud in Tableau or other visualisation software. tweet_only = tweet_df.select("author_id", "text") # Remove punctuation, covert to lower case df_clean = tweet_only.select( "author_id", (lower(regexp_replace('text', "[^a-zA-Z\\s]", "")).alias('text'))) # Tokenize text tokenizer = Tokenizer(inputCol='text', outputCol='words_token') df_words_token = tokenizer.transform(df_clean).select( 'author_id', 'words_token') # Remove stop words remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean') df_words_no_stopw = remover.transform(df_words_token).select( 'author_id', 'words_clean') # Filter length word > 3 filter_length_udf = udf(lambda row: [x for x in row if 3 <= len(x) <= 13], ArrayType(StringType())) df_final_words = df_words_no_stopw.withColumn( 'words', filter_length_udf(col('words_clean'))) # Printing the word list. Can comment away in actual deployment. df_final_words.show(truncate=False) word_count = df_final_words.select('author_id', F.explode('words').alias('word')).\ groupBy('author_id', 'word').\ count().\ sort('count', ascending=False) # Printing the word list and count. Can comment away in actual deployment. word_count.show() # SENTIMENT ANALYSIS. Sentiment is in the range of (-1, 1). sentiment = udf(lambda x: TextBlob(x).sentiment[0]) tweet_sentiment = tweet_df.withColumn( "sentiment_score", sentiment(tweet_df["text"]).cast("double")) classify_sentiment_udf = udf(classify_sentiment) tweet_sentiment = tweet_sentiment.withColumn( "sentiment", classify_sentiment_udf(tweet_sentiment["sentiment_score"])) tweet_sentiment = tweet_sentiment.select('author_id', 'created_at', 'id', 'text', 'sentiment_score', 'sentiment') # Can comment away the show statement. Left here to display the progress in console for demo. tweet_sentiment.show() sentiment_count = tweet_sentiment.groupBy('author_id', 'sentiment').agg( F.mean('sentiment_score'), F.count('sentiment')).toDF("author_id", "sentiment", "avg_sentiment_score", "count") # Can comment away the show statement. Left here to display the progress in console for demo. sentiment_count.show() # Read in existing data from Amazon RedShift DB. If user already exists, need to merge and deduplicate, then write data back. with redshift_conn.connect() as conn, conn.begin(): # Check if Table exists first. If so, read in existing Twitter users that are already in RedShift DB. # The unique key is the id, which is the author_id, Twitter user id. if redshift_conn.has_table("user_data"): user = pd.read_sql(""" select * from user_data;""", conn) # Append latest data retrieved to those in DB and remove duplicates, keeping the latest. user = user.append(user_df.toPandas()) user = user.drop_duplicates(subset="id", keep="last") else: user = user_df.toPandas() # Similarly, check if the Table for sentiment count exists. If so, read in existing sentiment count # for existing users in RedShift DB. The pair, author_id and sentiment," is used for deduplication. if redshift_conn.has_table("sentiment_count"): senti_df = pd.read_sql( """ select * from sentiment_count;""", conn) # Append latest data to those in DB and remove duplicates, keeping the latest. senti_df = senti_df.append(sentiment_count.toPandas()) senti_df = senti_df.drop_duplicates( subset=["author_id", "sentiment"], keep="last") else: senti_df = sentiment_count.toPandas() # Checking if Table for word_count already exists in RedShift. If so, read in existing word count for # existing users in RedShift DB. Distinct pair of author_id and word is used for comparison. if redshift_conn.has_table("word_count"): word_df = pd.read_sql( """ select * from word_count;""", conn) # Append latest data to those in DB and remove duplicates, keeping the latest. word_df = word_df.append(word_count.toPandas()) word_df = word_df.drop_duplicates(subset=["author_id", "word"], keep="last") else: word_df = word_count.toPandas() # Check for Table, tweet_sentiment. If exists, read in existing tweet sentiment for existing users in # RedShift DB. The unique ID used is the tweet id, which is unique for each tweet. All unique tweets # are kept. Thus even if the Twitter user deleted his old tweets, it will still be retained in the # Redshift DB if it was previously captured. if redshift_conn.has_table("tweet_sentiment"): tweet_db = pd.read_sql( """ select * from tweet_sentiment;""", conn) # Append latest data to those in DB and remove duplicates, keeping the latest. tweet_db = tweet_db.append(tweet_sentiment.toPandas()) tweet_db = tweet_db.drop_duplicates(subset="id", keep="last") else: tweet_db = tweet_sentiment.toPandas() # Update the data to Redshift. user.to_sql('user_data', redshift_conn, index=False, if_exists='replace') word_df.to_sql('word_count', redshift_conn, index=False, if_exists='replace') senti_df.to_sql('sentiment_count', redshift_conn, index=False, if_exists='replace') tweet_db.to_sql('tweet_sentiment', redshift_conn, index=False, if_exists='replace', dtype={ 'author_id': sqlalchemy.types.VARCHAR(length=255), 'created_at': sqlalchemy.types.VARCHAR(length=255), 'id': sqlalchemy.types.VARCHAR(length=255), 'text': sqlalchemy.types.VARCHAR(length=5000), 'sentiment_score': sqlalchemy.types.Float(precision=3, asdecimal=True), 'sentiment': sqlalchemy.types.VARCHAR(length=255), }) # Location information in tweet. Currently not in use. # location.to_sql('location_data', redshift_conn, index=False, if_exists='replace') # Can comment away print statement for actual deployment. Left here so that status will be printed in # console for demo purpose. print("Redshift DB updated successfully.")
df = spark.read.load("gs://vf-polimi-batch-data/dpi/year=%d/month=%d" % (now.year, now.month), \ format='com.databricks.spark.csv', \ header='true', \ inferSchema='true') # number, signature, usage, timestamp df = df.withColumn('timestamp', f.to_timestamp('timestamp','dd/MM/yyyy-HH:mm:ss')) df = df.withColumn('hour', f.hour('timestamp')) df = df\ .groupBy('number', 'signature', 'day','hour')\ .sum('usage')\ .groupBy('number', 'signature', 'hour')\ .agg(f.mean('sum(usage)'))\ .withColumnRenamed('avg(sum(usage))','average_usage') # need to create a dataset first "bq mk vf_polimi_demo_dataset" print('Computing kpis and writing output to BigQuery') df.write.format('bigquery') \ .option('table', 'vf_polimi_demo_dataset.batch_kpi%d%d' % (now.year, now.month)) \ .option("temporaryGcsBucket","vf-polimi-batch-data") \ .mode('overwrite') \ .save() print('Finished') # alternative solution to write output on GCS partitioned by date #df.write.partitionBy('hour').option('header', 'true').mode('overwrite').csv('gs://vf-polimi-batch-data/dpi-kpi/year=%d/month=%d' % (now.year, now.month))
# A slightly different way to generate the two random columns df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) #df.describe().show() display(df.describe()) # COMMAND ---------- #df.describe('uniform', 'normal').show() display(df.describe('uniform', 'normal')) # COMMAND ---------- from pyspark.sql.functions import mean, min, max #df.select([mean('uniform'), min('uniform'), max('uniform')]).show() display(df.select([mean('uniform'), min('uniform'), max('uniform')])) # COMMAND ---------- # MAGIC %md ### Sample covariance and correlation # MAGIC # MAGIC Covariance is a measure of how two variables change with respect to each other. A positive number would mean that there is a tendency that as one variable increases, the other increases as well. A negative number would mean that as one variable increases, the other variable has a tendency to decrease. The sample covariance of two columns of a DataFrame can be calculated as follows: # COMMAND ---------- from pyspark.sql.functions import rand df = sqlContext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) # COMMAND ----------
'Total_Individuals_in_Shelter', 'Single_Adult_Men_in_Shelter', 'Single_Adult_Women_in_Shelter', 'Total_Single_Adults_in_Shelter', 'Families_with_Children_in_Shelter', 'Adults_in_Families_with_Children_in_Shelter', 'Children_in_Families_with_Children_in_Shelter', "Total_Individuals_in_Families_with_Children_in_Shelter_", 'Adult_Families_in_Shelter', 'Individuals_in_Adult_Families_in_Shelter', 'case_count', 'cum_case_count', 'hosp_count', 'cum_hosp_count', 'death_count', 'death_count_prob', 'cum_death_count') # Clean transformed datasets cols = ['Total_Adults_in_Shelter', 'Total_Children_in_Shelter', 'Total_Individuals_in_Shelter'] cvd_dhs = clean_na(cvd_dhs, cols) # Compute rolling averages of homeless sheltler census counts to replace null values window2 = Window.partitionBy().orderBy('date').rowsBetween(Window.currentRow - 8, Window.currentRow - 1) cvd_dhs = cvd_dhs.withColumn('total_adults_rolling_avg', functions.mean(cvd_dhs['Total_Adults_in_Shelter']).over(window2)) cvd_dhs = cvd_dhs.withColumn('total_children_rolling_avg', functions.mean(cvd_dhs['Total_Children_in_Shelter']).over(window2)) cvd_dhs = cvd_dhs.withColumn('total_individuals_rolling_avg', functions.mean(cvd_dhs['Total_Individuals_in_Shelter']).over(window2)) # Computing p-scores for Adults and Children in homeless shelters cvd_dhs.createOrReplaceTempView('cvd_dhs') result = spark.sql("SELECT CD.date, CD.case_count, CD.cum_case_count, CD.hosp_count, " "CD.cum_hosp_count, CD.death_count, CD.death_count_prob, CD.cum_death_count, " "AVG(CD3.adults) as adults_avg, AVG(CD3.children) as children_avg, AVG(CD3.total) as total_avg, " "COALESCE(CD.Total_Adults_in_Shelter, CD.total_adults_rolling_avg) as adults_cvd, " "COALESCE(CD.Total_Children_in_Shelter, CD.total_children_rolling_avg) as children_cvd, " "COALESCE(CD.Total_Individuals_in_Shelter, CD.total_individuals_rolling_avg) as total_cvd, " "((COALESCE(CD.Total_Adults_in_Shelter, CD.total_adults_rolling_avg) - " "AVG(CD3.adults)) / AVG(CD3.adults)) as adults_pscore, "
#Part 2 Task 3 #Import Modules from pyspark.sql import SparkSession from pyspark.sql import functions as F #Create a spark session spark = SparkSession.builder.appName("AirBnB").getOrCreate() #Dataset path path = 'C:/Users/Ashish/Desktop/Truata/airbnbdata.parquet' #Output Directory Path outputDir = 'C:/Users/Ashish/Desktop/Truata/out/' #Load parquet file into dataframe airbnb_Data = spark.read.parquet(path) #Querying Value from dataframe for price > 5000 and review = 10 bedBathData = airbnb_Data.filter((airbnb_Data['price'] > 5000) & (airbnb_Data['review_scores_value'] == 10)).select(["bathrooms", "bedrooms"]).agg(F.mean('bathrooms'), F.mean('bedrooms')) #Renaming Columns renamedBedBathData = bedBathData.withColumnRenamed('avg(bathrooms)', 'avg_bathrooms')\ .withColumnRenamed('avg(bedrooms)', 'avg_bedrooms') #Saving as CSV file renamedBedBathData.toPandas().to_csv(outputDir+'out_2_3.csv', index= False)
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum") ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum") ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().ix[:,0] ) stats = stats_df.ix[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column)==0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
from pyspark.sql import SparkSession from pyspark.sql import functions as F if __name__ == "__main__": spark = SparkSession.builder.master("local").appName("pyspark homework").getOrCreate() file_path = "hdfs:///dataset/bank-data.csv" df = spark.read.csv(path=file_path, header=True, inferSchema=True) df.groupBy("sex").agg(F.min("income"), F.max("income"), F.mean("income")).show() df.groupBy("region").agg({"income": "mean"}).show()
def mean(scol): return F.when( F.row_number().over(self._window) >= self._min_periods, F.mean(scol).over(self._window)).otherwise(F.lit(None))
def fn(col): if 'window' in kwargs: window = kwargs['window'] return F.mean(col).over(window) else: return F.mean(col)
# resulting column: flights \ .agg(countDistinct("carrier").alias("num_carriers")) \ .show() # `groupBy()` groups data by the specified columns, so # aggregations can be computed by group: from pyspark.sql.functions import mean flights \ .groupBy("origin") \ .agg( \ count("*").alias("num_departures"), \ mean("dep_delay").alias("avg_dep_delay") \ ) \ .show() # You can chain together multiple DataFrame methods: flights \ .filter(col("dest") == lit("BOS")) \ .groupBy("origin") \ .agg( \ count("*").alias("num_departures"), \ mean("dep_delay").alias("avg_dep_delay") \ ) \ .orderBy("avg_dep_delay") \ .show()
'float'), 2).alias('Low'), format_number(result_desc['Close'].cast( 'float'), 2).alias('Close'), result_desc['Volume'].cast('int').alias('Volume') ).show() hv_ratio = walmartdf.withColumn( "HV ratio", walmartdf['High'] / walmartdf['Volume']) hv_ratio.select('HV Ratio').show() # finding highest value date walmartdf.orderBy(walmartdf['High'].desc()).head(1)[0][0] walmartdf.agg(mean(walmartdf['Close'])).show() walmartdf.select(max(walmartdf['Volume']), min('Volume')).show() walmartdf.filter(walmartdf['Close'] < 60).count() (walmartdf.filter(walmartdf['High'] > 80).count( ) / walmartdf.agg(count(walmartdf['Date'])).head(1)[0][0]) * 100 newdf = walmartdf.withColumn("year", year(walmartdf['Date'])) newdf.groupby("year").max().select('year', 'max(High)').show() newdf2 = walmartdf.withColumn("month", month(walmartdf['Date'])) newdf2.groupBy("month").mean().select(
# immigration us_immigrant = spark.read.parquet( '{}immigrant/'.format(processed_data_path)).filter( F.col('monthYear') == F.lit(monthYear)) us_immigration = spark.read.parquet( '{}immigration/'.format(processed_data_path)).filter( F.col('monthYear') == F.lit(monthYear)) # demographics us_demographics = spark.read.parquet('{}demographics/'.format(processed_data_path))\ .select("median_age", "city_id", "total_population", "foreign_born")\ .join(city.select("state_code", "city_id"), "city_id")\ .drop('city_id')\ .groupBy("state_code")\ .agg( F.mean("median_age").alias('median_age'), F.sum("total_population").alias("total_population"), F.sum("foreign_born").alias("foreign_born") ) # process anlaytics immigration analytics_immigration = us_immigrant\ .select('cicid', 'from_country_code', 'age', 'occupation', 'gender', 'monthYear')\ .join(country_code, us_immigrant.from_country_code == country_code.code, 'left')\ .drop('from_country_code', 'code')\ .withColumnRenamed('country', 'from_country')\ .join(us_immigration.select('cicid','state_code'), 'cicid', 'left')\ .join(us_state_code, us_immigration.state_code == us_state_code.code, 'left')\ .drop('code')\ .join(us_demographics, 'state_code')\ .drop('state_code')
# count = sum of daily counts feat + 'count' + dd: f.sum(f.col(feat + 'count_0d')).over(window), # A few more complicated examples: # mean = weighted mean of daily means feat + 'count' + dd: f.sum(f.col(feat + 'mean_0d') * f.col(feat + 'count_0d')).over(window) / f.sum(f.col(feat + 'count_0d')).over(window), # stddev = sqrt(weighted mean of daily variances) feat + 'stddev' + dd: f.sqrt( f.mean(f.col(feat + 'count_0d') * f.col(feat + 'stddev_0d')**2).over(window) / f.sum(f.col(feat + 'count_0d')).over(window)), } # Loop through the dictionary of new columns and add them to the aggregated # dataframe for col_name, col_obj in new_cols.items(): add = SparkWithColumn(name='add_' + col_name, read_key='df_agg', store_key='df_agg', new_col_name=col_name, new_col=col_obj) lookback_chain.add(add) # STEP 5: Save the results
from pyspark.sql.types import * import time from pyspark.sql import SparkSession import json import pyspark.sql.functions as f spark = (SparkSession.builder.appName("Spark Benchmarking").master( "local[*]").config("spark.driver.memory", "8g").config("spark.driver.maxResultSize", "4g").getOrCreate()) big_df = (spark.read.format("csv").option("header", "true").option( "delimiter", ",").load("./data/KB/*.csv")) small_df = big_df.groupby("key").agg(f.mean(f.col("value"))) small_df.write() joined = big_df.join(small_df, small_df.key == big_df.key, how="left")
movies = movies.withColumn("year", get_year_udf(movies.title)) movies.show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC As part of the analysis of this dataset, it would be useful to have the average rating for each movie. In the following cell of code, I aggregate over the ratings table to get the average rating for each movie ID. # COMMAND ---------- from pyspark.sql.types import FloatType from pyspark.sql.functions import bround from pyspark.sql.functions import mean ratings_agg = ratings.groupBy("movieId").agg(mean("rating").alias("avg_rating")) ratings_agg = ratings_agg.withColumn("average_rating", ratings_agg.avg_rating.cast(FloatType())).drop("avg_rating").withColumnRenamed("average_rating", "avg_rating") ratings_agg = ratings_agg.select("movieId",bround("avg_rating",2).alias("avg_rating")) ratings_agg.show() # COMMAND ---------- # MAGIC %md # MAGIC Here we evaluate the average rating by year to identify if there is a trend in the ratings either to decrease or increase over the years. Visually, it is not possible to appreaciate such trend, but it was possible to identify some outlayer values in the year column. # MAGIC # MAGIC To achieve this, it was necessary to join the aggregated ratings table with movies table that includes the year as a column. # COMMAND ---------- joined_movies = movies.join(ratings_agg,"movieId") joined_movies.select("year",'avg_rating').groupBy("year").mean().orderBy("year").display()
#df1.describe("trip_distance").show() #df1.describe("total_amount").show() # Query data # https://towardsdatascience.com/beginners-guide-to-pyspark-bbe3b553b79f # Compute monthly avg trip_distance & total_amount # df1.select('trip_distance' # ).groupBy('pickup_mon')\ # .mean()\ # .show() #AttributeError: 'GroupedData' object has no attribute 'describe' # Try this instead # https://stackoverflow.com/questions/51632126/pysparkhow-to-calculate-avg-and-count-in-a-single-groupby df1.groupBy('pickup_mon').agg(functions.mean('trip_distance'), functions.count('trip_distance')).show() df1.groupBy('pickup_mon').agg(functions.mean('total_amount'), functions.count('total_amount')).show() # Compute stddev next # Consider map individual variable and compute summary statistics; it may be faster? # Delete the temporary files input_path = sc._jvm.org.apache.hadoop.fs.Path(input_directory) input_path.getFileSystem(sc._jsc.hadoopConfiguration()).delete( input_path, True) ## Back to Google Cloud, Week 7 ## Upload this file to Storage's cs512_trip
def get_average(dataset, column): return dataset.agg(mean(dataset[column]).alias('mean')).collect()[0]['mean']
def readDataFromES(): es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) results_gen = elasticsearch.helpers.scan( es, index='netflowrepo', doc_type='entry', query={"query": { "match_all": {} }}) results = list(results_gen) id_list = [] sumOfBytes_list = [] uniqDstIPs_list = [] uniqDstPorts_list = [] for row in results: id_list.append(row['_id']) sumOfBytes_list.append(row['_source']['sumOfBytes']) uniqDstIPs_list.append(row['_source']['uniqDstIPs']) uniqDstPorts_list.append(row['_source']['uniqDstPorts']) # Convert data to numpy arrays. np_ID = np.array(id_list) np_Bytes = np.array(sumOfBytes_list) np_DstIPs = np.array(uniqDstIPs_list) np_DstPorts = np.array(uniqDstPorts_list) # Convert data into Matrix. Each feature is in a column. tmp1 = np.concatenate((np_Bytes.reshape((-1, 1)), np_DstIPs.reshape( (-1, 1))), axis=1) tmp2 = np.concatenate((tmp1, np_DstPorts.reshape((-1, 1))), axis=1) mat = sc.parallelize(tmp2.tolist()) # Convert to Data Frame. df = spark.createDataFrame(mat) df = df.toDF('sumOfBytes', 'uniqDstIPs', 'uniqDstPorts') # Add headers. if DEBUGMODE: df.show() # Add unique numeric ID, and place in first column. df = df.withColumn("id", monotonically_increasing_id()) df = df.select("id", FEATURE_COLS[0], FEATURE_COLS[1], FEATURE_COLS[2]) if DEBUGMODE: df.show() # Convert all data columns to float. for col in df.columns: if col in FEATURE_COLS: df = df.withColumn(col, df[col].cast('float')) if DEBUGMODE: df.show() # Need to convert this to a vector for Spark's implementation of KMeans. vecAssembler = VectorAssembler(inputCols=FEATURE_COLS, outputCol="features") df_kmeans = vecAssembler.transform(df).select( 'id', 'features') # Drop other columns. if DEBUGMODE: df_kmeans.show() if SCALING_FLAG: # Scale the data. scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures") scaler_model = scaler.fit(df_kmeans) df_scaled = scaler_model.transform(df_kmeans) if DEBUGMODE: df_scaled.show() # Train the Machine Learning Model. k = 3 # silhouette score of 0.799529809602 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("scaledFeatures") model = kmeans.fit(df_scaled) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # Assign events to clusters. predictions = model.transform(df_scaled).select( 'id', 'scaledFeatures', 'prediction') if DEBUGMODE: predictions.show() # Extract scaledFeatures column back to FEATURE_COLS predictions = predictions.rdd.map(extract).toDF([ "id", "prediction", "scaledFeatures", "sumOfBytes", "uniqDstIPs", "uniqDstPorts" ]) # Rename scaledFeatures to features. predictions = predictions.withColumnRenamed("scaledFeatures", "features") df_pred = predictions # # # # Find optimal choice for k. # # # cost = np.zeros(20) # # # for k in range(2,20): # # # kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("scaledFeatures") # # # model = kmeans.fit(df_scaled.sample(False,0.1, seed=42)) # # # cost[k] = model.computeCost(df_scaled) # # # printDebugMsg("Cost =") # # # for k in range(2, 20): # # # printDebugMsg("{0}: {1}".format(k, cost[k])) # # # sys.exit(1) else: # Train the Machine Learning Model. k = 3 # silhouette score of 0.997791174741 with no scaling. # Using "features" has a higher silhouette score of 0.997791174741 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(df_kmeans) centers = model.clusterCenters() printDebugMsg("Cluster Centers: ") for center in centers: printDebugMsg(center) # Assign events to clusters. predictions = model.transform(df_kmeans).select( 'id', 'features', 'prediction') # # # # Find optimal choice for k. # # # cost = np.zeros(20) # # # for k in range(2,20): # # # kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") # # # model = kmeans.fit(df_kmeans.sample(False,0.1, seed=42)) # # # cost[k] = model.computeCost(df_kmeans) # # # printDebugMsg("Cost =") # # # for k in range(2, 20): # # # printDebugMsg("{0}: {1}".format(k, cost[k])) # # # sys.exit(1) rows = predictions.collect() # Create prediction dataframe. df_pred = spark.createDataFrame(rows) # Join prediction with original data. df_pred = df_pred.join(df, 'id') if DEBUGMODE: df_pred.show() if DEBUGMODE: predictions.show() printDebugMsg("Prediction counts for each cluster:") if DEBUGMODE: predictions.groupBy('prediction').count().show() # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) printDebugMsg( "Silhouette with squared euclidean distance = {0}".format(silhouette)) # Get max, stddev, and mean by cluster. row_0 = df_pred.filter(df_pred['prediction'] == 0).groupBy().max( 'sumOfBytes', 'uniqDstIPs', 'uniqDstPorts').collect()[0] row_1 = df_pred.filter(df_pred['prediction'] == 1).groupBy().max( 'sumOfBytes', 'uniqDstIPs', 'uniqDstPorts').collect()[0] row_2 = df_pred.filter(df_pred['prediction'] == 2).groupBy().max( 'sumOfBytes', 'uniqDstIPs', 'uniqDstPorts').collect()[0] sumOfBytes_0_max = row_0[0] uniqDstIPs_0_max = row_0[1] uniqDstPorts_0_max = row_0[2] sumOfBytes_1_max = row_1[0] uniqDstIPs_1_max = row_1[1] uniqDstPorts_1_max = row_1[2] sumOfBytes_2_max = row_2[0] uniqDstIPs_2_max = row_2[1] uniqDstPorts_2_max = row_2[2] printDebugMsg("sumOfBytes_0_max = {0}".format(sumOfBytes_0_max)) printDebugMsg("uniqDstIPs_0_max = {0}".format(uniqDstIPs_0_max)) printDebugMsg("uniqDstPorts_0_max = {0}".format(uniqDstPorts_0_max)) printDebugMsg("sumOfBytes_1_max = {0}".format(sumOfBytes_1_max)) printDebugMsg("uniqDstIPs_1_max = {0}".format(uniqDstIPs_1_max)) printDebugMsg("uniqDstPorts_1_max = {0}".format(uniqDstPorts_1_max)) printDebugMsg("sumOfBytes_2_max = {0}".format(sumOfBytes_2_max)) printDebugMsg("uniqDstIPs_2_max = {0}".format(uniqDstIPs_2_max)) printDebugMsg("uniqDstPorts_2_max = {0}".format(uniqDstPorts_2_max)) # Get original data stddev. This is for scaling the new input. sumOfBytes_Orig_stddev = df.select(stddev('sumOfBytes')).collect()[0][0] uniqDstIPs_Orig_stddev = df.select(stddev('uniqDstIPs')).collect()[0][0] uniqDstPorts_Orig_stddev = df.select( stddev('uniqDstPorts')).collect()[0][0] printDebugMsg( "sumOfBytes_Orig_stddev = {0}".format(sumOfBytes_Orig_stddev)) printDebugMsg( "uniqDstIPs_Orig_stddev = {0}".format(uniqDstIPs_Orig_stddev)) printDebugMsg( "uniqDstPorts_Orig_stddev = {0}".format(uniqDstPorts_Orig_stddev)) # Get scaled data stddev for All clusters. This is for determining the threshold. sumOfBytes_All_stddev = df_pred.select( stddev('sumOfBytes')).collect()[0][0] uniqDstIPs_All_stddev = df_pred.select( stddev('uniqDstIPs')).collect()[0][0] uniqDstPorts_All_stddev = df_pred.select( stddev('uniqDstPorts')).collect()[0][0] printDebugMsg("sumOfBytes_All_stddev = {0}".format(sumOfBytes_All_stddev)) printDebugMsg("uniqDstIPs_All_stddev = {0}".format(uniqDstIPs_All_stddev)) printDebugMsg( "uniqDstPorts_All_stddev = {0}".format(uniqDstPorts_All_stddev)) # Set values to scaled data for each cluster for determining threshold. sumOfBytes_0_stddev = sumOfBytes_1_stddev = sumOfBytes_2_stddev = sumOfBytes_All_stddev uniqDstIPs_0_stddev = uniqDstIPs_1_stddev = uniqDstIPs_2_stddev = uniqDstIPs_All_stddev uniqDstPorts_0_stddev = uniqDstPorts_1_stddev = uniqDstPorts_2_stddev = uniqDstPorts_All_stddev # Get original data mean. This is for scaling the new input. sumOfBytes_Orig_mean = df.select(mean('sumOfBytes')).collect()[0][0] uniqDstIPs_Orig_mean = df.select(mean('uniqDstIPs')).collect()[0][0] uniqDstPorts_Orig_mean = df.select(mean('uniqDstPorts')).collect()[0][0] printDebugMsg("sumOfBytes_Orig_mean = {0}".format(sumOfBytes_Orig_mean)) printDebugMsg("uniqDstIPs_Orig_mean = {0}".format(uniqDstIPs_Orig_mean)) printDebugMsg( "uniqDstPorts_Orig_mean = {0}".format(uniqDstPorts_Orig_mean)) # Get scaled data mean for All clusters. This is for determining the threshold. sumOfBytes_All_mean = df_pred.select(mean('sumOfBytes')).collect()[0][0] uniqDstIPs_All_mean = df_pred.select(mean('uniqDstIPs')).collect()[0][0] uniqDstPorts_All_mean = df_pred.select( mean('uniqDstPorts')).collect()[0][0] printDebugMsg("sumOfBytes_All_mean = {0}".format(sumOfBytes_All_mean)) printDebugMsg("uniqDstIPs_All_mean = {0}".format(uniqDstIPs_All_mean)) printDebugMsg("uniqDstPorts_All_mean = {0}".format(uniqDstPorts_All_mean)) # Set values to scaled data for each cluster for determining threshold. sumOfBytes_0_mean = sumOfBytes_1_mean = sumOfBytes_2_mean = sumOfBytes_All_mean uniqDstIPs_0_mean = uniqDstIPs_1_mean = uniqDstIPs_2_mean = uniqDstIPs_All_mean uniqDstPorts_0_mean = uniqDstPorts_1_mean = uniqDstPorts_2_mean = uniqDstPorts_All_mean upperThreshold_0_Bytes = sumOfBytes_0_max + NUM_STDDEV_ABOVE * sumOfBytes_0_stddev printDebugMsg( "upperThreshold_0_Bytes = {0}".format(upperThreshold_0_Bytes)) upperThreshold_1_Bytes = sumOfBytes_1_max + NUM_STDDEV_ABOVE * sumOfBytes_1_stddev printDebugMsg( "upperThreshold_1_Bytes = {0}".format(upperThreshold_1_Bytes)) upperThreshold_2_Bytes = sumOfBytes_2_max + NUM_STDDEV_ABOVE * sumOfBytes_2_stddev printDebugMsg( "upperThreshold_2_Bytes = {0}".format(upperThreshold_2_Bytes)) upperThreshold_0_DstIPs = uniqDstIPs_0_max + NUM_STDDEV_ABOVE * uniqDstIPs_0_stddev printDebugMsg( "upperThreshold_0_DstIPs = {0}".format(upperThreshold_0_DstIPs)) upperThreshold_1_DstIPs = uniqDstIPs_1_max + NUM_STDDEV_ABOVE * uniqDstIPs_1_stddev printDebugMsg( "upperThreshold_1_DstIPs = {0}".format(upperThreshold_1_DstIPs)) upperThreshold_2_DstIPs = uniqDstIPs_2_max + NUM_STDDEV_ABOVE * uniqDstIPs_2_stddev printDebugMsg( "upperThreshold_2_DstIPs = {0}".format(upperThreshold_2_DstIPs)) upperThreshold_0_DstPorts = uniqDstPorts_0_max + NUM_STDDEV_ABOVE * uniqDstPorts_0_stddev printDebugMsg( "upperThreshold_0_DstPorts = {0}".format(upperThreshold_0_DstPorts)) upperThreshold_1_DstPorts = uniqDstPorts_1_max + NUM_STDDEV_ABOVE * uniqDstPorts_1_stddev printDebugMsg( "upperThreshold_1_DstPorts = {0}".format(upperThreshold_1_DstPorts)) upperThreshold_2_DstPorts = uniqDstPorts_2_max + NUM_STDDEV_ABOVE * uniqDstPorts_2_stddev printDebugMsg( "upperThreshold_2_DstPorts = {0}".format(upperThreshold_2_DstPorts)) # Combined upper threshold is pythagorean in 3 dimensions. # s^2 = x^2 + y^2 + z^2 threshold_0 = sqrt(upperThreshold_0_Bytes**2 + upperThreshold_0_DstIPs**2 + upperThreshold_0_DstPorts**2) threshold_1 = sqrt(upperThreshold_1_Bytes**2 + upperThreshold_1_DstIPs**2 + upperThreshold_1_DstPorts**2) threshold_2 = sqrt(upperThreshold_2_Bytes**2 + upperThreshold_2_DstIPs**2 + upperThreshold_2_DstPorts**2) printDebugMsg("threshold_0 = {0}".format(threshold_0)) printDebugMsg("threshold_1 = {0}".format(threshold_1)) printDebugMsg("threshold_2 = {0}".format(threshold_2)) # Combine everything needed to return values. sumOfBytes_0 = (sumOfBytes_0_max, sumOfBytes_Orig_mean, sumOfBytes_Orig_stddev) sumOfBytes_1 = (sumOfBytes_1_max, sumOfBytes_Orig_mean, sumOfBytes_Orig_stddev) sumOfBytes_2 = (sumOfBytes_2_max, sumOfBytes_Orig_mean, sumOfBytes_Orig_stddev) uniqDstIPs_0 = (uniqDstIPs_0_max, uniqDstIPs_Orig_mean, uniqDstIPs_Orig_stddev) uniqDstIPs_1 = (uniqDstIPs_1_max, uniqDstIPs_Orig_mean, uniqDstIPs_Orig_stddev) uniqDstIPs_2 = (uniqDstIPs_2_max, uniqDstIPs_Orig_mean, uniqDstIPs_Orig_stddev) uniqDstPorts_0 = (uniqDstPorts_0_max, uniqDstPorts_Orig_mean, uniqDstPorts_Orig_stddev) uniqDstPorts_1 = (uniqDstPorts_1_max, uniqDstPorts_Orig_mean, uniqDstPorts_Orig_stddev) uniqDstPorts_2 = (uniqDstPorts_2_max, uniqDstPorts_Orig_mean, uniqDstPorts_Orig_stddev) # Update global variables. global kmeansModel global clusterCenters global sumOfBytes_stats global uniqDstIPs_stats global uniqDstPorts_stats global thresholds kmeansModel = model clusterCenters = centers sumOfBytes_stats = [sumOfBytes_0, sumOfBytes_1, sumOfBytes_2] uniqDstIPs_stats = [uniqDstIPs_0, uniqDstIPs_1, uniqDstIPs_2] uniqDstPorts_stats = [uniqDstPorts_0, uniqDstPorts_1, uniqDstPorts_2] thresholds = (threshold_0, threshold_1, threshold_2)
iotmsgsRDD = sqlContext.read.json(js) iotmsgsRDD.registerTempTable("iotmsgsTable") print("JSON converted to DataFrame of casted floating point numbers") sqlContext.sql("select distinct cast(payload.data.temperature as float) \ from iotmsgsTable order by temperature desc").show() print("DataFrame showing automated 'describe' summary of floating points") sqlContext.sql("select distinct cast(payload.data.temperature as float) \ from iotmsgsTable order by temperature desc").describe().show() print("DataFrame of selected SQL dataframe functions") temperatureDF = sqlContext.sql("select distinct cast(payload.data.temperature \ as float) from iotmsgsTable order by temperature desc") functionsDF = temperatureDF.select([mean('temperature'), min('temperature'), \ max('temperature')]) print(type(functionsDF)) print(functionsDF) functionsDF.printSchema() functionsDF.show() # Collect a List of Rows of data from the DataFrame print("Extracted List of Rows of selected SQL dataframe function") functionsList = temperatureDF.select([mean('temperature'), min('temperature'), \ max('temperature')]).collect() print(type(functionsList)) print(functionsList) print() # Collect max temperature from Row #1 of the DataFrame
# Read Data from HDFS df = spark.read.csv('/tmp/fajar/bank-class.csv', inferSchema=False, header=True) # Column rename and type casting df = df.withColumn("LIMIT_BAL", df["LIMIT_BAL"].cast(IntegerType())) df = df.withColumn("AGE", df["AGE"].cast(IntegerType())) df = df.withColumn("DELAY1", df["DELAY1"].cast(IntegerType())) df = df.withColumn("DELAY2", df["DELAY2"].cast(IntegerType())) df = df.withColumnRenamed("TARGET", "label") # Change column name 'TARGET' to 'label' to ease modelling purpose df = df.withColumn("label", df["label"].cast(IntegerType())) df = df.withColumn("BILLING1", df["BILLING1"].cast(DoubleType())) df = df.withColumn("BILLING2", df["BILLING2"].cast(DoubleType())) df = df.withColumn("PAYMENT1", df["PAYMENT1"].cast(DoubleType())) df = df.withColumn("PAYMENT2", df["PAYMENT2"].cast(DoubleType())) LIMIT_BAL_means = df.select(mean("LIMIT_BAL")).collect()[0][0] df = df.na.fill({"LIMIT_BAL": LIMIT_BAL_means}) df = df.na.drop(subset=("MARITAL", "EDUCATION", "SEX")) # String to Index Conversion MARITALindexer = StringIndexer(inputCol="MARITAL", outputCol="MARITALindex") EDUCATIONindexer= StringIndexer(inputCol="EDUCATION", outputCol="EDUCATIONindex") SEXindexer = StringIndexer(inputCol="SEX", outputCol="SEXindex") # Index to Binary Vector Conversion MARITALencoder = OneHotEncoder(inputCol="MARITALindex", outputCol="MARITALvec") EDUCATIONencoder= OneHotEncoder(inputCol="EDUCATIONindex", outputCol="EDUCATIONvec") SEXencoder = OneHotEncoder(inputCol="SEXindex", outputCol="SEXvec") # Create features vector
#to find the mode of embarked column titanic_df.groupBy("Embarked").count().orderBy("count", ascending=False).show() mode_embarked = titanic_df.groupBy("Embarked").count().orderBy( "count", ascending=False).first()[0] print(mode_embarked) #to fill the nullvales of embarked column with it.s mode # titanic_df=titanic_df.fillna(mode_embarked,subset=['Embarked']) #to count the missing values # for c in titanic_df.columns: # print(c,titanic_df.filter(col(c).isNull()).count()) #to find the mean of age column titanic_df.select(mean('Age')).show() mean_age = titanic_df.select(mean('Age')).first()[0] print(int(mean_age)) #to fill the null values of age column with mean of age # titanic_df=titanic_df.fillna(mean_age,subset=['Age']) #to count the missing values # for c in titanic_df.columns: # print(c,titanic_df.filter(col(c).isNull()).count()) #APRIL 29TH #0r titanic_df = titanic_df.fillna({ "Age": int(mean_age), "Embarked": mode_embarked
def mapper(word): return (word[0:4], word[4:10], word[10:15], word[15:23], word[23:27], word[27:28], word[28:34], word[34:41], word[41:46], word[46:51], word[51:56], word[56:60], word[60:63], word[63:64], word[64:65], word[65:69], word[69:70], word[70:75], word[75:76], word[76:77], word[77:78], word[78:84], word[84:85], word[85:86], word[86:87], word[87:92], word[92:93], word[93:98], word[98:99], word[99:104], word[104:105]) def reducer(x,y): return x+y if __name__ == "__main__": start_time = time.time() sc = SparkContext(appName="PySparkClimate") # f = open("1980.txt", "w") # for fileName in glob.glob("/home/DATA/NOAA_weather/1980/*.gz"): lines = sc.textFile("/home/DATA/NOAA_weather/1980/325960-99999-1980.gz", 1) counts = lines.flatMap(lambda x: x.splitlines()) \ .map(mapper) # \ # .reduceByKey(reducer) output = counts.collect() end_time = time.time() print("TIME OF PROGRAM: ",end_time - start_time) spark = SparkSession.builder \ .master("local") \ .appName("Word Count") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame(output, ['VARS', 'USAF_CAT_ID', 'NCDC_WBAN_ID', 'OBSV_DATE', 'OBSV_TIME', 'OBSV_FLAG', 'OBSV_LAT', 'OBSV_LONG', 'RPT_TYPE', 'OBSV_ELEV', 'STAT_LET_ID', 'OBSV_QC_PROC', 'OBSV_ANG', 'OBSV_QUAL_CODE', 'OBSV_TYPE', 'OBSV_SPEED', 'OBSV_SPEED_QUAL_CODE', 'OBSV_CEIL_H', 'OBSV_CEIL_QUAL_CODE', 'OBSV_CEIL_DET_CODE', 'CAVOK_CODE', 'VIS_D', 'VIS_D_QUAL', 'VIS_VAR', 'VIS_QUAL_VAR_CODE', 'AIR_TMP', 'AIR_TMP_Q', 'AIR_TMP_DEW', 'AIR_TMP_DEW_QUAL', 'SEA_PRESS', 'SEA_PRESS_QC']) # df.describe(['AIR_TMP']).show() df = df.where(F.col('AIR_TMP') != 9999) stats = df.select(F.mean(F.col('AIR_TMP')).alias('mean')).collect() print(stats[0]['mean']) # f.write(str(stats[0]['mean']) + ",") # f.close() sc.stop()
def summary(df, datatypes=None): spark = df.sql_ctx types = {x.name: x.dataType for x in list(df.schema)} #filter datatypes if datatypes is not None: types = { k: v for k, v in types.items() if any([x in datatypes for x in [v, str(v), v.simpleString()]]) } res = pd.DataFrame.from_dict(types, orient='index') res.columns = ['datatype'] count = df.count() res['count'] = count d = df.select([F.approx_count_distinct(c).alias(c) for c in df.columns]).toPandas().T d.columns = ['approx_distinct'] d.index.name = 'index' res = res.join(d) res['unique_ratio'] = res['approx_distinct'] / count sel = [] for c, v in types.items(): if isinstance(v, (T.NumericType)): sel += [F.mean(c).alias(c)] else: sel += [F.min(F.lit(None)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['mean'] d.index.name = 'index' res = res.join(d) d = df.select([F.min(c).alias(c) for c in df.columns]).toPandas().T d.columns = ['min'] d.index.name = 'index' res = res.join(d) d = df.select([F.max(c).alias(c) for c in df.columns]).toPandas().T d.columns = ['max'] d.index.name = 'index' res = res.join(d) d = df.select([ F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns ]).toPandas().T d.columns = ['null'] d.index.name = 'index' res = res.join(d) sel = [] for c, v in types.items(): if isinstance(v, (T.NumericType)): sel += [F.count(F.when(F.isnan(c), c)).alias(c)] else: sel += [F.min(F.lit(0)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['nan'] d.index.name = 'index' res = res.join(d) sel = [] for c, v in types.items(): if isinstance(v, (T.StringType)): sel += [F.count(F.when(F.col(c).isin(''), c)).alias(c)] else: sel += [F.min(F.lit(0)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['empty'] d.index.name = 'index' res = res.join(d) return res
from pyspark import SparkContext from csv import reader from pyspark.sql import SQLContext from pyspark.sql.functions import col from pyspark.sql import functions as F sc = SparkContext() sqc = SQLContext(sc) df = sqc.read.options( header='true', inferschema='true', sep='\t', encoding='ISO-8859-1').csv('/user/hm74/NYCOpenData/erm2-nwe9.tsv.gz') timeFmt = "MM/dd/yyyy HH:mm:ss" timeDiff = (F.unix_timestamp(col('Closed Date'), format=timeFmt) - F.unix_timestamp(col('Created Date'), format=timeFmt)) df = df.withColumn("ResponseTimeHrs", timeDiff / 3600.0) df.groupBy("Incident Zip").agg(F.mean('ResponseTimeHrs')).show() df.where(col('Borough') == 'BROOKLYN').where( col('Complaint Type') == 'Root/Sewer/Sidewalk Condition').count() # or all boroughs
sc = SparkContext(conf = conf) sqlcontext = SQLContext(sc) # 1. Create a DataFrame with one int column and 10 rows. df = sqlcontext.range(0, 10) df.show() # Generate two other columns using uniform distribution and normal distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")) df.show() # 2. Summary and Descriptive Statistics df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) df.describe('uniform', 'normal').show() df.select([mean('uniform'), min('uniform'), max('uniform')]).show() # 3. Sample covariance and correlation # Covariance is a measure of how two variables change with respect to each other. # A positive number would mean that there is a tendency that as one variable increases, # the other increases as well. # A negative number would mean that as one variable increases, # the other variable has a tendency to decrease. df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) df.stat.cov('rand1', 'rand2') df.stat.cov('id', 'id') # Correlation is a normalized measure of covariance that is easier to understand, # as it provides quantitative measurements of the statistical dependence between two random variables. df.stat.corr('rand1', 'rand2') df.stat.corr('id', 'id')
"""## Cast the String datatype to Integer/Float""" from pyspark.sql.types import IntegerType, FloatType bankProspectsDF2 = bankProspectsDF1.withColumn( "age", bankProspectsDF1["age"].cast(IntegerType())).withColumn( "salary", bankProspectsDF1["salary"].cast(FloatType())) """## Replace Age and Salary with average values of their respective column import mean from sql.fuctions """ from pyspark.sql.functions import mean """### Calculate "mean" value of the age""" mean_age_val = bankProspectsDF2.select(mean(bankProspectsDF2['age'])).collect() mean_age = mean_age_val[0][0] """### Calculate mean salary value""" mean_salary_val = bankProspectsDF2.select(mean( bankProspectsDF2['salary'])).collect() mean_salary = mean_salary_val[0][0] """### Replace missing age with average value""" bankbankProspectsDF3 = bankProspectsDF2.na.fill(mean_age, ["age"]) """### Replace missing age with salary value""" bankbankProspectsDF4 = bankbankProspectsDF3.na.fill(mean_salary, ["salary"]) """## Write the transformed file to a new csv file"""
def makeMapping(firstTime): global routes grpString = "ORIGIN_AIRPORT_ID","ORIGIN_CITY_NAME","ORIGIN","DEST_AIRPORT_ID","DEST_CITY_NAME","DEST","UNIQUE_CARRIER_NAME" if firstTime: routes = routes.groupBy(*grpString).agg(sum("PASSENGERS").alias("PASSENGERS"),sum("DEPARTURES_PERFORMED").alias("DEPARTURES_PERFORMED"),mean("RAMP_TO_RAMP").alias("RAMP_TO_RAMP")) for i in routes.collect(): if not dictAir.get("Airport{}".format(i[0])): initNode(i[0],(i[1],i[2]),i[8]) if not dictAir.get("Airport{}".format(i[3])): initNode(i[3],(i[4],i[5]),0) if (i[9]!=9876543.21): tripTime =i[9] getApt(i[0])['depts'] += i[8] sourceCNX = getApt(i[0])['cnx'] sourceCNX.append((int(i[3]),tripTime,i[6]))
import findspark findspark.init('C:\spark-3.0.0-preview2-bin-hadoop2.7') from pyspark.sql import SparkSession spark = SparkSession.builder.appName("missingdata").getOrCreate() df = spark.read.csv("ContainsNull.csv", header=True, inferSchema=True) df.show() # Drop df.na.drop().show() df.na.drop( thresh=2).show() # the row at least should have 2 null values to show up df.na.drop(subset=["Sales"]).show() df.na.drop(how='any').show() df.na.drop(how='all').show() # Fill with "string" df.na.fill('NEW VALUE').show() df.na.fill(0).show() df.na.fill('No Name', subset=['Name']).show() # Fill with mean from pyspark.sql.functions import mean mean_val = df.select(mean(df['Sales'])).collect() mean_val[0][0] mean_sales = mean_val[0][0] df.na.fill(mean_sales, ["Sales"]).show()
if not l: return empty_value counts = defaultdict(int) for value in l: counts[value] += 1 counts = [(v, k) for (k, v) in counts.items()] counts.sort() return counts[-1][1] MAIN_SUMMARY_FIELD_AGGREGATORS = [ F.sum('aborts_content').alias('aborts_content_sum'), F.sum('aborts_gmplugin').alias('aborts_gmplugin_sum'), F.sum('aborts_plugin').alias('aborts_plugin_sum'), # active_addons F.mean('active_addons_count').alias('active_addons_count_mean'), F.first('active_experiment_branch').alias('active_experiment_branch'), F.first('active_experiment_id').alias('active_experiment_id'), # active_theme F.sum(F.expr('active_ticks/(3600.0/5)')).alias('active_hours_sum'), F.first('addon_compatibility_check_enabled').alias( 'addon_compatibility_check_enabled'), F.first('app_build_id').alias('app_build_id'), F.first('app_display_version').alias('app_display_version'), F.first('app_name').alias('app_name'), F.first('app_version').alias('app_version'), # attribution F.first('blocklist_enabled').alias('blocklist_enabled'), F.first('channel').alias('channel'), F.first('city').alias('city'), F.first('country').alias('country'),
# In[39]: df.select(avg('char_38').alias('Avrage_of_char38')).show() # In[40]: from pyspark.sql.functions import format_number # In[41]: stdd = df.select(stddev('char_38').alias('std')) # In[42]: stdd.select(format_number('std', 2).alias('std')).show() # In[99]: #missing data # In[46]: from pyspark.sql.functions import mean mean_val = df.select(mean(df['char_38'])).collect() # In[47]: df.na.fill(df.select(mean(df['char_38'])).collect()[0][0], ['char_38']).show() # In[ ]:
status_stations_df.docksAvailable, weather_df.date, weather_df.time, weather_df.temperature, weather_df.humidity, weather_df.pressure, weather_df.visibility, weather_df.precipIntensity, weather_df.windSpeed )) status_joined_df.show() stats_df = (status_joined_df .agg( F.mean(status_joined_df.temperature).alias("avgTemp"), F.mean(status_joined_df.humidity).alias("avgHumidity"), F.mean(status_joined_df.pressure).alias("avgPressure"), F.stddev(status_joined_df.temperature).alias("stddevTemp"), F.stddev(status_joined_df.humidity).alias("stddevHumidity"), F.stddev(status_joined_df.pressure).alias("stddevPressure"), F.stddev(status_joined_df.visibility).alias("stddevVisibility"), F.stddev(status_joined_df.precipIntensity).alias("stddevPrecipitation"), F.stddev(status_joined_df.windSpeed).alias("stddevWindSpeed"))) stats_df.write.mode('overwrite').parquet("hdfs://hadoop:9000/models/weather-stats") stats = stats_df.collect()[0] print "Statistics: %s" % (stats,)
def get_builtin_aggregator_column(agg, ctx): try: aggregator = ctx.aggregators[agg["aggregator"]] try: input = ctx.populate_values(agg["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise if aggregator["name"] == "approx_count_distinct": return F.approxCountDistinct(input["col"], input.get("rsd")).alias(agg["name"]) if aggregator["name"] == "avg": return F.avg(input).alias(agg["name"]) if aggregator["name"] in { "collect_set_int", "collect_set_float", "collect_set_string" }: return F.collect_set(input).alias(agg["name"]) if aggregator["name"] == "count": return F.count(input).alias(agg["name"]) if aggregator["name"] == "count_distinct": return F.countDistinct(*input).alias(agg["name"]) if aggregator["name"] == "covar_pop": return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "covar_samp": return F.covar_samp(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "kurtosis": return F.kurtosis(input).alias(agg["name"]) if aggregator["name"] in {"max_int", "max_float", "max_string"}: return F.max(input).alias(agg["name"]) if aggregator["name"] == "mean": return F.mean(input).alias(agg["name"]) if aggregator["name"] in {"min_int", "min_float", "min_string"}: return F.min(input).alias(agg["name"]) if aggregator["name"] == "skewness": return F.skewness(input).alias(agg["name"]) if aggregator["name"] == "stddev": return F.stddev(input).alias(agg["name"]) if aggregator["name"] == "stddev_pop": return F.stddev_pop(input).alias(agg["name"]) if aggregator["name"] == "stddev_samp": return F.stddev_samp(input).alias(agg["name"]) if aggregator["name"] in {"sum_int", "sum_float"}: return F.sum(input).alias(agg["name"]) if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}: return F.sumDistinct(input).alias(agg["name"]) if aggregator["name"] == "var_pop": return F.var_pop(input).alias(agg["name"]) if aggregator["name"] == "var_samp": return F.var_samp(input).alias(agg["name"]) if aggregator["name"] == "variance": return F.variance(input).alias(agg["name"]) raise ValueError("missing builtin aggregator") # unexpected except CortexException as e: e.wrap("aggregate " + agg["name"]) raise
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg( mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum")).toPandas() else: stats_df = df.select(column).na.drop().agg( mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum")).toPandas() stats_df["variance"] = df.select(column).na.drop().agg( variance_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg( skewness_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg( kurtosis_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr( "percentile_approx({col},CAST({n} AS DOUBLE))".format( col=column, n=x)).toPandas().ix[:, 0]) stats = stats_df.ix[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column).na.drop().select( df_abs(col(column) - stats["mean"]).alias("delta")).agg( df_sum(col("delta"))).toPandas().ix[0, 0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote( base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
# store data columnindex_categorical_stores = columnindex(header_stores, fields_categorical_stores) schema_stores = construct_schema(fields_categorical_stores, header_stores) df_stores_rdd = stores_d_file.filter(lambda l: header_stores_original[ 0] not in l).map(lambda p: convert_string_to_numeric_df( p.split(","), columnindex_categorical_stores)) df_stores = spark.createDataFrame(df_stores_rdd, schema_stores) ## split stores into control and treatment group according to store attributes (these attributes should be strongly related to store sales) store_group_col_names = ['avg_hhi', 'avg_traffic'] store_group_col_names_std = [ col_name + '_std' for col_name in store_group_col_names ] for col_name in ['avg_hhi', 'avg_traffic']: col_name_std = col_name + '_std' col_mean = df_stores.agg(F.mean(col(col_name))).collect()[0][0] col_std = df_stores.agg(F.stddev(col(col_name))).collect()[0][0] standarize_udf = udf(lambda x: (x - col_mean) / col_std, DoubleType()) df_stores = df_stores.withColumn(col_name_std, standarize_udf(col(col_name))) store_group_col_index = columnindex(df_stores.columns, store_group_col_names_std) df_stores_rdd = df_stores.rdd.map(list) ## perform the clustering to cluster stores according to the attributes store_number = df_stores.count() if store_number <= 2: cluster_number = 1 else: cluster_number = 3 clusters = KMeans.train(df_stores_rdd.map( lambda p: [p[index] for index in store_group_col_index]),