def test_multiple_udfs(self):
        """
        Test multiple group aggregate pandas UDFs in one agg function.
        """
        from pyspark.sql.functions import sum, mean

        df = self.data
        mean_udf = self.pandas_agg_mean_udf
        sum_udf = self.pandas_agg_sum_udf
        weighted_mean_udf = self.pandas_agg_weighted_mean_udf

        result1 = (df.groupBy('id')
                   .agg(mean_udf(df.v),
                        sum_udf(df.v),
                        weighted_mean_udf(df.v, df.w))
                   .sort('id')
                   .toPandas())
        expected1 = (df.groupBy('id')
                     .agg(mean(df.v),
                          sum(df.v),
                          mean(df.v).alias('weighted_mean(v, w)'))
                     .sort('id')
                     .toPandas())

        self.assertPandasEqual(expected1, result1)
    def test_mixed_udf(self):
        from pyspark.sql.functions import mean

        df = self.data
        w = self.unbounded_window

        plus_one = self.python_plus_one
        time_two = self.pandas_scalar_time_two
        mean_udf = self.pandas_agg_mean_udf

        result1 = df.withColumn(
            'v2',
            plus_one(mean_udf(plus_one(df['v'])).over(w)))
        expected1 = df.withColumn(
            'v2',
            plus_one(mean(plus_one(df['v'])).over(w)))

        result2 = df.withColumn(
            'v2',
            time_two(mean_udf(time_two(df['v'])).over(w)))
        expected2 = df.withColumn(
            'v2',
            time_two(mean(time_two(df['v'])).over(w)))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
    def test_basic(self):
        df = self.data
        weighted_mean_udf = self.pandas_agg_weighted_mean_udf

        # Groupby one column and aggregate one UDF with literal
        result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id')
        expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())

        # Groupby one expression and aggregate one UDF with literal
        result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\
            .sort(df.id + 1)
        expected2 = df.groupby((col('id') + 1))\
            .agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1)
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())

        # Groupby one column and aggregate one UDF without literal
        result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id')
        expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id')
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())

        # Groupby one expression and aggregate one UDF without literal
        result4 = df.groupby((col('id') + 1).alias('id'))\
            .agg(weighted_mean_udf(df.v, df.w))\
            .sort('id')
        expected4 = df.groupby((col('id') + 1).alias('id'))\
            .agg(mean(df.v).alias('weighted_mean(v, w)'))\
            .sort('id')
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
    def test_without_partitionBy(self):
        df = self.data
        w = self.unpartitioned_window
        mean_udf = self.pandas_agg_mean_udf

        result1 = df.withColumn('v2', mean_udf(df['v']).over(w))
        expected1 = df.withColumn('v2', mean(df['v']).over(w))

        result2 = df.select(mean_udf(df['v']).over(w))
        expected2 = df.select(mean(df['v']).over(w))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
    def test_simple(self):
        df = self.data
        w = self.unbounded_window

        mean_udf = self.pandas_agg_mean_udf

        result1 = df.withColumn('mean_v', mean_udf(df['v']).over(w))
        expected1 = df.withColumn('mean_v', mean(df['v']).over(w))

        result2 = df.select(mean_udf(df['v']).over(w))
        expected2 = df.select(mean(df['v']).over(w))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
Example #6
0
    def test_shrinking_window(self):
        from pyspark.sql.functions import mean

        df = self.data
        w1 = self.shrinking_row_window
        w2 = self.shrinking_range_window

        mean_udf = self.pandas_agg_mean_udf

        result1 = df.withColumn('m1', mean_udf(df['v']).over(w1)) \
            .withColumn('m2', mean_udf(df['v']).over(w2))

        expected1 = df.withColumn('m1', mean(df['v']).over(w1)) \
            .withColumn('m2', mean(df['v']).over(w2))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
    def test_replace_existing(self):
        df = self.data
        w = self.unbounded_window

        result1 = df.withColumn('v', self.pandas_agg_mean_udf(df['v']).over(w))
        expected1 = df.withColumn('v', mean(df['v']).over(w))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
    def test_alias(self):
        df = self.data
        mean_udf = self.pandas_agg_mean_udf

        result1 = df.groupby('id').agg(mean_udf(df.v).alias('mean_alias'))
        expected1 = df.groupby('id').agg(mean(df.v).alias('mean_alias'))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
    def test_mixed_sql(self):
        df = self.data
        w = self.unbounded_window
        mean_udf = self.pandas_agg_mean_udf

        result1 = df.withColumn('v', mean_udf(df['v'] * 2).over(w) + 1)
        expected1 = df.withColumn('v', mean(df['v'] * 2).over(w) + 1)

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
    def test_alias(self):
        from pyspark.sql.functions import mean

        df = self.data
        mean_udf = self.pandas_agg_mean_udf

        result1 = df.groupby('id').agg(mean_udf(df.v).alias('mean_alias'))
        expected1 = df.groupby('id').agg(mean(df.v).alias('mean_alias'))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
Example #11
0
def formatItens(firstTime):
    #format itenary data
    global itens
    itens = itens.withColumn("ORIGIN_AIRPORT_ID",toInt("ORIGIN_AIRPORT_ID"))
    itens = itens.withColumn("DEST_AIRPORT_ID",toInt("DEST_AIRPORT_ID"))
    itens = itens.withColumn("MARKET_MILES_FLOWN",toKm("MARKET_MILES_FLOWN"))
    itens = itens.withColumn("PASSENGERS",toInt("PASSENGERS"))
    if firstTime:
        aggArg = sum("PASSENGERS").alias("PASSENGERS"),mean("MARKET_MILES_FLOWN").alias("MARKET_KMS_FLOWN")
        itens = itens.groupBy("ORIGIN_AIRPORT_ID","DEST_AIRPORT_ID").agg(*aggArg).cache()
Example #12
0
    def test_bounded_mixed(self):
        from pyspark.sql.functions import mean, max

        df = self.data
        w1 = self.sliding_row_window
        w2 = self.unbounded_window

        mean_udf = self.pandas_agg_mean_udf
        max_udf = self.pandas_agg_max_udf

        result1 = df.withColumn('mean_v', mean_udf(df['v']).over(w1)) \
            .withColumn('max_v', max_udf(df['v']).over(w2)) \
            .withColumn('mean_unbounded_v', mean_udf(df['v']).over(w1))

        expected1 = df.withColumn('mean_v', mean(df['v']).over(w1)) \
            .withColumn('max_v', max(df['v']).over(w2)) \
            .withColumn('mean_unbounded_v', mean(df['v']).over(w1))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
    def test_multiple_udfs(self):
        df = self.data
        w = self.unbounded_window

        result1 = df.withColumn('mean_v', self.pandas_agg_mean_udf(df['v']).over(w)) \
                    .withColumn('max_v', self.pandas_agg_max_udf(df['v']).over(w)) \
                    .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w))

        expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) \
                      .withColumn('max_v', max(df['v']).over(w)) \
                      .withColumn('min_w', min(df['w']).over(w))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
Example #14
0
    def test_mixed_udf(self):
        df = self.data
        w = self.unbounded_window

        plus_one = self.python_plus_one
        time_two = self.pandas_scalar_time_two
        mean_udf = self.pandas_agg_mean_udf

        result1 = df.withColumn(
            'v2',
            plus_one(mean_udf(plus_one(df['v'])).over(w)))
        expected1 = df.withColumn(
            'v2',
            plus_one(mean(plus_one(df['v'])).over(w)))

        result2 = df.withColumn(
            'v2',
            time_two(mean_udf(time_two(df['v'])).over(w)))
        expected2 = df.withColumn(
            'v2',
            time_two(mean(time_two(df['v'])).over(w)))

        assert_frame_equal(expected1.toPandas(), result1.toPandas())
        assert_frame_equal(expected2.toPandas(), result2.toPandas())
Example #15
0
    def handleUIOptions(self, displayColName):
        agg = self.options.get("aggregation")
        valFields = self.options.get("valueFields")

        if agg == 'COUNT':
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
        elif agg == 'SUM':
            return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas()
        elif agg == 'AVG':
            return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas()
        elif agg == 'MIN':
            return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas()
        elif agg == 'MAX':
            return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas()
        elif agg == 'MEAN':
            return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas()
        else:
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
Example #16
0
    def transform(self, dataframe):
        """Applies standardization to the specified columns.

        # Arguments
            dataframe: dataframe. Spark Dataframe.
        """
        # Compute the means of the specified columns.
        means = [mean(x) for x in self.columns]
        means = dataframe.select(means).collect()[0].asDict()
        self.means = self.clean_mean_keys(means)
        # Compute the standard deviation of the specified columns.
        stddevs = [stddev_pop(x) for x in self.columns]
        stddevs = dataframe.select(stddevs).collect()[0].asDict()
        self.stddevs = self.clean_stddev_keys(stddevs)
        # For every feature, add a new column to the dataframe.
        for column in self.columns:
            self.current_column = column
            dataframe = dataframe.rdd.map(self._transform).toDF()

        return dataframe
    def test_invalid_args(self):
        df = self.data
        plus_one = self.python_plus_one
        mean_udf = self.pandas_agg_mean_udf

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(
                    AnalysisException,
                    'nor.*aggregate function'):
                df.groupby(df.id).agg(plus_one(df.v)).collect()

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(
                    AnalysisException,
                    'aggregate function.*argument.*aggregate function'):
                df.groupby(df.id).agg(mean_udf(mean_udf(df.v))).collect()

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(
                    AnalysisException,
                    'mixture.*aggregate function.*group aggregate pandas UDF'):
                df.groupby(df.id).agg(mean_udf(df.v), mean(df.v)).collect()
Example #18
0
    def test_bounded_simple(self):
        from pyspark.sql.functions import mean, max, min, count

        df = self.data
        w1 = self.sliding_row_window
        w2 = self.shrinking_range_window

        plus_one = self.python_plus_one
        count_udf = self.pandas_agg_count_udf
        mean_udf = self.pandas_agg_mean_udf
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('mean_v', mean_udf(plus_one(df['v'])).over(w1)) \
            .withColumn('count_v', count_udf(df['v']).over(w2)) \
            .withColumn('max_v',  max_udf(df['v']).over(w2)) \
            .withColumn('min_v', min_udf(df['v']).over(w1))

        expected1 = df.withColumn('mean_v', mean(plus_one(df['v'])).over(w1)) \
            .withColumn('count_v', count(df['v']).over(w2)) \
            .withColumn('max_v', max(df['v']).over(w2)) \
            .withColumn('min_v', min(df['v']).over(w1))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
#print("sessions %d : uniqueSessions %d" % (sessionCount, uniqueSessions))
 
#Convert orders to grouped values
ordersDf = ordersDf.groupby("ssid").agg( sum("revenue").alias("revenue"), count("*").alias("transactions"))

#Convert sessions to grouped vals
conversionUdf = udf(convertUnixToDate,StringType())
sessionsDf  = sessionsDf.withColumn("unixTime", split(sessionsDf .ssid,":")[2])
sessionsDf  = sessionsDf.withColumn("startTime",conversionUdf("unixTime"))
sessionsDf  = sessionsDf.withColumn("siteId", split(sessionsDf .ssid,":")[1])

# #Join sessions and groups
sessionsAlias = sessionsDf.alias("session")
ordersAlias = ordersDf.alias("order")
featureAlias = featuresDf.select("ssid", "ad").alias("features")
sessionOrders = sessionsAlias.join(ordersAlias, ["ssid"])
joinedData = sessionOrders.join(featureAlias, ["ssid"])

# #Orderby and show values
groupedData = joinedData.groupby("startTime","siteId","gr","ad","browser",).agg(count("*").alias("sessions"), sum("transactions").alias("transactions"), sum("revenue").alias("revenue"))
#groupedData.coalesce(1).write.option("sep","\t").option("header","true").csv("results/target.tsv")

# https://stackoverflow.com/questions/47995188/how-to-calculate-mean-and-standard-deviation-given-a-pyspark-dataframe
# Get matching pairs and calculate the mean and value
meanFeatureExp = [mean("feature-{0}".format(x+1)).alias("feature{0}_mean".format(x+1)) for x in range(4)]
stdDevExp = [stddev("feature-{0}".format(x+1)).alias("feature{0}_std".format(x+1)) for x in range(4)]
siteIdUdf = udf(ssidToSiteId,StringType())
featuresDf = featuresDf.withColumn("siteId",siteIdUdf("ssid"))
adData = featuresDf.groupby("siteId","ad").agg(*meanFeatureExp,*stdDevExp)
adData.coalesce(1).write.option("sep","\t").option("header","true").mode("append").csv("results/output.txt")
Example #20
0
from pyspark.sql.functions import year, dayofyear, mean, round
from pyspark.ml.clustering import KMeans
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

FEATURES_COL = [
    "DayOfYear", "Avg(Temperature)", "Avg(Humidity)", "Avg(Pressure)",
    "Avg(WindSpeed)"
]

spark = SparkSession.builder.appName("Project").getOrCreate()
df = spark.read.csv("./data.csv", header=True)
spark_df = df.filter((df.city == "Las Vegas") & (year("datetime") == 2016 )) \
    .groupBy(dayofyear("datetime").alias("DayOfYear")) \
    .agg(mean("temperature").alias("Avg(Temperature)"), \
    mean("humidity").alias("Avg(Humidity)"), \
    mean("pressure").alias("Avg(Pressure)"), \
    mean("wind_speed").alias("Avg(WindSpeed)"), \
    mean("wind_direction").alias("Avg(WindDirection)")) \
    .orderBy('DayOfYear')

vecAssembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df_kmeans = vecAssembler.transform(spark_df).select('DayOfYear', 'features')
scaler = MinMaxScaler(inputCol="features",\
         outputCol="scaledFeatures")
scalerModel = scaler.fit(df_kmeans.select("features"))
scaledData = scalerModel.transform(df_kmeans)
scaledData.show()

k = 5
def main(username):
    # For verification on the username received and print in console for demo. For actual deployment,
    # can comment away.
    print(f"Received username= {username}")

    # Start the Spark instance
    cnfg = SparkConf().setAppName("TwitterUserProfile").setMaster("local[2]")
    sc = SparkContext(conf=cnfg)
    spark = SparkSession(sc)

    # Initialise the first page of tweets & user (1 page consist of 10 entries)
    url = create_url(target=username)
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers)

    # Parsing the JSON response returned by Twitter
    tweet_df = spark.createDataFrame(json_response['data'])

    # Check if there's geolocation field in the response.
    geo_exist = has_column(tweet_df, "geo")

    # Extracting the geolocation information via geo.place_id
    if geo_exist:
        tweet_df = tweet_df.select("author_id", "created_at", "geo.place_id",
                                   "id", "text")
    else:
        tweet_df = tweet_df.select("author_id", "created_at", "id", "text")

    # Extracting the user details
    user_df = spark.createDataFrame(json_response['includes']['users'])

    # flatten the public_metrics
    cols = list(
        map(lambda f: F.col("public_metrics").getItem(f).alias(str(f)), [
            "following_count", "tweet_count", "listed_count", "followers_count"
        ]))

    public_metrics = user_df.select(cols)
    user_df = user_df.drop('public_metrics')

    # Merge user_df with public_metrics
    user_df = with_column_index(user_df)
    public_metrics = with_column_index(public_metrics)
    user_df = user_df.join(public_metrics,
                           user_df.ColumnIndex == public_metrics.ColumnIndex,
                           'inner').drop("ColumnIndex")

    # If there are more tweets (next page / next token), append it to tweet_df.
    # user_df is just for a single user, so no need to append. Info will be the same.

    if 'next_token' not in json_response['meta']:
        pass
    else:
        next_token = json_response['meta']['next_token']

        while next_token is not None:
            url = create_url(username, next_token)
            json_response = connect_to_endpoint(url, headers)

            new_tweets = spark.createDataFrame(json_response['data'])

            # Check if there's geolocation field in the new tweets
            new_tweet_geo_exist = has_column(new_tweets, "geo")

            if new_tweet_geo_exist:
                new_tweets = new_tweets.select("author_id", "created_at",
                                               "geo.place_id", "id", "text")
            else:
                new_tweets = new_tweets.select("author_id", "created_at", "id",
                                               "text")

            # to make sure all have the same number of columns
            for column in tweet_df.columns:
                if column not in new_tweets.columns:
                    new_tweets = new_tweets.withColumn(column, F.lit(None))

            for column in new_tweets.columns:
                if column not in tweet_df.columns:
                    tweet_df = tweet_df.withColumn(column, F.lit(None))

            # Reordering the column of new_tweets for union function
            if geo_exist:
                new_tweets = new_tweets.select("author_id", "created_at",
                                               "place_id", "id", "text")
            else:
                new_tweets = new_tweets.select("author_id", "created_at", "id",
                                               "text")

            tweet_df = tweet_df.union(new_tweets)

            if 'next_token' not in json_response['meta']:
                next_token = None
            else:
                next_token = json_response['meta']['next_token']

    # Show the df. Can comment away in actual production.
    tweet_df.show(truncate=False)
    user_df.show(truncate=False)

    # Extract geolocation information within the tweets. Currently not in use.
    if geo_exist:
        location_df = tweet_df.select("author_id", "id", "place_id").dropna()
        location_df.show(truncate=False)

    # WORD FREQUENCY - to be made into word cloud in Tableau or other visualisation software.
    tweet_only = tweet_df.select("author_id", "text")

    # Remove punctuation, covert to lower case
    df_clean = tweet_only.select(
        "author_id",
        (lower(regexp_replace('text', "[^a-zA-Z\\s]", "")).alias('text')))

    # Tokenize text
    tokenizer = Tokenizer(inputCol='text', outputCol='words_token')
    df_words_token = tokenizer.transform(df_clean).select(
        'author_id', 'words_token')

    # Remove stop words
    remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
    df_words_no_stopw = remover.transform(df_words_token).select(
        'author_id', 'words_clean')

    # Filter length word > 3
    filter_length_udf = udf(lambda row: [x for x in row if 3 <= len(x) <= 13],
                            ArrayType(StringType()))
    df_final_words = df_words_no_stopw.withColumn(
        'words', filter_length_udf(col('words_clean')))

    # Printing the word list. Can comment away in actual deployment.
    df_final_words.show(truncate=False)

    word_count = df_final_words.select('author_id', F.explode('words').alias('word')).\
        groupBy('author_id', 'word').\
        count().\
        sort('count', ascending=False)

    # Printing the word list and count. Can comment away in actual deployment.
    word_count.show()

    # SENTIMENT ANALYSIS. Sentiment is in the range of (-1, 1).
    sentiment = udf(lambda x: TextBlob(x).sentiment[0])
    tweet_sentiment = tweet_df.withColumn(
        "sentiment_score",
        sentiment(tweet_df["text"]).cast("double"))

    classify_sentiment_udf = udf(classify_sentiment)

    tweet_sentiment = tweet_sentiment.withColumn(
        "sentiment",
        classify_sentiment_udf(tweet_sentiment["sentiment_score"]))
    tweet_sentiment = tweet_sentiment.select('author_id', 'created_at', 'id',
                                             'text', 'sentiment_score',
                                             'sentiment')
    # Can comment away the show statement. Left here to display the progress in console for demo.
    tweet_sentiment.show()

    sentiment_count = tweet_sentiment.groupBy('author_id', 'sentiment').agg(
        F.mean('sentiment_score'),
        F.count('sentiment')).toDF("author_id", "sentiment",
                                   "avg_sentiment_score", "count")
    # Can comment away the show statement. Left here to display the progress in console for demo.
    sentiment_count.show()

    # Read in existing data from Amazon RedShift DB. If user already exists, need to merge and deduplicate, then write data back.
    with redshift_conn.connect() as conn, conn.begin():

        # Check if Table exists first. If so, read in existing Twitter users that are already in RedShift DB.
        # The unique key is the id, which is the author_id, Twitter user id.
        if redshift_conn.has_table("user_data"):
            user = pd.read_sql("""
               select * from user_data;""", conn)

            # Append latest data retrieved to those in DB and remove duplicates, keeping the latest.
            user = user.append(user_df.toPandas())
            user = user.drop_duplicates(subset="id", keep="last")
        else:
            user = user_df.toPandas()

        # Similarly, check if the Table for sentiment count exists. If so, read in existing sentiment count
        # for existing users in RedShift DB. The pair, author_id and sentiment," is used for deduplication.
        if redshift_conn.has_table("sentiment_count"):
            senti_df = pd.read_sql(
                """
               select * from sentiment_count;""", conn)

            # Append latest data to those in DB and remove duplicates, keeping the latest.
            senti_df = senti_df.append(sentiment_count.toPandas())
            senti_df = senti_df.drop_duplicates(
                subset=["author_id", "sentiment"], keep="last")
        else:
            senti_df = sentiment_count.toPandas()

        # Checking if Table for word_count already exists in RedShift. If so, read in existing word count for
        # existing users in RedShift DB. Distinct pair of author_id and word is used for comparison.
        if redshift_conn.has_table("word_count"):
            word_df = pd.read_sql(
                """
                       select * from word_count;""", conn)

            # Append latest data to those in DB and remove duplicates, keeping the latest.
            word_df = word_df.append(word_count.toPandas())
            word_df = word_df.drop_duplicates(subset=["author_id", "word"],
                                              keep="last")
        else:
            word_df = word_count.toPandas()

        # Check for Table, tweet_sentiment. If exists, read in existing tweet sentiment for existing users in
        # RedShift DB. The unique ID used is the tweet id, which is unique for each tweet. All unique tweets
        # are kept. Thus even if the Twitter user deleted his old tweets, it will still be retained in the
        # Redshift DB if it was previously captured.
        if redshift_conn.has_table("tweet_sentiment"):
            tweet_db = pd.read_sql(
                """
                       select * from tweet_sentiment;""", conn)

            # Append latest data to those in DB and remove duplicates, keeping the latest.
            tweet_db = tweet_db.append(tweet_sentiment.toPandas())
            tweet_db = tweet_db.drop_duplicates(subset="id", keep="last")
        else:
            tweet_db = tweet_sentiment.toPandas()

    # Update the data to Redshift.
    user.to_sql('user_data', redshift_conn, index=False, if_exists='replace')
    word_df.to_sql('word_count',
                   redshift_conn,
                   index=False,
                   if_exists='replace')
    senti_df.to_sql('sentiment_count',
                    redshift_conn,
                    index=False,
                    if_exists='replace')
    tweet_db.to_sql('tweet_sentiment',
                    redshift_conn,
                    index=False,
                    if_exists='replace',
                    dtype={
                        'author_id':
                        sqlalchemy.types.VARCHAR(length=255),
                        'created_at':
                        sqlalchemy.types.VARCHAR(length=255),
                        'id':
                        sqlalchemy.types.VARCHAR(length=255),
                        'text':
                        sqlalchemy.types.VARCHAR(length=5000),
                        'sentiment_score':
                        sqlalchemy.types.Float(precision=3, asdecimal=True),
                        'sentiment':
                        sqlalchemy.types.VARCHAR(length=255),
                    })
    # Location information in tweet. Currently not in use.
    # location.to_sql('location_data', redshift_conn, index=False, if_exists='replace')

    # Can comment away print statement for actual deployment. Left here so that status will be printed in
    # console for demo purpose.
    print("Redshift DB updated successfully.")
Example #22
0
df = spark.read.load("gs://vf-polimi-batch-data/dpi/year=%d/month=%d" % (now.year, now.month), \
                              format='com.databricks.spark.csv', \
                              header='true', \
                              inferSchema='true')

# number, signature, usage, timestamp

df = df.withColumn('timestamp', f.to_timestamp('timestamp','dd/MM/yyyy-HH:mm:ss'))

df = df.withColumn('hour', f.hour('timestamp'))

df = df\
    .groupBy('number', 'signature', 'day','hour')\
    .sum('usage')\
    .groupBy('number', 'signature', 'hour')\
    .agg(f.mean('sum(usage)'))\
    .withColumnRenamed('avg(sum(usage))','average_usage')

# need to create a dataset first "bq mk vf_polimi_demo_dataset"
print('Computing kpis and writing output to BigQuery')
df.write.format('bigquery') \
    .option('table', 'vf_polimi_demo_dataset.batch_kpi%d%d' % (now.year, now.month)) \
    .option("temporaryGcsBucket","vf-polimi-batch-data") \
    .mode('overwrite') \
    .save()

print('Finished')

# alternative solution to write output on GCS partitioned by date
#df.write.partitionBy('hour').option('header', 'true').mode('overwrite').csv('gs://vf-polimi-batch-data/dpi-kpi/year=%d/month=%d' % (now.year, now.month))
# A slightly different way to generate the two random columns
df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
#df.describe().show()
display(df.describe())


# COMMAND ----------

#df.describe('uniform', 'normal').show()
display(df.describe('uniform', 'normal'))

# COMMAND ----------

from pyspark.sql.functions import mean, min, max
#df.select([mean('uniform'), min('uniform'), max('uniform')]).show()
display(df.select([mean('uniform'), min('uniform'), max('uniform')]))

# COMMAND ----------

# MAGIC %md ### Sample covariance and correlation
# MAGIC 
# MAGIC Covariance is a measure of how two variables change with respect to each other. A positive number would mean that there is a tendency that as one variable increases, the other increases as well. A negative number would mean that as one variable increases, the other variable has a tendency to decrease. The sample covariance of two columns of a DataFrame can be calculated as follows:

# COMMAND ----------

from pyspark.sql.functions import rand
df = sqlContext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))


# COMMAND ----------
Example #24
0
    'Total_Individuals_in_Shelter', 'Single_Adult_Men_in_Shelter',
    'Single_Adult_Women_in_Shelter', 'Total_Single_Adults_in_Shelter',
    'Families_with_Children_in_Shelter', 'Adults_in_Families_with_Children_in_Shelter',
    'Children_in_Families_with_Children_in_Shelter',
    "Total_Individuals_in_Families_with_Children_in_Shelter_", 'Adult_Families_in_Shelter',
    'Individuals_in_Adult_Families_in_Shelter', 'case_count', 'cum_case_count', 'hosp_count', 'cum_hosp_count',
    'death_count', 'death_count_prob', 'cum_death_count')

# Clean transformed datasets
cols = ['Total_Adults_in_Shelter', 'Total_Children_in_Shelter', 'Total_Individuals_in_Shelter']
cvd_dhs = clean_na(cvd_dhs, cols)

# Compute rolling averages of homeless sheltler census counts to replace null values
window2 = Window.partitionBy().orderBy('date').rowsBetween(Window.currentRow - 8, Window.currentRow - 1)
cvd_dhs = cvd_dhs.withColumn('total_adults_rolling_avg',
                             functions.mean(cvd_dhs['Total_Adults_in_Shelter']).over(window2))
cvd_dhs = cvd_dhs.withColumn('total_children_rolling_avg',
                             functions.mean(cvd_dhs['Total_Children_in_Shelter']).over(window2))
cvd_dhs = cvd_dhs.withColumn('total_individuals_rolling_avg',
                             functions.mean(cvd_dhs['Total_Individuals_in_Shelter']).over(window2))

# Computing p-scores for Adults and Children in homeless shelters
cvd_dhs.createOrReplaceTempView('cvd_dhs')
result = spark.sql("SELECT CD.date, CD.case_count, CD.cum_case_count, CD.hosp_count, "
                   "CD.cum_hosp_count, CD.death_count, CD.death_count_prob, CD.cum_death_count, "
                   "AVG(CD3.adults) as adults_avg, AVG(CD3.children) as children_avg, AVG(CD3.total) as total_avg, "
                   "COALESCE(CD.Total_Adults_in_Shelter, CD.total_adults_rolling_avg) as adults_cvd, "
                   "COALESCE(CD.Total_Children_in_Shelter, CD.total_children_rolling_avg) as children_cvd, "
                   "COALESCE(CD.Total_Individuals_in_Shelter, CD.total_individuals_rolling_avg) as total_cvd, "
                   "((COALESCE(CD.Total_Adults_in_Shelter, CD.total_adults_rolling_avg) - "
                   "AVG(CD3.adults)) / AVG(CD3.adults)) as adults_pscore, "
Example #25
0
#Part 2 Task 3

#Import Modules
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

#Create a spark session
spark = SparkSession.builder.appName("AirBnB").getOrCreate()

#Dataset path
path = 'C:/Users/Ashish/Desktop/Truata/airbnbdata.parquet'

#Output Directory Path
outputDir = 'C:/Users/Ashish/Desktop/Truata/out/'

#Load parquet file into dataframe
airbnb_Data = spark.read.parquet(path)

#Querying Value from dataframe for price > 5000 and review = 10
bedBathData = airbnb_Data.filter((airbnb_Data['price'] > 5000) & (airbnb_Data['review_scores_value'] == 10)).select(["bathrooms", "bedrooms"]).agg(F.mean('bathrooms'), F.mean('bedrooms'))

#Renaming Columns
renamedBedBathData = bedBathData.withColumnRenamed('avg(bathrooms)', 'avg_bathrooms')\
    .withColumnRenamed('avg(bedrooms)', 'avg_bedrooms')

#Saving as CSV file
renamedBedBathData.toPandas().to_csv(outputDir+'out_2_3.csv', index= False)
Example #26
0
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().ix[:,0]
                                        )
        stats = stats_df.ix[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats
Example #27
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

if __name__ == "__main__":
    spark = SparkSession.builder.master("local").appName("pyspark homework").getOrCreate()
    file_path = "hdfs:///dataset/bank-data.csv"
    df = spark.read.csv(path=file_path, header=True, inferSchema=True)

    df.groupBy("sex").agg(F.min("income"), F.max("income"), F.mean("income")).show()

    df.groupBy("region").agg({"income": "mean"}).show()
Example #28
0
 def mean(scol):
     return F.when(
         F.row_number().over(self._window) >= self._min_periods,
         F.mean(scol).over(self._window)).otherwise(F.lit(None))
Example #29
0
 def fn(col):
     if 'window' in kwargs:
         window = kwargs['window']
         return F.mean(col).over(window)
     else:
         return F.mean(col)
Example #30
0
# resulting column:

flights \
  .agg(countDistinct("carrier").alias("num_carriers")) \
  .show()

# `groupBy()` groups data by the specified columns, so
# aggregations can be computed by group:

from pyspark.sql.functions import mean

flights \
  .groupBy("origin") \
  .agg( \
       count("*").alias("num_departures"), \
       mean("dep_delay").alias("avg_dep_delay") \
  ) \
  .show()

# You can chain together multiple DataFrame methods:

flights \
  .filter(col("dest") == lit("BOS")) \
  .groupBy("origin") \
  .agg( \
       count("*").alias("num_departures"), \
       mean("dep_delay").alias("avg_dep_delay") \
  ) \
  .orderBy("avg_dep_delay") \
  .show()
Example #31
0
                       'float'), 2).alias('Low'),
                   format_number(result_desc['Close'].cast(
                       'float'), 2).alias('Close'),
                   result_desc['Volume'].cast('int').alias('Volume')
                   ).show()

hv_ratio = walmartdf.withColumn(
    "HV ratio", walmartdf['High'] / walmartdf['Volume'])

hv_ratio.select('HV Ratio').show()

# finding highest value date
walmartdf.orderBy(walmartdf['High'].desc()).head(1)[0][0]


walmartdf.agg(mean(walmartdf['Close'])).show()

walmartdf.select(max(walmartdf['Volume']), min('Volume')).show()

walmartdf.filter(walmartdf['Close'] < 60).count()

(walmartdf.filter(walmartdf['High'] > 80).count(
) / walmartdf.agg(count(walmartdf['Date'])).head(1)[0][0]) * 100

newdf = walmartdf.withColumn("year", year(walmartdf['Date']))

newdf.groupby("year").max().select('year', 'max(High)').show()

newdf2 = walmartdf.withColumn("month", month(walmartdf['Date']))

newdf2.groupBy("month").mean().select(
# immigration
us_immigrant = spark.read.parquet(
    '{}immigrant/'.format(processed_data_path)).filter(
        F.col('monthYear') == F.lit(monthYear))
us_immigration = spark.read.parquet(
    '{}immigration/'.format(processed_data_path)).filter(
        F.col('monthYear') == F.lit(monthYear))

# demographics
us_demographics = spark.read.parquet('{}demographics/'.format(processed_data_path))\
.select("median_age", "city_id", "total_population", "foreign_born")\
.join(city.select("state_code", "city_id"), "city_id")\
.drop('city_id')\
.groupBy("state_code")\
.agg(
    F.mean("median_age").alias('median_age'),
    F.sum("total_population").alias("total_population"),
    F.sum("foreign_born").alias("foreign_born")
)

# process anlaytics immigration
analytics_immigration = us_immigrant\
.select('cicid', 'from_country_code', 'age', 'occupation', 'gender', 'monthYear')\
.join(country_code, us_immigrant.from_country_code == country_code.code, 'left')\
.drop('from_country_code', 'code')\
.withColumnRenamed('country', 'from_country')\
.join(us_immigration.select('cicid','state_code'), 'cicid', 'left')\
.join(us_state_code, us_immigration.state_code == us_state_code.code, 'left')\
.drop('code')\
.join(us_demographics, 'state_code')\
.drop('state_code')
Example #33
0
        # count = sum of daily counts
        feat + 'count' + dd:
        f.sum(f.col(feat + 'count_0d')).over(window),

        # A few more complicated examples:

        # mean = weighted mean of daily means
        feat + 'count' + dd:
        f.sum(f.col(feat + 'mean_0d') * f.col(feat + 'count_0d')).over(window)
        / f.sum(f.col(feat + 'count_0d')).over(window),

        # stddev = sqrt(weighted mean of daily variances)
        feat + 'stddev' + dd:
        f.sqrt(
            f.mean(f.col(feat + 'count_0d') *
                   f.col(feat + 'stddev_0d')**2).over(window) /
            f.sum(f.col(feat + 'count_0d')).over(window)),
    }

    # Loop through the dictionary of new columns and add them to the aggregated
    # dataframe
    for col_name, col_obj in new_cols.items():
        add = SparkWithColumn(name='add_' + col_name,
                              read_key='df_agg',
                              store_key='df_agg',
                              new_col_name=col_name,
                              new_col=col_obj)

        lookback_chain.add(add)

# STEP 5: Save the results
Example #34
0
from pyspark.sql.types import *
import time
from pyspark.sql import SparkSession
import json
import pyspark.sql.functions as f

spark = (SparkSession.builder.appName("Spark Benchmarking").master(
    "local[*]").config("spark.driver.memory",
                       "8g").config("spark.driver.maxResultSize",
                                    "4g").getOrCreate())

big_df = (spark.read.format("csv").option("header", "true").option(
    "delimiter", ",").load("./data/KB/*.csv"))
small_df = big_df.groupby("key").agg(f.mean(f.col("value")))
small_df.write()

joined = big_df.join(small_df, small_df.key == big_df.key, how="left")
movies = movies.withColumn("year", get_year_udf(movies.title))
movies.show()

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC As part of the analysis of this dataset, it would be useful to have the average rating for each movie. In the following cell of code, I aggregate over the ratings table to get the average rating for each movie ID. 

# COMMAND ----------

from pyspark.sql.types import FloatType
from pyspark.sql.functions import bround
from pyspark.sql.functions import mean

ratings_agg = ratings.groupBy("movieId").agg(mean("rating").alias("avg_rating"))
ratings_agg = ratings_agg.withColumn("average_rating", ratings_agg.avg_rating.cast(FloatType())).drop("avg_rating").withColumnRenamed("average_rating", "avg_rating")
ratings_agg = ratings_agg.select("movieId",bround("avg_rating",2).alias("avg_rating"))
ratings_agg.show()

# COMMAND ----------

# MAGIC %md 
# MAGIC Here we evaluate the average rating by year to identify if there is a trend in the ratings either to decrease or increase over the years. Visually, it is not possible to appreaciate such trend, but it was possible to identify some outlayer values in the year column. 
# MAGIC 
# MAGIC To achieve this, it was necessary to join the aggregated ratings table with movies table that includes the year as a column. 

# COMMAND ----------

joined_movies = movies.join(ratings_agg,"movieId")
joined_movies.select("year",'avg_rating').groupBy("year").mean().orderBy("year").display()
#df1.describe("trip_distance").show()
#df1.describe("total_amount").show()

# Query data
# https://towardsdatascience.com/beginners-guide-to-pyspark-bbe3b553b79f

# Compute monthly avg trip_distance & total_amount
# df1.select('trip_distance'
#           ).groupBy('pickup_mon')\
#           .mean()\
#           .show()
#AttributeError: 'GroupedData' object has no attribute 'describe'

# Try this instead
# https://stackoverflow.com/questions/51632126/pysparkhow-to-calculate-avg-and-count-in-a-single-groupby
df1.groupBy('pickup_mon').agg(functions.mean('trip_distance'),
                              functions.count('trip_distance')).show()
df1.groupBy('pickup_mon').agg(functions.mean('total_amount'),
                              functions.count('total_amount')).show()

# Compute stddev next

# Consider map individual variable and compute summary statistics; it may be faster?

# Delete the temporary files
input_path = sc._jvm.org.apache.hadoop.fs.Path(input_directory)
input_path.getFileSystem(sc._jsc.hadoopConfiguration()).delete(
    input_path, True)

## Back to Google Cloud, Week 7
## Upload this file to Storage's cs512_trip
Example #37
0
def get_average(dataset, column):
    return dataset.agg(mean(dataset[column]).alias('mean')).collect()[0]['mean']
def readDataFromES():
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    results_gen = elasticsearch.helpers.scan(
        es,
        index='netflowrepo',
        doc_type='entry',
        query={"query": {
            "match_all": {}
        }})

    results = list(results_gen)

    id_list = []
    sumOfBytes_list = []
    uniqDstIPs_list = []
    uniqDstPorts_list = []

    for row in results:
        id_list.append(row['_id'])
        sumOfBytes_list.append(row['_source']['sumOfBytes'])
        uniqDstIPs_list.append(row['_source']['uniqDstIPs'])
        uniqDstPorts_list.append(row['_source']['uniqDstPorts'])

    # Convert data to numpy arrays.
    np_ID = np.array(id_list)
    np_Bytes = np.array(sumOfBytes_list)
    np_DstIPs = np.array(uniqDstIPs_list)
    np_DstPorts = np.array(uniqDstPorts_list)

    # Convert data into Matrix. Each feature is in a column.
    tmp1 = np.concatenate((np_Bytes.reshape((-1, 1)), np_DstIPs.reshape(
        (-1, 1))),
                          axis=1)
    tmp2 = np.concatenate((tmp1, np_DstPorts.reshape((-1, 1))), axis=1)
    mat = sc.parallelize(tmp2.tolist())

    # Convert to Data Frame.
    df = spark.createDataFrame(mat)
    df = df.toDF('sumOfBytes', 'uniqDstIPs', 'uniqDstPorts')  # Add headers.
    if DEBUGMODE: df.show()

    # Add unique numeric ID, and place in first column.
    df = df.withColumn("id", monotonically_increasing_id())
    df = df.select("id", FEATURE_COLS[0], FEATURE_COLS[1], FEATURE_COLS[2])
    if DEBUGMODE: df.show()

    # Convert all data columns to float.
    for col in df.columns:
        if col in FEATURE_COLS:
            df = df.withColumn(col, df[col].cast('float'))
    if DEBUGMODE: df.show()

    # Need to convert this to a vector for Spark's implementation of KMeans.
    vecAssembler = VectorAssembler(inputCols=FEATURE_COLS,
                                   outputCol="features")
    df_kmeans = vecAssembler.transform(df).select(
        'id', 'features')  # Drop other columns.
    if DEBUGMODE: df_kmeans.show()

    if SCALING_FLAG:
        # Scale the data.
        scaler = StandardScaler(inputCol="features",
                                outputCol="scaledFeatures")
        scaler_model = scaler.fit(df_kmeans)
        df_scaled = scaler_model.transform(df_kmeans)
        if DEBUGMODE: df_scaled.show()

        # Train the Machine Learning Model.
        k = 3  # silhouette score of 0.799529809602
        kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("scaledFeatures")
        model = kmeans.fit(df_scaled)

        centers = model.clusterCenters()
        print("Cluster Centers: ")
        for center in centers:
            print(center)

        # Assign events to clusters.
        predictions = model.transform(df_scaled).select(
            'id', 'scaledFeatures', 'prediction')

        if DEBUGMODE: predictions.show()

        # Extract scaledFeatures column back to FEATURE_COLS
        predictions = predictions.rdd.map(extract).toDF([
            "id", "prediction", "scaledFeatures", "sumOfBytes", "uniqDstIPs",
            "uniqDstPorts"
        ])

        # Rename scaledFeatures to features.
        predictions = predictions.withColumnRenamed("scaledFeatures",
                                                    "features")

        df_pred = predictions

        # # # # Find optimal choice for k.
        # # # cost = np.zeros(20)
        # # # for k in range(2,20):
        # # # kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("scaledFeatures")
        # # # model = kmeans.fit(df_scaled.sample(False,0.1, seed=42))
        # # # cost[k] = model.computeCost(df_scaled)
        # # # printDebugMsg("Cost =")
        # # # for k in range(2, 20):
        # # # printDebugMsg("{0}: {1}".format(k, cost[k]))
        # # # sys.exit(1)

    else:
        # Train the Machine Learning Model.
        k = 3  # silhouette score of 0.997791174741 with no scaling.
        # Using "features" has a higher silhouette score of 0.997791174741
        kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
        model = kmeans.fit(df_kmeans)

        centers = model.clusterCenters()
        printDebugMsg("Cluster Centers: ")
        for center in centers:
            printDebugMsg(center)

        # Assign events to clusters.
        predictions = model.transform(df_kmeans).select(
            'id', 'features', 'prediction')

        # # # # Find optimal choice for k.
        # # # cost = np.zeros(20)
        # # # for k in range(2,20):
        # # # kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
        # # # model = kmeans.fit(df_kmeans.sample(False,0.1, seed=42))
        # # # cost[k] = model.computeCost(df_kmeans)
        # # # printDebugMsg("Cost =")
        # # # for k in range(2, 20):
        # # # printDebugMsg("{0}: {1}".format(k, cost[k]))
        # # # sys.exit(1)

        rows = predictions.collect()
        # Create prediction dataframe.
        df_pred = spark.createDataFrame(rows)

        # Join prediction with original data.
        df_pred = df_pred.join(df, 'id')
        if DEBUGMODE: df_pred.show()

    if DEBUGMODE: predictions.show()
    printDebugMsg("Prediction counts for each cluster:")
    if DEBUGMODE: predictions.groupBy('prediction').count().show()

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    printDebugMsg(
        "Silhouette with squared euclidean distance = {0}".format(silhouette))

    # Get max, stddev, and mean by cluster.
    row_0 = df_pred.filter(df_pred['prediction'] == 0).groupBy().max(
        'sumOfBytes', 'uniqDstIPs', 'uniqDstPorts').collect()[0]
    row_1 = df_pred.filter(df_pred['prediction'] == 1).groupBy().max(
        'sumOfBytes', 'uniqDstIPs', 'uniqDstPorts').collect()[0]
    row_2 = df_pred.filter(df_pred['prediction'] == 2).groupBy().max(
        'sumOfBytes', 'uniqDstIPs', 'uniqDstPorts').collect()[0]
    sumOfBytes_0_max = row_0[0]
    uniqDstIPs_0_max = row_0[1]
    uniqDstPorts_0_max = row_0[2]
    sumOfBytes_1_max = row_1[0]
    uniqDstIPs_1_max = row_1[1]
    uniqDstPorts_1_max = row_1[2]
    sumOfBytes_2_max = row_2[0]
    uniqDstIPs_2_max = row_2[1]
    uniqDstPorts_2_max = row_2[2]

    printDebugMsg("sumOfBytes_0_max = {0}".format(sumOfBytes_0_max))
    printDebugMsg("uniqDstIPs_0_max = {0}".format(uniqDstIPs_0_max))
    printDebugMsg("uniqDstPorts_0_max = {0}".format(uniqDstPorts_0_max))
    printDebugMsg("sumOfBytes_1_max = {0}".format(sumOfBytes_1_max))
    printDebugMsg("uniqDstIPs_1_max = {0}".format(uniqDstIPs_1_max))
    printDebugMsg("uniqDstPorts_1_max = {0}".format(uniqDstPorts_1_max))
    printDebugMsg("sumOfBytes_2_max = {0}".format(sumOfBytes_2_max))
    printDebugMsg("uniqDstIPs_2_max = {0}".format(uniqDstIPs_2_max))
    printDebugMsg("uniqDstPorts_2_max = {0}".format(uniqDstPorts_2_max))

    # Get original data stddev.  This is for scaling the new input.
    sumOfBytes_Orig_stddev = df.select(stddev('sumOfBytes')).collect()[0][0]
    uniqDstIPs_Orig_stddev = df.select(stddev('uniqDstIPs')).collect()[0][0]
    uniqDstPorts_Orig_stddev = df.select(
        stddev('uniqDstPorts')).collect()[0][0]
    printDebugMsg(
        "sumOfBytes_Orig_stddev = {0}".format(sumOfBytes_Orig_stddev))
    printDebugMsg(
        "uniqDstIPs_Orig_stddev = {0}".format(uniqDstIPs_Orig_stddev))
    printDebugMsg(
        "uniqDstPorts_Orig_stddev = {0}".format(uniqDstPorts_Orig_stddev))

    # Get scaled data stddev for All clusters.  This is for determining the threshold.
    sumOfBytes_All_stddev = df_pred.select(
        stddev('sumOfBytes')).collect()[0][0]
    uniqDstIPs_All_stddev = df_pred.select(
        stddev('uniqDstIPs')).collect()[0][0]
    uniqDstPorts_All_stddev = df_pred.select(
        stddev('uniqDstPorts')).collect()[0][0]
    printDebugMsg("sumOfBytes_All_stddev = {0}".format(sumOfBytes_All_stddev))
    printDebugMsg("uniqDstIPs_All_stddev = {0}".format(uniqDstIPs_All_stddev))
    printDebugMsg(
        "uniqDstPorts_All_stddev = {0}".format(uniqDstPorts_All_stddev))

    # Set values to scaled data for each cluster for determining threshold.
    sumOfBytes_0_stddev = sumOfBytes_1_stddev = sumOfBytes_2_stddev = sumOfBytes_All_stddev
    uniqDstIPs_0_stddev = uniqDstIPs_1_stddev = uniqDstIPs_2_stddev = uniqDstIPs_All_stddev
    uniqDstPorts_0_stddev = uniqDstPorts_1_stddev = uniqDstPorts_2_stddev = uniqDstPorts_All_stddev

    # Get original data mean.  This is for scaling the new input.
    sumOfBytes_Orig_mean = df.select(mean('sumOfBytes')).collect()[0][0]
    uniqDstIPs_Orig_mean = df.select(mean('uniqDstIPs')).collect()[0][0]
    uniqDstPorts_Orig_mean = df.select(mean('uniqDstPorts')).collect()[0][0]
    printDebugMsg("sumOfBytes_Orig_mean = {0}".format(sumOfBytes_Orig_mean))
    printDebugMsg("uniqDstIPs_Orig_mean = {0}".format(uniqDstIPs_Orig_mean))
    printDebugMsg(
        "uniqDstPorts_Orig_mean = {0}".format(uniqDstPorts_Orig_mean))

    # Get scaled data mean for All clusters.  This is for determining the threshold.
    sumOfBytes_All_mean = df_pred.select(mean('sumOfBytes')).collect()[0][0]
    uniqDstIPs_All_mean = df_pred.select(mean('uniqDstIPs')).collect()[0][0]
    uniqDstPorts_All_mean = df_pred.select(
        mean('uniqDstPorts')).collect()[0][0]
    printDebugMsg("sumOfBytes_All_mean = {0}".format(sumOfBytes_All_mean))
    printDebugMsg("uniqDstIPs_All_mean = {0}".format(uniqDstIPs_All_mean))
    printDebugMsg("uniqDstPorts_All_mean = {0}".format(uniqDstPorts_All_mean))

    # Set values to scaled data for each cluster for determining threshold.
    sumOfBytes_0_mean = sumOfBytes_1_mean = sumOfBytes_2_mean = sumOfBytes_All_mean
    uniqDstIPs_0_mean = uniqDstIPs_1_mean = uniqDstIPs_2_mean = uniqDstIPs_All_mean
    uniqDstPorts_0_mean = uniqDstPorts_1_mean = uniqDstPorts_2_mean = uniqDstPorts_All_mean

    upperThreshold_0_Bytes = sumOfBytes_0_max + NUM_STDDEV_ABOVE * sumOfBytes_0_stddev
    printDebugMsg(
        "upperThreshold_0_Bytes = {0}".format(upperThreshold_0_Bytes))

    upperThreshold_1_Bytes = sumOfBytes_1_max + NUM_STDDEV_ABOVE * sumOfBytes_1_stddev
    printDebugMsg(
        "upperThreshold_1_Bytes = {0}".format(upperThreshold_1_Bytes))

    upperThreshold_2_Bytes = sumOfBytes_2_max + NUM_STDDEV_ABOVE * sumOfBytes_2_stddev
    printDebugMsg(
        "upperThreshold_2_Bytes = {0}".format(upperThreshold_2_Bytes))

    upperThreshold_0_DstIPs = uniqDstIPs_0_max + NUM_STDDEV_ABOVE * uniqDstIPs_0_stddev
    printDebugMsg(
        "upperThreshold_0_DstIPs = {0}".format(upperThreshold_0_DstIPs))

    upperThreshold_1_DstIPs = uniqDstIPs_1_max + NUM_STDDEV_ABOVE * uniqDstIPs_1_stddev
    printDebugMsg(
        "upperThreshold_1_DstIPs = {0}".format(upperThreshold_1_DstIPs))

    upperThreshold_2_DstIPs = uniqDstIPs_2_max + NUM_STDDEV_ABOVE * uniqDstIPs_2_stddev
    printDebugMsg(
        "upperThreshold_2_DstIPs = {0}".format(upperThreshold_2_DstIPs))

    upperThreshold_0_DstPorts = uniqDstPorts_0_max + NUM_STDDEV_ABOVE * uniqDstPorts_0_stddev
    printDebugMsg(
        "upperThreshold_0_DstPorts = {0}".format(upperThreshold_0_DstPorts))

    upperThreshold_1_DstPorts = uniqDstPorts_1_max + NUM_STDDEV_ABOVE * uniqDstPorts_1_stddev
    printDebugMsg(
        "upperThreshold_1_DstPorts = {0}".format(upperThreshold_1_DstPorts))

    upperThreshold_2_DstPorts = uniqDstPorts_2_max + NUM_STDDEV_ABOVE * uniqDstPorts_2_stddev
    printDebugMsg(
        "upperThreshold_2_DstPorts = {0}".format(upperThreshold_2_DstPorts))

    # Combined upper threshold is pythagorean in 3 dimensions.
    # s^2 = x^2 + y^2 + z^2
    threshold_0 = sqrt(upperThreshold_0_Bytes**2 + upperThreshold_0_DstIPs**2 +
                       upperThreshold_0_DstPorts**2)
    threshold_1 = sqrt(upperThreshold_1_Bytes**2 + upperThreshold_1_DstIPs**2 +
                       upperThreshold_1_DstPorts**2)
    threshold_2 = sqrt(upperThreshold_2_Bytes**2 + upperThreshold_2_DstIPs**2 +
                       upperThreshold_2_DstPorts**2)

    printDebugMsg("threshold_0 = {0}".format(threshold_0))
    printDebugMsg("threshold_1 = {0}".format(threshold_1))
    printDebugMsg("threshold_2 = {0}".format(threshold_2))

    # Combine everything needed to return values.
    sumOfBytes_0 = (sumOfBytes_0_max, sumOfBytes_Orig_mean,
                    sumOfBytes_Orig_stddev)
    sumOfBytes_1 = (sumOfBytes_1_max, sumOfBytes_Orig_mean,
                    sumOfBytes_Orig_stddev)
    sumOfBytes_2 = (sumOfBytes_2_max, sumOfBytes_Orig_mean,
                    sumOfBytes_Orig_stddev)
    uniqDstIPs_0 = (uniqDstIPs_0_max, uniqDstIPs_Orig_mean,
                    uniqDstIPs_Orig_stddev)
    uniqDstIPs_1 = (uniqDstIPs_1_max, uniqDstIPs_Orig_mean,
                    uniqDstIPs_Orig_stddev)
    uniqDstIPs_2 = (uniqDstIPs_2_max, uniqDstIPs_Orig_mean,
                    uniqDstIPs_Orig_stddev)
    uniqDstPorts_0 = (uniqDstPorts_0_max, uniqDstPorts_Orig_mean,
                      uniqDstPorts_Orig_stddev)
    uniqDstPorts_1 = (uniqDstPorts_1_max, uniqDstPorts_Orig_mean,
                      uniqDstPorts_Orig_stddev)
    uniqDstPorts_2 = (uniqDstPorts_2_max, uniqDstPorts_Orig_mean,
                      uniqDstPorts_Orig_stddev)

    # Update global variables.
    global kmeansModel
    global clusterCenters
    global sumOfBytes_stats
    global uniqDstIPs_stats
    global uniqDstPorts_stats
    global thresholds
    kmeansModel = model
    clusterCenters = centers
    sumOfBytes_stats = [sumOfBytes_0, sumOfBytes_1, sumOfBytes_2]
    uniqDstIPs_stats = [uniqDstIPs_0, uniqDstIPs_1, uniqDstIPs_2]
    uniqDstPorts_stats = [uniqDstPorts_0, uniqDstPorts_1, uniqDstPorts_2]
    thresholds = (threshold_0, threshold_1, threshold_2)
Example #39
0
iotmsgsRDD = sqlContext.read.json(js)
iotmsgsRDD.registerTempTable("iotmsgsTable")

print("JSON converted to DataFrame of casted floating point numbers")
sqlContext.sql("select distinct cast(payload.data.temperature as float) \
  from iotmsgsTable order by temperature desc").show()

print("DataFrame showing automated 'describe' summary of floating points")
sqlContext.sql("select distinct cast(payload.data.temperature as float) \
  from iotmsgsTable order by temperature desc").describe().show()

print("DataFrame of selected SQL dataframe functions")
temperatureDF = sqlContext.sql("select distinct cast(payload.data.temperature \
  as float) from iotmsgsTable order by temperature desc")
functionsDF = temperatureDF.select([mean('temperature'), min('temperature'), \
  max('temperature')])
print(type(functionsDF))
print(functionsDF)
functionsDF.printSchema()
functionsDF.show()

# Collect a List of Rows of data from the DataFrame
print("Extracted List of Rows of selected SQL dataframe function")
functionsList = temperatureDF.select([mean('temperature'), min('temperature'), \
  max('temperature')]).collect()
print(type(functionsList))
print(functionsList)
print()

# Collect max temperature from Row #1 of the DataFrame
Example #40
0
# Read Data from HDFS
df = spark.read.csv('/tmp/fajar/bank-class.csv', inferSchema=False, header=True)

# Column rename and type casting
df = df.withColumn("LIMIT_BAL", df["LIMIT_BAL"].cast(IntegerType()))
df = df.withColumn("AGE", df["AGE"].cast(IntegerType()))
df = df.withColumn("DELAY1", df["DELAY1"].cast(IntegerType()))
df = df.withColumn("DELAY2", df["DELAY2"].cast(IntegerType()))
df = df.withColumnRenamed("TARGET", "label")                        # Change column name 'TARGET' to 'label' to ease modelling purpose
df = df.withColumn("label", df["label"].cast(IntegerType()))
df = df.withColumn("BILLING1", df["BILLING1"].cast(DoubleType()))
df = df.withColumn("BILLING2", df["BILLING2"].cast(DoubleType()))
df = df.withColumn("PAYMENT1", df["PAYMENT1"].cast(DoubleType()))
df = df.withColumn("PAYMENT2", df["PAYMENT2"].cast(DoubleType()))

LIMIT_BAL_means = df.select(mean("LIMIT_BAL")).collect()[0][0]
df = df.na.fill({"LIMIT_BAL": LIMIT_BAL_means})
df = df.na.drop(subset=("MARITAL", "EDUCATION", "SEX"))

# String to Index Conversion
MARITALindexer  = StringIndexer(inputCol="MARITAL", outputCol="MARITALindex")
EDUCATIONindexer= StringIndexer(inputCol="EDUCATION", outputCol="EDUCATIONindex")
SEXindexer      = StringIndexer(inputCol="SEX", outputCol="SEXindex")


# Index to Binary Vector Conversion
MARITALencoder  = OneHotEncoder(inputCol="MARITALindex", outputCol="MARITALvec")
EDUCATIONencoder= OneHotEncoder(inputCol="EDUCATIONindex", outputCol="EDUCATIONvec")
SEXencoder      = OneHotEncoder(inputCol="SEXindex", outputCol="SEXvec")

# Create features vector
Example #41
0
#to find the mode of embarked column

titanic_df.groupBy("Embarked").count().orderBy("count", ascending=False).show()
mode_embarked = titanic_df.groupBy("Embarked").count().orderBy(
    "count", ascending=False).first()[0]
print(mode_embarked)

#to fill the nullvales of embarked column with it.s mode
# titanic_df=titanic_df.fillna(mode_embarked,subset=['Embarked'])

#to count the missing values
# for c in titanic_df.columns:
#     print(c,titanic_df.filter(col(c).isNull()).count())

#to find the mean of age column
titanic_df.select(mean('Age')).show()
mean_age = titanic_df.select(mean('Age')).first()[0]
print(int(mean_age))

#to fill the null values of age column with mean of age
# titanic_df=titanic_df.fillna(mean_age,subset=['Age'])

#to count the missing values
# for c in titanic_df.columns:
#     print(c,titanic_df.filter(col(c).isNull()).count())

#APRIL 29TH
#0r
titanic_df = titanic_df.fillna({
    "Age": int(mean_age),
    "Embarked": mode_embarked
Example #42
0
def mapper(word):
        return (word[0:4], word[4:10], word[10:15], word[15:23], word[23:27], word[27:28], word[28:34], word[34:41], word[41:46], word[46:51], word[51:56], word[56:60], word[60:63], word[63:64], word[64:65], word[65:69], word[69:70], word[70:75], word[75:76], word[76:77], word[77:78], word[78:84], word[84:85], word[85:86], word[86:87], word[87:92], word[92:93], word[93:98], word[98:99], word[99:104], word[104:105])

def reducer(x,y):
        return x+y

if __name__ == "__main__":
    start_time = time.time()
    sc = SparkContext(appName="PySparkClimate")
#     f = open("1980.txt", "w")
    # for fileName in glob.glob("/home/DATA/NOAA_weather/1980/*.gz"):
    lines = sc.textFile("/home/DATA/NOAA_weather/1980/325960-99999-1980.gz", 1)
    counts = lines.flatMap(lambda x: x.splitlines()) \
                    .map(mapper) # \
                    # .reduceByKey(reducer)
    output = counts.collect()
    end_time = time.time()
    print("TIME OF PROGRAM: ",end_time - start_time)
    spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    df = spark.createDataFrame(output, ['VARS', 'USAF_CAT_ID', 'NCDC_WBAN_ID', 'OBSV_DATE', 'OBSV_TIME', 'OBSV_FLAG', 'OBSV_LAT', 'OBSV_LONG', 'RPT_TYPE', 'OBSV_ELEV', 'STAT_LET_ID', 'OBSV_QC_PROC', 'OBSV_ANG', 'OBSV_QUAL_CODE', 'OBSV_TYPE', 'OBSV_SPEED', 'OBSV_SPEED_QUAL_CODE', 'OBSV_CEIL_H', 'OBSV_CEIL_QUAL_CODE', 'OBSV_CEIL_DET_CODE', 'CAVOK_CODE', 'VIS_D', 'VIS_D_QUAL', 'VIS_VAR', 'VIS_QUAL_VAR_CODE', 'AIR_TMP', 'AIR_TMP_Q', 'AIR_TMP_DEW', 'AIR_TMP_DEW_QUAL', 'SEA_PRESS', 'SEA_PRESS_QC'])
    # df.describe(['AIR_TMP']).show()
    df = df.where(F.col('AIR_TMP') != 9999)
    stats = df.select(F.mean(F.col('AIR_TMP')).alias('mean')).collect()
    print(stats[0]['mean'])
#     f.write(str(stats[0]['mean']) + ",")
#     f.close()
    sc.stop()   
Example #43
0
def summary(df, datatypes=None):
    spark = df.sql_ctx
    types = {x.name: x.dataType for x in list(df.schema)}

    #filter datatypes
    if datatypes is not None:
        types = {
            k: v
            for k, v in types.items()
            if any([x in datatypes
                    for x in [v, str(v), v.simpleString()]])
        }

    res = pd.DataFrame.from_dict(types, orient='index')
    res.columns = ['datatype']

    count = df.count()
    res['count'] = count

    d = df.select([F.approx_count_distinct(c).alias(c)
                   for c in df.columns]).toPandas().T
    d.columns = ['approx_distinct']
    d.index.name = 'index'
    res = res.join(d)

    res['unique_ratio'] = res['approx_distinct'] / count

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.NumericType)):
            sel += [F.mean(c).alias(c)]
        else:
            sel += [F.min(F.lit(None)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['mean']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([F.min(c).alias(c) for c in df.columns]).toPandas().T
    d.columns = ['min']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([F.max(c).alias(c) for c in df.columns]).toPandas().T
    d.columns = ['max']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([
        F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns
    ]).toPandas().T
    d.columns = ['null']
    d.index.name = 'index'
    res = res.join(d)

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.NumericType)):
            sel += [F.count(F.when(F.isnan(c), c)).alias(c)]
        else:
            sel += [F.min(F.lit(0)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['nan']
    d.index.name = 'index'
    res = res.join(d)

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.StringType)):
            sel += [F.count(F.when(F.col(c).isin(''), c)).alias(c)]
        else:
            sel += [F.min(F.lit(0)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['empty']
    d.index.name = 'index'
    res = res.join(d)

    return res
Example #44
0
from pyspark import SparkContext
from csv import reader
from pyspark.sql import SQLContext
from pyspark.sql.functions import col
from pyspark.sql import functions as F

sc = SparkContext()
sqc = SQLContext(sc)
df = sqc.read.options(
    header='true', inferschema='true', sep='\t',
    encoding='ISO-8859-1').csv('/user/hm74/NYCOpenData/erm2-nwe9.tsv.gz')
timeFmt = "MM/dd/yyyy HH:mm:ss"
timeDiff = (F.unix_timestamp(col('Closed Date'), format=timeFmt) -
            F.unix_timestamp(col('Created Date'), format=timeFmt))
df = df.withColumn("ResponseTimeHrs", timeDiff / 3600.0)
df.groupBy("Incident Zip").agg(F.mean('ResponseTimeHrs')).show()
df.where(col('Borough') == 'BROOKLYN').where(
    col('Complaint Type') ==
    'Root/Sewer/Sidewalk Condition').count()  # or all boroughs
Example #45
0
sc = SparkContext(conf = conf)
sqlcontext = SQLContext(sc)

# 1. Create a DataFrame with one int column and 10 rows.
df = sqlcontext.range(0, 10)
df.show()

# Generate two other columns using uniform distribution and normal distribution.
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))
df.show()

# 2. Summary and Descriptive Statistics
df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
df.describe('uniform', 'normal').show()

df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

# 3. Sample covariance and correlation
# Covariance is a measure of how two variables change with respect to each other. 
# A positive number would mean that there is a tendency that as one variable increases, 
# the other increases as well. 
# A negative number would mean that as one variable increases, 
# the other variable has a tendency to decrease.
df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))
df.stat.cov('rand1', 'rand2')
df.stat.cov('id', 'id')

# Correlation is a normalized measure of covariance that is easier to understand, 
# as it provides quantitative measurements of the statistical dependence between two random variables.
df.stat.corr('rand1', 'rand2')
df.stat.corr('id', 'id')
Example #46
0
"""##  Cast the String datatype to Integer/Float"""

from pyspark.sql.types import IntegerType, FloatType

bankProspectsDF2 = bankProspectsDF1.withColumn(
    "age", bankProspectsDF1["age"].cast(IntegerType())).withColumn(
        "salary", bankProspectsDF1["salary"].cast(FloatType()))
"""## Replace Age and Salary with average values of their respective column

import mean from sql.fuctions
"""

from pyspark.sql.functions import mean
"""### Calculate "mean" value of the age"""

mean_age_val = bankProspectsDF2.select(mean(bankProspectsDF2['age'])).collect()

mean_age = mean_age_val[0][0]
"""### Calculate mean salary value"""

mean_salary_val = bankProspectsDF2.select(mean(
    bankProspectsDF2['salary'])).collect()

mean_salary = mean_salary_val[0][0]
"""### Replace missing age with average value"""

bankbankProspectsDF3 = bankProspectsDF2.na.fill(mean_age, ["age"])
"""### Replace missing age with salary value"""

bankbankProspectsDF4 = bankbankProspectsDF3.na.fill(mean_salary, ["salary"])
"""## Write the transformed file to a new csv file"""
Example #47
0
def makeMapping(firstTime):
    global routes
    grpString = "ORIGIN_AIRPORT_ID","ORIGIN_CITY_NAME","ORIGIN","DEST_AIRPORT_ID","DEST_CITY_NAME","DEST","UNIQUE_CARRIER_NAME"
    if firstTime:
        routes = routes.groupBy(*grpString).agg(sum("PASSENGERS").alias("PASSENGERS"),sum("DEPARTURES_PERFORMED").alias("DEPARTURES_PERFORMED"),mean("RAMP_TO_RAMP").alias("RAMP_TO_RAMP"))
    for i in routes.collect():
        if not dictAir.get("Airport{}".format(i[0])):
            initNode(i[0],(i[1],i[2]),i[8])
        if not dictAir.get("Airport{}".format(i[3])):
            initNode(i[3],(i[4],i[5]),0)
        if (i[9]!=9876543.21):
            tripTime =i[9]
            getApt(i[0])['depts'] += i[8]
            sourceCNX = getApt(i[0])['cnx']
            sourceCNX.append((int(i[3]),tripTime,i[6]))
import findspark
findspark.init('C:\spark-3.0.0-preview2-bin-hadoop2.7')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("missingdata").getOrCreate()

df = spark.read.csv("ContainsNull.csv", header=True, inferSchema=True)
df.show()

# Drop
df.na.drop().show()
df.na.drop(
    thresh=2).show()  # the row at least should have 2 null values to show up
df.na.drop(subset=["Sales"]).show()
df.na.drop(how='any').show()
df.na.drop(how='all').show()

# Fill with "string"
df.na.fill('NEW VALUE').show()
df.na.fill(0).show()
df.na.fill('No Name', subset=['Name']).show()

# Fill with mean
from pyspark.sql.functions import mean
mean_val = df.select(mean(df['Sales'])).collect()
mean_val[0][0]
mean_sales = mean_val[0][0]
df.na.fill(mean_sales, ["Sales"]).show()
Example #49
0
    if not l:
        return empty_value
    counts = defaultdict(int)
    for value in l:
        counts[value] += 1
    counts = [(v, k) for (k, v) in counts.items()]
    counts.sort()
    return counts[-1][1]


MAIN_SUMMARY_FIELD_AGGREGATORS = [
    F.sum('aborts_content').alias('aborts_content_sum'),
    F.sum('aborts_gmplugin').alias('aborts_gmplugin_sum'),
    F.sum('aborts_plugin').alias('aborts_plugin_sum'),
    # active_addons
    F.mean('active_addons_count').alias('active_addons_count_mean'),
    F.first('active_experiment_branch').alias('active_experiment_branch'),
    F.first('active_experiment_id').alias('active_experiment_id'),
    # active_theme
    F.sum(F.expr('active_ticks/(3600.0/5)')).alias('active_hours_sum'),
    F.first('addon_compatibility_check_enabled').alias(
        'addon_compatibility_check_enabled'),
    F.first('app_build_id').alias('app_build_id'),
    F.first('app_display_version').alias('app_display_version'),
    F.first('app_name').alias('app_name'),
    F.first('app_version').alias('app_version'),
    # attribution
    F.first('blocklist_enabled').alias('blocklist_enabled'),
    F.first('channel').alias('channel'),
    F.first('city').alias('city'),
    F.first('country').alias('country'),
# In[39]:

df.select(avg('char_38').alias('Avrage_of_char38')).show()

# In[40]:

from pyspark.sql.functions import format_number

# In[41]:

stdd = df.select(stddev('char_38').alias('std'))

# In[42]:

stdd.select(format_number('std', 2).alias('std')).show()

# In[99]:

#missing data

# In[46]:

from pyspark.sql.functions import mean
mean_val = df.select(mean(df['char_38'])).collect()

# In[47]:

df.na.fill(df.select(mean(df['char_38'])).collect()[0][0], ['char_38']).show()

# In[ ]:
        status_stations_df.docksAvailable,
        weather_df.date,
        weather_df.time,
        weather_df.temperature,
        weather_df.humidity,
        weather_df.pressure,
        weather_df.visibility,
        weather_df.precipIntensity,
        weather_df.windSpeed
    ))

status_joined_df.show()

stats_df = (status_joined_df
    .agg(
        F.mean(status_joined_df.temperature).alias("avgTemp"),
        F.mean(status_joined_df.humidity).alias("avgHumidity"),
        F.mean(status_joined_df.pressure).alias("avgPressure"),

        F.stddev(status_joined_df.temperature).alias("stddevTemp"),
        F.stddev(status_joined_df.humidity).alias("stddevHumidity"),
        F.stddev(status_joined_df.pressure).alias("stddevPressure"),
        F.stddev(status_joined_df.visibility).alias("stddevVisibility"),
        F.stddev(status_joined_df.precipIntensity).alias("stddevPrecipitation"),
        F.stddev(status_joined_df.windSpeed).alias("stddevWindSpeed")))

stats_df.write.mode('overwrite').parquet("hdfs://hadoop:9000/models/weather-stats")

stats = stats_df.collect()[0]

print "Statistics: %s" % (stats,)
Example #52
0
def get_builtin_aggregator_column(agg, ctx):
    try:
        aggregator = ctx.aggregators[agg["aggregator"]]

        try:
            input = ctx.populate_values(agg["input"],
                                        aggregator["input"],
                                        preserve_column_refs=False)
        except CortexException as e:
            e.wrap("input")
            raise

        if aggregator["name"] == "approx_count_distinct":
            return F.approxCountDistinct(input["col"],
                                         input.get("rsd")).alias(agg["name"])
        if aggregator["name"] == "avg":
            return F.avg(input).alias(agg["name"])
        if aggregator["name"] in {
                "collect_set_int", "collect_set_float", "collect_set_string"
        }:
            return F.collect_set(input).alias(agg["name"])
        if aggregator["name"] == "count":
            return F.count(input).alias(agg["name"])
        if aggregator["name"] == "count_distinct":
            return F.countDistinct(*input).alias(agg["name"])
        if aggregator["name"] == "covar_pop":
            return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"])
        if aggregator["name"] == "covar_samp":
            return F.covar_samp(input["col1"],
                                input["col2"]).alias(agg["name"])
        if aggregator["name"] == "kurtosis":
            return F.kurtosis(input).alias(agg["name"])
        if aggregator["name"] in {"max_int", "max_float", "max_string"}:
            return F.max(input).alias(agg["name"])
        if aggregator["name"] == "mean":
            return F.mean(input).alias(agg["name"])
        if aggregator["name"] in {"min_int", "min_float", "min_string"}:
            return F.min(input).alias(agg["name"])
        if aggregator["name"] == "skewness":
            return F.skewness(input).alias(agg["name"])
        if aggregator["name"] == "stddev":
            return F.stddev(input).alias(agg["name"])
        if aggregator["name"] == "stddev_pop":
            return F.stddev_pop(input).alias(agg["name"])
        if aggregator["name"] == "stddev_samp":
            return F.stddev_samp(input).alias(agg["name"])
        if aggregator["name"] in {"sum_int", "sum_float"}:
            return F.sum(input).alias(agg["name"])
        if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}:
            return F.sumDistinct(input).alias(agg["name"])
        if aggregator["name"] == "var_pop":
            return F.var_pop(input).alias(agg["name"])
        if aggregator["name"] == "var_samp":
            return F.var_samp(input).alias(agg["name"])
        if aggregator["name"] == "variance":
            return F.variance(input).alias(agg["name"])

        raise ValueError("missing builtin aggregator")  # unexpected

    except CortexException as e:
        e.wrap("aggregate " + agg["name"])
        raise
Example #53
0
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(
                mean(col(column)).alias("mean"),
                df_min(col(column)).alias("min"),
                df_max(col(column)).alias("max"),
                variance(col(column)).alias("variance"),
                kurtosis(col(column)).alias("kurtosis"),
                stddev(col(column)).alias("std"),
                skewness(col(column)).alias("skewness"),
                df_sum(col(column)).alias("sum")).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(
                mean(col(column)).alias("mean"),
                df_min(col(column)).alias("min"),
                df_max(col(column)).alias("max"),
                df_sum(col(column)).alias("sum")).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(
                variance_custom(col(column), stats_df["mean"].ix[0],
                                current_result["count"])).toPandas().ix[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(
                skewness_custom(col(column), stats_df["mean"].ix[0],
                                current_result["count"])).toPandas().ix[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(
                kurtosis_custom(col(column), stats_df["mean"].ix[0],
                                current_result["count"])).toPandas().ix[0][0]

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr(
                "percentile_approx({col},CAST({n} AS DOUBLE))".format(
                    col=column, n=x)).toPandas().ix[:, 0])
        stats = stats_df.ix[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column).na.drop().select(
            df_abs(col(column) - stats["mean"]).alias("delta")).agg(
                df_sum(col("delta"))).toPandas().ix[0, 0] /
                        float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"],
                                     bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15,
                                    right=0.95,
                                    top=0.9,
                                    bottom=0.1,
                                    wspace=0,
                                    hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(
            base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats
Example #54
0
 # store data
 columnindex_categorical_stores = columnindex(header_stores,
                                              fields_categorical_stores)
 schema_stores = construct_schema(fields_categorical_stores, header_stores)
 df_stores_rdd = stores_d_file.filter(lambda l: header_stores_original[
     0] not in l).map(lambda p: convert_string_to_numeric_df(
         p.split(","), columnindex_categorical_stores))
 df_stores = spark.createDataFrame(df_stores_rdd, schema_stores)
 ## split stores into control and treatment group according to store attributes (these attributes should be strongly related to store sales)
 store_group_col_names = ['avg_hhi', 'avg_traffic']
 store_group_col_names_std = [
     col_name + '_std' for col_name in store_group_col_names
 ]
 for col_name in ['avg_hhi', 'avg_traffic']:
     col_name_std = col_name + '_std'
     col_mean = df_stores.agg(F.mean(col(col_name))).collect()[0][0]
     col_std = df_stores.agg(F.stddev(col(col_name))).collect()[0][0]
     standarize_udf = udf(lambda x: (x - col_mean) / col_std, DoubleType())
     df_stores = df_stores.withColumn(col_name_std,
                                      standarize_udf(col(col_name)))
 store_group_col_index = columnindex(df_stores.columns,
                                     store_group_col_names_std)
 df_stores_rdd = df_stores.rdd.map(list)
 ## perform the clustering to cluster stores according to the attributes
 store_number = df_stores.count()
 if store_number <= 2:
     cluster_number = 1
 else:
     cluster_number = 3
 clusters = KMeans.train(df_stores_rdd.map(
     lambda p: [p[index] for index in store_group_col_index]),