Ejemplo n.º 1
0
def make_new_user_predictions(user_ratings_df):
    user_factors = get_user_factors(user_ratings_df)

    # print('user_factors: {}'.format(user_factors))

    predictions = np.dot(user_factors, item_factors.T)

    prediction_df = spark.createDataFrame(
        zip(item_ids.tolist(), predictions.tolist()),
        ['item', 'res_prediction'])

    res_prediction_stats_df = (prediction_df.agg(
        F.avg(F.col('res_prediction')).alias('avg_res_prediction'),
        F.stddev_samp(F.col('res_prediction')).alias('stddev_res_prediction')))

    predicted_rating_df = (
        prediction_df.crossJoin(rating_stats_df).crossJoin(
            res_prediction_stats_df).crossJoin(residual_stats_df).join(
                item_bias_df, on='item').withColumn(
                    'prediction',
                    ((F.col('res_prediction') - F.col('avg_res_prediction')) *
                     F.col('stddev_residual') / F.col('stddev_res_prediction')
                     # / 2
                     + F.col('avg_residual') + F.col('avg_rating') +
                     F.col('item_bias'))
                    # * (1 - 1 / F.pow(F.col('count_item_rating'), 0.6))
                )
        # .filter(F.col('prediction') > 0)
    )

    predicted_rating_stats_df = (predicted_rating_df.agg(
        F.avg(F.col('prediction')).alias('avg_prediction'),
        F.stddev_samp(F.col('prediction')).alias('stddev_prediction')))

    # print('prediction_df')
    # prediction_df.show(truncate=False)

    print('predicted_rating_df')
    predicted_rating_df.show(truncate=False)

    print('residual_stats_df')
    residual_stats_df.show(truncate=False)

    print('res_prediction_stats_df')
    res_prediction_stats_df.show(truncate=False)

    print('rating_stats_df')
    rating_stats_df.show(truncate=False)

    print('predicted_rating_stats_df')
    predicted_rating_stats_df.show(truncate=False)

    return predicted_rating_df
Ejemplo n.º 2
0
    def _fit(self, data):
        inputCol = self.getInputCol()
        outputCol = self.getOutputCol()

        mean, stddev = data.agg(avg(inputCol), stddev_samp(inputCol)).first()

        return ImputeNormalModel(
            mean=float(mean),
            stddev=float(stddev),
            inputCol=inputCol,
            outputCol=outputCol,
        )
Ejemplo n.º 3
0
    def standardize_column(self, column_name):
        def standardize_column_helper(mean, sd):
            return udf(lambda x: old_div((x - mean) * 1.0, sd)
                       if x != None else x)

        mean = self._data_frame.select(F.mean(column_name)).collect()[0][0]
        StdDev = self._data_frame.select(
            F.stddev_samp(column_name)).collect()[0][0]
        self._data_frame = self._data_frame.withColumn(
            column_name + "_fs_standardized",
            standardize_column_helper(mean, StdDev)(col(column_name)))
        self._data_frame = self._data_frame.withColumn(
            column_name + "_fs_standardized",
            self._data_frame[column_name + "_fs_standardized"].cast('float'))
        return self._data_frame
    def evaluate_demographics(self, target_file=[]):
        cur_demo = self.add_demo()
        from pyspark.sql.functions import udf
        udf_age = udf(lambda x: x.toArray().tolist()[0])
        cur_demo = cur_demo.withColumn("AGE", udf_age("demo_feature"))
        cur_target_file = self.spark.read.parquet(self.out_file_name)

        anal_df = cur_target_file.select("ID").distinct().join(cur_demo, "ID")
        from pyspark.sql.functions import avg, stddev_samp, count
        anal_df.groupBy().agg(avg("AGE"), stddev_samp("AGE")).show()

        self.logger.info(cur_target_file.count())

        cur_death = self.get_hospital_death()
        self.logger.info(anal_df.count())
        anal_df.join(cur_death, "ID").groupBy("IS_DEAD").agg(count("*")).show()
Ejemplo n.º 5
0
    def run_pipeline(self):
        try:
            logging.info(
                "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/"
            )
            # check collect_list and collect_set
            #collect_set() function returns all values from an input column with duplicate values eliminated.
            #collect_list() function returns all values from an input column with duplicates

            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600),
                          ("Robert", "Sales", 4100),
                          ("Maria", "Finance", 3000), ("James", "Sales", 3000),
                          ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                          ("Jeff", "Marketing", 3000),
                          ("Kumar", "Marketing", 2000),
                          ("Saif", "Sales", 4100)]
            schema = ["employee_name", "department", "salary"]

            df = self.spark.createDataFrame(data=simpleData,
                                            schema=schema).cache()
            df.show(truncate=False)

            from pyspark.sql.functions import approx_count_distinct, collect_list
            from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
            from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
            from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
            from pyspark.sql.functions import variance, var_samp, var_pop
            df.printSchema()
            df.show(truncate=False)

            print("approx_count_distinct: " + \
                  str(df.select(approx_count_distinct("salary")).collect()[0][0]))

            print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

            df.select(collect_list("salary")).show(truncate=False)

            df.select(collect_set("salary")).show(truncate=False)

            df2 = df.select(countDistinct("department", "salary"))
            df2.show(truncate=False)
            print("Distinct Count of Department & Salary: " +
                  str(df2.collect()[0][0]))

            print("count: " + str(df.select(count("salary")).collect()[0]))
            dffirst = df.select(first("salary"))
            dffirst.show(truncate=False)
            df.select(last("salary")).show(truncate=False)
            df.select(kurtosis("salary")).show(truncate=False)
            df.select(max("salary")).show(truncate=False)
            df.select(min("salary")).show(truncate=False)
            df.select(mean("salary")).show(truncate=False)
            df.select(skewness("salary")).show(truncate=False)
            df.select(stddev("salary"), stddev_samp("salary"), \
                      stddev_pop("salary")).show(truncate=False)
            df.select(sum("salary")).show(truncate=False)
            df.select(sumDistinct("salary")).show(truncate=False)
            df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \
                .show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return
Ejemplo n.º 6
0
# _read csv file
flightdata = spark.read.option('inferSchema',
                               'true').option('header',
                                              'true').csv('2015-summary.csv')
flightdata.show(5)
flightdata.printSchema()

# _add new column using withColumn, we are just printing the updated dataframe by show, but it should be taken into new variable as new dataframe.
flightdata.withColumn("newCol", col("count") + 10).show(4)

# _using select, we can also mention column names explicitly in place of *
flightdata_mod = flightdata.select("*", (col("count") + 20).alias("newCol2"))
flightdata_mod.show(5)

# _basic statistical functions
flightdata.select(mean("count")).show()
flightdata.select(min("count")).show()
flightdata.select(max("count")).show()
flightdata.select(stddev_pop("count")).show()
flightdata.select(stddev_samp("count")).show()
flightdata.select()

# _group by and aggregations
flightdata.groupBy("DEST_COUNTRY_NAME").agg(sum('count')).show(5)
dest_count_data = flightdata.groupBy("DEST_COUNTRY_NAME").agg({'count': 'sum'})

# _write the data to csv after coalesce
dest_count_data_merged = dest_count_data.coalesce(1)
dest_count_data_merged.write.format('csv').option('header',
                                                  'true').save('dest_country')
Ejemplo n.º 7
0
dailyActivitiesDF.select(min("CaloriesBurned"), max("CaloriesBurned")).show()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Statistical functions
# MAGIC
# MAGIC - We can do some basic statistical functions as well using the Spark API

# COMMAND ----------

# standard deviation and variance
dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"),
                         stddev_pop("CaloriesBurned"),
                         stddev_samp("CaloriesBurned")).show()

# COMMAND ----------

# Any extreme points in our data?
dailyActivitiesDF.select(skewness("CaloriesBurned"),
                         kurtosis("CaloriesBurned")).show()

# COMMAND ----------

# Covariance and Correlation
dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"),
                         covar_samp("CaloriesBurned", "Steps"),
                         covar_pop("CaloriesBurned", "Steps")).show()

# COMMAND ----------
Ejemplo n.º 8
0
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show(2)

#first, last, min, max
from pyspark.sql.functions import first, last, min, max
df.select(
    first("StockCode").alias("First_stock"), last("StockCode"),
    min("StockCode"), max("StockCode")).show(2)

#sum,sumDistinct, avg
from pyspark.sql.functions import sum, sumDistinct, avg
df.select(sum("Quantity"), sumDistinct("Quantity"), avg("Quantity")).show(2)

#표본분산 , 표본표준편차
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_samp("Quantity"), stddev_samp("Quantity")).show(2)

#모분산, 모표본편차
from pyspark.sql.functions import var_pop, stddev_pop
df.select(var_pop("Quantity"), stddev_pop("Quantity")).show(2)

#비대칭도, 척도
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show(2)

#공분산과 상관관계
from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity"),
          covar_samp("InvoiceNo", "Quantity")).show(2)

#복합데이터 타입의 집계
Ejemplo n.º 9
0
 def get_column_stdev(self, column):
     return self.spark_df.select(stddev_samp(col(column))).collect()[0][0]
Ejemplo n.º 10
0
    def _transform(self, requests_df):
        '''
        Predicts the rating for requested users and restaurants.

        Parameters
        ==========
        requests_df         (pyspark.sql.DataFrame) Data used to request 
                            predictions of ratings. Columns are 'user' and
                            'item'. Values of 'user' and 'item' must be
                            numeric.

        Returns
        =======
        final_prediction_df (pyspark.sql.DataFrame) Predictions with 'user',
                            'item' and 'prediction'. Prediction will be a 
                            floating point number.

        '''

        # print('Transform starting!')

        start_time = time.monotonic()

        if self.useALS:
            self.prediction_df = self.recommender.transform(requests_df)

            self.prediction_stats_df = (
                self.prediction_df
                .dropna(how='all', subset=['prediction'])
                .agg(
                    F.avg(F.col('prediction')).alias('avg_prediction'),
                    F.stddev_samp(F.col('prediction')).alias('stddev_prediction')
                )
            )

            # print('prediction_df')
            # self.prediction_df.show()

            # print('prediction_stats_df')
            # self.prediction_stats_df.show()

            # print('rating_stats_df')
            # self.rating_stats_df.show()

            # print('residual_stats_df')
            # self.residual_stats_df.show()

            if self.useBias:
                final_prediction_df = (
                    self.prediction_df
                    .crossJoin(self.rating_stats_df)
                    # .crossJoin(self.prediction_stats_df)
                    # .crossJoin(self.residual_stats_df)
                    .join(self.user_bias_df, on='user')
                    .join(self.item_bias_df, on='item')
                    .fillna({
                        'user_bias': 0.0,
                        'item_bias': 0.0
                    })
                    .withColumn(
                        'prediction',
                        (
                            F.coalesce(
                                F.col('prediction')
                                # - F.col('avg_prediction')
                                , F.lit(0.0)
                            )
                            # * F.col('stddev_residual')
                            # / F.col('stddev_prediction')
                            # + F.col('avg_residual')
                            + F.col('avg_rating')
                            + F.col('user_bias')
                            + F.col('item_bias')
                        )
                        # * (1 - (1 / F.pow(F.col('count_item_rating')), self.lambda_3))
                    )
                   .select(
                        'user',
                        'item',
                        'rating',
                        'prediction'
                    )
                )

            else:
                final_prediction_df = (
                    self.prediction_df
                    .dropna(how='all', subset=['prediction'])
                    # .fillna({'prediction': F.col('avg_prediction')})
                    # .crossJoin(self.residual_stats_df)
                    # .crossJoin(self.prediction_stats_df)
                    # .withColumn(
                    #     'prediction',
                    #     (
                    #         F.col('prediction')
                    #         - F.col('avg_prediction')
                    #     )
                    #     * F.col('stddev_residual')
                    #     / F.col('stddev_prediction')
                    #     + F.col('avg_residual')
                    # )
                )
        else:
            final_prediction_df = (
                requests_df
                .crossJoin(self.rating_stats_df)
                .join(self.user_bias_df, on='user')
                .join(self.item_bias_df, on='item')
                .fillna({
                    'user_bias': 0.0,
                    'item_bias': 0.0
                })
                .withColumn(
                    'prediction',
                    F.col('avg_rating')
                    + F.col('user_bias')
                    + F.col('item_bias')
                )
                .select(
                    'user',
                    'item',
                    'rating',
                    'prediction'
                )
            )

        print('Transform done in {} seconds'.format(time.monotonic() - start_time))

        # print('final_prediction_df')
        # final_prediction_df.show()

        return final_prediction_df
Ejemplo n.º 11
0
    def _fit(self, ratings_df):
        '''
        Fit ALS model using reviews as training data.

        Parameters
        ==========
        ratings_df      (pyspark.sql.DataFrame) Data used to train recommender
                        model. Columns are 'user', 'item', and 'rating'. Values
                        of user and item must be numeric. Values of rating
                        range from 1 to 5.

        Returns
        =======
        self
        '''
        # avg_rating_df = (
        #     ratings_df
        #     .groupBy()
        #     .avg(self.getRatingCol())
        #     .withColumnRenamed('avg({})'.format(self.getRatingCol()),
        #                        'avg_rating')
        # )

        # print('Fit starting!')

        start_time = time.monotonic()

        # print('ratings_df')
        # ratings_df.show()

        rating_stats_df = (
            ratings_df
            .agg(
                F.avg(self.getRatingCol()).alias('avg_rating'),
                F.stddev_samp(self.getRatingCol()).alias('stddev_rating')
            )
        )

        # print('ratings_stats_df:')
        # rating_stats_df.show()

        # if not self.getUseALS():
        #     self.setLambda_1(0.0)
        #     self.setLambda_2(0.0)

        item_bias_df = (
            ratings_df
            .crossJoin(rating_stats_df)
            .withColumn(
                'diffs_item_rating',
                F.col(self.getRatingCol()) - F.col('avg_rating')
            )
            .groupBy(self.getItemCol())
            .agg(
                F.avg(F.col('diffs_item_rating')).alias('avg_diffs_item_rating'),
                F.nanvl(
                    F.stddev_samp(F.col('diffs_item_rating')),
                    F.lit(2.147483647E9)
                ).alias('stddev_diffs_item_rating'),
                F.count("*").alias('count_item_rating')
            )
            .withColumn(
                'stderr_diffs_item_rating',
                (self.getLambda_1() + F.col('stddev_diffs_item_rating'))
                / F.sqrt('count_item_rating')
            )
            .withColumn(
                'item_bias',
                F.col('avg_diffs_item_rating')
                / (1 +  F.col('stderr_diffs_item_rating'))
            )
            .select(
                self.getItemCol(),
                'item_bias',
                'avg_diffs_item_rating',
                'stderr_diffs_item_rating',
                'stddev_diffs_item_rating',
                'count_item_rating'
            )
        )

        # print('item_bias_df:')
        # item_bias_df.show(5)

        # item_bias_df.printSchema()

        # print('item_bias_df NaN')
        # item_bias_df.where(F.isnan("item_bias")).show()

        user_bias_df = (
            ratings_df
            .crossJoin(rating_stats_df)
            .join(item_bias_df, on=self.getItemCol())
            .withColumn(
                'diffs_user_rating',
                F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('item_bias')
            )
            .groupBy(self.getUserCol())
            .agg(
                F.avg(F.col('diffs_user_rating')).alias('avg_diffs_user_rating'),
                F.nanvl(
                    F.stddev_samp(F.col('diffs_user_rating')),
                    F.lit(2.147483647E9)
                ).alias('stddev_diffs_user_rating'),
                F.count("*").alias('count_user_rating')
            )
            .withColumn(
                'stderr_diffs_user_rating',
                (self.getLambda_2() + F.col('stddev_diffs_user_rating'))
                / F.sqrt('count_user_rating')
            )
            .withColumn(
                'user_bias',
                F.col('avg_diffs_user_rating')
                / (1 + F.col('stderr_diffs_user_rating'))
            )
            .select(
                self.getUserCol(),
                'user_bias',
                'avg_diffs_user_rating',
                'stderr_diffs_user_rating',
                'stddev_diffs_user_rating',
                'count_user_rating'
            )
        )

        # print('user_bias_df:')
        # user_bias_df.show(5)

        # print('user_bias_df NaN')
        # user_bias_df.where(F.isnan("user_bias")).show()

        if self.getUseALS():
            if self.getUseBias():
                residual_df = (
                    ratings_df
                    .crossJoin(rating_stats_df)
                    .join(user_bias_df, on=self.getUserCol())
                    .join(item_bias_df, on=self.getItemCol())
                    .withColumn(
                        self.getRatingCol(),
                        F.col(self.getRatingCol())
                        - F.col('avg_rating')
                        - F.col('user_bias')
                        - F.col('item_bias')
                    )
                    .select(
                        self.getUserCol(),
                        self.getItemCol(),
                        self.getRatingCol()
                    )
                )

            else:
                residual_df = ratings_df
                # self.setColdStartStrategy('drop')

            residual_stats_df = (
                residual_df
                .agg(
                    F.avg(F.col(self.getRatingCol())).alias('avg_residual'),
                    F.stddev(F.col(self.getRatingCol())).alias('stddev_residual')
                )
            )

            # print('residual_df')
            # residual_df.show()

            # print('residual_df NaN')
            # residual_df.where(F.isnan("rating")).show()

            # print('residual_stats_df')
            # residual_stats_df.show()

            als_model = ALS(
                rank=self.getRank(),
                maxIter=self.getMaxIter(),
                regParam=self.getRegParam(),
                numUserBlocks=self.getNumUserBlocks(),
                numItemBlocks=self.getNumItemBlocks(),
                implicitPrefs=self.getImplicitPrefs(),
                alpha=self.getAlpha(),
                userCol=self.getUserCol(),
                itemCol=self.getItemCol(),
                ratingCol=self.getRatingCol(),
                nonnegative=self.getNonnegative(),
                checkpointInterval=self.getCheckpointInterval(),
                intermediateStorageLevel=self.getIntermediateStorageLevel(),
                finalStorageLevel=self.getFinalStorageLevel()
            )

            recommender = als_model.fit(residual_df)

        else:
            recommender = None
            residual_stats_df = None

        print('Fit done in {} seconds'.format(time.monotonic() - start_time))

        return(
            RecommenderModel(
                self.getUseALS(), self.getUseBias(), self.getLambda_3(),
                # self.getColdStartStrategy(),
                recommender, rating_stats_df, residual_stats_df,
                user_bias_df, item_bias_df
            )
        )
Ejemplo n.º 12
0
Often you will want to compute a metric over a set of values that share a common characteristic, like the average price of a house in a certain region. To achieve this, you would need to group the data by region and compute an aggregate metric on that subgroup of data.

We’ve already seen in the video a couple of these aggregation metrics, on landingprices.csv_. We’ll inspect a few more now and apply them to _~/workspace/mnt/data_lake/landing/purchased.csv_. In particular, you’ll use the spark.sql aggregation functions avg() to compute the average value of some column in a group, stddev_samp() to compute the standard (sample) deviation and max() (which we alias as sfmax so as not to shadow Python’s built-in max()) to retrieve the largest value of some column in a group.

Instructions
100 XP

- Use the .groupBy() method to group the data by the “Country” column.
- In these groups, compute the average of the “Salary” column and name the resulting column “average_salary”.
- Compute the standard deviation of the “Salary” column in each group in the same aggregation.
- Retrieve the largest “Salary” in each group, in the same aggregation, and name the resulting column “highest_salary”.

'''
from pyspark.sql.functions import col, avg, stddev_samp, max as sfmax

aggregated = (purchased
              # Group rows by 'Country'
              .groupBy(col('Country'))
              .agg(
                  # Calculate the average salary per group
                  avg('Salary').alias('average_salary'),
                  # Calculate the standard deviation per group
                  stddev_samp('Salary'),
                  # Retain the highest salary per group
                  sfmax('Salary').alias('highest_salary')
              )
              )

aggregated.show()
Ejemplo n.º 13
0
def get_builtin_aggregator_column(agg, ctx):
    try:
        aggregator = ctx.aggregators[agg["aggregator"]]

        try:
            input = ctx.populate_values(agg["input"],
                                        aggregator["input"],
                                        preserve_column_refs=False)
        except CortexException as e:
            e.wrap("input")
            raise

        if aggregator["name"] == "approx_count_distinct":
            return F.approxCountDistinct(input["col"],
                                         input.get("rsd")).alias(agg["name"])
        if aggregator["name"] == "avg":
            return F.avg(input).alias(agg["name"])
        if aggregator["name"] in {
                "collect_set_int", "collect_set_float", "collect_set_string"
        }:
            return F.collect_set(input).alias(agg["name"])
        if aggregator["name"] == "count":
            return F.count(input).alias(agg["name"])
        if aggregator["name"] == "count_distinct":
            return F.countDistinct(*input).alias(agg["name"])
        if aggregator["name"] == "covar_pop":
            return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"])
        if aggregator["name"] == "covar_samp":
            return F.covar_samp(input["col1"],
                                input["col2"]).alias(agg["name"])
        if aggregator["name"] == "kurtosis":
            return F.kurtosis(input).alias(agg["name"])
        if aggregator["name"] in {"max_int", "max_float", "max_string"}:
            return F.max(input).alias(agg["name"])
        if aggregator["name"] == "mean":
            return F.mean(input).alias(agg["name"])
        if aggregator["name"] in {"min_int", "min_float", "min_string"}:
            return F.min(input).alias(agg["name"])
        if aggregator["name"] == "skewness":
            return F.skewness(input).alias(agg["name"])
        if aggregator["name"] == "stddev":
            return F.stddev(input).alias(agg["name"])
        if aggregator["name"] == "stddev_pop":
            return F.stddev_pop(input).alias(agg["name"])
        if aggregator["name"] == "stddev_samp":
            return F.stddev_samp(input).alias(agg["name"])
        if aggregator["name"] in {"sum_int", "sum_float"}:
            return F.sum(input).alias(agg["name"])
        if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}:
            return F.sumDistinct(input).alias(agg["name"])
        if aggregator["name"] == "var_pop":
            return F.var_pop(input).alias(agg["name"])
        if aggregator["name"] == "var_samp":
            return F.var_samp(input).alias(agg["name"])
        if aggregator["name"] == "variance":
            return F.variance(input).alias(agg["name"])

        raise ValueError("missing builtin aggregator")  # unexpected

    except CortexException as e:
        e.wrap("aggregate " + agg["name"])
        raise
Ejemplo n.º 14
0
def getVol(df):
    #get volatility by day
    df_std = df.agg(func.stddev_samp(df.percent))
    df_std = df_std.withColumnRenamed("stddev_samp(percent)", "volatility")
    return df_std
Ejemplo n.º 15
0
df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"),
          stddev_samp("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
          covar_pop("InvoiceNo", "Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()


# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()

Ejemplo n.º 17
0
                        df['column1'] + df['column2'] * df['column3'])
    df3.show()

    print('--- Aggregations and quick statistics -------')
    # the dataframe doesn't have headers that's why we need the column names
    ADULT_COLUMN_NAMES = [
        "age", "workclass", "fnlwgt", "education", "education_num",
        "marital_status", "occupation", "relationship", "race", "sex",
        "capital_gain", "capital_loss", "hours_per_week", "native_country",
        "income"
    ]
    # I downloaded the adult.data file from
    # https://archive.ics.uci.edu/ml/datasets/adult
    #  to my data folder and renamed it to adult.data.csv
    csv_df = session.read.csv('data/adult.data.csv',
                              header=False,
                              inferSchema=True)
    # we'll set the column names one by one on this loop
    for new_col, old_col in zip(ADULT_COLUMN_NAMES, csv_df.columns):
        csv_df = csv_df.withColumnRenamed(old_col, new_col)

    # quick descriptive statistics
    csv_df.describe().show()
    # get average work hours per age
    work_hours_df = csv_df.groupBy('age').agg(
        funcs.avg('hours_per_week'),
        funcs.stddev_samp('hours_per_week')).sort('age')
    work_hours_df.show(100)

    print('---- The End :) -----')
Ejemplo n.º 18
0
encode = OneHotEncoder(inputCol="studentIdx", outputCol="studentclassVec")

# Let's apply the same procedure to the label(target) variable
# No need to apply onehot encoding to label (only string indexing is required)
label_StrIdx = StringIndexer(inputCol="default", outputCol="label")

# Build the first stages for the pipeline
stages = [strIdx, encode, label_StrIdx]

# For numerical variables, let's do transform those to standard scaled variables
from pyspark.sql.functions import col, stddev_samp

numCols = ['income', 'balance']
for c in numCols:
    df = df.withColumn(c + "Scaled",
                       col(c) / df.agg(stddev_samp(c)).first()[0])

# Finally, you can define the inputs for mordel
# In this case, the vector of the categorical variables and the scaled numerical variables were assinged
inputs = ["studentclassVec", "incomeScaled", "balanceScaled"]

# As all input features need to be vectorized, VectorAssembler function has to be used
assembler = VectorAssembler(inputCols=inputs, outputCol="features")

# Add the assembler to the previous stage
stages += [assembler]

# Put stages to build the Pipeline
# - The stage consists of string indexer, onehot encoder, scaler, and vector assembler
pipeline = Pipeline(stages=stages)
Ejemplo n.º 19
0
 def std_dev(data_frame, measure_column_name):
     return data_frame.select(
         FN.stddev_samp(measure_column_name)).collect()[0][0]
Ejemplo n.º 20
0
def get_baseline_scores(train_df, val_df, evaluator, eval_name):
    stats_rating_df = (
        train_df
        .agg(
            F.avg('rating').alias('avg_rating'),
            F.stddev_samp('rating').alias('stddev_rating')
        )
    )

    stats_row = stats_rating_df.head()

    print('[plot_scores Train] Avg: {}'.format(stats_row[0]))
    print('[plot_scores Train] Std Dev: {}'.format(stats_row[1]))

    # Naive model: random normal rating centered on average rating and scaled
    # with standard deviation of training data.
    train_predict_df = (
        train_df
        .crossJoin(stats_rating_df)
        .withColumn(
            'prediction',
            F.col('avg_rating') + F.randn() * F.col('stddev_rating')
        )
        .select(
            'user',
            'item',
            'rating',
            'prediction'
        )
    )

    val_predict_df = (
        val_df
        .crossJoin(stats_rating_df)
        .withColumn(
            'prediction',
            F.col('avg_rating') + F.randn() * F.col('stddev_rating')
        )
        .select(
            'user',
            'item',
            'rating',
            'prediction'
        )
    )

    naive_score_train = evaluator.evaluate(train_predict_df)
    naive_score_val = evaluator.evaluate(val_predict_df)

    print('Train Naive {} score: {}'.format(eval_name, naive_score_train))
    print('Validation Naive {} score: {}'.format(eval_name, naive_score_val))

    estimator = Recommender(
        lambda_1=0.0,
        lambda_2=0.0,
        lambda_3=0.0,
        useALS=False,
        useBias=True,
        userCol='user',
        itemCol='item',
        ratingCol='rating'
    )

    model = estimator.fit(train_df)
    baseline_score_train = evaluator.evaluate(model.transform(train_df))
    baseline_score_val = evaluator.evaluate(model.transform(val_df))

    print('Train Baseline {} score: {}'.format(eval_name, baseline_score_train))
    print('Validation Baseline {} score: {}'.format(eval_name, baseline_score_val))

    return (
        naive_score_train, naive_score_val,
        baseline_score_train, baseline_score_val
    )
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)
Ejemplo n.º 22
0
lh = l3.union(l4)

l1.groupBy('Level_1', 'Sex').count().sort('count',
                                          ascending=False).show(150,
                                                                truncate=False)
l2.groupBy('Level_2', 'Sex').count().sort('count',
                                          ascending=False).show(150,
                                                                truncate=False)
lh.groupBy('Level', 'Sex').count().sort('count',
                                        ascending=False).show(150,
                                                              truncate=False)

l1_t = l1.groupBy('Level_1').agg(
    F.mean('Age_1').alias('Mean'),
    F.count('Age_1').alias('Count'),
    F.stddev_samp('Age_1').alias('StdDev'))
l1_t.sort('Count', ascending=False).show(25, truncate=False)
l2_t = l2.groupBy('Level_2').agg(
    F.mean('Age_2').alias('Mean'),
    F.count('Age_2').alias('Count'),
    F.stddev_samp('Age_2').alias('StdDev'))
l2_t.sort('Count', ascending=False).show(25, truncate=False)
lh_t = lh.groupBy('Level').agg(
    F.mean('Age').alias('Mean'),
    F.count('Age').alias('Count'),
    F.stddev_samp('Age').alias('StdDev'))
lh_t.sort('Count', ascending=False).show(25, truncate=False)

aRules = fpm.associationRules
associationRules = fpm.associationRules
freqItemsets = fpm.freqItemsets