def task_6(data_io, product_processed_data): # -----------------------------Column names-------------------------------- # Inputs: category_column = 'category' # Outputs: categoryIndex_column = 'categoryIndex' categoryOneHot_column = 'categoryOneHot' categoryPCA_column = 'categoryPCA' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ step1 = product_processed_data[[category_column]] stringIndexer = M.feature.StringIndexer(inputCol="category", outputCol="indexed_category", handleInvalid="error", stringOrderType="frequencyDesc") inputs = stringIndexer.getOutputCol() OHencoder = M.feature.OneHotEncoderEstimator(inputCols=[inputs], outputCols=['categoryOneHot'], dropLast=False) pca_ = M.feature.PCA(inputCol="categoryOneHot", outputCol='categoryPCA', k=15) pipeline = Pipeline(stages=[stringIndexer, OHencoder, pca_]) pipelineFit = pipeline.fit(step1) output = pipelineFit.transform(step1) sum_mean = Summarizer.metrics("mean") meanVector_categoryOneHot = output.select( Summarizer.mean(output.categoryOneHot)).head()[0] meanVector_categoryPCA = output.select(Summarizer.mean( output.categoryPCA)).head()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'meanVector_categoryOneHot': [ None, ], 'meanVector_categoryPCA': [ None, ] } # Modify res: res['count_total'] = output.count() res['meanVector_categoryOneHot'] = meanVector_categoryOneHot res['meanVector_categoryPCA'] = meanVector_categoryPCA # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_6') return res
def summarize_artist_styles(self): # We need to use a `Summarizer` to be able to take # the average of a Vector-type column songs = self._generate_dataset() \ .withColumn("artist", F.explode("artists.name")) \ .groupBy("artist") \ .agg(Summarizer.mean(F.col("features")).alias("average_song")) \ .select("artist", "average_song") # Only keep track of some of the most popular artists, # there's way too many to realistically compare all of them dataset = self.spark \ .read.json(KPOP_ARTISTS, multiLine=True) \ .withColumnRenamed("name", "artist") \ .select("artist", "popularity") \ .join(songs, "artist") \ .collect() for row in dataset: self._save_radar_plot( row["artist"], # DenseVector -> numpy.ndarray -> List[float] row["average_song"].toArray().tolist(), row["popularity"])
from pyspark.sql import SparkSession from pyspark.ml.stat import Summarizer from pyspark.sql import Row from pyspark.ml.linalg import Vectors spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") sc = spark.sparkContext df = sc.parallelize([ Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0)) ]).toDF() summarizer = Summarizer.metrics("mean", "count") df.show() df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) df.select(summarizer.summary(df.features)).show(truncate=False) df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) df.select(Summarizer.mean(df.features)).show(truncate=False) spark.stop()
(Vectors.dense([4.0, 5.0, 0.0, 3.0])), (Vectors.dense([6.0, 7.0, 0.0, 8.0])), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]))]) rdd_data = more_data.map(lambda line: tuple([float(x) for x in line])) summary = Statistics.colStats(rdd_data) print("Mean:" + str(summary.mean())) # a dense vector containing the mean value for each column print("Variance:" + str(summary.variance())) # column-wise variance print("Non zeros:" + str(summary.numNonzeros())) print("Count:" + str(summary.count())) print("Min:" + str(summary.min())) print("Max:" + str(summary.max())) # Examples with Summarizer summarizer = Summarizer.metrics("mean", "count", "min", "max", "variance") # compute statistics for multiple metrics data_frame.select(summarizer.summary(data_frame.features)).show(truncate=False) # compute statistics for single metric "mean" data_frame.select(Summarizer.mean(data_frame.features)).show(truncate=False) spark_session.stop()
# $example off$ if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("SummarizerExample") \ .getOrCreate() sc = spark.sparkContext # $example on$ df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() # create summarizer for multiple metrics "mean" and "count" summarizer = Summarizer.metrics("mean", "count") # compute statistics for multiple metrics with weight df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) # compute statistics for multiple metrics without weight df.select(summarizer.summary(df.features)).show(truncate=False) # compute statistics for single metric "mean" with weight df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) # compute statistics for single metric "mean" without weight df.select(Summarizer.mean(df.features)).show(truncate=False) # $example off$ spark.stop()