def task_6(data_io, product_processed_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    category_column = 'category'
    # Outputs:
    categoryIndex_column = 'categoryIndex'
    categoryOneHot_column = 'categoryOneHot'
    categoryPCA_column = 'categoryPCA'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    step1 = product_processed_data[[category_column]]
    stringIndexer = M.feature.StringIndexer(inputCol="category",
                                            outputCol="indexed_category",
                                            handleInvalid="error",
                                            stringOrderType="frequencyDesc")
    inputs = stringIndexer.getOutputCol()
    OHencoder = M.feature.OneHotEncoderEstimator(inputCols=[inputs],
                                                 outputCols=['categoryOneHot'],
                                                 dropLast=False)
    pca_ = M.feature.PCA(inputCol="categoryOneHot",
                         outputCol='categoryPCA',
                         k=15)
    pipeline = Pipeline(stages=[stringIndexer, OHencoder, pca_])
    pipelineFit = pipeline.fit(step1)
    output = pipelineFit.transform(step1)
    sum_mean = Summarizer.metrics("mean")
    meanVector_categoryOneHot = output.select(
        Summarizer.mean(output.categoryOneHot)).head()[0]
    meanVector_categoryPCA = output.select(Summarizer.mean(
        output.categoryPCA)).head()[0]
    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'meanVector_categoryOneHot': [
            None,
        ],
        'meanVector_categoryPCA': [
            None,
        ]
    }
    # Modify res:
    res['count_total'] = output.count()
    res['meanVector_categoryOneHot'] = meanVector_categoryOneHot
    res['meanVector_categoryPCA'] = meanVector_categoryPCA

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_6')
    return res
Exemple #2
0
    def summarize_artist_styles(self):
        # We need to use a `Summarizer` to be able to take
        # the average of a Vector-type column
        songs = self._generate_dataset() \
            .withColumn("artist", F.explode("artists.name")) \
            .groupBy("artist") \
            .agg(Summarizer.mean(F.col("features")).alias("average_song")) \
            .select("artist", "average_song")

        # Only keep track of some of the most popular artists,
        # there's way too many to realistically compare all of them
        dataset = self.spark \
            .read.json(KPOP_ARTISTS, multiLine=True) \
            .withColumnRenamed("name", "artist") \
            .select("artist", "popularity") \
            .join(songs, "artist") \
            .collect()

        for row in dataset:
            self._save_radar_plot(
                row["artist"],
                # DenseVector -> numpy.ndarray -> List[float]
                row["average_song"].toArray().tolist(),
                row["popularity"])
Exemple #3
0
from pyspark.sql import SparkSession
from pyspark.ml.stat import Summarizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

sc = spark.sparkContext

df = sc.parallelize([
    Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
    Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))
]).toDF()

summarizer = Summarizer.metrics("mean", "count")

df.show()
df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)
df.select(summarizer.summary(df.features)).show(truncate=False)
df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)
df.select(Summarizer.mean(df.features)).show(truncate=False)

spark.stop()
Exemple #4
0
            (Vectors.dense([4.0, 5.0, 0.0, 3.0])),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0])),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]))])

    rdd_data = more_data.map(lambda line: tuple([float(x) for x in line]))
    summary = Statistics.colStats(rdd_data)

    print("Mean:" + str(summary.mean()))  # a dense vector containing the mean value for each column
    print("Variance:" + str(summary.variance()))  # column-wise variance
    print("Non zeros:" + str(summary.numNonzeros()))
    print("Count:" + str(summary.count()))
    print("Min:" + str(summary.min()))
    print("Max:" + str(summary.max()))


    # Examples with Summarizer
    summarizer = Summarizer.metrics("mean", "count", "min", "max", "variance")

    # compute statistics for multiple metrics
    data_frame.select(summarizer.summary(data_frame.features)).show(truncate=False)

    # compute statistics for single metric "mean"
    data_frame.select(Summarizer.mean(data_frame.features)).show(truncate=False)

    spark_session.stop()





Exemple #5
0
# $example off$

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("SummarizerExample") \
        .getOrCreate()
    sc = spark.sparkContext

    # $example on$
    df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
                         Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()

    # create summarizer for multiple metrics "mean" and "count"
    summarizer = Summarizer.metrics("mean", "count")

    # compute statistics for multiple metrics with weight
    df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)

    # compute statistics for multiple metrics without weight
    df.select(summarizer.summary(df.features)).show(truncate=False)

    # compute statistics for single metric "mean" with weight
    df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)

    # compute statistics for single metric "mean" without weight
    df.select(Summarizer.mean(df.features)).show(truncate=False)
    # $example off$

    spark.stop()