Ejemplo n.º 1
0
def task_6(data_io, product_processed_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    category_column = 'category'
    # Outputs:
    categoryIndex_column = 'categoryIndex'
    categoryOneHot_column = 'categoryOneHot'
    categoryPCA_column = 'categoryPCA'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    step1 = product_processed_data[[category_column]]
    stringIndexer = M.feature.StringIndexer(inputCol="category",
                                            outputCol="indexed_category",
                                            handleInvalid="error",
                                            stringOrderType="frequencyDesc")
    inputs = stringIndexer.getOutputCol()
    OHencoder = M.feature.OneHotEncoderEstimator(inputCols=[inputs],
                                                 outputCols=['categoryOneHot'],
                                                 dropLast=False)
    pca_ = M.feature.PCA(inputCol="categoryOneHot",
                         outputCol='categoryPCA',
                         k=15)
    pipeline = Pipeline(stages=[stringIndexer, OHencoder, pca_])
    pipelineFit = pipeline.fit(step1)
    output = pipelineFit.transform(step1)
    sum_mean = Summarizer.metrics("mean")
    meanVector_categoryOneHot = output.select(
        Summarizer.mean(output.categoryOneHot)).head()[0]
    meanVector_categoryPCA = output.select(Summarizer.mean(
        output.categoryPCA)).head()[0]
    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'meanVector_categoryOneHot': [
            None,
        ],
        'meanVector_categoryPCA': [
            None,
        ]
    }
    # Modify res:
    res['count_total'] = output.count()
    res['meanVector_categoryOneHot'] = meanVector_categoryOneHot
    res['meanVector_categoryPCA'] = meanVector_categoryPCA

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_6')
    return res
Ejemplo n.º 2
0
def evaluate(model, word_column="words", vectorizer="w2v"):
    doc2vecs_df = featurize(word_column, vectorizer)
    if type(model) == LinearSVC:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.regParam, [0.1]) \
            .build()
    elif type(model) == GBTClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [50]) \
            .build()
    elif type(model) == RandomForestClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxBins, [100]) \
            .build()
    elif type(model) == MultilayerPerceptronClassifier:
        paramGrid = ParamGridBuilder() \
             .addGrid(model.layers, [[122, 50, 2]]) \
             .build()
        # .addGrid(model.layers, [[120, 2], [120, 50, 2], [120, 75, 50, 2]]) \
    elif type(model) == FMClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.stepSize, [.01, .001]) \
            .build()
    print('Evaluating...')
    w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])
    si = StringIndexer(inputCol="LABEL", outputCol="label")
    model_evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="f1")
    classifier_pipeline = Pipeline(stages=[si, model])
    crossval = CrossValidator(estimator=classifier_pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=model_evaluator,
                              numFolds=5)
    fit_model = crossval.fit(doc2vecs_df)
    predictions = fit_model.transform(w2v_test_df)
    # predictions.toPandas().to_csv('predictions.csv')
    # predictions.groupBy('prediction', 'label', 'PRODUCT_CATEGORY')
    # predictions.describe()
    summarizer = Summarizer.metrics("mean", "count")
    predictions.select(
        summarizer.summary(predictions.filter(
            predictions.label == 1).pos)).show(truncate=False)
    preds_and_labels = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
    print('Confusion Matrix')
    print(metrics.confusionMatrix().toArray())
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    accuracy = model_evaluator.evaluate(predictions)
    trainingSummary = fit_model.bestModel.stages[-1].extractParamMap()
    print(trainingSummary)

    return accuracy
Ejemplo n.º 3
0
def task_6(data_io, product_processed_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    category_column = 'category'
    # Outputs:
    categoryIndex_column = 'categoryIndex'
    categoryOneHot_column = 'categoryOneHot'
    categoryPCA_column = 'categoryPCA'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    indexer = M.feature.StringIndexer(inputCol=category_column,
                                      outputCol=categoryIndex_column,
                                      handleInvalid="error")
    indexed_model = indexer.fit(product_processed_data).transform(
        product_processed_data)

    encoder = M.feature.OneHotEncoderEstimator(
        dropLast=False,
        inputCols=[categoryIndex_column],
        outputCols=[categoryOneHot_column])
    encodded_model = encoder.fit(indexed_model).transform(indexed_model)

    pca = M.feature.PCA(k=15,
                        inputCol=categoryOneHot_column,
                        outputCol=categoryPCA_column)
    pca_model = pca.fit(encodded_model).transform(encodded_model)

    summarizer = Summarizer.metrics("mean")

    count_total = pca_model.count()
    meanVector_categoryPCA = pca_model.select(
        summarizer.summary(pca_model.categoryPCA)).head()[0][0]
    meanVector_categoryOneHot = pca_model.select(
        summarizer.summary(pca_model.categoryOneHot)).head()[0][0]
    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'meanVector_categoryOneHot': [
            None,
        ],
        'meanVector_categoryPCA': [
            None,
        ]
    }
    # Modify res:
    res['count_total'] = count_total
    res['meanVector_categoryOneHot'] = meanVector_categoryOneHot
    res['meanVector_categoryPCA'] = meanVector_categoryPCA

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_6')
    return res
Ejemplo n.º 4
0
def basic_statistics():
    """Basic statistics."""

    df = sql.read.parquet(str(DATA_PARQUET))

    numeric = ['cost', 'call_duration_minutes', 'data_volume_mb']
    assemble = VectorAssembler(inputCols=numeric, outputCol='features')
    features = assemble.transform(df.dropna(subset=numeric + ['target']))

    breakpoint()

    # summarize
    summarize = Summarizer().metrics('mean', 'variance', 'count',
                                     'numNonZeros', 'max', 'min', 'normL2',
                                     'normL1')
    features.select(summarize.summary(
        features['features'])).show(truncate=False)

    # correlations
    r1 = Correlation.corr(features, 'features', 'pearson').head()[0]
    small = features.sample(fraction=0.1, seed=100500)
    r2 = Correlation.corr(small, 'features', 'spearman').head()[0]
    def match_word_with_word_vector(self, clean_word_no_dup_df,
                                    word_vector_df):
        words_with_vector_df = self.assign_vector_to_words(
            clean_word_no_dup_df, word_vector_df)

        words_with_vector_df.persist()

        mismatched_words_matched_df = self.embed_vector_to_not_matched_words(
            words_with_vector_df, word_vector_df)

        complete_match_df = words_with_vector_df.where(
            col('word_vector').isNotNull()).union(mismatched_words_matched_df)

        return complete_match_df.groupBy('sentence_id').agg(
            Summarizer.sum(
                col('word_vector')).alias('sentence_vector')).select(
                    'sentence_id', 'sentence_vector')
Ejemplo n.º 6
0
    def summarize_artist_styles(self):
        # We need to use a `Summarizer` to be able to take
        # the average of a Vector-type column
        songs = self._generate_dataset() \
            .withColumn("artist", F.explode("artists.name")) \
            .groupBy("artist") \
            .agg(Summarizer.mean(F.col("features")).alias("average_song")) \
            .select("artist", "average_song")

        # Only keep track of some of the most popular artists,
        # there's way too many to realistically compare all of them
        dataset = self.spark \
            .read.json(KPOP_ARTISTS, multiLine=True) \
            .withColumnRenamed("name", "artist") \
            .select("artist", "popularity") \
            .join(songs, "artist") \
            .collect()

        for row in dataset:
            self._save_radar_plot(
                row["artist"],
                # DenseVector -> numpy.ndarray -> List[float]
                row["average_song"].toArray().tolist(),
                row["popularity"])
Ejemplo n.º 7
0
from pyspark.sql import SparkSession
from pyspark.ml.stat import Summarizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

sc = spark.sparkContext

df = sc.parallelize([
    Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
    Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))
]).toDF()

summarizer = Summarizer.metrics("mean", "count")

df.show()
df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)
df.select(summarizer.summary(df.features)).show(truncate=False)
df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)
df.select(Summarizer.mean(df.features)).show(truncate=False)

spark.stop()
Ejemplo n.º 8
0
            (Vectors.dense([4.0, 5.0, 0.0, 3.0])),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0])),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]))])

    rdd_data = more_data.map(lambda line: tuple([float(x) for x in line]))
    summary = Statistics.colStats(rdd_data)

    print("Mean:" + str(summary.mean()))  # a dense vector containing the mean value for each column
    print("Variance:" + str(summary.variance()))  # column-wise variance
    print("Non zeros:" + str(summary.numNonzeros()))
    print("Count:" + str(summary.count()))
    print("Min:" + str(summary.min()))
    print("Max:" + str(summary.max()))


    # Examples with Summarizer
    summarizer = Summarizer.metrics("mean", "count", "min", "max", "variance")

    # compute statistics for multiple metrics
    data_frame.select(summarizer.summary(data_frame.features)).show(truncate=False)

    # compute statistics for single metric "mean"
    data_frame.select(Summarizer.mean(data_frame.features)).show(truncate=False)

    spark_session.stop()





Ejemplo n.º 9
0
from pyspark.ml.linalg import Vectors
# $example off$

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("SummarizerExample") \
        .getOrCreate()
    sc = spark.sparkContext

    # $example on$
    df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
                         Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()

    # create summarizer for multiple metrics "mean" and "count"
    summarizer = Summarizer.metrics("mean", "count")

    # compute statistics for multiple metrics with weight
    df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)

    # compute statistics for multiple metrics without weight
    df.select(summarizer.summary(df.features)).show(truncate=False)

    # compute statistics for single metric "mean" with weight
    df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)

    # compute statistics for single metric "mean" without weight
    df.select(Summarizer.mean(df.features)).show(truncate=False)
    # $example off$

    spark.stop()
####### Using correlation and Summarizer

#Select features
features = ["age", "charges", "customer_contacts", "attrition"]

va = VectorAssembler(inputCols= features, outputCol = "features") #Create Vector Assember
featuresData = va.transform(rawData) #transform original dataset to include new col of vectors
featuresData.show(n=2)

#Calculate correlation and display
r1 = Correlation.corr(featuresData, "features", method = 'pearson').head()
print("Pearson correlation matrix:\n" + str(r1[0]))

#Calculate mean statistic for the list of features in order
summarizer = Summarizer.metrics("mean")
featuresData.select(summarizer.summary(featuresData.features)).show(truncate=False)

"""**Split the Spark Dataframe into Train and Test**"""

#Splitting dataframe with randomsplit
splits = rawData.randomSplit(weights= [.7, .3], seed= 12345)

print("training obs count: ", splits[0].count())
print("test obs count: ", splits[1].count())

train = splits[0]
test = splits[1]

"""**Feature Engineering & Define Model**"""
Ejemplo n.º 11
0
output.select("features").show(10, truncate=False)

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Summarizer

assembler = VectorAssembler(inputCols=[
    'year', 'population', 'labor_force', 'population_percent',
    'employed_total', 'employed_percent', 'agrictulture_ratio',
    'nonagriculture_ratio', 'unemployed', 'unemployed_percent', 'not_in_labor'
],
                            outputCol="features")

assembled = assembler.transform(employment_df)

summarizer = Summarizer.metrics("max", "mean").summary(assembled["features"])

assembled.select(summarizer).show(truncate=False)

assembled.select(Summarizer.variance(assembled.features)).show(truncate=False)

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

assembler = VectorAssembler(inputCols=[
    "date", "day", "period", "nswprice", "nswdemand", "vicprice", "vicdemand",
    "transfer"
],
                            outputCol="features")

assembled = assembler.transform(electricity_df)