Esempio n. 1
0
def evaluate(model, word_column="words", vectorizer="w2v"):
    doc2vecs_df = featurize(word_column, vectorizer)
    if type(model) == LinearSVC:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.regParam, [0.1]) \
            .build()
    elif type(model) == GBTClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [50]) \
            .build()
    elif type(model) == RandomForestClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxBins, [100]) \
            .build()
    elif type(model) == MultilayerPerceptronClassifier:
        paramGrid = ParamGridBuilder() \
             .addGrid(model.layers, [[122, 50, 2]]) \
             .build()
        # .addGrid(model.layers, [[120, 2], [120, 50, 2], [120, 75, 50, 2]]) \
    elif type(model) == FMClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.stepSize, [.01, .001]) \
            .build()
    print('Evaluating...')
    w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])
    si = StringIndexer(inputCol="LABEL", outputCol="label")
    model_evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="f1")
    classifier_pipeline = Pipeline(stages=[si, model])
    crossval = CrossValidator(estimator=classifier_pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=model_evaluator,
                              numFolds=5)
    fit_model = crossval.fit(doc2vecs_df)
    predictions = fit_model.transform(w2v_test_df)
    # predictions.toPandas().to_csv('predictions.csv')
    # predictions.groupBy('prediction', 'label', 'PRODUCT_CATEGORY')
    # predictions.describe()
    summarizer = Summarizer.metrics("mean", "count")
    predictions.select(
        summarizer.summary(predictions.filter(
            predictions.label == 1).pos)).show(truncate=False)
    preds_and_labels = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
    print('Confusion Matrix')
    print(metrics.confusionMatrix().toArray())
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    accuracy = model_evaluator.evaluate(predictions)
    trainingSummary = fit_model.bestModel.stages[-1].extractParamMap()
    print(trainingSummary)

    return accuracy
def task_6(data_io, product_processed_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    category_column = 'category'
    # Outputs:
    categoryIndex_column = 'categoryIndex'
    categoryOneHot_column = 'categoryOneHot'
    categoryPCA_column = 'categoryPCA'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    indexer = M.feature.StringIndexer(inputCol=category_column,
                                      outputCol=categoryIndex_column,
                                      handleInvalid="error")
    indexed_model = indexer.fit(product_processed_data).transform(
        product_processed_data)

    encoder = M.feature.OneHotEncoderEstimator(
        dropLast=False,
        inputCols=[categoryIndex_column],
        outputCols=[categoryOneHot_column])
    encodded_model = encoder.fit(indexed_model).transform(indexed_model)

    pca = M.feature.PCA(k=15,
                        inputCol=categoryOneHot_column,
                        outputCol=categoryPCA_column)
    pca_model = pca.fit(encodded_model).transform(encodded_model)

    summarizer = Summarizer.metrics("mean")

    count_total = pca_model.count()
    meanVector_categoryPCA = pca_model.select(
        summarizer.summary(pca_model.categoryPCA)).head()[0][0]
    meanVector_categoryOneHot = pca_model.select(
        summarizer.summary(pca_model.categoryOneHot)).head()[0][0]
    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'meanVector_categoryOneHot': [
            None,
        ],
        'meanVector_categoryPCA': [
            None,
        ]
    }
    # Modify res:
    res['count_total'] = count_total
    res['meanVector_categoryOneHot'] = meanVector_categoryOneHot
    res['meanVector_categoryPCA'] = meanVector_categoryPCA

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_6')
    return res
def task_6(data_io, product_processed_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    category_column = 'category'
    # Outputs:
    categoryIndex_column = 'categoryIndex'
    categoryOneHot_column = 'categoryOneHot'
    categoryPCA_column = 'categoryPCA'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    step1 = product_processed_data[[category_column]]
    stringIndexer = M.feature.StringIndexer(inputCol="category",
                                            outputCol="indexed_category",
                                            handleInvalid="error",
                                            stringOrderType="frequencyDesc")
    inputs = stringIndexer.getOutputCol()
    OHencoder = M.feature.OneHotEncoderEstimator(inputCols=[inputs],
                                                 outputCols=['categoryOneHot'],
                                                 dropLast=False)
    pca_ = M.feature.PCA(inputCol="categoryOneHot",
                         outputCol='categoryPCA',
                         k=15)
    pipeline = Pipeline(stages=[stringIndexer, OHencoder, pca_])
    pipelineFit = pipeline.fit(step1)
    output = pipelineFit.transform(step1)
    sum_mean = Summarizer.metrics("mean")
    meanVector_categoryOneHot = output.select(
        Summarizer.mean(output.categoryOneHot)).head()[0]
    meanVector_categoryPCA = output.select(Summarizer.mean(
        output.categoryPCA)).head()[0]
    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'meanVector_categoryOneHot': [
            None,
        ],
        'meanVector_categoryPCA': [
            None,
        ]
    }
    # Modify res:
    res['count_total'] = output.count()
    res['meanVector_categoryOneHot'] = meanVector_categoryOneHot
    res['meanVector_categoryPCA'] = meanVector_categoryPCA

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_6')
    return res
Esempio n. 4
0
from pyspark.sql import SparkSession
from pyspark.ml.stat import Summarizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

sc = spark.sparkContext

df = sc.parallelize([
    Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
    Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))
]).toDF()

summarizer = Summarizer.metrics("mean", "count")

df.show()
df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)
df.select(summarizer.summary(df.features)).show(truncate=False)
df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)
df.select(Summarizer.mean(df.features)).show(truncate=False)

spark.stop()
Esempio n. 5
0
            (Vectors.dense([4.0, 5.0, 0.0, 3.0])),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0])),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]))])

    rdd_data = more_data.map(lambda line: tuple([float(x) for x in line]))
    summary = Statistics.colStats(rdd_data)

    print("Mean:" + str(summary.mean()))  # a dense vector containing the mean value for each column
    print("Variance:" + str(summary.variance()))  # column-wise variance
    print("Non zeros:" + str(summary.numNonzeros()))
    print("Count:" + str(summary.count()))
    print("Min:" + str(summary.min()))
    print("Max:" + str(summary.max()))


    # Examples with Summarizer
    summarizer = Summarizer.metrics("mean", "count", "min", "max", "variance")

    # compute statistics for multiple metrics
    data_frame.select(summarizer.summary(data_frame.features)).show(truncate=False)

    # compute statistics for single metric "mean"
    data_frame.select(Summarizer.mean(data_frame.features)).show(truncate=False)

    spark_session.stop()





Esempio n. 6
0
from pyspark.ml.linalg import Vectors
# $example off$

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("SummarizerExample") \
        .getOrCreate()
    sc = spark.sparkContext

    # $example on$
    df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
                         Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()

    # create summarizer for multiple metrics "mean" and "count"
    summarizer = Summarizer.metrics("mean", "count")

    # compute statistics for multiple metrics with weight
    df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)

    # compute statistics for multiple metrics without weight
    df.select(summarizer.summary(df.features)).show(truncate=False)

    # compute statistics for single metric "mean" with weight
    df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)

    # compute statistics for single metric "mean" without weight
    df.select(Summarizer.mean(df.features)).show(truncate=False)
    # $example off$

    spark.stop()
####### Using correlation and Summarizer

#Select features
features = ["age", "charges", "customer_contacts", "attrition"]

va = VectorAssembler(inputCols= features, outputCol = "features") #Create Vector Assember
featuresData = va.transform(rawData) #transform original dataset to include new col of vectors
featuresData.show(n=2)

#Calculate correlation and display
r1 = Correlation.corr(featuresData, "features", method = 'pearson').head()
print("Pearson correlation matrix:\n" + str(r1[0]))

#Calculate mean statistic for the list of features in order
summarizer = Summarizer.metrics("mean")
featuresData.select(summarizer.summary(featuresData.features)).show(truncate=False)

"""**Split the Spark Dataframe into Train and Test**"""

#Splitting dataframe with randomsplit
splits = rawData.randomSplit(weights= [.7, .3], seed= 12345)

print("training obs count: ", splits[0].count())
print("test obs count: ", splits[1].count())

train = splits[0]
test = splits[1]

"""**Feature Engineering & Define Model**"""
Esempio n. 8
0
output.select("features").show(10, truncate=False)

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Summarizer

assembler = VectorAssembler(inputCols=[
    'year', 'population', 'labor_force', 'population_percent',
    'employed_total', 'employed_percent', 'agrictulture_ratio',
    'nonagriculture_ratio', 'unemployed', 'unemployed_percent', 'not_in_labor'
],
                            outputCol="features")

assembled = assembler.transform(employment_df)

summarizer = Summarizer.metrics("max", "mean").summary(assembled["features"])

assembled.select(summarizer).show(truncate=False)

assembled.select(Summarizer.variance(assembled.features)).show(truncate=False)

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

assembler = VectorAssembler(inputCols=[
    "date", "day", "period", "nswprice", "nswdemand", "vicprice", "vicdemand",
    "transfer"
],
                            outputCol="features")

assembled = assembler.transform(electricity_df)