def evaluate(model, word_column="words", vectorizer="w2v"): doc2vecs_df = featurize(word_column, vectorizer) if type(model) == LinearSVC: paramGrid = ParamGridBuilder() \ .addGrid(model.regParam, [0.1]) \ .build() elif type(model) == GBTClassifier: paramGrid = ParamGridBuilder() \ .addGrid(model.maxIter, [50]) \ .build() elif type(model) == RandomForestClassifier: paramGrid = ParamGridBuilder() \ .addGrid(model.maxBins, [100]) \ .build() elif type(model) == MultilayerPerceptronClassifier: paramGrid = ParamGridBuilder() \ .addGrid(model.layers, [[122, 50, 2]]) \ .build() # .addGrid(model.layers, [[120, 2], [120, 50, 2], [120, 75, 50, 2]]) \ elif type(model) == FMClassifier: paramGrid = ParamGridBuilder() \ .addGrid(model.stepSize, [.01, .001]) \ .build() print('Evaluating...') w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2]) si = StringIndexer(inputCol="LABEL", outputCol="label") model_evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="f1") classifier_pipeline = Pipeline(stages=[si, model]) crossval = CrossValidator(estimator=classifier_pipeline, estimatorParamMaps=paramGrid, evaluator=model_evaluator, numFolds=5) fit_model = crossval.fit(doc2vecs_df) predictions = fit_model.transform(w2v_test_df) # predictions.toPandas().to_csv('predictions.csv') # predictions.groupBy('prediction', 'label', 'PRODUCT_CATEGORY') # predictions.describe() summarizer = Summarizer.metrics("mean", "count") predictions.select( summarizer.summary(predictions.filter( predictions.label == 1).pos)).show(truncate=False) preds_and_labels = predictions.select(['prediction', 'label']) metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple)) print('Confusion Matrix') print(metrics.confusionMatrix().toArray()) # Overall statistics precision = metrics.precision(1.0) recall = metrics.recall(1.0) f1Score = metrics.fMeasure(1.0) print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) accuracy = model_evaluator.evaluate(predictions) trainingSummary = fit_model.bestModel.stages[-1].extractParamMap() print(trainingSummary) return accuracy
def task_6(data_io, product_processed_data): # -----------------------------Column names-------------------------------- # Inputs: category_column = 'category' # Outputs: categoryIndex_column = 'categoryIndex' categoryOneHot_column = 'categoryOneHot' categoryPCA_column = 'categoryPCA' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ indexer = M.feature.StringIndexer(inputCol=category_column, outputCol=categoryIndex_column, handleInvalid="error") indexed_model = indexer.fit(product_processed_data).transform( product_processed_data) encoder = M.feature.OneHotEncoderEstimator( dropLast=False, inputCols=[categoryIndex_column], outputCols=[categoryOneHot_column]) encodded_model = encoder.fit(indexed_model).transform(indexed_model) pca = M.feature.PCA(k=15, inputCol=categoryOneHot_column, outputCol=categoryPCA_column) pca_model = pca.fit(encodded_model).transform(encodded_model) summarizer = Summarizer.metrics("mean") count_total = pca_model.count() meanVector_categoryPCA = pca_model.select( summarizer.summary(pca_model.categoryPCA)).head()[0][0] meanVector_categoryOneHot = pca_model.select( summarizer.summary(pca_model.categoryOneHot)).head()[0][0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'meanVector_categoryOneHot': [ None, ], 'meanVector_categoryPCA': [ None, ] } # Modify res: res['count_total'] = count_total res['meanVector_categoryOneHot'] = meanVector_categoryOneHot res['meanVector_categoryPCA'] = meanVector_categoryPCA # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_6') return res
def task_6(data_io, product_processed_data): # -----------------------------Column names-------------------------------- # Inputs: category_column = 'category' # Outputs: categoryIndex_column = 'categoryIndex' categoryOneHot_column = 'categoryOneHot' categoryPCA_column = 'categoryPCA' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ step1 = product_processed_data[[category_column]] stringIndexer = M.feature.StringIndexer(inputCol="category", outputCol="indexed_category", handleInvalid="error", stringOrderType="frequencyDesc") inputs = stringIndexer.getOutputCol() OHencoder = M.feature.OneHotEncoderEstimator(inputCols=[inputs], outputCols=['categoryOneHot'], dropLast=False) pca_ = M.feature.PCA(inputCol="categoryOneHot", outputCol='categoryPCA', k=15) pipeline = Pipeline(stages=[stringIndexer, OHencoder, pca_]) pipelineFit = pipeline.fit(step1) output = pipelineFit.transform(step1) sum_mean = Summarizer.metrics("mean") meanVector_categoryOneHot = output.select( Summarizer.mean(output.categoryOneHot)).head()[0] meanVector_categoryPCA = output.select(Summarizer.mean( output.categoryPCA)).head()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'meanVector_categoryOneHot': [ None, ], 'meanVector_categoryPCA': [ None, ] } # Modify res: res['count_total'] = output.count() res['meanVector_categoryOneHot'] = meanVector_categoryOneHot res['meanVector_categoryPCA'] = meanVector_categoryPCA # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_6') return res
from pyspark.sql import SparkSession from pyspark.ml.stat import Summarizer from pyspark.sql import Row from pyspark.ml.linalg import Vectors spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") sc = spark.sparkContext df = sc.parallelize([ Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0)) ]).toDF() summarizer = Summarizer.metrics("mean", "count") df.show() df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) df.select(summarizer.summary(df.features)).show(truncate=False) df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) df.select(Summarizer.mean(df.features)).show(truncate=False) spark.stop()
(Vectors.dense([4.0, 5.0, 0.0, 3.0])), (Vectors.dense([6.0, 7.0, 0.0, 8.0])), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]))]) rdd_data = more_data.map(lambda line: tuple([float(x) for x in line])) summary = Statistics.colStats(rdd_data) print("Mean:" + str(summary.mean())) # a dense vector containing the mean value for each column print("Variance:" + str(summary.variance())) # column-wise variance print("Non zeros:" + str(summary.numNonzeros())) print("Count:" + str(summary.count())) print("Min:" + str(summary.min())) print("Max:" + str(summary.max())) # Examples with Summarizer summarizer = Summarizer.metrics("mean", "count", "min", "max", "variance") # compute statistics for multiple metrics data_frame.select(summarizer.summary(data_frame.features)).show(truncate=False) # compute statistics for single metric "mean" data_frame.select(Summarizer.mean(data_frame.features)).show(truncate=False) spark_session.stop()
from pyspark.ml.linalg import Vectors # $example off$ if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("SummarizerExample") \ .getOrCreate() sc = spark.sparkContext # $example on$ df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() # create summarizer for multiple metrics "mean" and "count" summarizer = Summarizer.metrics("mean", "count") # compute statistics for multiple metrics with weight df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) # compute statistics for multiple metrics without weight df.select(summarizer.summary(df.features)).show(truncate=False) # compute statistics for single metric "mean" with weight df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) # compute statistics for single metric "mean" without weight df.select(Summarizer.mean(df.features)).show(truncate=False) # $example off$ spark.stop()
####### Using correlation and Summarizer #Select features features = ["age", "charges", "customer_contacts", "attrition"] va = VectorAssembler(inputCols= features, outputCol = "features") #Create Vector Assember featuresData = va.transform(rawData) #transform original dataset to include new col of vectors featuresData.show(n=2) #Calculate correlation and display r1 = Correlation.corr(featuresData, "features", method = 'pearson').head() print("Pearson correlation matrix:\n" + str(r1[0])) #Calculate mean statistic for the list of features in order summarizer = Summarizer.metrics("mean") featuresData.select(summarizer.summary(featuresData.features)).show(truncate=False) """**Split the Spark Dataframe into Train and Test**""" #Splitting dataframe with randomsplit splits = rawData.randomSplit(weights= [.7, .3], seed= 12345) print("training obs count: ", splits[0].count()) print("test obs count: ", splits[1].count()) train = splits[0] test = splits[1] """**Feature Engineering & Define Model**"""
output.select("features").show(10, truncate=False) from pyspark.ml.feature import VectorAssembler from pyspark.ml.stat import Summarizer assembler = VectorAssembler(inputCols=[ 'year', 'population', 'labor_force', 'population_percent', 'employed_total', 'employed_percent', 'agrictulture_ratio', 'nonagriculture_ratio', 'unemployed', 'unemployed_percent', 'not_in_labor' ], outputCol="features") assembled = assembler.transform(employment_df) summarizer = Summarizer.metrics("max", "mean").summary(assembled["features"]) assembled.select(summarizer).show(truncate=False) assembled.select(Summarizer.variance(assembled.features)).show(truncate=False) from pyspark.ml.feature import VectorAssembler from pyspark.ml.stat import Correlation assembler = VectorAssembler(inputCols=[ "date", "day", "period", "nswprice", "nswdemand", "vicprice", "vicdemand", "transfer" ], outputCol="features") assembled = assembler.transform(electricity_df)