def testLogisticMLPipeline1(self):
     training = sqlCtx.createDataFrame([
         ("a b c d e spark", 1.0),
         ("b d", 2.0),
         ("spark f g h", 1.0),
         ("hadoop mapreduce", 2.0),
         ("b spark who", 1.0),
         ("g d a y", 2.0),
         ("spark fly", 1.0),
         ("was mapreduce", 2.0),
         ("e spark program", 1.0),
         ("a e c l", 2.0),
         ("spark compile", 1.0),
         ("hadoop software", 2.0)
         ], ["text", "label"])
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
     lr = LogisticRegression(sqlCtx)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
     model = pipeline.fit(training)
     test = sqlCtx.createDataFrame([
         ("spark i j k", 1.0),
         ("l m n", 2.0),
         ("mapreduce spark", 1.0),
         ("apache hadoop", 2.0)], ["text", "label"])
     result = model.transform(test)
     predictionAndLabels = result.select("prediction", "label")
     evaluator = MulticlassClassificationEvaluator()
     score = evaluator.evaluate(predictionAndLabels)
     self.failUnless(score == 1.0)
Beispiel #2
0
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only
def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
def sparking_your_interest():
	df = SQLContext.read.json('speeches_dataset.json')
	df_fillna=df.fillna("")
	print(df_fillna.count())
	print(df_fillna.printSchema())

	df_utf=call_utf_encoder(df)
	df_cleaned=call_para_cleanup(df_utf)
	print(df_cleaned)
	df_with_bigrams = call_ngrams(df_cleaned, 2)
	df_with_trigrams = call_ngrams(df_with_bigrams, 3)
	df_with_4grams = call_ngrams(df_with_trigrams, 4)
	df_with_5grams = call_ngrams(df_with_4grams, 4)
	df_with_6grams = call_ngrams(df_with_5grams, 4)
	df_with_vocab_score = call_speech_vocab(df_with_6grams)

	df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
	df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
	df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
	assembler = VectorAssembler(
	    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
	    outputCol="features")
	assembler_output = assembler.transform(df_with_4grams_idf_vectors)
	output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
	print(output.show())
	print(output.count())

	output_tordd = output.rdd
	train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
	train_df = train_rdd.toDF()
	test_df = test_rdd.toDF()
	print(train_df)
	print(test_df)

	print('Train DF - Count: ')
	print(train_df.count())
	print('Test DF - Count: ')
	print(test_df.count())

	print("Initializing RF Model")
	labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
	rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
	pipeline = Pipeline(stages=[labelIndexer,rf])
	model = pipeline.fit(output)
	print("Completed RF Model")

	predictions = model.transform(test_df)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g" % (1.0 - accuracy))
	rfModel = model.stages[1]
	print(rfModel)  # summary only
	print("Predictions: ")
	print(predictions.show())
def model(classifier, ftrain, fvalid, fprediction):

    startTime = time.time()

    ctx = SparkContext(appName="model_on_Spark")
    sqlContext = SQLContext(ctx)
    logger = SparkLogger(ctx)
    logger.set_level('ERROR')

    # load and prepare training and validation data
    rawTrain, train = prepData(sqlContext, ctx, ftrain)
    rawValid, valid = prepData(sqlContext, ctx, fvalid)

    # is needed to join columns
    valid = indexData(valid)
    rawValid = indexData(rawValid)

    classifiers = {
        "RandomForestClassifier" : RFC
    }

    clf = classifiers[classifier]()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    # train and predict
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
    model = pipeline.fit(train)

    predictions = model.transform(valid)

    # write to file:

    subsetPrediction = predictions.select("prediction", "index")
    subsetValidData = rawValid.select("dataset", "index")

    output = (subsetValidData
               .join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
                    .drop("index")
                    .drop("index"))

    lines = output.map(toCSVLine)
    lines.saveAsTextFile('output')

    evaluator = MulticlassClassificationEvaluator(
       labelCol="label", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print "Test Error = %g" % (1.0 - accuracy)

    executionTime = time.time() - startTime
    row=classifier+','+str(executionTime)
    ctx.parallelize([row]).saveAsTextFile("timing")
def build_decision_tree(sqlContext, features, interested):
	print '-----------------------------------------'
	data = sqlContext.createDataFrame(
			[Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))])
	data.printSchema()
	data.show(5)
	print 'created data frame'

	# Index the label column & adding metadata.
	labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
	print 'created label indexer'

	# Mark the features with < 4 distinct values as categorical
	featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

	# Split the data into training and test sets
	(trainingData, testData) = data.randomSplit([0.8, 0.2])

	# Train a DecisionTree model
	dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

	# Chain the indexers together with DecisionTree
	pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

	# Train the model
	model = pipeline.fit(trainingData)

	# Make predictions
	predictions = model.transform(testData)

	predictions.select("prediction", "indexedLabel", "features").show(5)

	# Select (prediction, true label) & compute test error
	evaluator = MulticlassClassificationEvaluator(
			labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	precision = evaluator.evaluate(predictions)

	treeModel = model.stages[2]
	return (1 - precision, model)
def naiveBayeseian():

    def parseLine(line):
        keys  = [float(x) for x in line.split(",")]
        #return LabeledPoint(keys[0],keys[1:])
        return keys
    scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv")
    data= scdata1.map(parseLine)
    splits = data.randomSplit([0.8, 0.2], 1234)
    train = splits[0]
    test = splits[1]
    layers = [30, 20, 20, 2]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
Beispiel #9
0
def print_evaluation_metrics(model, test_df, labelCol="label", featuresCol="features"):
    """
    Prints evaluation metrics.
    :param model: Used model.
    :param test_df: dataframe containing test data.
    :param labelCol: label column.
    :param featuresCol: features column.
    :return: A DataFrame.
    """
    predictions = model.transform(test_df)


    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol=labelCol, predictionCol="prediction",)
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    print "Accuracy:", accuracy
    print "f1:", f1
    print "Precision:", weighted_precision
    print "Recall:", weighted_recall
def price_predict(path, windows=5, spark_contest=None, sql_context=None):
    if spark_contest is None:
        spark_contest, sql_context = load_spark_context()
    input_data = DataParser(path=path, window_size=windows)
    close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data(
        data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context)
    evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION)

    # handle open data
    open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128,
                                                  featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    open_model = open_trainer.fit(open_train_df)
    open_result = open_model.transform(open_test_df)
    open_prediction_labels = open_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(open_prediction_labels)))

    # handle close data
    close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128,
                                                   featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    close_model = close_trainer.fit(close_train_df)
    close_result = close_model.transform(close_test_df)
    close_prediction_labels = close_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
Beispiel #11
0
def calculate_accuracy_metrics(predictions):

    """
    Calculates accuracy metrics for a Prediction DataFrame

    :param predictions:
    :return:
    """
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction")
    accuracy = round(evaluator.evaluate(predictions, {evaluator.metricName: "precision"}), 2)
    recall = round(evaluator.evaluate(predictions, {evaluator.metricName: "recall"}), 2)

    positive_cases = predictions.filter(predictions["indexedLabel"] == 1.0)
    negative_cases = predictions.filter(predictions["indexedLabel"] == 0.0)
    false_positive_cases = negative_cases.filter(positive_cases["prediction"] == 1.0)
    false_negative_cases = positive_cases.filter(positive_cases["prediction"] == 0.0)

    return [accuracy,
            recall,
            positive_cases.count(),
            negative_cases.count(),
            false_positive_cases.count(),
            false_negative_cases.count()]
    ##################### Preprocessing #####################
    # PCA
    pca = PCA(k=d, inputCol="features", outputCol="pca")

    ##################### Decision Tree #####################
    # Train a Random Forest model.
    rf = RandomForestClassifier(labelCol="label", featuresCol="pca", \
                                numTrees=n, seed=1234, maxDepth=30, \
                                minInstancePerNode=5)

    ##################### Pipelined Model #####################
    pipeline_rf = Pipeline(stages=[pca, rf])

    # build pipelined model with train data
    model_rf = pipeline_rf.fit(train_df)

    ##################### Prediction #####################
    # make predictions
    result_rf = model_rf.transform(test_df)

    ##################### Evaluation #####################
    # compute accuracy
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(result_rf)

    print("\n+-------------------+")
    print("| Accuracy = %.2f%% |" % (100 * accuracy))
    print("+-------------------+\n")
data = sc.parallelize(Xtrain)

print("\nSplitting data into 60","%"," training and 40","%","testing")
training_data, testing_data = data.randomSplit([0.6, 0.4], seed=0)
vectorizedData = training_data.toDF()
print("Creating MultilayerPerceptronClassifier...")
MLP = MultilayerPerceptronClassifier(labelCol='indexedLabel', featuresCol='indexedFeatures')
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(vectorizedData)
featureIndexer = VectorIndexer(inputCol='features',
                               outputCol='indexedFeatures',
                               maxCategories=2).fit(data.toDF())
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, MLP])

paramGrid_MLP = ParamGridBuilder().addGrid(MLP.layers,[[3072, neuron, 10] for neuron in [200, 500]]).build()
evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel',
                                      predictionCol='prediction', metricName='f1')
print("Processing crossvalidation with 3-fold & 200/500 hidden layer units")
crossval = CrossValidator(estimator=pipeline,
                  estimatorParamMaps=paramGrid_MLP,
                  evaluator=evaluator,
                  numFolds=3)
starttime = datetime.datetime.now()
CV_model = crossval.fit(vectorizedData)
print CV_model.bestModel.stages[2]
print('Done on fitting model:%s'%(datetime.datetime.now()-starttime))

print("Transforming testing data...")
vectorized_test_data = testing_data.toDF()

#transformed_data1 = CV_model.transform(vectorizedData)
#print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1)
                   "hours-per-week")
data.show()

assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)
data.show()

# Splitting the data into training and data set
training, test = data.select("label", "features").randomSplit([0.70, 0.30])

# Create Random Forest model and fit the model with training dataset
rf = RandomForestClassifier()
model = rf.fit(training)

# Generate prediction from test dataset
pred = model.transform(test)

# Evaluate  the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(pred)

# Show model accuracy
print("Accuracy:", accuracy)

# Report
predictionAndLabels = pred.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
print("Confusion Matrix:", metrics.confusionMatrix())
print("Precision:", metrics.precision())
print("Recall:", metrics.recall())
print("F-measure:", metrics.fMeasure())
Beispiel #15
0
si  = StringIndexer(inputCol="purpose", outputCol="purpose_index")
hot = OneHotEncoder(inputCol="purpose_index", outputCol="purpose_features")
va  = VectorAssembler(inputCols=["loan_amnt", "interest_rate", "employment_length", "home_owner", "income", "verified", "open_accts", "credit_debt", "purpose_features"], outputCol="features")
dtr = DecisionTreeRegressor(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=2, varianceCol="variance")
gbr = GBTRegressor(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)
gbc = GBTClassifier(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=5, maxIter=20, seed=12345)

pipeline = Pipeline(stages=[si, hot, va, gbc])

model = pipeline.fit(training)
model.write().overwrite().save('hdfs:///tmp/spark_model')

predictions = model.transform(testing)

predictions.select(['default','prediction']).sort(col('prediction').desc()).show(25,False)

#evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="default")
#rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
#r2   = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

#evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="default")
#evaluator.evaluate(predictions)
#evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="default")
evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})


#ZEND
    # $example on$
    # load data file.
    inputData = spark.read.format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    # instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model.
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="precision")

    # compute the classification error on test data.
    precision = evaluator.evaluate(predictions)
    print("Test Error : " + str(1 - precision))
    # $example off$

    spark.stop()
# Fit the pipeline
pipelined_data = pipeline.fit(df)
transformed_data = pipelined_data.transform(df)
training_set, test_set = transformed_data.randomSplit([0.8, 0.2], seed=10)

# Create the model, train and predict
nb = NaiveBayes(smoothing=1.0,
                modelType="multinomial",
                featuresCol='TF',
                labelCol='race')
training_set.cache()
model = nb.fit(training_set)
predictions = model.transform(test_set)

# Evaluate the results
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='race')
result = predictions.select('race', 'prediction')
result_rdd = result.rdd
metrics = MulticlassMetrics(result_rdd)

print("Naive Bayes model evaluation")
print("F score: {}".format(evaluator.evaluate(result)))
print(metrics.confusionMatrix())
for k, v in race_to_number.iteritems():
    print("F score for {}: {}".format(k, metrics.fMeasure(v)))
print("Precision: {}".format(
    evaluator.evaluate(result, {evaluator.metricName: 'precision'})))
print("Contigency table of the prediction results of naive bayes model")
result.stat.crosstab('prediction', 'prediction').show()

print("Predictions for naive bayes: {}".format(
Beispiel #18
0
# set seed for reproducibility and Split Data in 80-20% for Train and Test Data Set
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))


#Logistic Regression Classification
lr = LogisticRegression(maxIter=25, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0).select("text","index","probability","label","prediction").orderBy("probability", ascending=False).show(n = 10, truncate = 30)



evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Test Error for Logistic Regression :" + str((1.0 - accuracy)*100)+ "%")
print("Test Accuracy for Logistic Regression :" + str((accuracy)*100)+ "%")

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction" ,metricName='f1')
f1 = evaluator.setMetricName("f1").evaluate(predictions)
weightedPrecision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
weightedRecall = evaluator.setMetricName("weightedRecall").evaluate(predictions)
accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)

print("Test weightedRecall for Logistic Regression :" + str(weightedRecall))
print("Test weightedPrecision for Logistic Regression :" + str(weightedPrecision))
print("Test f1 score for Logistic Regression :" + str(f1))

Beispiel #19
0
    bst_model_path = model_save_path + "_bst_model"
    train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345)
    bst_model = train_with_tune(train_df)
    bst_model.write().overwrite().save(bst_model_path)

    # 用训练得到最佳模型来对测试数据进行预测
    # 预测结果的数据结构是类似下面的结构:
    #      features = Vectors.dense(...)
    #      label=0,
    #      rawPrediction=DenseVector([0.048, -0.048]),
    #      probability=DenseVector([0.512, 0.488]),
    #      prediction=0.0
    loaded_bst_model = PipelineModel.load(bst_model_path)
    result = loaded_model.transform(train_df)
    predict_result = loaded_bst_model.transform(test_df)
    print("predicted sample :", predict_result.take(3))

    # 对训练出来的二分类模型进行评估
    bin_eval = BinaryClassificationEvaluator()
    predict_metric = bin_eval.evaluate(predict_result, {bin_eval.metricName: "areaUnderROC"})
    print("trained model test auc metric", predict_metric)

    # 查看具体分类混淆矩阵信息,默认会计算f1
    mm = MulticlassClassificationEvaluator()
    f1 = mm.evaluate(predict_result)
    accuracy = mm.evaluate(predict_result, {mm.metricName: "accuracy"})
    precision = mm.evaluate(predict_result, {mm.metricName: "weightedPrecision"})
    recall = mm.evaluate(predict_result, {mm.metricName: "weightedRecall"})
    print("predict trained model precision: %f, recall: %f, acc: %s, f1: %f " \
          % (precision, recall, accuracy, f1))
Beispiel #20
0
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'engine-type')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="engine-type", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf = RandomForestClassifier(labelCol="engine-type",featuresCol="features",numTrees = 100,maxDepth = 4,maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(train_df)
predictions = rfModel.transform(test_df)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="engine-type")
evaluator.evaluate(predictions)

from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1,labelCol="engine-type",featuresCol="features")

model = nb.fit(train_df)

predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="engine-type")
evaluator.evaluate(predictions)


Beispiel #21
0
def transform_predictions(dataframe, spark):
    df_transformed = dataframe.drop("Patient addmited to regular ward (1=yes, 0=no)",
                                    "Patient addmited to semi-intensive unit (1=yes, 0=no)",
                                    "Patient addmited to intensive care unit (1=yes, 0=no)")

    df_transformed_no_missing = dismiss_missing_values(df_transformed)

    # build the dataset to be used as a rf_model base
    outcome_features = ["SARS-Cov-2 exam result"]
    required_features = ['Hemoglobin', 'Hematocrit', 'Platelets', 'Eosinophils', 'Red blood Cells', 'Lymphocytes',
                         'Leukocytes', 'Basophils', 'Monocytes']

    assembler = VectorAssembler(inputCols=required_features, outputCol='features')
    model_data = assembler.transform(df_transformed_no_missing)

    # split the dataset into train/test subgroups
    (training_data, test_data) = model_data.randomSplit([0.8, 0.2], seed=2020)

    # Random Forest classifier
    rf = RandomForestClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features', maxDepth=5)
    rf_model = rf.fit(training_data)
    rf_predictions = rf_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    rf_accuracy = multi_evaluator.evaluate(rf_predictions)

    # Decision Tree Classifier
    dt = DecisionTreeClassifier(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxDepth=3)
    dt_model = dt.fit(training_data)
    dt_predictions = dt_model.transform(test_data)
    dt_predictions.select(outcome_features + required_features).show(10)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    dt_accuracy = multi_evaluator.evaluate(dt_predictions)

    # Logistic Regression Model
    lr = LogisticRegression(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxIter=10)
    lr_model = lr.fit(training_data)
    lr_predictions = lr_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    lr_accuracy = multi_evaluator.evaluate(lr_predictions)

    # Gradient-boosted Tree classifier Model
    gb = GBTClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features')
    gb_model = gb.fit(training_data)
    gb_predictions = gb_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    gb_accuracy = multi_evaluator.evaluate(gb_predictions)

    rdd = spark.sparkContext.parallelize([rf_accuracy, dt_accuracy, lr_accuracy, gb_accuracy])
    predictions_dataframe = spark.createDataFrame(rdd, FloatType())

    return predictions_dataframe
Beispiel #22
0
                              handleInvalid='error')
indexer = stringIndexer.fit(df_tf_idf)
df_tf_idf_lab = indexer.transform(df_tf_idf).select('features', 'indexed')
df_tf_idf_lab.show()

# 切分训练集和预测集
splits = df_tf_idf_lab.randomSplit([0.7, 0.3], 123)
train = splits[0]
test = splits[1]

# 定义模型
nb = NaiveBayes(featuresCol='features',
                labelCol='indexed',
                predictionCol='prediction',
                probabilityCol='probability',
                rawPredictionCol='rawPrediction',
                smoothing=1.0,
                modelType='multinomial')
# 模型训练
model = nb.fit(train)
# 预测集训练
predictions = model.transform(test)
predictions.show()

# 计算准确率
evaluator = MulticlassClassificationEvaluator(labelCol='indexed',
                                              predictionCol='prediction',
                                              metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy =" + str(accuracy))
Beispiel #23
0
    indexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkedInd")
    dataset = indexer.fit(dataset).transform(dataset)

    #assemble features
    assembler = VectorAssembler(inputCols=[
        "Age", "Pclass", "SexInd", "SibSp", "Parch", "Fare", "EmbarkedInd"
    ],
                                outputCol="features")

    dataset = assembler.transform(dataset)

    (trainingData, testData) = dataset.randomSplit([0.8, 0.2])

    #MLP
    layers = [7, 8, 4, 2]  #input: 7 features; output: 2 classes
    mlp = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         labelCol="Survived",
                                         featuresCol="features",
                                         blockSize=128,
                                         seed=0)

    model = mlp.fit(trainingData)
    result = model.transform(testData)

    prediction_label = result.select("prediction", "Survived")
    evaluator = MulticlassClassificationEvaluator(labelCol="Survived",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    print("MLP test accuracy: " + str(evaluator.evaluate(prediction_label)))
Beispiel #24
0
data = data.withColumn('length', length(data['question_text']))

#選取建模需要的所有特徵,做成一個向量

assembler = VectorAssembler(inputCols=['question_tfidf', 'length'],
                            outputCol='features')

lgr = LogisticRegression(labelCol="target",
                         featuresCol="features",
                         maxIter=100)

pipeline = Pipeline(
    stages=[tokenizer, remover, ngram, hashingTF, idf, assembler, lgr])

paramGrid = ParamGridBuilder().build()

evaluator = MulticlassClassificationEvaluator(labelCol="target",
                                              metricName='f1')
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)
train = data.filter(data['target'].isNotNull())
(trainX, validation) = train.randomSplit([0.7, 0.3])
test = data.filter(data['target'].isNull())
model = cv.fit(trainX)
results = model.transform(validation).select("qid", "target", "prediction")
f1 = evaluator.evaluate(results)

### f1 = 0.90 ????
# MAGIC Automated MLflow tracking is enabled by default for:
# MAGIC
# MAGIC   - Databricks Runtime 5.4 ML or above
# MAGIC   - Databricks Runtime 5.4 or above
# MAGIC
# MAGIC To enable it for earlier versions, set the `SparkSession` configuration flag `"spark.databricks.mlflow.trackMLlib.enabled"` to `"true"`.

# COMMAND ----------

#spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

# COMMAND ----------

# Define an evaluation metric.  In this case, use "weightedPrecision", which is equivalent to 0-1 accuracy.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                              metricName="weightedPrecision")

# COMMAND ----------

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# COMMAND ----------

grid = ParamGridBuilder() \
  .addGrid(dtc.maxDepth, [2, 3, 4, 5, 6, 7, 8]) \
  .addGrid(dtc.maxBins, [2, 4, 8]) \
  .build()

# COMMAND ----------

cv = CrossValidator(estimator=pipeline,
Beispiel #26
0
def model_accuracy(test_results):
    print("control entered model_accuracy ")
    print(test_results. select('Sentiment','length','stop_tokens','hash_token','idf_token','probability','prediction').show(20))
    acc_eval = MulticlassClassificationEvaluator()
    acc = acc_eval.evaluate(test_results)
    return acc
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
# COMMAND ----------

display([list('0123456789'),truePositives].toDF())

# COMMAND ----------

testDF=spark.read.parquet('/mnt/adls/testset.parquet')

# COMMAND ----------

predictions=lrModel.transform(testDF)

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# COMMAND ----------

!pip install Keras
#!pip install tensorflow
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

# COMMAND ----------
Beispiel #29
0
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()

print "Grid is build"

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)

print "CV Estimator is defined"

cv_model = cv.fit(dfTrain)

print "Model is fitted"

df_test_pred = cv_model.transform(dfTest)
Beispiel #30
0
labelConverter = IndexToString().\
    setInputCol('prediction').\
    setOutputCol('predictedLabel').\
    setLabels(labelIndexer.labels)

trainingData, testData = data.randomSplit([0.7, 0.3])

# 构建决策树分类模型,设置决策树的参数
dtClassifier = DecisionTreeClassifier().\
    setLabelCol('indexedLabel').\
    setFeaturesCol('indexedFeatures')

# 构建机器学习流水线(Pipeline),调用fit()进行模型训练
dtPipeline = Pipeline().\
    setStages([labelIndexer, featureIndexer, dtClassifier, labelConverter])
dtPipelineModel = dtPipeline.fit(trainingData)
dtPredictions = dtPipelineModel.transform(testData)
dtPredictions.select('predictedLabel', 'label', 'features').show(20)

evaluator = MulticlassClassificationEvaluator().\
    setLabelCol('indexedLabel').\
    setPredictionCol('prediction')

dtAccuracy = evaluator.evaluate(dtPredictions)
print('决策树模型准确率:{}'.format(dtAccuracy))  # 模型的预测准确率

# 通过调用toDebugString方法查看训练的决策树模型结构
treeModelClassifier = dtPipelineModel.stages[2]
print('Learned classification tree model:\n' +
      str(treeModelClassifier.toDebugString))
Beispiel #31
0
fig, axList = prepareSubplot(np.arange(0., 1.1, 0.1), np.arange(0., 1.1, 0.1), figsize=(12., 5.), subplots=(1,2))
ax0, ax1 = axList
ax0.set_title('First Model', color='#999999')
ax1.set_title('Second Model', color='#999999')
generateROC(axList[0], labelsAndScores)
generateROC(axList[1], labelsAndScores2)
display(fig)

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

metric = 'precision'

multiclassEval = MulticlassClassificationEvaluator()

multiclassEval.setMetricName(metric)
print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions))
print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2))

# COMMAND ----------

import inspect
print inspect.getsource(MulticlassClassificationEvaluator)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Using MLlib instead of ML
# MAGIC  
    # $example on$
    # Load training data
    data = spark.read.format("libsvm")\
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]

    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

    # train the model
    model = trainer.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    spark.stop()
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()
forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(
    rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
print(trainingData.take(1))

nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="indexed")
start_time = time.time()
modelClassifier = nb.fit(trainingData)
end_time = time.time()
print(end_time - start_time)

predictionsClassifier = modelClassifier.transform(testData)
evaluator = MulticlassClassificationEvaluator().setLabelCol(
    "indexed").setPredictionCol("prediction")
print(
    "accuracy = ",
    evaluator.evaluate(predictionsClassifier,
                       {evaluator.metricName: "accuracy"}))
print(
    "weightedPrecision = ",
    evaluator.evaluate(predictionsClassifier,
                       {evaluator.metricName: "weightedPrecision"}))
print(
    "weightedRecall = ",
    evaluator.evaluate(predictionsClassifier,
                       {evaluator.metricName: "weightedRecall"}))
print("f1 = ",
      evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "f1"}))
(trainingData, testData) = td.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
rmClassifer = RandomForestClassifier(labelCol="indexed", \
                featuresCol="pcaFeatures", numTrees=100)
rmModel = rmClassifer.fit(trainingData)

#Predict on the test data
predictions = rmModel.transform(testData)
predictions.select("prediction", "indexed", "label", "pcaFeatures").collect()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="accuracy")
evaluator.evaluate(predictions)

#Draw a confusion matrix
predictions.groupBy("indexed", "prediction").count().show()

#Balance data set
from numpy.random import randint
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

RATIO_ADJUST = 2.0  ## ratio of pos to neg in the df_subsample

counts = trainingData.select('indexed').groupBy('indexed').count().collect()
higherBound = counts[0][1]
TRESHOLD_TO_FILTER = int(RATIO_ADJUST * float(counts[1][1]) / counts[0][1] *
Beispiel #35
0
from pyspark.sql.types import FloatType
#Extracting only the column with probability with column with 1's probability
secondelement=udf(lambda v:float(v[1]),FloatType())
transformed.select(secondelement('probability')).show()

#Dataframe column to list
mvv_count_df.select('mvv').collect()

#-------- creating saving loading model ------------------
rf = RandomForestClassifier(labelCol='label', featuresCol='features',numTrees=20)

paramGrid = ParamGridBuilder().build()#ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01, 0.001, 0.0001]).build()
#lr = LinearRegression()
#paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [500]).addGrid(lr.regParam, [0]).addGrid(lr.elasticNetParam, [1]).build()
pipeline_new = Pipeline(stages=[rf])
evaluator = MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("f1")  #/setMetricName/ "f1" (default), "weightedPrecision", "weightedRecall", "accuracy"
#evaluator = RegressionEvaluator(metricName="mae")
crossval = CrossValidator(estimator=pipeline_new, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)
model_new_rf = crossval.fit(trainingData)
model_new_rf.bestModel
model_new_rf.bestModel.save('rf_pipeline_model_saved')
model_new_rf.avgMetrics

#loading a saved model
from pyspark.ml import PipelineModel
loadedModel = PipelineModel.load("rf_pipeline_model_saved")


#Checkpointing is a process of truncating RDD lineage graph and saving it to a reliable distributed (HDFS) or local file system.
sc.setCheckpointDir("hdfs://hadoop-master:9000/data/checkpoint")
df.repartition(100)
Beispiel #36
0
    # Split the data into training and test sets (30% held out for testing)
    # multi_feat = MultiFeaturizer(spark, [wv, wv_tweet])
    # feat_df = multi_feat.featurize(converted_df)
    # converted_df2 = shape_df(spark, df, 'nagisa', ['補助記号']).drop("age")
    # tfidf = TfidfFeaturizer(spark)
    # feat_df = tfidf.featurize(converted_df2)
    # onehot = OneHotFeaturizer(spark)
    # feat_df = onehot.featurize(converted_df)
    # multi_feat = MultiFeaturizer(spark, [wv_tweet, tfidf], [converted_df, converted_df2])
    # feat_df = multi_feat.featurize()
    (trainingData, testData) = feat_df.randomSplit([0.8, 0.2], seed=3)

    # 3. call `fit`. (fit のときにはたんに事前に作った data-frame を入れる)
    clf = model.fit(trainingData)

    predict_train = model.transform(trainingData)
    predict_test = model.transform(testData)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predict_train)
    print("train accuracy: " + str(accuracy))

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predict_test)
    print("test accuracy: " + str(accuracy))
Beispiel #37
0
data1 = output.select("label", "features")
(training, test) = data1.randomSplit([0.8, 0.2], seed = 12345)


#gbt = GBTClassifier(numTrees = 10, maxDepth = 3, maxBins = 64)
gbt = GBTClassifier(maxIter = 30, maxDepth = 2, impurityType = gini)

#gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
##rf = RandomForestClassifier(numTrees = 25, maxDepth = 4, maxBins = 64)
pipeline = Pipeline(stages=[gbt])
pipelineModel = pipeline.fit(training)

testPredictions = pipelineModel.transform(test)
testPredictions.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")#.setMetricName("accuracy")
evaluatorParaMap = {evaluator.metricName: "f1"}
aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap)


from pyspark.ml.tuning import *

paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [1,5]).build()

cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3)

cvModel = cv.fit(training)
cvPredictions = cvModel.transform(test)
cvAUCTest = evaluator.evaluate(cvPredictions, evaluatorParaMap)

print("pipeline Test AUC: %g" % aucTest)
import findspark
findspark.init('D:\Spark')
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('MultiLayer').getOrCreate()
data = spark.read.csv('iris.data', inferSchema=True, header=False)
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['_c0', '_c1', '_c2', '_c3'],
                            outputCol='features')
final_data = assembler.transform(data)
splits = final_data.randomSplit([0.6, 0.4])
train = splits[0]
test = splits[1]
layers = [4, 5, 4, 3]
trainer = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         labelCol='_c4')
model = trainer.fit(final_data)
result = model.transform(test)
predictionAndLabels = result.select("prediction", "_c4")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy",
                                              labelCol='_c4')
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
Beispiel #39
0
    def test_save_load_pipeline_estimator(self):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0),
            (4, "b spark who", 1.0),
            (5, "g d a y", 0.0),
            (6, "spark fly", 1.0),
            (7, "was mapreduce", 0.0),
        ], ["id", "text", "label"])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                              outputCol="features")

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(5)
        lr2 = LogisticRegression().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100]) \
            .addGrid(ova.classifier, [lr1, lr2]) \
            .build()

        tvs = TrainValidationSplit(
            estimator=pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())

        # Run train validation split, and choose the best set of parameters.
        tvsModel = tvs.fit(training)

        # test save/load of CrossValidatorModel
        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages),
                         len(tvsModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              tvsModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(
            stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        tvs2 = TrainValidationSplit(
            estimator=nested_pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())

        # Run train validation split, and choose the best set of parameters.
        tvsModel2 = tvs2.fit(training)
        # test save/load of CrossValidatorModel
        tvsModelPath2 = temp_path + "/tvsModel2"
        tvsModel2.save(tvsModelPath2)
        loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = tvsModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid,
                         original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(
                loaded_nested_pipeline_model.stages,
                original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)
Beispiel #40
0
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                              predictionCol="prediction",
                                              metricName="accuracy")

treeModel = model.stages[2]
# summary only
print(treeModel)
hashingTF = HashingTF(inputCol="features", outputCol="features")
regParam = 0.3
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
Beispiel #41
0
       	print >> sys.stderr, "%s <input> <model_path> <stop_file> class_num appname" % sys.argv[0] 
        sys.exit(1)

    input_path = sys.argv[1]
    model_path = sys.argv[2]
    stop_file = sys.argv[3]
    class_num = int(sys.argv[4])
    appname = sys.argv[5]

    conf = SparkConf().setAppName(appname)
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    data_df = text_to_df(sc, sqlContext, input_path)
    print "*** create data frame ***" 
    splits = data_df.randomSplit([0.8, 0.2], 1234)
    training = splits[0].cache()
    test = splits[1].cache()

    stopwords = get_stopwords(stop_file)
    print "*** load %s stopwords ***" % len(stopwords)
    pipeline = get_pipeline(vector_size=50, class_num=class_num, stopwords=stopwords) 
    model = pipeline.fit(training)
    result = model.transform(test)
  
    pred_label = result.select("prediction", "indexLabel")
    evaluator = MulticlassClassificationEvaluator(metricName="precision", predictionCol="prediction", labelCol="indexLabel")
    print("Precision: " + str(evaluator.evaluate(pred_label)))    


Beispiel #42
0
print(f"Test set length: {test.count()} records")


# Cross validation 

# Cross-validation is a model validation technique for assessing how the results of a statistical analysis will generalize to an independent data set. It is mainly used in settings where the goal is prediction, and one wants to estimate how accurately a predictive model will perform in practice.

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Evaluate model
rfevaluator = MulticlassClassificationEvaluator(metricName="f1")

# Create ParamGrid for Cross Validation
rfparamGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 5, 10, 20, 30])
             .addGrid(rf.maxBins, [10, 20, 40, 80, 100])
             .addGrid(rf.numTrees, [5, 20, 50, 100, 500])
             .build())

# Create 5-fold CrossValidator
rfcv = CrossValidator(estimator = rf,
                      estimatorParamMaps = rfparamGrid,
                      evaluator = rfevaluator,
                      numFolds = 5)

# Run cross validations.
Beispiel #43
0
                                   metricName="rmse")

logger.info('Regression train RMSE: %g' % lr_evaluator.evaluate(lr_pred_train))
logger.info('Regression test RMSE: %g' % lr_evaluator.evaluate(lr_pred_test))

# save pipeline

lr_model.save('/app/saved_models/lr_model')

# binary classification
# predict if post belongs to AskReddit

bc_pipeline = get_binary_classification_pipeline()
bc_model = bc_pipeline.fit(train_data)
bc_pred = bc_model.transform(test_data)
bc_evaluator_acc = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="label", metricName="accuracy")
bc_evaluator_f1 = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                    labelCol="label",
                                                    metricName="f1")

logger.info('Binary classification test Accuracy: %g' %
            bc_evaluator_acc.evaluate(bc_pred))
logger.info('Binary classification test F1: %g' %
            bc_evaluator_f1.evaluate(bc_pred))

# save pipeline

bc_model.save('/app/saved_models/bc_model')

# multi-class classification
# predict post's subreddit
Beispiel #44
0
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

sc = SparkContext(appName="MyFirstApp4_Task_task2")
spark = SparkSession(sc)


df_node18=spark.read.format("parquet").load(path="hdfs://namenode:9000/example4/test.parquet")
model_node21=CrossValidatorModel.load("hdfs://namenode:9000/example4/model_2/")
model_node19=PipelineModel.load("hdfs://namenode:9000/example4/model_1/")
df_node20=model_node19.transform(df_node18)
df_node22=model_node21.transform(df_node20)

evaluator_node23 = MulticlassClassificationEvaluator(labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy")
score_node23=evaluator_node23.evaluate(df_node22)
df_node23= spark.createDataFrame([(score_node23,)], ["score"])

df_node23.write.format("csv").save(path="hdfs://namenode:9000/example4/EvalResult3.csv")
#applying logistic regression using the  "Text" to predict "Sentiment"
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

# COMMAND ----------

#finding the accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# COMMAND ----------

#applying cross validation
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
    data = sqlContext.read.format("libsvm")\
        .load("data/mllib/sample_multiclass_classification_data.txt")
    # Split the data into train and test
    
    data.show() 
    data.printSchema()
    data.select('features').show()
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    print (train.count())
    train.show()
    test = splits[1]
    
    
    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    sc.stop()
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[2]
# summary only
print(treeModel)

# see for more: https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier

# Churn - which customers (of a telecommunications company) are likely to stop using their service
# Churn dataset provided by the UC Irvine machine-learning repository hosted by SGI
# Data from https://www.sgi.com/tech/mlc/db/churn.all
$ wget https://www.sgi.com/tech/mlc/db/churn.all

# Classification - Random Forest
Beispiel #48
0
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect)


# In[329]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10)


# In[330]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')


# In[331]:

from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
grid=(ParamGridBuilder()
     .baseOn([evaluator.metricName,'precision'])
     .addGrid(dt.maxDepth, [10,20])
     .build())
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator)


# In[332]:
Beispiel #49
0
print "Done in {} second".format(round(tt,3))


# In[18]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


print "Fitting the classifier on selected features"
t0 = time()

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[19]:

print "Testing precision of the model"
t0 = time()

dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()
Beispiel #50
0
spark = getSpark()

df = spark.read.load('../dataset/merged/publisher/')
df = df.withColumn('label', df._hyperpartisan.cast('integer'))
testSet = spark.read.load('../dataset/merged/article/')
testSet = testSet.withColumn('label', testSet._hyperpartisan.cast('integer'))

hashingTF = HashingTF(inputCol="words", outputCol="rawfeatures", numFeatures=1000)
idf = IDF(inputCol="rawfeatures", outputCol="features")
#pca = PCA(k=1000, inputCol="rfeatures", outputCol="features")
#lr = LogisticRegression(regParam=0.1, maxIter=20)
lr = RandomForestClassifier(numTrees=20, maxDepth=5, seed=42)
pipeline = Pipeline(stages=[hashingTF, idf, lr])
filename = "TFIDF-RF20-5"

ev = MulticlassClassificationEvaluator(metricName='accuracy')
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=ParamGridBuilder().build(),
                          evaluator=ev,
                          numFolds=10, seed=42)

model = crossval.fit(df)

with open(filename,"a") as f:
    f.write(f"accuracy crossValidation: {max(model.avgMetrics)}\n")
    f.write(f"accuracy testSet        : {ev.evaluate(model.transform(testSet))}\n")
    
    
ev = MulticlassClassificationEvaluator(metricName='weightedPrecision')
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=ParamGridBuilder().build(),
Beispiel #51
0
# instantiate the base classifier.
lr = LogisticRegression(featuresCol='tfidf',
                        weightCol='weight',
                        maxIter=10,
                        tol=1E-6,
                        fitIntercept=True)

# train the multiclass model.
model = lr.fit(train)

# score the model on test data.
predictions = model.transform(test)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="f1")

# compute the classification error on test data.
f1 = evaluator.evaluate(predictions)
print("f1 score = %g" % (f1))

# to show dataframe with predictions and probabilities
#display(predictions)

# to save predictions -- on Azure Databricks
#predictions.write.save("/FileStore/lr_output.parquet")

####################################
# One-vs-All (Logistic Regression) #
####################################
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("naive_bayes_example")\
        .getOrCreate()

    # $example on$
    # Load training data
    data = spark.read.format("libsvm") \
        .load("sample_libsvm_data.txt")
    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(train)
    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    spark.stop()
Beispiel #53
0
# Select results to view
display(predictions.select("label", "prediction", "probability"))

# COMMAND ----------

# MAGIC %md
# MAGIC #### Model Evaluation
# MAGIC 
# MAGIC To evaluate our model, we will be making use of the Evaluator in MulticlassClassification. Note that f1-score is the default metric for the MulticlassClassificationEvaluator.

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="precision")
accuracy = evaluator.evaluate(predictions)
print "Model Accuracy: ", accuracy

# COMMAND ----------

# MAGIC %md
# MAGIC The Evaluator is able to use a few metrics such as f1-score, precision, recall, weightedPrecision and weightedRecall.
# MAGIC 
# MAGIC evaluator.setMetricName("insert_metric_here") can be used to change the metric used to evaluate models.

# COMMAND ----------

evaluator.explainParam("metricName")

# COMMAND ----------
Beispiel #54
0
def run_MLA(XX,XXpredict,yy,yypredict,unique_IDS_tr,unique_IDS_pr,uniquetarget_tr,uniquetarget_pr,n_feat,ind_run_name,n_run):
    logger.info('Starting MLA run')
    logger.info('------------')
    if settings.pyspark_on == 1:                # Use pyspark or not? Pyspark makes cross node (HPC) calculation possible.
        from pyspark import SparkContext        # It's slower, manages resources between nodes using HTTP. 
        from pyspark.sql import SQLContext      # So far, it does not include feature importance outputs.
        from pyspark.ml import Pipeline         # I would have to program feature importances myself. May be time consuming.
        from pyspark.ml.feature import VectorAssembler
        from pyspark.ml.classification import RandomForestClassifier
        from pyspark.ml.feature import StringIndexer, VectorIndexer
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        # pyspark go
        
        if settings.pyspark_remake_csv == 1: # Making the csv files for the pyspark MLA to read in is time consuming, turn off the file generation?
            logger.info('Remaking csvs for pysparks...')
            numpy.savetxt(temp_train, XX, delimiter=",")
            logger.info('Training csv saved')
            numpy.savetxt(temp_pred, XXpredict, delimiter=",")
            logger.info('Predict csv saved')
        sc = SparkContext(appName="ML_RF") # Initiate spark
        
        sclogger=sc._jvm.org.apache.log4j # Initiate spark logging
        sclogger.LogManager.getLogger("org").setLevel(sclogger.Level.ERROR)
        sclogger.LogManager.getLogger("akka").setLevel(sclogger.Level.ERROR)
        sqlContext=SQLContext(sc)
        # Read in data
        data_tr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_train)
        data_pr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_pred)
        data_tr=data_tr.withColumnRenamed(data_tr.columns[-1],"label") # rename last column (answers), to label
        data_pr=data_pr.withColumnRenamed(data_pr.columns[-1],"label")
        
        assembler=VectorAssembler(inputCols=data_tr.columns[:-1],outputCol="features")
        reduced=assembler.transform(data_tr.select('*')) # Assemble feature vectos for spark MLA
        
        assembler_pr=VectorAssembler(inputCols=data_pr.columns[:-1],outputCol="features")
        reduced_pr=assembler_pr.transform(data_pr.select('*'))
        
        labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(reduced) # Index vectors        
        featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(reduced)
        # Initiate MLA alg
        rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=100,maxDepth=5,maxBins=200)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Set up fitting pipeline
        start, end=[],[] # Timer
        logger.info('Fit start')
        logger.info('------------')
        start = time.time()
        model=pipeline.fit(reduced) # Fit
        end = time.time()
        logger.info('Fit ended in %s seconds' %(end-start))
        logger.info('------------')
        start, end=[],[]
        logger.info('Predict start')
        logger.info('------------')
        start = time.time()
        predictions = model.transform(reduced_pr) # Predict
        evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision")
        accuracy = evaluator.evaluate(predictions)
        logger.info("Test Error = %g" %(1.0-accuracy))
        logger.info('------------')
        logger.info('Pulling results ...')
        yypredict=numpy.array(predictions.select("indexedLabel").collect()) # Pulls all results into numpy arrays to continue program
        yypredict=yypredict[:,0]
        result=numpy.array(predictions.select("prediction").collect())
        result=result[:,0]
        XXpredict=numpy.array(predictions.select("indexedFeatures").collect())
        XXpredict=XXpredict[:,0]
        probs=numpy.array(predictions.select("probability").collect())
        probs=probs[:,0]
        XXpredict=numpy.column_stack((XXpredict,yypredict))
        end=time.time()
        logger.info('Predict ended in %s seconds' %(end-start))
        logger.info('------------')
    
    else:
        # Run sklearn MLA switch
        MLA = get_function(settings.MLA) # Pulls in machine learning algorithm from settings
        clf = MLA().set_params(**settings.MLAset)
        logger.info('MLA settings') 
        logger.info(clf)
        logger.info('------------')    
        start, end=[],[] # Timer
        logger.info('Fit start')
        logger.info('------------')
        start = time.time()
        clf = clf.fit(XX[:,0:n_feat],yy) # XX is train array, yy is training answers
        end = time.time()
        logger.info('Fit ended in %s seconds' %(end-start))
        logger.info('------------')
        score = clf.score
        if 'OvsA' not in ind_run_name:
            if settings.output_all_trees == 1:
                i_tree = 0
                for tree_in_forest in clf.estimators_:
                    with open('plots/tree_' + str(i_tree) + '.dot', 'w') as my_file:
                        my_file = tree.export_graphviz(tree_in_forest, out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True)
                    os.system('dot -Tpng plots/tree_%s.dot -o plots/tree_%s.png' %(i_tree,i_tree))
                    os.remove('plots/tree_%s.dot' %i_tree)
                    i_tree = i_tree + 1        
            else:
                with open('plots/tree_example.dot', 'w') as my_file:
                    my_file = tree.export_graphviz(clf.estimators_[0], out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True)
                os.system('dot -Tpng plots/tree_example.dot -o plots/tree_example.png')
                os.remove('plots/tree_example.dot')
        start, end=[],[]
        # Split cats for RAM management
        numcats = numpy.int64((2*(XXpredict.size/1024/1024)*clf.n_jobs))
        if settings.get_contributions ==1:
            numcats=100
        if numcats < 1:
            numcats = 1
        logger.info('Predict start')
        logger.info('------------')
        start = time.time()
        result,probs,bias,contributions,train_contributions=[],[],[],[],[]
        XXpredict_cats=numpy.array_split(XXpredict,numcats)
        logger.info('Splitting predict array into %s' %numcats)
        logger.info('------------')
        for i in range(len(XXpredict_cats)):
            logger.info('Predicting cat %s/%s' %(i,len(XXpredict_cats)))
            result.extend(clf.predict(XXpredict_cats[i][:,0:n_feat])) # XX is predict array.
            probs.extend(clf.predict_proba(XXpredict_cats[i][:,0:n_feat])) # Only take from 0:n_feat because answers are tacked on end
            if 'OvsA' not in ind_run_name:            
                if (settings.get_contributions == 1) | (settings.get_perfect_contributions==1):           
                    logger.info('Getting contributions from predict catalogue %s' %i)
                    tiresult = ti.predict(clf,XXpredict_cats[i][:,0:n_feat])
                    contributions.extend(tiresult[2])
                    bias = tiresult[1][0]
        feat_importance = clf.feature_importances_
        result=numpy.float32(result)
        probs=numpy.float32(probs)
        if 'OvsA' not in ind_run_name:            
            if settings.get_contributions == 1: 
                numpy.save('contributions',contributions)
            if settings.get_perfect_contributions == 1: 
                numpy.save('perfect_contributions',contributions)
            if settings.compute_contribution_mic == 1:
                logger.info('Getting contributions from train catalogue (for plot_mic_cont)')
                tiresult_train = ti.predict(clf,XX[:,0:n_feat])
                train_contributions=tiresult_train[2]
                bias_train = tiresult_train[1][0]
        
        accuracy = metrics.accuracy_score(result,yypredict)
        recall = metrics.recall_score(result,yypredict,average=None)
        precision = metrics.precision_score(result,yypredict,average=None)
        score = metrics.f1_score(result, yypredict,average=None)
        
        end = time.time()
        logger.info('Predict ended in %s seconds' %(end-start))
        logger.info('------------')

    logger.info('Recall Score: %s' %recall)
    logger.info('Precision Score: %s' %precision)
    logger.info('Accuracy Score: %s' %accuracy)
    logger.info('F1 Score: %s' %score)
    percentage=(n/predictdatanum)*100
    
    run_opts.diagnostics([result,yypredict,unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr],'result')
#    stats=numpy.array([])
#    stats=numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage))
    # SAVE
    if settings.saveresults == 1:
        logger.info('Saving results')
        logger.info('------------')

        numpy.savetxt(settings.result_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((yypredict,result)),header="True_target Predicted_target")
        numpy.savetxt(settings.prob_outfile+('_%s' %ind_run_name)+'.txt',probs)
        numpy.savetxt(settings.feat_outfile+('_%s' %ind_run_name)+'.txt',feat_importance)
        numpy.savetxt(settings.stats_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage,clf.max_depth)),header="n_est traindatanum predictdatanum percentage max_depth",fmt="%s")
    
    return result,feat_importance,probs,bias,contributions,accuracy,recall,precision,score,clf,train_contributions
dfE = dfEntrenamiento.select("cat", "tipo").join(dfGlobal, "cat", "inner")

columnas = [
    'area', 'perimeter', 'fd', 'perimeter', 'compact_circle', 'fd', 'B1_sum',
    'B1_mean', 'B1_median', 'B1_stdev', 'B1_min', 'B1_max', 'B1_variance',
    'B2_sum', 'B2_mean', 'B2_median', 'B2_stdev', 'B2_min', 'B2_max',
    'B2_variance', 'B3_sum', 'B3_mean', 'B3_median', 'B3_stdev', 'B3_min',
    'B3_max', 'B3_variance'
]
constructor = VectorAssembler(inputCols=columnas, outputCol="features")

dfEF = constructor.transform(dfE).select("cat", "features", "tipo")

entrena, evalua = dfEF.randomSplit([0.8, 0.2])

rf = RandomForestClassifier(labelCol="tipo")

modelo = rf.fit(entrena)

pred = modelo.transform(evalua)

evaluador = MulticlassClassificationEvaluator(labelCol="tipo",
                                              metricName="accuracy")

evaluador.evaluate(pred)

modeloOk = rf.fit(dfEF)

modeloOk.write().overwrite().save(
    "modelorf")  #eliminar overwrite() si no se quiere sobreescribir el modelo
Beispiel #56
0
from pyspark.ml.classification import NaiveBayes
naivebayes = NaiveBayes(featuresCol="features", labelCol="label")

# %% [markdown]
# ### Parameter grid

# %%
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().    addGrid(naivebayes.smoothing, [0, 1, 2, 4, 8]).    build()

# %% [markdown]
# ### Evaluator

# %%
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator()

# %% [markdown]
# ## Build cross-validation model

# %%
from pyspark.ml.tuning import CrossValidator
crossvalidator = CrossValidator(estimator=naivebayes, estimatorParamMaps=param_grid, evaluator=evaluator)

# %% [markdown]
# ## Fit cross-validation model

# %%
crossvalidation_mode = crossvalidator.fit(training)

# %% [markdown]
    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("predictedLabel", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    rfModel = model.stages[2]
    print(rfModel)  # summary only
    # $example off$

    spark.stop()


Beispiel #58
0
if __name__ == '__main__':

    #initialize spark session
    spark = SparkSession\
            .builder\
            .appName("Test")\
            .getOrCreate()
    sc = spark.sparkContext

    #reading the train dataframes
    trainingDF = spark.read.load("../data/train_small.parquet")

    #train = trainingDF.withColumn('features',trainingDF.features.cast(VectorUDT()))     
    
    # Split the data into train and test
    splits = trainingDF.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
    
Beispiel #59
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("NaiveBayesExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    data = spark.read.format("libsvm") \
        .load("data/mllib/sample_libsvm_data.txt")

    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    spark.stop()
Beispiel #60
0
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
                featuresCol="features")
dtModel = dtClassifer.fit(trainingData)


#Predict on the test data
predictions = dtModel.transform(testData)
predictions.select("prediction","species","label").show()

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
evaluator.evaluate(predictions)      

#Draw a confusion matrix
predictions.groupBy("label","prediction").count().show()


###################################### INSULT as the output
#Split into training and testing data
(trainingData, testData) = INSULTDf.randomSplit([0.75, 0.25])
trainingData.count()
testData.count()
testData.show()

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator