Exemple #1
0
def naive_bayes():
    conf = SparkConf().setAppName('RF')
    sc = SparkContext(conf=conf)
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([
        Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])),
        Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])),
        Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))
    ])

    nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight")
    model = nb.fit(df)
    # model.pi
    # # DenseVector([-0.81..., -0.58...])
    # model.theta
    # # DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1)
    test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
    result = model.transform(test0).head()
    # result.prediction
    # # 1.0
    # result.probability
    # # DenseVector([0.32..., 0.67...])
    # result.rawPrediction
    # # DenseVector([-1.72..., -0.99...])
    test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))
                            ]).toDF()
    # model.transform(test1).head().prediction
    # # 1.0
    temp_path = "."
    nb_path = temp_path + "/nb"
    nb.save(nb_path)
    nb2 = NaiveBayes.load(nb_path)
    # nb2.getSmoothing()
    # # 1.0
    model_path = temp_path + "/nb_model"
    model.save(model_path)
    model2 = NaiveBayesModel.load(model_path)
    # model.pi == model2.pi
    # # True
    # model.theta == model2.theta
    # # True
    nb = nb.setThresholds([0.01, 10.00])
    model3 = nb.fit(df)
    result = model3.transform(test0).head()
Exemple #2
0
def NaiveBayesCl(train):
    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
    # train the model
    nbModel = nb.fit(train)

    return nbModel
Exemple #3
0
    def exec_naive_bayes(self,
                         featuresCol1="features",
                         labelCol1="label",
                         predictionCol1="prediction",
                         smoothing1=1,
                         numClass1=2):
        '''
        Creates the Naive Bayes model Pipeline
        Input: featureCol1: feature column name, labelCol: label column name, predictionCol1: prediction column name
                            model parameters: {smoothing}, numClass1: number of class labels
        Output: None
        '''
        #Initialize NaiveBayes Model with parameters passed
        nb = NaiveBayes(featuresCol=featuresCol1,
                        labelCol=labelCol1,
                        predictionCol=predictionCol1,
                        smoothing=smoothing1)

        #Fit nb model with training data
        nbModel = nb.fit(self.trainingData)

        #Make nb model predictions on testData
        predictions = nbModel.transform(self.testData)

        #Evaluate the results generated by the model prediction
        self.model_evaluator(predictions,
                             modelType="NaiveBayes Model",
                             modelParams=str({'smoothing': smoothing1}),
                             numClass=numClass1)
Exemple #4
0
def main():
    spark = SparkSession.builder.appName('nlp').getOrCreate()
    data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection",
                          inferSchema=True, sep='\t')
    data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1',
                                                                    'text')
    data.show()
    data = data.withColumn('length', length(data['text']))
    data.show()
    data.groupby('class').mean().show()
    tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
    stopremove = StopWordsRemover(inputCol='token_text',
                                  outputCol='stop_tokens')
    count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
    idf = IDF(inputCol="c_vec", outputCol="tf_idf")
    ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')
    clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                               outputCol='features')
    nb = NaiveBayes()
    data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove,
                                      count_vec, idf, clean_up])
    cleaner = data_prep_pipe.fit(data)
    clean_data = cleaner.transform(data)
    clean_data = clean_data.select(['label', 'features'])
    clean_data.show()
    (training, testing) = clean_data.randomSplit([0.7, 0.3])
    spam_predictor = nb.fit(training)
    data.printSchema()
    test_results = spam_predictor.transform(testing)
    test_results.show()
    acc_eval = MulticlassClassificationEvaluator()
    acc = acc_eval.evaluate(test_results)
    print("Accuracy of model at predicting spam was: {}".format(acc))
def naive_bayes(training, test):  # 特征向量必须为非负值
    testing = test.select("features")
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
    model = nb.fit(training)
    result = model.transform(test)
    accuracy = 1.0 * result.rdd.filter(
        lambda l: l.label == l.prediction).count() / test.count()
    print "朴素贝叶斯模型的正确率为:", accuracy
Exemple #6
0
 def training(self, transformed_ddf):
     train_ddf, test_ddf = transformed_ddf.randomSplit([0.7, 0.3])
     nb = NaiveBayes(smoothing=1.0, modelType="multinomial",
                     featuresCol="tfidf_vector",
                     labelCol=self.target_col,
                     predictionCol=self.prediction_col)
     self.model = nb.fit(train_ddf)
     self.evaluation(train_ddf, 'Train')
     self.evaluation(test_ddf, 'Test')
Exemple #7
0
def NaiveBayesEvaluation(TransformedDataset):

    nb = NaiveBayes()
    nb.setLabelCol("LabelIndex")
    nb.setPredictionCol("Label_Prediction")
    training, test = TransformedDataset.randomSplit([0.8, 0.2], seed=11)
    nvModel = nb.fit(training)
    prediction = nvModel.transform(test)

    # selected = prediction.select("body", "LabelIndex", "label", "Label_Prediction")
    # for row in selected.collect():
    #     print(row)

    from pyspark.mllib.evaluation import MulticlassMetrics

    predictionAndLabels = prediction.select(
        "Label_Prediction",
        "LabelIndex").rdd.map(lambda r: (float(r[0]), float(r[1])))

    # predictionAndLabels = test.rdd.map(lambda lp: (float(nvModel.predict(lp.features)), lp.label))
    metrics = MulticlassMetrics(predictionAndLabels)

    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Statistics by class
    labels = prediction.rdd.map(lambda lp: lp.label).distinct().collect()
    labelIndices = prediction.rdd.map(
        lambda lp: lp.LabelIndex).distinct().collect()
    labelIndicesPairs = prediction.rdd.map(
        lambda lp: (lp.label, lp.LabelIndex)).distinct().collect()

    print("Labels", labels)
    print("Label Indices", labelIndices)
    print("Label Indice Pairs", labelIndicesPairs)

    for label, labelIndex in sorted(labelIndicesPairs):
        print("\n Class %s precision = %s" %
              (label, metrics.precision(labelIndex)))
        print("Class %s recall = %s" % (label, metrics.recall(labelIndex)))
        print(
            "Class %s F1 Measure = %s" %
            (label, metrics.fMeasure(labelIndex, beta=1.0)), "\n")

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" %
          metrics.weightedFalsePositiveRate)
Exemple #8
0
def naive_bayes_classifier(training_df, testing_df):
    """
    Apply Naive Bayes Classifier to test data for predicting sentiment of Tweets.
    :param training_df: Trained labelled data
    :param testing_df: Test data
    :return: transformed dataframe of predicted labels for tweets
    """
    nb = NaiveBayes()
    model = nb.fit(training_df)
    return model.transform(testing_df).select(["label", "words", "prediction"])
def naive_bayes(trainingDataFrame,
                smoothing=1.0,
                modelType="multinomial",
                weightCol="weight"):
    nb = NaiveBayes(smoothing=smoothing,
                    modelType=modelType,
                    weightCol=weightCol)
    nbModel = nb.fit(trainingDataFrame)
    result = {}
    result["model"] = nbModel
    return result
Exemple #10
0
def pipeline_bayes(file,name):
    print("control entered pipeline_bayes ")
    cleaner = file.fit(name)
    cleaned = cleaner.transform(name)
    training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
    nb = NaiveBayes()
    predictor = nb.fit(training)
    test_results = predictor.transform(testing)
    return test_results
Exemple #11
0
 def train2(self):
     print("Training Model\n")
     train_df = self.load_train()
     test_df = self.load_test()
     nb = NaiveBayes()
     nb.setPredictionCol("predict_")
     nb.setFeaturesCol("features")
     nb.setLabelCol("label")
     self.__model = nb.fit(train_df)
     print("Complate\n")
     self.saveModel()
     self.testModel_df(test_df)
def naiveBayes_predict(trainingData,testData):
	
	print('\n************************ Apprentissage du  NaiveBayes ************************\n')

	nb = NaiveBayes()
	nbModel = nb.fit(trainingData)

	print('\n**************************** Sauvegarder le model ****************************\n')

	nbModel.save("./models/myNaiveBayesModel")
	
	return nbModel.transform(testData)
Exemple #13
0
 def train_naive_bayes(self, smoothing=1.0):
     '''
     train dataset on naive bayes algo
     --------
     Parameters
     smoothing = float
     --------
     Returns
     None
     '''
     # create the trainer and set its parameters
     nb = NaiveBayes(smoothing=smoothing, modelType="multinomial")
     self.model = nb.fit(self.train)
Exemple #14
0
def naive_bayes(train, test, smoothing, modelType):

  nb = NaiveBayes(smoothing=smoothing, modelType=modelType)

  # Entrenamos el modelo
  model = nb.fit(train)

  predictions = model.transform(test)
  # evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
  evaluator = BinaryClassificationEvaluator()
  accuracy = evaluator.evaluate(predictions)

  return accuracy
Exemple #15
0
def naive_bayes(trainingData, testData):
    from pyspark.ml.classification import NaiveBayes

    nb = NaiveBayes(smoothing=1)
    model = nb.fit(trainingData)
    predictions = model.transform(testData)
    predictions.filter(predictions['prediction'] == 0) \
        .select("Descript", "Category", "probability", "label", "prediction") \
        .orderBy("probability", ascending=False) \
        .show(n=10, truncate=30)

    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    evaluator.evaluate(predictions)
Exemple #16
0
def main(spark, filename):
    df = spark.read.csv(filename, header=False, inferSchema=True)
    vector_assembler = VectorAssembler(inputCols=['_c0', '_c1', '_c2', '_c3'],
                                       outputCol='features')
    # >>> df.show(4)
    # +---+---+---+---+-----------+
    # |_c0|_c1|_c2|_c3|        _c4|
    # +---+---+---+---+-----------+
    # |5.1|3.5|1.4|0.2|Iris-setosa|
    # |4.9|3.0|1.4|0.2|Iris-setosa|
    # |4.7|3.2|1.3|0.2|Iris-setosa|
    # |4.6|3.1|1.5|0.2|Iris-setosa|
    # +---+---+---+---+-----------+
    vector_assembler = VectorAssembler(inputCols=['_c0', '_c1', '_c2', '_c3'],
                                       outputCol='features')
    v_df = vector_assembler.transform(df)

    # >>> v_df.show(4)
    # +---+---+---+---+-----------+-----------------+
    # |_c0|_c1|_c2|_c3|        _c4|         features|
    # +---+---+---+---+-----------+-----------------+
    # |5.1|3.5|1.4|0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
    # |4.9|3.0|1.4|0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
    # |4.7|3.2|1.3|0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
    # |4.6|3.1|1.5|0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
    # +---+---+---+---+-----------+-----------------+
    # only showing top 4 rows
    indexer = StringIndexer(inputCol='_c4', outputCol='label')
    i_df = indexer.fit(v_df).transform(v_df)
    #   >>> i_df.show(4)
    # +---+---+---+---+-----------+-----------------+-----+
    # |_c0|_c1|_c2|_c3|        _c4|         features|label|
    # +---+---+---+---+-----------+-----------------+-----+
    # |5.1|3.5|1.4|0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
    # |4.9|3.0|1.4|0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
    # |4.7|3.2|1.3|0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
    # |4.6|3.1|1.5|0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
    # +---+---+---+---+-----------+-----------------+-----+
    # only showing top 4 rows
    splits = i_df.randomSplit([0.6, 0.4], 1)
    train_df = splits[0]
    test_df = splits[1]
    nb = NaiveBayes(modelType='multinomial')
    nbmodel = nb.fit(train_df)
    predictions = nbmodel.transform(test_df)
    evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                                  predictionCol='prediction',
                                                  metricName='accuracy')
    nbaccuracy = evaluator.evaluate(predictions)
    print(nbaccuracy)
def naive_bayes_classify(comment_preprocessed):
    sc = SparkContext(appName="Classification")
    sql_context = SQLContext(sc)
    data = sql_context.createDataFrame(comment_preprocessed)

    train, test = data.randomSplit([0.7, 0.3], 1234)
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
    model = nb.fit(train)

    predictions = model.transform(test)
    evaluate_classification(predictions)

    time.sleep(1)
    # predict_comment(sql_context, model)
    compare_classification_with_tool(sql_context, model)
def bayes_classifier(training_data, test_data, validation_data):
    dt = NaiveBayes(featuresCol='scaled_features', labelCol='label', smoothing=0.00001)

    # ROC 0.43
    dtModel = dt.fit(training_data)
    predict_valid = dtModel.transform(validation_data)
    predict_valid.show(10)

    evaluate_metrics(predict_valid)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label',
                                              metricName="areaUnderROC")

    model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_valid,
                    data_type="valid_data")
Exemple #19
0
def nb_classifier(training, testing):
    #MODEL 1: DECISION TREE CLASSIFIER
    from pyspark.ml.classification import NaiveBayes

    #Initialize model
    nb = NaiveBayes(modelType='multinomial')
    #Fit data into model
    nb_model = nb.fit(training)
    #Test model
    nb_predictions = nb_model.transform(testing)

    #Evaluate model
    nb_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    nb_accuracy = nb_evaluator.evaluate(nb_predictions)
    return nb_accuracy
Exemple #20
0
def test_naive_bayes():
    df = spark.createDataFrame([
        Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])),
        Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])),
        Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))])

    nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight")
    model = nb.fit(df)
    model.setFeaturesCol("features")

    test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
    features = test0.head().features

    result = model.predict(features)

    assert result == 1.0
Exemple #21
0
def load_classifier_model():
    # model = PipelineModel.load("./movie-robot-model")
    # print(model)
    # if model != None:
    #     return model

    data_set = ModelProcessUtil.create_train_vectors()

    df = spark.createDataFrame(data_set)

    df.show()
    nb = NaiveBayes(modelType="bernoulli")
    nb_model = nb.fit(df)
    nb_model.setFeaturesCol("features")
    # nb_model.save("./movie-robot-model")
    nb_model.write().overwrite().save("./movie-robot-model")

    return nb_model
def naive_bayes_generator(training_data, deal_id):
    ####In:
    #A training data set, as generated by data_prep()
    #The deal_id you want to generate a model for

    ####Out
    #The model is saved
    #An update message is outputted

    training_data = training_data.withColumnRenamed(deal_id, 'label')
    model = NaiveBayes(smoothing=10, modelType="bernoulli")
    model = model.fit(training_data)
    model.write().overwrite().save(
        f"s3://rtl-databricks-datascience/lpater/naive_bayes/{deal_id}/")
    output_message = "Saved a Naive Bayes model for " + deal_id + "."

    #sea also: https://spark.apache.org/docs/latest/ml-classification-regression.html
    return output_message
Exemple #23
0
def bayes_cv(business_id):
    """
    Crossvalidation of bayes model
    """
    spark = yelp_lib.spark
    review = yelp_lib.get_parq('review')
    business_df = review.filter(review['business_id'] == business_id)

    regexTokenizer = RegexTokenizer(inputCol="text",
                                    outputCol="words",
                                    pattern="\\W")
    wordsDataFrame = regexTokenizer.transform(business_df)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    cleaned = remover.transform(wordsDataFrame)

    star_mapping = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0}

    cleaned = cleaned.replace(star_mapping, 'stars')
    cleaned = cleaned.withColumn("stars", cleaned["stars"].cast("double"))

    cv = CountVectorizer(inputCol="filtered", outputCol="features")
    model = cv.fit(cleaned)
    vectorized = model.transform(cleaned)

    vectorized = vectorized.select(
        col('stars').alias('label'), col('features'))

    splits = vectorized.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0)
    # train the model
    nb_model = nb.fit(train)
    # compute accuracy on the test set
    result = nb_model.transform(test)

    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    return "Accuracy: " + str(evaluator.evaluate(predictionAndLabels))
def filter_detections(spark, resources_folder):
    messages = spark.read.csv(resources_folder + 'SMSSpamCollection',
                              inferSchema=True,
                              sep='\t')
    messages.printSchema()
    messages.show()
    messages = messages.withColumnRenamed('_c0', 'class').withColumnRenamed(
        '_c1', 'text')
    messages.show()

    messages = messages.withColumn('length', length(messages['text']))
    messages.show()
    messages.groupBy('class').mean().show()
    tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
    stop_remover = StopWordsRemover(inputCol='token_text',
                                    outputCol='stop_tokens')
    count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='count_vec')
    # idf = inverse document frecuency
    # td = term frequency
    idf = IDF(inputCol='count_vec', outputCol='tf_idf')
    ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label')

    assembler = VectorAssembler(inputCols=['tf_idf', 'length'],
                                outputCol='features')
    nb = NaiveBayes()

    data_pre_pipeline = Pipeline(stages=[
        ham_spam_to_numeric, tokenizer, stop_remover, count_vec, idf, assembler
    ])

    clean_data = data_pre_pipeline.fit(messages).transform(messages)
    clean_data.show()
    clean_data = clean_data.select('label', 'features')
    training_messages, test_messages = clean_data.randomSplit([0.7, 0.3])
    spam_detector = nb.fit(training_messages)
    test_results = spam_detector.transform(test_messages)
    test_results.show()

    acc_eval = MulticlassClassificationEvaluator()
    acc = acc_eval.evaluate(test_results)
    print("ACC of NB Model")
    print(acc)
    def fit(self):

        # 构建一个spark类型的dataframe格式的训练集形式
        pkl_file = open('data.pkl', 'rb')
        train_data = pickle.load(pkl_file)

        df = self.spark.createDataFrame([
            Row(label=train_data[j][0],
                weight=0.1,
                features=Vectors.dense(train_data[j][1][i])) for j in range(14)
            for i in range(len(train_data[j][1]))
        ])

        nb = NaiveBayes(smoothing=1.0,
                        modelType="multinomial",
                        weightCol="weight")
        # nb = DecisionTreeClassifier()
        print("训练正在开始-------------->")
        model = nb.fit(df)
        model.save(self.model_path)
Exemple #26
0
def naive_bayes(train, test):
    """Naive Bayes model. It uses cross validation to calculate the best smoothing
        value to train the model."""
    nb = NaiveBayes(modelType="multinomial", featuresCol='scaledFeatures')
    grid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.5, 1.0]).build()
    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=nb,
                        estimatorParamMaps=grid,
                        evaluator=evaluator)
    cv_model = cv.fit(train)
    best_model = cv_model.bestModel
    best_smooth = best_model._java_obj.getSmoothing()
    """Training with the best smoothing value"""
    best_nb = NaiveBayes(smoothing=best_smooth,
                         modelType="multinomial",
                         featuresCol='scaledFeatures')
    nb_model = best_nb.fit(train)
    predictions = nb_model.transform(test)

    return predictions
def naive_bayes(df, seed):
    # Drop preferred_foot because it's the only categorical column, the others are all numerical
    # Use preferred_foot if we have time to implement it
    df = df.drop("preferred_foot")

    labelIndexer = StringIndexer(inputCol="team_position", outputCol="label").fit(df)
    df = labelIndexer.transform(df)
    df = df.drop("team_position")

    list_of_features = df.drop("label").columns  # Get list of all features
    assembler = VectorAssembler(inputCols=list_of_features, outputCol="features")
    df = assembler.transform(df)

    (train_data, test_data) = df.randomSplit([0.8, 0.2], seed)

    n_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial")

    model = n_bayes.fit(train_data)  # Training happens here

    predictions = model.transform(test_data)

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    y_true = predictions.select(['label']).collect()
    y_pred = predictions.select(['prediction']).collect()

    print("Classification report and confusion matrix for Naive Bayes:")
    print(classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    confusion_matrix_corrected = [[cm[1][1], cm[1][2], cm[1][0]], [cm[2][1], cm[2][2], cm[2][0]],
                                  [cm[0][1], cm[0][2], cm[0][0]]]
    print("")
    print(confusion_matrix_corrected[0])
    print(confusion_matrix_corrected[1])
    print(confusion_matrix_corrected[2])

    cm = np.array([confusion_matrix_corrected[0], confusion_matrix_corrected[1], confusion_matrix_corrected[2]])

    return accuracy, cm
Exemple #28
0
def train(spark):
    sc = spark.sparkContext
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=8000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    srcdf = sc.textFile('part.csv').map(parse_line)
    srcdf = srcdf.toDF()
    training, testing = srcdf.randomSplit([0.9, 0.1])

    wordsData = tokenizer.transform(training)
    featurizedData = hashingTF.transform(wordsData)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.persist()

    trainDF = rescaledData.select("features", "label").rdd.map(
        lambda x: Row(label=float(x['label']),
                      features=Vectors.dense(x['features']))).toDF()
    naivebayes = NaiveBayes()
    model = naivebayes.fit(trainDF)

    testWordsData = tokenizer.transform(testing)
    testFeaturizedData = hashingTF.transform(testWordsData)
    testIDFModel = idf.fit(testFeaturizedData)
    testRescaledData = testIDFModel.transform(testFeaturizedData)
    testRescaledData.persist()

    testDF = testRescaledData.select("features", "label").rdd.map(
        lambda x: Row(label=float(x['label']),
                      features=Vectors.dense(x['features']))).toDF()
    predictions = model.transform(testDF)
    predictions.show()

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("The accuracy on test-set is " + str(accuracy))
    model.save('Bayes20000')
Exemple #29
0
 def nbModel(self, dfTrain, dfTest, seed):
     client = mlflow.tracking.MlflowClient()
     mlflow.set_experiment("gML NB")
     mlflow.end_run()
     mlflow.start_run()
     nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
     model = nb.fit(dfTrain)
     predictions = model.transform(dfTest)
     metrics = ["accuracy", "f1"]
     result = []
     for metric in metrics:
         evaluator = MulticlassClassificationEvaluator(
             labelCol="label",
             predictionCol="prediction",
             metricName=metric)
         v = evaluator.evaluate(predictions)
         mlflow.log_metric(metric, v)
         # print("  {}: {}".format(metric,v))
         temp = [metric, v]
         result.append(temp)
     mlflow.spark.log_model(model, "nbModel")
     return result
Exemple #30
0
def driver(takeSample=False):
    data_df, features = feature_eng.preprocess_features2(takeSample=takeSample)
    data_df.cache()
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data_df.randomSplit([0.7, 0.3])
    trainingData = sampling.undersample(trainingData, class_ratio=0.6)

    # create the trainer and set its parameters
    nb = NaiveBayes(labelCol='TARGET',
                    featuresCol='OCCUPATION_TYPE',
                    smoothing=1.0,
                    modelType="multinomial")

    # Train model.  This also runs the indexers.
    model = nb.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)
    predictions.select('TARGET', 'rawPrediction', 'prediction',
                       'probability').show(20)

    return multiple_evaluator(predictions)
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("NaiveBayesExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    data = spark.read.format("libsvm") \
        .load("data/mllib/sample_libsvm_data.txt")

    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    spark.stop()
    .select("Text","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

# COMMAND ----------

#finding the accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# COMMAND ----------

##applying naive bayes using the  "Text" to predict "Sentiment"
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)

predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


# COMMAND ----------

#finding the accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)
Exemple #33
0
#ENCODING LABEL
stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res")
ppl = Pipeline(stages=[stage_string])
df1 = ppl.fit(df01).transform(df01)

#CREATING TF_IDF
tokenizer = Tokenizer(inputCol="review_body", outputCol="words")
wordsData = tokenizer.transform(df1)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#NAIVEBAYES 
nb = NaiveBayes(featuresCol="features", labelCol="class_res")

#Model training
model = nb.fit(rescaledData)

#Model Saving
model.write().overwrite().save("./NB_model")

#Predictions
pred = model.transform(rescaledData)

#Disploying top 5 prediction values
pred.select('prediction').show(5)