Example #1
0
 def train(self):
     self.logger.log("info", "Training Model")
     raw_data = self.__sc.parallelize(self.__documents)
     raw_hashed_tf = raw_data.map(
         lambda dic: LabeledPoint(dic['label'], compTF(dic['text'])))
     raw_hashed_idf = compIDF(raw_hashed_tf)
     raw_hashed_tfidf = compTFIDF(raw_hashed_tf, raw_hashed_idf)
     self.__model = NaiveBayes.train(raw_hashed_tfidf)
     self.logger.log("info", "Complate")
Example #2
0
# Load training data
#spark = SparkSession.builder.appName("Lesson7").getOrCreate()
sc = SparkContext(appName="PythonNaiveBayesExample")
data = sc.textFile("C:\\PySpark_MLib\\data\\classification\\Immunotherapy.csv").map(parseLine)


#data = sc.read("C:\\PySpark_MLib\\data\\classification\\Immunotherapy.csv")

# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 0)
train = splits[0]
test = splits[1]

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

#pipeline = Pipeline().setStages(Array(assembler,lr))
# train the model
model = nb.fit(train)
model = NaiveBayes.train(train, 1.0)

# select example rows to display.
predictions = model.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
 #                                             metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
            new_string = "1 "
        elif "Iris-virginica" in line[-1]:
            new_string = "3 "
        elif "Iris-versicolor" in line[-1]:
            new_string = "3 "
        # new_string = line[0] + " "
        count = 1
        for i in line[:-1]:
            new_string += str(count) + ":" + str(i) + " "
            count += 1
        new_string += "\n"
        output_string += new_string
        # print(repr(row))
        # print(repr(new_string))
        # break
    output_file = open("iris.txt", "w")
    output_file.write(output_string)
    output_file.close()

    sc = SparkContext()
    sc.setLogLevel('ERROR')
    data = MLUtils.loadLibSVMFile(sc, "iris.txt")
    training, test = data.randomSplit([0.8, 0.2])
    model = NaiveBayes.train(training, 1.0)
    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print('model accuracy {}'.format(accuracy))