def train(self): self.logger.log("info", "Training Model") raw_data = self.__sc.parallelize(self.__documents) raw_hashed_tf = raw_data.map( lambda dic: LabeledPoint(dic['label'], compTF(dic['text']))) raw_hashed_idf = compIDF(raw_hashed_tf) raw_hashed_tfidf = compTFIDF(raw_hashed_tf, raw_hashed_idf) self.__model = NaiveBayes.train(raw_hashed_tfidf) self.logger.log("info", "Complate")
# Load training data #spark = SparkSession.builder.appName("Lesson7").getOrCreate() sc = SparkContext(appName="PythonNaiveBayesExample") data = sc.textFile("C:\\PySpark_MLib\\data\\classification\\Immunotherapy.csv").map(parseLine) #data = sc.read("C:\\PySpark_MLib\\data\\classification\\Immunotherapy.csv") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 0) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") #pipeline = Pipeline().setStages(Array(assembler,lr)) # train the model model = nb.fit(train) model = NaiveBayes.train(train, 1.0) # select example rows to display. predictions = model.transform(test) predictions.show() # compute accuracy on the test set evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", # metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test set accuracy = " + str(accuracy))
new_string = "1 " elif "Iris-virginica" in line[-1]: new_string = "3 " elif "Iris-versicolor" in line[-1]: new_string = "3 " # new_string = line[0] + " " count = 1 for i in line[:-1]: new_string += str(count) + ":" + str(i) + " " count += 1 new_string += "\n" output_string += new_string # print(repr(row)) # print(repr(new_string)) # break output_file = open("iris.txt", "w") output_file.write(output_string) output_file.close() sc = SparkContext() sc.setLogLevel('ERROR') data = MLUtils.loadLibSVMFile(sc, "iris.txt") training, test = data.randomSplit([0.8, 0.2]) model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() print('model accuracy {}'.format(accuracy))