def evaluate_model(type): if type == 'logistic': model = LogisticRegressionModel.load(sc, "logit_model.model") elif type == 'tree': model = DecisionTreeModel.load(sc, "dt_model.model") elif type == 'rf': model = RandomForestModel.load(sc, "rf_model.model")
def index(): conf = SparkConf().setAppName("TaxiWeb") sc = SparkContext(conf=conf) model = DecisionTreeModel.load( sc, "TugasAkhir/Model/decision_tree/decision_tree_v5") return render_template("home.html")
def model_instream(sc, **params): fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get( sc._jsc.hadoopConfiguration()) if not fs.exists( sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])): raise Exception("Invalid file path, path not exists!") if params['type'] == 'kmeans': model = KMeansModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'fpgrowth': model = FPGrowthModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'logistic-regression': model = LogisticRegressionModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'word2vec': model = Word2VecModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'decision-tree': model = DecisionTreeModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) else: raise Exception("Invalid model type!") return True, model
def loadModel(): clusterModel = KMeansModel.load(sc, pv.clusterModelPath) classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath) if pv.outputDebugMsg: Utils.logMessage("\nLoad cluster & classification model finished") return clusterModel, classificationModel
def getModel(path, file): if path_exist(path + 'index-' + file): index = sc.textFile(path + 'index-' + file) a = index.collect() b = lambda x: [int(i) for i in x] return DecisionTreeModel.load(sc, path + 'model-' + file), b(a) else: vector, classes = dataPreparing(sc.textFile(path + file)) index = CorrelationFeature( vector) #se precisar de feature do Feature Selection reduced = MatrixReducer(vector, index) data = pass2libsvm(reduced, classes) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(data, numberClasses, {}) #, maxDepth=5, maxBins=32) model.save(sc, path + 'model-' + file) return model, index
def saveModel(self): # save the model to the given path self.tree_model.save(self.sc, "trained") # re-load the saved model self.tree_model = DecisionTreeModel.load(self.sc, "trained") # re-evaluate self.evaluate()
def main(sc, filename): ''' The driver for the spark scoring application, it generates predictions for a given file of features and target variables ''' rawDataRdd = sc.textFile(filename) print "Data Size: {}".format(rawDataRdd.count()) labeledPointsRdd = rawDataRdd.map(parse_lines) #load models logit_model = LogisticRegressionModel.load(sc, "logit_model.model") dt_model = DecisionTreeModel.load(sc, "dt_model.model") rf_model = RandomForestModel.load(sc, "rf_model.model") #logistic predictions labels_and_preds = labeledPointsRdd.map(lambda p: (float(logit_model.predict(p.features)), p.label )) labels_and_preds_collected = labels_and_preds.collect() print "\n" print "Predictions: Logistic Regression" y_true = [] y_pred = [] for row in labels_and_preds_collected: y_true.append(row[1]) y_pred.append(row[0]) # print "predicted: {0} - actual: {1}\n".format(row[0], row[1]) accuracy = labels_and_preds.filter(lambda (v,p): v == p).count() / float(labeledPointsRdd.count()) print_box() print "Prediction Accuracy (Logistic): {}".format(round(accuracy, 4)) print_box() print "\n" #decision tree predictions predictions = dt_model.predict(labeledPointsRdd.map(lambda p: p.features)) labels_and_preds_dt = labeledPointsRdd.map(lambda p: p.label).zip(predictions) labels_and_preds_dt_collected = labels_and_preds.collect() accuracy_dt = labels_and_preds_dt.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count()) print_box() print "Prediction Accuracy (Decision Tree): {}".format(round(accuracy_dt, 4)) print_box() print "\n" #random forest predictions predictions_rf = rf_model.predict(labeledPointsRdd.map(lambda p: p.features)) labels_and_preds_rf = labeledPointsRdd.map(lambda p: p.label).zip(predictions_rf) accuracy_rf = labels_and_preds_rf.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count()) print_box() print "Prediction Accuracy (Random Forest): {}".format(round(accuracy_rf, 4)) print_box()
def process(reviews): if(reviews.isEmpty()): pass else: model_name = "dt" updated_model = "dt0" model_path, data_path, metadata_path = '','','' #performing looping process to check the availability of new model classifier for i in range(25,-1,-1): model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i) updated_model = model_name+str(i) data_path = model_path+"/data/part-r*" metadata_path = model_path+"/metadata/part-00000" if(patherror(data_path) == False and patherror(metadata_path) == False): break #load model classifier model = DecisionTreeModel.load(sc, model_path) start = time.time() reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #review tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) prediction = model.predict(tfidf) labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0])) metrics = MulticlassMetrics(labeled_prediction) output = reviews.zip(prediction) filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out" output.saveAsTextFile(filename) end = time.time() print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))
def get_dt_model(sc, train=None): model_path = 'dt.model' if train is None: model = DecisionTreeModel.load(sc, model_path) else: model = DecisionTree.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=10) model.save(sc, model_path) return model
def test(sc): files = ["sounds/flushing/20150227_193109-flushing-04.wav", "sounds/bike/20150227_193806-bici-14.wav", "sounds/blender/20150227_193606-licuadora-14.wav" ] rfmodel = RandomForestModel.load(sc, RF_PATH) dtmodel = DecisionTreeModel.load(sc, DT_PATH) print dtmodel.toDebugString() for f in files: vec = audio.showFeatures(f) testfeatures = Vectors.dense([float(x) for x in vec.split(' ')]) print(vec) pred = dtmodel.predict(testfeatures) print("DT Prediction is " + str(pred), classes[int(pred)]) pred = rfmodel.predict(testfeatures) print("RF Prediction is " + str(pred), classes[int(pred)])
def init_spark_context(): global predictionModel # load spark context conf = SparkConf().setAppName("movie_recommendation-server") # IMPORTANT: pass aditional Python modules to each worker sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py']) # absolute path in hdfs # to run locally, remove first slash '/' i.e my_model1, not /my_model1 predictionModel = DecisionTreeModel.load(sc, '/my_model1') sc.addFile( 'conv/6.p') sc.addFile( 'conv/7.p') sc.addFile( 'conv/8.p') sc.addFile('conv/10.p') sc.addFile('conv/12.p') sc.addFile( 'conv/36.p') return sc
parsedData = raw_data.map(parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor(training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print('Time consumed = '), (datetime.now() - startTime) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) #save and load model model.save(sc, "DTR-Narrow-2008") sameModel = DecisionTreeModel.load(sc, "DTR-Narrow-2008") sc.stop()
label = clean_line_split[10] nonLable = clean_line_split[0:10] + clean_line_split[11:] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Mean Squared Error = ' + str (testMSE)) print ('Learned regression tree model:') print (model.toDebugString()) #save and load model model.save (sc, "DTR-Wide-2008") sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")
#Cancelled becomes the 9th column now, and total columns in the data = 9 label = clean_line_split[8] nonLable = clean_line_split[0:8] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) training.cache () #start timer at this point startTime = datetime.now() #build the model model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Error = ' + str (testErr)) print ('Learned classification tree model:') print (model.toDebugString()) #save and load model model.save(sc, "DT-Class-W-00-08") sameModel = DecisionTreeModel.load(sc, "DT-Class-W-00-08") sc.stop ()
# $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonDecisionTreeRegressionExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myDecisionTreeRegressionModel") sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel") # $example off$
parsedData = raw_data.map(parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) training.cache() #start timer at this point startTime = datetime.now() #build the model model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( test.count()) print('Time consumed = '), (datetime.now() - startTime) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) #save and load model model.save(sc, "DT-Class-N-00-08") sameModel = DecisionTreeModel.load(sc, "DT-Class-N-00-08") sc.stop()
from pyspark import SparkConf, SparkContext import urllib.request import urllib from pyspark.mllib.regression import LabeledPoint from numpy import array from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from time import time import createLabeledPoint from createLabeledPoint import * try: sc.stop() except: pass sc = SparkContext().getOrCreate(SparkConf()) testm = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_model") testm_portsweep = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_portsweep_model") test_data_file = "./corrected.gz" test_raw_data = sc.textFile(test_data_file) typename = test_raw_data.filter(lambda x: 'portsweep' in x) cur=0 idx=0 count = typename.count() for idx in range(count): typename_pd = typename.zipWithIndex().filter(lambda x: x[1]==idx).map(lambda x: x[0]) test_csv_data = typename_pd.map(lambda x: x.split(",")) test_data = test_csv_data.map(create_labeled_point) predictions = testm_portsweep.predict(test_data.map(lambda p: p.features)) if str(predictions.take(1)[0]) == "3.0": print(typename_pd.collect()) cur=cur+1
.setAppName("Mlib") .set("spark.executor.memory", "1g")) sc = SparkContext(conf = conf) dv1 =np.array([1.0,0.0,3.0]) dv2= [1.0,0.0,3.0] sv1 = Vectors.sparse(3,[0,2],[1.0,3.0]) sv2 = sps.csc_matrix((np.array([1.0,3.0]),np.array([0,2]),np.array([0,2])),shape=(3,1)) print sv2 data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt') (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "model_data") sameModel = DecisionTreeModel.load(sc, "model_data")
conf = SparkConf() conf.setAppName("TA") sc = SparkContext(conf=conf) tre = StreamingContext(sc, 10) htf = HashingTF(50000) NB_directory = 'hdfs://master:9000/user/hadoop/NaiveBayes' NB_model = NaiveBayesModel.load(sc, NB_directory) LR_directory = 'hdfs://master:9000/user/hadoop/LogisticRegression' LR_model = LogisticRegressionModel.load(sc, LR_directory) DT_output_dir = 'hdfs://master:9000/user/hadoop/DT' DT_model = DecisionTreeModel.load(sc, DT_output_dir) voted_classifier = VoteClassifier(NB_model, LR_model, DT_model) def sentiment(test_sample): sample_data_test = test_sample.split(" ") cli = htf.transform(sample_data_test) return voted_classifier.classify(cli) lines = tre.socketTextStream(socket.gethostbyname(socket.gethostname()), 10000) lines.pprint() tweets = lines.flatMap(lambda text: [(text)]) tweets.pprint()
#Cancelled becomes the 6th column now, and total columns in the data = 6 label = clean_line_split[5] nonLable = clean_line_split[0:5] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) training.cache () #start timer at this point startTime = datetime.now() #build the model model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Error = ' + str (testErr)) print ('Learned classification tree model:') print (model.toDebugString()) #save and load model model.save(sc, "DT-Class-N-95-08") sameModel = DecisionTreeModel.load(sc, "DT-Class-N-95-08") sc.stop ()
import csv from pyspark.mllib.tree import DecisionTree, DecisionTreeModel # Remove the first line from the csv file def clean(x): if (x[29] != "Amount"): return x #Turn the data into a labeled point using 30 dimensions def normalize(x): return LabeledPoint(float(x[30]), [float(x[0]), float(x[29]) / 25691.16]) sameModel = DecisionTreeModel.load(sc, "./decisiontreefraud") #make a spark conference conf = (SparkConf().setMaster("local").setAppName("My app").set( "spark.executor.memory", "4g")) #files have to be added while running to see the data in the stream ssc = StreamingContext(sc, 1) lines1 = ssc.textFileStream( "file:///mnt/vdatanodea/datasets/creditcards/credit/b") trainingData = lines1.map(lambda line: LabeledPoint(float(line.split(" ")[ 1]), [(line.split(" ")[0]), (line.split(" ")[2])])).cache() trainingData.pprint() lines2 = ssc.textFileStream( "file:///mnt/vdatanodea/datasets/creditcards/credit/c")
def getModel(self, path): if self.type == 'NaiveBayes': return NaiveBayesModel.load(self.sc, path) elif self.type == 'DecisionTree': return DecisionTreeModel.load(self.sc, path)
def getModel(path, file): if path_exist(path + 'index-' + file): index = sc.textFile(path + 'index-' + file) a = index.collect() b = lambda x: [int(i) for i in x] return DecisionTreeModel.load(sc, path + 'model-' + file), b(a) else: vector, classes = dataPreparing(sc.textFile(path + file)) index = CorrelationFeature( vector) #se precisar de feature do Feature Selection reduced = MatrixReducer(vector, index) data = pass2libsvm(reduced, classes) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Load CSV data data2 = spark.read.format("csv").schema(schema).load(path + file) # Create vector assembler to produce a feature vector for each record for use in MLlib # First 45 csv fields are features, the 46th field is the label. Remove IPs from features. assembler = VectorAssembler(inputCols=[schema.names[1]] + schema.names[3:-1], outputCol="features") # Assemble feature vector in new dataframe assembledData = assembler.transform(data2) # Create a label and feature indexers to speed up categorical columns for decision tree labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndexed = labelIndexer.fit(assembledData).transform(assembledData) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=20) featureIndexed = featureIndexer.fit(labelIndexed).transform( labelIndexed) # Create a DecisionTree model trainer dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and model training in a Pipeline # pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model # model = pipeline.fit(assembledData) model = dt.fit(featureIndexed) #model = DecisionTree.trainClassifier(data, numberClasses,{}) #, maxDepth=5, maxBins=32) #model.save(sc, path+'model-'+file) return model, index
# coding=utf-8 from pyspark import SparkContext, SparkConf from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('Decision Tree Classification').setMaster('local[2]') sc = SparkContext(conf=conf) # load and parse the data file into an RDD of LabelPoint data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt') #split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) #train a decision tree model model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # evaluate model on test instance and compute test error predictions = model.predict(testData.map(lambda x : x.features)) labelAndPredictions = testData.map(lambda lp : lp.label).zip(predictions) testErr = labelAndPredictions.filter(lambda (v, p) : v != p).count()/float(testData.count()) print('test err' + str(testErr)) print('learned classification tree model :' + str(model.toDebugString)) # save and load model model.save(sc, '../model/myDecisionTreeClassificationModel') sameModel = DecisionTreeModel.load(sc, '../model/myDecisionTreeClassificationModel') sc.stop()
import json import requests from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from numpy import array app = Flask(__name__) conf = SparkConf() conf.setAppName("Classification") try: sc.stop() except: pass sc = SparkContext(pyFiles=['/home/ubuntu/project_src/flaskapp/createLabeledPoint.py','/home/ubuntu/project_src/flaskapp/ClassSet.py','/home/ubuntu/project_src/flaskapp/FuncSet.py','/home/ubuntu/project_src/flaskapp/hello.py']).getOrCreate(conf=conf) #testm = DecisionTreeModel.load(sc, "hdfs://*****:*****@app.route('/') def hello_world(): return 'From python hello!' @app.route('/index') def index(): return render_template("index.html") @app.route('/train') def trainodule(): pass @app.route('/getSpkTstCnt') def runclass(): # testm = DecisionTreeModel.load(sc, "hdfs://ip-172-31-1-239:9000/home/ubuntu/project_src/tree_model") test_data_file = "hdfs://ip-172-31-1-239:9000/user/ubuntu/corrected.gz" test_raw_data = sc.textFile(test_data_file)
data = MLUtils.loadLibSVMFile(sc, dataPath) # 将数据集分割为训练数据集和测试数据集 (trainingData, testData) = data.randomSplit([0.7, 0.3]) print("train data count: " + str(trainingData.count())) print("test data count : " + str(testData.count())) # 训练决策树分类器 # categoricalFeaturesInfo 为空,表示所有的特征均为连续值 model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # 测试数据集上预测 predictions = model.predict(testData.map(lambda x: x.features)) # 打包真实值与预测值 labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) # 统计预测错误的样本的频率 testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(testData.count()) print('Decision Tree Test Error = %5.3f%%' % (testErr * 100)) print("Decision Tree Learned classifiction tree model : ") print(model.toDebugString()) # 保存和加载训练好的模型 modelPath = "/home/zhb/Desktop/work/DecisionTreeShareProject/app/myDecisionTreeClassificationModel" model.save(sc, modelPath) sameModel = DecisionTreeModel.load(sc, modelPath)
sc = SparkContext(appName="PythonDecisionTreeRegressionExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myDecisionTreeRegressionModel") sameModel = DecisionTreeModel.load( sc, "target/tmp/myDecisionTreeRegressionModel") # $example off$
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'file') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='entropy', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "myModelPath") sameModel = DecisionTreeModel.load(sc, "myModelPath")
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
from __future__ import print_function from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.regression import LabeledPoint from numpy import array sc.stop() if __name__ == "__main__": sc = SparkContext(appName="PythonDecisionTreeRegressionExample") sc.setLogLevel("ERROR") model1 = DecisionTreeModel.load(sc, "runs") model2 = DecisionTreeModel.load(sc, "wickets") batsmen_cluster = {} bowler_cluster = {} with open( '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_batsmen.csv' ) as f: for line in f: ar = line.split(',') a = [] a.append(int(ar[0])) a.append(float(ar[3])) a.append(float(ar[4])) batsmen_cluster[ar[2]] = a with open( '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_bowler.csv' ) as f:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils from pyspark.mllib.evaluation import BinaryClassificationMetrics # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "myModelPath") sameModel = DecisionTreeModel.load(sc, "myModelPath")
conf = SparkConf().setAppName('Decision Tree Regression').setMaster('local[2]') sc = SparkContext(conf=conf) # load data data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt') # split the data into training and test sets (training, testData) = data.randomSplit([0.7, 0.3]) # training a decision tree regression model = DecisionTree().trainRegressor(training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) # evaluate model on test instance and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelAndPredictions = testData.map(lambda x: x.label).zip(predictions) testMSE = labelAndPredictions.map(lambda (v, p): (v - p)**2).sum() / float( testData.count()) print('test mean squared error :' + str(testMSE)) print('learned regression tree model :') print(model.toDebugString()) # save and load model model.save(sc, '../model/myDecisionTreeRegressionModel') sameModel = DecisionTreeModel.load(sc, '../model/myDecisionTreeRegressionModel') sc.stop()
.replace('.0','')\ .replace('feature ','')\ .replace('Predict: ','#')\ .replace('not in','5')\ .replace(' ','|')\ .replace('in','4')\ .replace('>=','0')\ .replace('<=','1')\ .replace('>','2')\ .replace('<','3')\ .replace('Root:','') print >> f2, paths.decode('utf8') else: walk(dic['children'], path + dic['name'] + ':') f2.close() if __name__ == "__main__": dtModelFile = "output/DTModel" dtModelResults = "decisionTreeModel.txt" sc = SparkContext("local[20]", "DecisionTreeClassification") dtModel = DecisionTreeModel.load(sc, dtModelFile) dtree = dtModel.toDebugString() print dtree # tree_C(dtree,dtModelResults) tree_json(dtree, dtModelResults) # tree_rule(dtree,dtModelResults)
# $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myDecisionTreeClassificationModel") sameModel = DecisionTreeModel.load( sc, "target/tmp/myDecisionTreeClassificationModel") # $example off$
# Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile( sc, 'file:///usr/local/spark/data/mllib/sample_libsvm_data.txt' ) # The code on web is wrong, this is correct. # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel') sameModel = DecisionTreeModel.load( sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel')
from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonDecisionTreeClassificationExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myDecisionTreeClassificationModel") sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel") # $example off$
#Cancelled becomes the 9th column now, and total columns in the data = 9 label = clean_line_split[8] nonLable = clean_line_split[0:8] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) training.cache () #start timer at this point startTime = datetime.now() #build the model model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Error = ' + str (testErr)) print ('Learned classification tree model:') print (model.toDebugString()) #save and load model model.save(sc, "DT-Class-W-95-08") sameModel = DecisionTreeModel.load(sc, "DT-Class-W-95-08") sc.stop ()
print "######################################################\n" print "######################################################\n" print "######### Start!!! #######\n" print "######################################################\n" print "######################################################\n" print "\n\n\n" #stop_rdd = rdd_tweets.coalesce(1) #stop_rdd.saveAsTextFile(output_path) print "****************************************************\n" print "Here is the last step\n" print "****************************************************\n" #Here is the trainning steps. binladen_model = DecisionTreeModel.load(sc, binladen_model_path) # #training_data = MLUtils.loadLibSVMFile(sc, training_path) test_data = rdd_labelFeatures # Evaluate model on test instances and compute test error predictions = binladen_model.predict(test_data.map(lambda x: x.features)) # test the error value labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v!=p).count() / float(test_data.count()) tmp_str = 'Test Error = ' + str(testErr) print(tmp_str) log_write(tmp_str) print "\n\n" #featuresAndPredictions = test_data.flatMap(lambda words: resplit_only_feature(words))\ # .zip(predictions)
nonLable = clean_line_split[1:] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Mean Squared Error = ' + str (testMSE)) print ('Learned regression tree model:') print (model.toDebugString()) #save and load model model.save (sc, "DTR-Wide-2008") sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008") sc.stop ()
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("TaxiWeb") sc = SparkContext(conf=conf) model = DecisionTreeModel.load(sc, "TugasAkhir/Model/decision_tree/decision_tree_v5")
nonLable = clean_line_split[1:] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Mean Squared Error = ' + str (testMSE)) print ('Learned regression tree model:') print (model.toDebugString()) #save and load model model.save (sc, "DTR-Narrow-2008") sameModel = DecisionTreeModel.load (sc, "DTR-Narrow-2008") sc.stop ()
from pyspark.sql.session import SparkSession from pyspark.mllib.evaluation import MulticlassMetrics from pyspark.ml import Pipeline from pyspark.mllib.tree import DecisionTreeModel sc = SparkContext() spark = SparkSession(sc) inputDF = spark.read.csv('s3://himaniproject2/ValidationDataset.csv', header='true', inferSchema='true', sep=';') transformed_df = inputDF.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) model = DecisionTreeModel.load(sc, "s3://himaniproject2/model") predictions = model.predict(transformed_df.map(lambda x: x.features)) labels_and_predictions = transformed_df.map(lambda x: x.label).zip(predictions) acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float( transformed_df.count()) print(".........................................................") print("Model accuracy....................: %.3f%%" % (acc * 100)) metrics = MulticlassMetrics(labels_and_predictions) fscore = metrics.fMeasure() print(".........................................................") print("F1 Score.................................. = %s" % fscore)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
.setMaster(master) .setAppName(app_name)) sc = SparkContext(conf=conf) lines = sc.textFile(input) parsedData = lines.map(parseLine) (trainingData, testData) = parsedData.randomSplit([0.5, 0.5]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) predictions.foreach(my_print) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) labelsAndPredictions.foreach(my_print) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, output) sameModel = DecisionTreeModel.load(sc, output) sc.stop()