Python DecisionTreeModel.load Examples, pyspark.mllib.tree.DecisionTreeModel.load Python Examples

Example #1

0

Show file

def evaluate_model(type):
    if type == 'logistic':
        model = LogisticRegressionModel.load(sc, "logit_model.model")
    elif type == 'tree':
        model = DecisionTreeModel.load(sc, "dt_model.model")
    elif type == 'rf':
        model = RandomForestModel.load(sc, "rf_model.model")

Example #2

0

Show file

def index():
    conf = SparkConf().setAppName("TaxiWeb")
    sc = SparkContext(conf=conf)
    model = DecisionTreeModel.load(
        sc, "TugasAkhir/Model/decision_tree/decision_tree_v5")

    return render_template("home.html")

Example #3

0

Show file

def model_instream(sc, **params):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        sc._jsc.hadoopConfiguration())
    if not fs.exists(
            sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) +
                                              '/model/' + params['path'])):
        raise Exception("Invalid file path, path not exists!")
    if params['type'] == 'kmeans':
        model = KMeansModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'fpgrowth':
        model = FPGrowthModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'logistic-regression':
        model = LogisticRegressionModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'word2vec':
        model = Word2VecModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'decision-tree':
        model = DecisionTreeModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    else:
        raise Exception("Invalid model type!")
    return True, model

Example #4

0

Show file

File: vestAccountMain.py Project: yfliu87/VestAccountDetection

def loadModel():
	clusterModel = KMeansModel.load(sc, pv.clusterModelPath)
	classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath)

	if pv.outputDebugMsg:
		Utils.logMessage("\nLoad cluster & classification model finished")
	return clusterModel, classificationModel

Example #5

0

Show file

def getModel(path, file):

    if path_exist(path + 'index-' + file):
        index = sc.textFile(path + 'index-' + file)
        a = index.collect()
        b = lambda x: [int(i) for i in x]

        return DecisionTreeModel.load(sc, path + 'model-' + file), b(a)

    else:

        vector, classes = dataPreparing(sc.textFile(path + file))

        index = CorrelationFeature(
            vector)  #se precisar de feature do Feature Selection

        reduced = MatrixReducer(vector, index)

        data = pass2libsvm(reduced, classes)

        # Train a DecisionTree model.
        #  Empty categoricalFeaturesInfo indicates all features are continuous.

        model = DecisionTree.trainClassifier(data, numberClasses,
                                             {})  #, maxDepth=5, maxBins=32)

        model.save(sc, path + 'model-' + file)

        return model, index

Example #6

0

Show file

File: DecisionTree.py Project: cjzamora/machine-learning

    def saveModel(self):
        # save the model to the given path
        self.tree_model.save(self.sc, "trained")

        # re-load the saved model
        self.tree_model = DecisionTreeModel.load(self.sc, "trained")

        # re-evaluate
        self.evaluate()

Example #7

0

Show file

File: DecisionTree.py Project: cjzamora/machine-learning

    def saveModel(self):
        # save the model to the given path
        self.tree_model.save(self.sc, "trained")

        # re-load the saved model
        self.tree_model = DecisionTreeModel.load(self.sc, "trained")

        # re-evaluate
        self.evaluate()

Example #8

0

Show file

def main(sc, filename):
    '''
    The driver for the spark scoring application, it generates predictions for
    a given file of features and target variables
    '''

    rawDataRdd = sc.textFile(filename)
    print "Data Size: {}".format(rawDataRdd.count())

    labeledPointsRdd = rawDataRdd.map(parse_lines)

    #load models
    logit_model = LogisticRegressionModel.load(sc, "logit_model.model")
    dt_model = DecisionTreeModel.load(sc, "dt_model.model")
    rf_model = RandomForestModel.load(sc, "rf_model.model")

    #logistic predictions
    labels_and_preds = labeledPointsRdd.map(lambda p: (float(logit_model.predict(p.features)), p.label  ))
    labels_and_preds_collected = labels_and_preds.collect()
    print "\n"
    print "Predictions: Logistic Regression"
    y_true = []
    y_pred = []
    for row in labels_and_preds_collected:
        y_true.append(row[1])
        y_pred.append(row[0])
        # print "predicted: {0} - actual: {1}\n".format(row[0], row[1])


    accuracy = labels_and_preds.filter(lambda (v,p): v == p).count() / float(labeledPointsRdd.count())

    print_box()
    print "Prediction Accuracy (Logistic): {}".format(round(accuracy, 4))
    print_box()
    print "\n"

    #decision tree predictions
    predictions = dt_model.predict(labeledPointsRdd.map(lambda p: p.features))
    labels_and_preds_dt = labeledPointsRdd.map(lambda p: p.label).zip(predictions)
    labels_and_preds_dt_collected = labels_and_preds.collect()


    accuracy_dt = labels_and_preds_dt.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())

    print_box()
    print "Prediction Accuracy (Decision Tree): {}".format(round(accuracy_dt, 4))
    print_box()
    print "\n"

    #random forest predictions
    predictions_rf = rf_model.predict(labeledPointsRdd.map(lambda p: p.features))
    labels_and_preds_rf = labeledPointsRdd.map(lambda p: p.label).zip(predictions_rf)
    accuracy_rf = labels_and_preds_rf.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())
    print_box()
    print "Prediction Accuracy (Random Forest): {}".format(round(accuracy_rf, 4))
    print_box()

Example #9

0

Show file

	def process(reviews):
		if(reviews.isEmpty()):
			pass
		else:
			model_name = "dt"
			updated_model = "dt0"
			model_path, data_path, metadata_path = '','',''
			
			#performing looping process to check the availability of new model classifier
			for i in range(25,-1,-1):
				model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i)
				updated_model = model_name+str(i)
				data_path = model_path+"/data/part-r*"
				metadata_path = model_path+"/metadata/part-00000"
				if(patherror(data_path) == False and patherror(metadata_path) == False):
					break
			
			#load model classifier
			model = DecisionTreeModel.load(sc, model_path)

			start = time.time()
			reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)
			
			Words = Row('label', 'words')
			words = reviews.map(lambda r: Words(*r))
			words_df = spark.createDataFrame(words)
			
			#review tokenization
			token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True)
			token_filtered = token.transform(words_df)
			
			#stopwords elimination
			remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False)
			stopwords_filtered = remover.transform(token_filtered)

			prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])
			
			#tf-idf calculation
			tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True))
			idf = IDF().fit(tf)
			tfidf = idf.transform(tf)
			
			prediction = model.predict(tfidf)
			
			labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0]))
			
			metrics = MulticlassMetrics(labeled_prediction)	
			
			output = reviews.zip(prediction)
				
			filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out"
			output.saveAsTextFile(filename)
			
			end = time.time()	
			print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))

Example #10

0

Show file

File: hw4.py Project: toosyou/big_data_analysis_hw

def get_dt_model(sc, train=None):
    model_path = 'dt.model'
    if train is None:
        model = DecisionTreeModel.load(sc, model_path)
    else:
        model = DecisionTree.trainClassifier(train,
                                             numClasses=2,
                                             categoricalFeaturesInfo={},
                                             impurity='gini',
                                             maxDepth=10)
        model.save(sc, model_path)

    return model

Example #11

0

Show file

File: main.py Project: LoadedCoders/iHear

def test(sc):
    files = ["sounds/flushing/20150227_193109-flushing-04.wav",
             "sounds/bike/20150227_193806-bici-14.wav",
             "sounds/blender/20150227_193606-licuadora-14.wav"
             ]

    rfmodel = RandomForestModel.load(sc, RF_PATH)
    dtmodel = DecisionTreeModel.load(sc, DT_PATH)

    print dtmodel.toDebugString()
    for f in files:
        vec = audio.showFeatures(f)
        testfeatures = Vectors.dense([float(x) for x in vec.split(' ')])
        print(vec)
        pred = dtmodel.predict(testfeatures)
        print("DT Prediction is " + str(pred), classes[int(pred)])
        pred = rfmodel.predict(testfeatures)
        print("RF Prediction is " + str(pred), classes[int(pred)])

Example #12

0

Show file

File: server.py Project: IcedNecro/AWO-61-backend

def init_spark_context():

    global predictionModel

    # load spark context
    conf = SparkConf().setAppName("movie_recommendation-server")

    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py'])

    # absolute path in hdfs
    # to run locally, remove first slash '/' i.e my_model1, not /my_model1

    predictionModel = DecisionTreeModel.load(sc, '/my_model1')
    sc.addFile( 'conv/6.p')
    sc.addFile( 'conv/7.p')
    sc.addFile( 'conv/8.p')
    sc.addFile('conv/10.p')
    sc.addFile('conv/12.p')
    sc.addFile( 'conv/36.p')

    return sc

Example #13

0

Show file

File: decision_tree_regression.py Project: bsangee/spark_vs_r

parsedData = raw_data.map(parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(training,
                                    categoricalFeaturesInfo={},
                                    impurity='variance',
                                    maxDepth=5,
                                    maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print('Time consumed = '), (datetime.now() - startTime)

print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())

#save and load model
model.save(sc, "DTR-Narrow-2008")
sameModel = DecisionTreeModel.load(sc, "DTR-Narrow-2008")
sc.stop()

Example #14

0

Show file

File: decision_tree_regression-wide.py Project: bsangee/spark_vs_r

	label = clean_line_split[10]
	nonLable = clean_line_split[0:10] + clean_line_split[11:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Wide-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")

Example #15

0

Show file

File: decision_tree_classification.py Project: bmewing/spark_vs_r

	#Cancelled becomes the 9th column now, and total columns in the data = 9
	label = clean_line_split[8]
	nonLable = clean_line_split[0:8]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-W-00-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-W-00-08")
sc.stop ()

Example #16

0

Show file

File: decision_tree_regression_example.py Project: 0xqq/spark

# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                        impurity='variance', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
    sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")
    # $example off$

Example #17

0

Show file

File: decision_tree_classification.py Project: bsangee/spark_vs_r

parsedData = raw_data.map(parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    test.count())
print('Time consumed = '), (datetime.now() - startTime)

print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

#save and load model
model.save(sc, "DT-Class-N-00-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-N-00-08")
sc.stop()

Example #18

0

Show file

File: extractsample.py Project: ivanybma/295_Leach_wsn_attack_detect_part

from pyspark import SparkConf, SparkContext
import urllib.request
import urllib
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from time import time
import createLabeledPoint
from createLabeledPoint import *
try:
	sc.stop()
except:
	pass
sc = SparkContext().getOrCreate(SparkConf())
testm = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_model")
testm_portsweep = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_portsweep_model")
test_data_file = "./corrected.gz"
test_raw_data = sc.textFile(test_data_file)

typename = test_raw_data.filter(lambda x: 'portsweep' in x)
cur=0
idx=0
count = typename.count()
for idx in range(count):
	typename_pd = typename.zipWithIndex().filter(lambda x: x[1]==idx).map(lambda x: x[0])
	test_csv_data = typename_pd.map(lambda x: x.split(","))
	test_data = test_csv_data.map(create_labeled_point)
	predictions = testm_portsweep.predict(test_data.map(lambda p: p.features))
	if str(predictions.take(1)[0]) == "3.0":
		print(typename_pd.collect())
		cur=cur+1

Example #19

0

Show file

File: mllib-test.py Project: Riuchando/Spark

         .setAppName("Mlib")
         .set("spark.executor.memory", "1g"))
sc = SparkContext(conf = conf)



dv1 =np.array([1.0,0.0,3.0])
dv2= [1.0,0.0,3.0]
sv1 = Vectors.sparse(3,[0,2],[1.0,3.0])
sv2 = sps.csc_matrix((np.array([1.0,3.0]),np.array([0,2]),np.array([0,2])),shape=(3,1))

print sv2

data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)


# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "model_data")
sameModel = DecisionTreeModel.load(sc, "model_data")

Example #20

0

Show file

File: Classification_live_data.py Project: SuperDev0403/Sentiment-Analysis-of-Twitter


conf = SparkConf()
conf.setAppName("TA")
sc = SparkContext(conf=conf)
tre = StreamingContext(sc, 10)
htf = HashingTF(50000)

NB_directory = 'hdfs://master:9000/user/hadoop/NaiveBayes'
NB_model = NaiveBayesModel.load(sc, NB_directory)

LR_directory = 'hdfs://master:9000/user/hadoop/LogisticRegression'
LR_model = LogisticRegressionModel.load(sc, LR_directory)

DT_output_dir = 'hdfs://master:9000/user/hadoop/DT'
DT_model = DecisionTreeModel.load(sc, DT_output_dir)

voted_classifier = VoteClassifier(NB_model, LR_model, DT_model)


def sentiment(test_sample):
    sample_data_test = test_sample.split(" ")
    cli = htf.transform(sample_data_test)
    return voted_classifier.classify(cli)


lines = tre.socketTextStream(socket.gethostbyname(socket.gethostname()), 10000)
lines.pprint()
tweets = lines.flatMap(lambda text: [(text)])
tweets.pprint()

Example #21

0

Show file

File: decision_tree_classification-narrow.py Project: bmewing/spark_vs_r

	
	#Cancelled becomes the 6th column now, and total columns in the data = 6
	label = clean_line_split[5]
	nonLable = clean_line_split[0:5]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-N-95-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-N-95-08")
sc.stop ()

Example #22

0

Show file

import csv
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel


# Remove the first line from the csv file
def clean(x):
    if (x[29] != "Amount"):
        return x


#Turn the data into a labeled point using 30 dimensions
def normalize(x):
    return LabeledPoint(float(x[30]), [float(x[0]), float(x[29]) / 25691.16])


sameModel = DecisionTreeModel.load(sc, "./decisiontreefraud")

#make a spark conference
conf = (SparkConf().setMaster("local").setAppName("My app").set(
    "spark.executor.memory", "4g"))

#files have to be added while running to see the data in the stream
ssc = StreamingContext(sc, 1)
lines1 = ssc.textFileStream(
    "file:///mnt/vdatanodea/datasets/creditcards/credit/b")
trainingData = lines1.map(lambda line: LabeledPoint(float(line.split(" ")[
    1]), [(line.split(" ")[0]), (line.split(" ")[2])])).cache()
trainingData.pprint()

lines2 = ssc.textFileStream(
    "file:///mnt/vdatanodea/datasets/creditcards/credit/c")

Example #23

0

Show file

 def getModel(self, path):
     if self.type == 'NaiveBayes':
         return NaiveBayesModel.load(self.sc, path)
     elif self.type == 'DecisionTree':
         return DecisionTreeModel.load(self.sc, path)

Example #24

0

Show file

def getModel(path, file):

    if path_exist(path + 'index-' + file):
        index = sc.textFile(path + 'index-' + file)
        a = index.collect()
        b = lambda x: [int(i) for i in x]

        return DecisionTreeModel.load(sc, path + 'model-' + file), b(a)

    else:

        vector, classes = dataPreparing(sc.textFile(path + file))

        index = CorrelationFeature(
            vector)  #se precisar de feature do Feature Selection

        reduced = MatrixReducer(vector, index)

        data = pass2libsvm(reduced, classes)

        # Train a DecisionTree model.
        #  Empty categoricalFeaturesInfo indicates all features are continuous.

        # Load CSV data
        data2 = spark.read.format("csv").schema(schema).load(path + file)

        # Create vector assembler to produce a feature vector for each record for use in MLlib
        # First 45 csv fields are features, the 46th field is the label. Remove IPs from features.
        assembler = VectorAssembler(inputCols=[schema.names[1]] +
                                    schema.names[3:-1],
                                    outputCol="features")

        # Assemble feature vector in new dataframe
        assembledData = assembler.transform(data2)

        # Create a label and feature indexers to speed up categorical columns for decision tree
        labelIndexer = StringIndexer(inputCol="label",
                                     outputCol="indexedLabel")
        labelIndexed = labelIndexer.fit(assembledData).transform(assembledData)
        featureIndexer = VectorIndexer(inputCol="features",
                                       outputCol="indexedFeatures",
                                       maxCategories=20)
        featureIndexed = featureIndexer.fit(labelIndexed).transform(
            labelIndexed)

        # Create a DecisionTree model trainer
        dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                    featuresCol="indexedFeatures")

        # Chain indexers and model training in a Pipeline
        #		pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

        # Train model
        #		model = pipeline.fit(assembledData)
        model = dt.fit(featureIndexed)

        #model = DecisionTree.trainClassifier(data, numberClasses,{})	 #, maxDepth=5, maxBins=32)

        #model.save(sc, path+'model-'+file)

        return model, index

Example #25

0

Show file

# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Decision Tree Classification').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse the data file into an RDD of LabelPoint
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')

#split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#train a decision tree model
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32)

# evaluate model on test instance and compute test error
predictions = model.predict(testData.map(lambda x : x.features))
labelAndPredictions = testData.map(lambda lp : lp.label).zip(predictions)
testErr = labelAndPredictions.filter(lambda (v, p) : v != p).count()/float(testData.count())

print('test err' + str(testErr))
print('learned classification tree model :' + str(model.toDebugString))

# save and load model
model.save(sc, '../model/myDecisionTreeClassificationModel')
sameModel = DecisionTreeModel.load(sc, '../model/myDecisionTreeClassificationModel')

sc.stop()

Example #26

0

Show file

File: hello.py Project: ivanybma/295_Leach_wsn_attack_detect_part

import json
import requests
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from numpy import array
app = Flask(__name__)
conf = SparkConf()
conf.setAppName("Classification")
try:
	sc.stop()
except:
	pass
sc = SparkContext(pyFiles=['/home/ubuntu/project_src/flaskapp/createLabeledPoint.py','/home/ubuntu/project_src/flaskapp/ClassSet.py','/home/ubuntu/project_src/flaskapp/FuncSet.py','/home/ubuntu/project_src/flaskapp/hello.py']).getOrCreate(conf=conf)
#testm = DecisionTreeModel.load(sc, "hdfs://*****:*****@app.route('/')
def hello_world():
	return 'From python hello!'
@app.route('/index')
def index():
	return render_template("index.html")
@app.route('/train')
def trainodule():
	pass
@app.route('/getSpkTstCnt')
def runclass():
#	testm = DecisionTreeModel.load(sc, "hdfs://ip-172-31-1-239:9000/home/ubuntu/project_src/tree_model")
	test_data_file = "hdfs://ip-172-31-1-239:9000/user/ubuntu/corrected.gz"
	test_raw_data = sc.textFile(test_data_file)

Example #27

0

Show file

File: decision_tree_classification_example.py Project: xmas1992/DecisionTreeShareProject

    data = MLUtils.loadLibSVMFile(sc, dataPath)
    # 将数据集分割为训练数据集和测试数据集
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    print("train data count: " + str(trainingData.count()))
    print("test data count : " + str(testData.count()))

    # 训练决策树分类器
    # categoricalFeaturesInfo 为空，表示所有的特征均为连续值
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         impurity='gini',
                                         maxDepth=5,
                                         maxBins=32)

    # 测试数据集上预测
    predictions = model.predict(testData.map(lambda x: x.features))
    # 打包真实值与预测值
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    # 统计预测错误的样本的频率
    testErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(testData.count())
    print('Decision Tree Test Error = %5.3f%%' % (testErr * 100))
    print("Decision Tree Learned classifiction tree model : ")
    print(model.toDebugString())

    # 保存和加载训练好的模型
    modelPath = "/home/zhb/Desktop/work/DecisionTreeShareProject/app/myDecisionTreeClassificationModel"
    model.save(sc, modelPath)
    sameModel = DecisionTreeModel.load(sc, modelPath)

Example #28

0

Show file

    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainRegressor(trainingData,
                                        categoricalFeaturesInfo={},
                                        impurity='variance',
                                        maxDepth=5,
                                        maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
    sameModel = DecisionTreeModel.load(
        sc, "target/tmp/myDecisionTreeRegressionModel")
    # $example off$

Example #29

0

Show file

File: decision_tree.py Project: shashankadidamu/OttoGroupClassification

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'file')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='entropy', maxDepth=5, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")

Example #30

0

Show file

File: tests.py Project: HodaAlemi/spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass

Example #31

0

Show file

File: simulation.py Project: PRONGS-CHIRAG/IPL-Match-Simulator

from __future__ import print_function

from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.regression import LabeledPoint
from numpy import array
sc.stop()
if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")
    sc.setLogLevel("ERROR")
    model1 = DecisionTreeModel.load(sc, "runs")
    model2 = DecisionTreeModel.load(sc, "wickets")
    batsmen_cluster = {}
    bowler_cluster = {}
    with open(
            '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_batsmen.csv'
    ) as f:
        for line in f:
            ar = line.split(',')
            a = []
            a.append(int(ar[0]))
            a.append(float(ar[3]))
            a.append(float(ar[4]))
            batsmen_cluster[ar[2]] = a

    with open(
            '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_bowler.csv'
    ) as f:

Example #32

0

Show file

from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import BinaryClassificationMetrics

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")

Example #33

0

Show file

conf = SparkConf().setAppName('Decision Tree Regression').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load data
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')
# split the data into training and test sets
(training, testData) = data.randomSplit([0.7, 0.3])

# training a decision tree regression
model = DecisionTree().trainRegressor(training,
                                      categoricalFeaturesInfo={},
                                      impurity='variance',
                                      maxDepth=5,
                                      maxBins=32)

# evaluate model on test instance and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelAndPredictions = testData.map(lambda x: x.label).zip(predictions)
testMSE = labelAndPredictions.map(lambda (v, p): (v - p)**2).sum() / float(
    testData.count())

print('test mean squared error :' + str(testMSE))
print('learned regression tree model :')
print(model.toDebugString())

# save and load model
model.save(sc, '../model/myDecisionTreeRegressionModel')
sameModel = DecisionTreeModel.load(sc,
                                   '../model/myDecisionTreeRegressionModel')

sc.stop()

Example #34

0

Show file

                    .replace('.0','')\
                    .replace('feature ','')\
                    .replace('Predict: ','#')\
                    .replace('not in','5')\
                    .replace(' ','|')\
                    .replace('in','4')\
                    .replace('>=','0')\
                    .replace('<=','1')\
                    .replace('>','2')\
                    .replace('<','3')\
                    .replace('Root:','')
            print >> f2, paths.decode('utf8')

        else:
            walk(dic['children'], path + dic['name'] + ':')

    f2.close()


if __name__ == "__main__":

    dtModelFile = "output/DTModel"
    dtModelResults = "decisionTreeModel.txt"

    sc = SparkContext("local[20]", "DecisionTreeClassification")
    dtModel = DecisionTreeModel.load(sc, dtModelFile)
    dtree = dtModel.toDebugString()
    print dtree
    #     tree_C(dtree,dtModelResults)
    tree_json(dtree, dtModelResults)
#     tree_rule(dtree,dtModelResults)

Example #35

0

Show file

File: decision_tree_classification_example.py Project: zdd199212/spark-2.2

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         impurity='gini',
                                         maxDepth=5,
                                         maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
    sameModel = DecisionTreeModel.load(
        sc, "target/tmp/myDecisionTreeClassificationModel")
    # $example off$

Example #36

0

Show file

File: Classifier.py Project: aprando/master-thesis-social-recsys

	def getModel(self, path):
		if self.type == 'NaiveBayes':
			return NaiveBayesModel.load(self.sc, path)
		elif self.type == 'DecisionTree':
			return DecisionTreeModel.load(self.sc, path)

Example #37

0

Show file

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(
    sc, 'file:///usr/local/spark/data/mllib/sample_libsvm_data.txt'
)  # The code on web is wrong, this is correct.
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel')
sameModel = DecisionTreeModel.load(
    sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel')

Example #38

0

Show file

File: decision_tree_classification_example.py Project: 0xqq/spark

from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeClassificationExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
    sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")
    # $example off$

Example #39

0

Show file

File: decision_tree_classification.py Project: bsangee/spark_vs_r

	#Cancelled becomes the 9th column now, and total columns in the data = 9
	label = clean_line_split[8]
	nonLable = clean_line_split[0:8]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-W-95-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-W-95-08")
sc.stop ()

Example #40

0

Show file

File: sub_binladen_retweets_list.py Project: zhuangkechen/midm

    print "######################################################\n"
    print "######################################################\n"
    print "#########            Start!!!                  #######\n"
    print "######################################################\n"
    print "######################################################\n"
    print "\n\n\n"
    #stop_rdd = rdd_tweets.coalesce(1)
    #stop_rdd.saveAsTextFile(output_path)
    print "****************************************************\n"
    print "Here is the last step\n"
    print "****************************************************\n"



    #Here is the trainning steps.
    binladen_model = DecisionTreeModel.load(sc, binladen_model_path)
    #
    #training_data = MLUtils.loadLibSVMFile(sc, training_path)
    test_data = rdd_labelFeatures
    # Evaluate model on test instances and compute test error
    predictions = binladen_model.predict(test_data.map(lambda x: x.features))
    # test the error value
    labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v!=p).count() / float(test_data.count())
    tmp_str = 'Test Error = ' + str(testErr)
    print(tmp_str)
    log_write(tmp_str)
    print "\n\n"

    #featuresAndPredictions = test_data.flatMap(lambda words: resplit_only_feature(words))\
    #        .zip(predictions)

Example #41

0

Show file

File: decision_tree_regression.py Project: bsangee/spark_vs_r

	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Wide-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")
sc.stop ()

Example #42

0

Show file

from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("TaxiWeb")
sc = SparkContext(conf=conf)
model = DecisionTreeModel.load(sc, "TugasAkhir/Model/decision_tree/decision_tree_v5")

Example #43

0

Show file

File: decision_tree_regression.py Project: bsangee/spark_vs_r

	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Narrow-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Narrow-2008")
sc.stop ()

Example #44

0

Show file

File: loadModel.py Project: hims209/cloudcomputing

from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml import Pipeline
from pyspark.mllib.tree import DecisionTreeModel
sc = SparkContext()
spark = SparkSession(sc)

inputDF = spark.read.csv('s3://himaniproject2/ValidationDataset.csv',
                         header='true',
                         inferSchema='true',
                         sep=';')

transformed_df = inputDF.rdd.map(
    lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

model = DecisionTreeModel.load(sc, "s3://himaniproject2/model")

predictions = model.predict(transformed_df.map(lambda x: x.features))

labels_and_predictions = transformed_df.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(
    transformed_df.count())
print(".........................................................")
print("Model accuracy....................: %.3f%%" % (acc * 100))

metrics = MulticlassMetrics(labels_and_predictions)

fscore = metrics.fMeasure()
print(".........................................................")
print("F1 Score.................................. = %s" % fscore)

Example #45

0

Show file

File: test_algorithms.py Project: Ignalina/spark311

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass

Example #46

0

Show file

File: decisiontree_classify_test.py Project: wangcunxin/spark_py

            .setMaster(master)
            .setAppName(app_name))

    sc = SparkContext(conf=conf)
    lines = sc.textFile(input)
    parsedData = lines.map(parseLine)
    (trainingData, testData) = parsedData.randomSplit([0.5, 0.5])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    predictions.foreach(my_print)

    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    labelsAndPredictions.foreach(my_print)

    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, output)
    sameModel = DecisionTreeModel.load(sc, output)


    sc.stop()