コード例 #1
0
def loadModel():
	clusterModel = KMeansModel.load(sc, pv.clusterModelPath)
	classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath)

	if pv.outputDebugMsg:
		Utils.logMessage("\nLoad cluster & classification model finished")
	return clusterModel, classificationModel
コード例 #2
0
def evaluate_model(type):
    if type == 'logistic':
        model = LogisticRegressionModel.load(sc, "logit_model.model")
    elif type == 'tree':
        model = DecisionTreeModel.load(sc, "dt_model.model")
    elif type == 'rf':
        model = RandomForestModel.load(sc, "rf_model.model")
コード例 #3
0
def predict_proba(rf_model, testRDD):

        trees = rf_model._java_model.trees()
        ntrees = rf_model.numTrees()
        scores_dict = {i: 0 for i in range(0,10)}
        scoresRDD = testRDD.map(lambda x: scores_dict.copy())

        for tree in trees:
                dtm = DecisionTreeModel(tree)
                currentScoreRDD = dtm.predict(testRDD)
                scoresRDD = scoresRDD.zip(currentScoreRDD)

                def reduceTuple(x):
                        x[0][int(x[1])] += 1
                        return x[0]

                scoresRDD = scoresRDD.map(reduceTuple)
        return scoresRDD
コード例 #4
0
    def saveModel(self):
        # save the model to the given path
        self.tree_model.save(self.sc, "trained")

        # re-load the saved model
        self.tree_model = DecisionTreeModel.load(self.sc, "trained")

        # re-evaluate
        self.evaluate()
コード例 #5
0
def main(sc, filename):
    '''
    The driver for the spark scoring application, it generates predictions for
    a given file of features and target variables
    '''

    rawDataRdd = sc.textFile(filename)
    print "Data Size: {}".format(rawDataRdd.count())

    labeledPointsRdd = rawDataRdd.map(parse_lines)

    #load models
    logit_model = LogisticRegressionModel.load(sc, "logit_model.model")
    dt_model = DecisionTreeModel.load(sc, "dt_model.model")
    rf_model = RandomForestModel.load(sc, "rf_model.model")

    #logistic predictions
    labels_and_preds = labeledPointsRdd.map(lambda p: (float(logit_model.predict(p.features)), p.label  ))
    labels_and_preds_collected = labels_and_preds.collect()
    print "\n"
    print "Predictions: Logistic Regression"
    y_true = []
    y_pred = []
    for row in labels_and_preds_collected:
        y_true.append(row[1])
        y_pred.append(row[0])
        # print "predicted: {0} - actual: {1}\n".format(row[0], row[1])


    accuracy = labels_and_preds.filter(lambda (v,p): v == p).count() / float(labeledPointsRdd.count())

    print_box()
    print "Prediction Accuracy (Logistic): {}".format(round(accuracy, 4))
    print_box()
    print "\n"

    #decision tree predictions
    predictions = dt_model.predict(labeledPointsRdd.map(lambda p: p.features))
    labels_and_preds_dt = labeledPointsRdd.map(lambda p: p.label).zip(predictions)
    labels_and_preds_dt_collected = labels_and_preds.collect()


    accuracy_dt = labels_and_preds_dt.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())

    print_box()
    print "Prediction Accuracy (Decision Tree): {}".format(round(accuracy_dt, 4))
    print_box()
    print "\n"

    #random forest predictions
    predictions_rf = rf_model.predict(labeledPointsRdd.map(lambda p: p.features))
    labels_and_preds_rf = labeledPointsRdd.map(lambda p: p.label).zip(predictions_rf)
    accuracy_rf = labels_and_preds_rf.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())
    print_box()
    print "Prediction Accuracy (Random Forest): {}".format(round(accuracy_rf, 4))
    print_box()
コード例 #6
0
	def process(reviews):
		if(reviews.isEmpty()):
			pass
		else:
			model_name = "dt"
			updated_model = "dt0"
			model_path, data_path, metadata_path = '','',''
			
			#performing looping process to check the availability of new model classifier
			for i in range(25,-1,-1):
				model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i)
				updated_model = model_name+str(i)
				data_path = model_path+"/data/part-r*"
				metadata_path = model_path+"/metadata/part-00000"
				if(patherror(data_path) == False and patherror(metadata_path) == False):
					break
			
			#load model classifier
			model = DecisionTreeModel.load(sc, model_path)

			start = time.time()
			reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)
			
			Words = Row('label', 'words')
			words = reviews.map(lambda r: Words(*r))
			words_df = spark.createDataFrame(words)
			
			#review tokenization
			token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True)
			token_filtered = token.transform(words_df)
			
			#stopwords elimination
			remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False)
			stopwords_filtered = remover.transform(token_filtered)

			prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])
			
			#tf-idf calculation
			tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True))
			idf = IDF().fit(tf)
			tfidf = idf.transform(tf)
			
			prediction = model.predict(tfidf)
			
			labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0]))
			
			metrics = MulticlassMetrics(labeled_prediction)	
			
			output = reviews.zip(prediction)
				
			filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out"
			output.saveAsTextFile(filename)
			
			end = time.time()	
			print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))
コード例 #7
0
ファイル: stargalaxy.py プロジェクト: bbw7561135/POPREU
def get_probs_classify(model, data):
    # Collect the individual decision trees as JavaArray objects
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data)

    # For each tree, apply its prediction to the entire dataset and zip together the results
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)
コード例 #8
0
ファイル: hw4.py プロジェクト: toosyou/big_data_analysis_hw
def get_dt_model(sc, train=None):
    model_path = 'dt.model'
    if train is None:
        model = DecisionTreeModel.load(sc, model_path)
    else:
        model = DecisionTree.trainClassifier(train,
                                             numClasses=2,
                                             categoricalFeaturesInfo={},
                                             impurity='gini',
                                             maxDepth=10)
        model.save(sc, model_path)

    return model
コード例 #9
0
def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel. 
    '''
    # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data.map(lambda x: x.features))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    featsAndPredictions = sc.parallelize([]) #empty RDD
    for i in range(ntrees):
        dtm = DecisionTreeModel(trees[i])
        predictions = dtm.predict(data.map(lambda x: x.features))
        featsAndPredictions=featsAndPredictions.union(data.map(lambda lp: lp.features).zip(predictions))

        #scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        #scores = scores.map(lambda x: x[0] + x[1])
        
    #add up the predictions and divide the accumulated scores over the number of trees
    return featsAndPredictions.reduceByKey(lambda a,b: a+b).map(lambda (key,val): (key,val/ntrees)) #add up the predictions
コード例 #10
0
ファイル: main.py プロジェクト: LoadedCoders/iHear
def test(sc):
    files = ["sounds/flushing/20150227_193109-flushing-04.wav",
             "sounds/bike/20150227_193806-bici-14.wav",
             "sounds/blender/20150227_193606-licuadora-14.wav"
             ]

    rfmodel = RandomForestModel.load(sc, RF_PATH)
    dtmodel = DecisionTreeModel.load(sc, DT_PATH)

    print dtmodel.toDebugString()
    for f in files:
        vec = audio.showFeatures(f)
        testfeatures = Vectors.dense([float(x) for x in vec.split(' ')])
        print(vec)
        pred = dtmodel.predict(testfeatures)
        print("DT Prediction is " + str(pred), classes[int(pred)])
        pred = rfmodel.predict(testfeatures)
        print("RF Prediction is " + str(pred), classes[int(pred)])
コード例 #11
0
ファイル: new-catraca.py プロジェクト: gta-ufrj/catraca
def getModel(path,file):
	
	if path_exist(path+'index-'+file):
		index=sc.sparkContext.textFile(path+'index-'+file)
		a=index.collect()
		b=lambda x : [ int(i) for i in x ]
		
		return DecisionTreeModel.load(sc, path+'model-'+file), b(a)

	else:
		vector,classes = dataPreparing(sc.sparkContext.textFile(path+file))
		index=CorrelationFeature(vector) #se precisar de feature do Feature Selection
		reduced=MatrixReducer(vector,index) 
		data=pass2libsvm(reduced,classes) 
		model = DecisionTree.trainClassifier(data, numberClasses, {})	 #, maxDepth=5, maxBins=32)
		model.save(sc, path+'model-'+file)			

		return	model, index
コード例 #12
0
ファイル: stargalaxy.py プロジェクト: beatriceliang/POPREU
def get_probs_classify (model, data):
    # Collect the individual decision trees as JavaArray objects
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data)

    # For each tree, apply its prediction to the entire dataset and zip together the results
    for i in range(1,ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data))
        scores = scores.map(lambda x: x[0] + x[1])
    
    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x/ntrees)
コード例 #13
0
def predict_proba(model, data):
    '''
    Input: A PySpark RandomForestModel object, RDD of LabeledPoints
    Output: List of probabilies 
    This wrapper exposes the probabilities (i.e. confidences) for a given prediciton. 
    '''
    # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(
        data.map(lambda x: x.features))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in xrange(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    probabilities = scores.map(lambda x: float(x) / ntrees).collect()
    return probabilities
コード例 #14
0
ファイル: server.py プロジェクト: IcedNecro/AWO-61-backend
def init_spark_context():

    global predictionModel

    # load spark context
    conf = SparkConf().setAppName("movie_recommendation-server")

    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py'])

    # absolute path in hdfs
    # to run locally, remove first slash '/' i.e my_model1, not /my_model1

    predictionModel = DecisionTreeModel.load(sc, '/my_model1')
    sc.addFile( 'conv/6.p')
    sc.addFile( 'conv/7.p')
    sc.addFile( 'conv/8.p')
    sc.addFile('conv/10.p')
    sc.addFile('conv/12.p')
    sc.addFile( 'conv/36.p')

    return sc
コード例 #15
0
def getModel(path, file):

    if path_exist(path + 'index-' + file):
        index = sc.textFile(path + 'index-' + file)
        a = index.collect()
        b = lambda x: [int(i) for i in x]

        return DecisionTreeModel.load(sc, path + 'model-' + file), b(a)

    else:

        vector, classes = dataPreparing(sc.textFile(path + file))

        index = CorrelationFeature(
            vector)  #se precisar de feature do Feature Selection

        reduced = MatrixReducer(vector, index)

        #data=pass2libsvm(vector)

        data = pass2libsvm(reduced, classes)

        #data=pass2libsvm(vector,classes)

        #para a (5-tupla deveria ser algo como ) data=pass2libsvm(vector)

        #(trainingData, testData) = data.randomSplit([0.7, 0.3])

        # Train a DecisionTree model.
        #  Empty categoricalFeaturesInfo indicates all features are continuous.

        model = DecisionTree.trainClassifier(data, numberClasses,
                                             {})  #, maxDepth=5, maxBins=32)

        model.save(sc, path + 'model-' + file)

        return model, index
コード例 #16
0
ファイル: Predict.py プロジェクト: abhishek-ch/evolveML
def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel.
    '''  # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data.map(
        lambda row: [float(row.SearchID), float(row.AdID), float(row.Position), float(row.ObjectType),
                     float(row.HistCTR)]))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda row : [float(row.SearchID),float(row.AdID),float(row.Position),float(row.ObjectType),float(row.HistCTR)])))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)
コード例 #17
0
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeClassificationExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
    sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")
    # $example off$
コード例 #18
0
ファイル: mllib-test.py プロジェクト: Riuchando/Spark
         .setAppName("Mlib")
         .set("spark.executor.memory", "1g"))
sc = SparkContext(conf = conf)



dv1 =np.array([1.0,0.0,3.0])
dv2= [1.0,0.0,3.0]
sv1 = Vectors.sparse(3,[0,2],[1.0,3.0])
sv2 = sps.csc_matrix((np.array([1.0,3.0]),np.array([0,2]),np.array([0,2])),shape=(3,1))

print sv2

data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)


# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "model_data")
sameModel = DecisionTreeModel.load(sc, "model_data")

conf = SparkConf()
conf.setAppName("TA")
sc = SparkContext(conf=conf)
tre = StreamingContext(sc, 10)
htf = HashingTF(50000)

NB_directory = 'hdfs://master:9000/user/hadoop/NaiveBayes'
NB_model = NaiveBayesModel.load(sc, NB_directory)

LR_directory = 'hdfs://master:9000/user/hadoop/LogisticRegression'
LR_model = LogisticRegressionModel.load(sc, LR_directory)

DT_output_dir = 'hdfs://master:9000/user/hadoop/DT'
DT_model = DecisionTreeModel.load(sc, DT_output_dir)

voted_classifier = VoteClassifier(NB_model, LR_model, DT_model)


def sentiment(test_sample):
    sample_data_test = test_sample.split(" ")
    cli = htf.transform(sample_data_test)
    return voted_classifier.classify(cli)


lines = tre.socketTextStream(socket.gethostbyname(socket.gethostname()), 10000)
lines.pprint()
tweets = lines.flatMap(lambda text: [(text)])
tweets.pprint()
コード例 #20
0
	#Cancelled becomes the 9th column now, and total columns in the data = 9
	label = clean_line_split[8]
	nonLable = clean_line_split[0:8]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-W-00-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-W-00-08")
sc.stop ()
コード例 #21
0
from __future__ import print_function

from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.regression import LabeledPoint
from numpy import array
sc.stop()
if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")
    sc.setLogLevel("ERROR")
    model1 = DecisionTreeModel.load(sc, "runs")
    model2 = DecisionTreeModel.load(sc, "wickets")
    batsmen_cluster = {}
    bowler_cluster = {}
    with open(
            '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_batsmen.csv'
    ) as f:
        for line in f:
            ar = line.split(',')
            a = []
            a.append(int(ar[0]))
            a.append(float(ar[3]))
            a.append(float(ar[4]))
            batsmen_cluster[ar[2]] = a

    with open(
            '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_bowler.csv'
    ) as f:
コード例 #22
0
import json
import requests
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from numpy import array
app = Flask(__name__)
conf = SparkConf()
conf.setAppName("Classification")
try:
	sc.stop()
except:
	pass
sc = SparkContext(pyFiles=['/home/ubuntu/project_src/flaskapp/createLabeledPoint.py','/home/ubuntu/project_src/flaskapp/ClassSet.py','/home/ubuntu/project_src/flaskapp/FuncSet.py','/home/ubuntu/project_src/flaskapp/hello.py']).getOrCreate(conf=conf)
#testm = DecisionTreeModel.load(sc, "hdfs://*****:*****@app.route('/')
def hello_world():
	return 'From python hello!'
@app.route('/index')
def index():
	return render_template("index.html")
@app.route('/train')
def trainodule():
	pass
@app.route('/getSpkTstCnt')
def runclass():
#	testm = DecisionTreeModel.load(sc, "hdfs://ip-172-31-1-239:9000/home/ubuntu/project_src/tree_model")
	test_data_file = "hdfs://ip-172-31-1-239:9000/user/ubuntu/corrected.gz"
	test_raw_data = sc.textFile(test_data_file)
コード例 #23
0
    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainRegressor(trainingData,
                                        categoricalFeaturesInfo={},
                                        impurity='variance',
                                        maxDepth=5,
                                        maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
    sameModel = DecisionTreeModel.load(
        sc, "target/tmp/myDecisionTreeRegressionModel")
    # $example off$
コード例 #24
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'file')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='entropy', maxDepth=5, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")
コード例 #25
0
 def getModel(self, path):
     if self.type == 'NaiveBayes':
         return NaiveBayesModel.load(self.sc, path)
     elif self.type == 'DecisionTree':
         return DecisionTreeModel.load(self.sc, path)
コード例 #26
0
	
	#Cancelled becomes the 6th column now, and total columns in the data = 6
	label = clean_line_split[5]
	nonLable = clean_line_split[0:5]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-N-95-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-N-95-08")
sc.stop ()
コード例 #27
0
import csv
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel


# Remove the first line from the csv file
def clean(x):
    if (x[29] != "Amount"):
        return x


#Turn the data into a labeled point using 30 dimensions
def normalize(x):
    return LabeledPoint(float(x[30]), [float(x[0]), float(x[29]) / 25691.16])


sameModel = DecisionTreeModel.load(sc, "./decisiontreefraud")

#make a spark conference
conf = (SparkConf().setMaster("local").setAppName("My app").set(
    "spark.executor.memory", "4g"))

#files have to be added while running to see the data in the stream
ssc = StreamingContext(sc, 1)
lines1 = ssc.textFileStream(
    "file:///mnt/vdatanodea/datasets/creditcards/credit/b")
trainingData = lines1.map(lambda line: LabeledPoint(float(line.split(" ")[
    1]), [(line.split(" ")[0]), (line.split(" ")[2])])).cache()
trainingData.pprint()

lines2 = ssc.textFileStream(
    "file:///mnt/vdatanodea/datasets/creditcards/credit/c")
コード例 #28
0
def getModel(path, file):

    if path_exist(path + 'index-' + file):
        index = sc.textFile(path + 'index-' + file)
        a = index.collect()
        b = lambda x: [int(i) for i in x]

        return DecisionTreeModel.load(sc, path + 'model-' + file), b(a)

    else:

        vector, classes = dataPreparing(sc.textFile(path + file))

        index = CorrelationFeature(
            vector)  #se precisar de feature do Feature Selection

        reduced = MatrixReducer(vector, index)

        data = pass2libsvm(reduced, classes)

        # Train a DecisionTree model.
        #  Empty categoricalFeaturesInfo indicates all features are continuous.

        # Load CSV data
        data2 = spark.read.format("csv").schema(schema).load(path + file)

        # Create vector assembler to produce a feature vector for each record for use in MLlib
        # First 45 csv fields are features, the 46th field is the label. Remove IPs from features.
        assembler = VectorAssembler(inputCols=[schema.names[1]] +
                                    schema.names[3:-1],
                                    outputCol="features")

        # Assemble feature vector in new dataframe
        assembledData = assembler.transform(data2)

        # Create a label and feature indexers to speed up categorical columns for decision tree
        labelIndexer = StringIndexer(inputCol="label",
                                     outputCol="indexedLabel")
        labelIndexed = labelIndexer.fit(assembledData).transform(assembledData)
        featureIndexer = VectorIndexer(inputCol="features",
                                       outputCol="indexedFeatures",
                                       maxCategories=20)
        featureIndexed = featureIndexer.fit(labelIndexed).transform(
            labelIndexed)

        # Create a DecisionTree model trainer
        dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                    featuresCol="indexedFeatures")

        # Chain indexers and model training in a Pipeline
        #		pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

        # Train model
        #		model = pipeline.fit(assembledData)
        model = dt.fit(featureIndexed)

        #model = DecisionTree.trainClassifier(data, numberClasses,{})	 #, maxDepth=5, maxBins=32)

        #model.save(sc, path+'model-'+file)

        return model, index
コード例 #29
0
    def selectNext(self):
        # predict for the rest the datapoints
        self.trainDataUnknown = self.indicesUnknown.map(lambda _: (_, None)) \
            .leftOuterJoin(self.dataset.trainSet) \
            .map(lambda _: (_[0], _[1][1]))

        actualIndices = self.trainDataUnknown.map(lambda _ : _[0])\
            .zipWithIndex()\
            .map(lambda _: (_[1], _[0]))

        myDebugger.TIMESTAMP('zipping indices ')

        rdd = sc.parallelize([])
        ''' these java objects are not serializable
         thus still no support to make an RDD out of it!! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        '''
        for x in self.model._java_model.trees():
            '''
             zipping each prediction from each decision tree
             with individual sample index so that they can be
             added later
            '''
            predX = DecisionTreeModel(x)\
                .predict(self.trainDataUnknown.map(lambda _ : _[1].features))\
                .zipWithIndex()\
                .map(lambda _: (_[1], _[0]))

            predX = actualIndices.leftOuterJoin(predX).map(lambda _: _[1])
            rdd = rdd.union(predX)

        myDebugger.TIMESTAMP('get individual tree predictions')
        ''' adding up no. of 1 in each sample's prediction this is the class prediction of 1s'''
        classPrediction = rdd.groupByKey().mapValues(sum)

        myDebugger.TIMESTAMP('reducing ')

        #  direct self.nEstimators gives error
        totalEstimators = self.nEstimators
        #  predicted probability of class 0
        classPrediction = classPrediction.map(
            lambda _: (_[0], abs(0.5 - (1 - (_[1] / totalEstimators)))))

        myDebugger.TIMESTAMP('mapping')

        # Selecting the index which has the highest uncertainty/ closest to probability 0.5
        selectedIndex1toN = classPrediction.sortBy(lambda _: _[1]).first()[0]

        myDebugger.TIMESTAMP('sorting')

        # takes the selectedIndex from the unknown samples and add it to the known ones
        self.indicesKnown = self.indicesKnown.union(
            sc.parallelize([selectedIndex1toN]))

        myDebugger.TIMESTAMP('update known indices')

        # removing first sample from unlabeled ones(update)
        self.indicesUnknown = self.indicesUnknown.filter(
            lambda _: _ != selectedIndex1toN)

        myDebugger.TIMESTAMP('update unknown indices')

        myDebugger.DEBUG(selectedIndex1toN)
        myDebugger.DEBUG(self.indicesKnown.collect())
        myDebugger.DEBUG(self.indicesUnknown.collect())

        myDebugger.TIMESTAMP('DEBUGGING DONE')
コード例 #30
0
parsedData = raw_data.map(parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(training,
                                    categoricalFeaturesInfo={},
                                    impurity='variance',
                                    maxDepth=5,
                                    maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print('Time consumed = '), (datetime.now() - startTime)

print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())

#save and load model
model.save(sc, "DTR-Narrow-2008")
sameModel = DecisionTreeModel.load(sc, "DTR-Narrow-2008")
sc.stop()
コード例 #31
0
ファイル: tests.py プロジェクト: HodaAlemi/spark
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
    data = MLUtils.loadLibSVMFile(sc, dataPath)
    # 将数据集分割为训练数据集和测试数据集
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    print("train data count: " + str(trainingData.count()))
    print("test data count : " + str(testData.count()))

    # 训练决策树分类器
    # categoricalFeaturesInfo 为空,表示所有的特征均为连续值
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         impurity='gini',
                                         maxDepth=5,
                                         maxBins=32)

    # 测试数据集上预测
    predictions = model.predict(testData.map(lambda x: x.features))
    # 打包真实值与预测值
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    # 统计预测错误的样本的频率
    testErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(testData.count())
    print('Decision Tree Test Error = %5.3f%%' % (testErr * 100))
    print("Decision Tree Learned classifiction tree model : ")
    print(model.toDebugString())

    # 保存和加载训练好的模型
    modelPath = "/home/zhb/Desktop/work/DecisionTreeShareProject/app/myDecisionTreeClassificationModel"
    model.save(sc, modelPath)
    sameModel = DecisionTreeModel.load(sc, modelPath)
コード例 #33
0
    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         impurity='gini',
                                         maxDepth=5,
                                         maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
    sameModel = DecisionTreeModel.load(
        sc, "target/tmp/myDecisionTreeClassificationModel")
    # $example off$
コード例 #34
0
from pyspark import SparkConf, SparkContext
import urllib.request
import urllib
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from time import time
import createLabeledPoint
from createLabeledPoint import *
try:
	sc.stop()
except:
	pass
sc = SparkContext().getOrCreate(SparkConf())
testm = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_model")
testm_portsweep = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_portsweep_model")
test_data_file = "./corrected.gz"
test_raw_data = sc.textFile(test_data_file)

typename = test_raw_data.filter(lambda x: 'portsweep' in x)
cur=0
idx=0
count = typename.count()
for idx in range(count):
	typename_pd = typename.zipWithIndex().filter(lambda x: x[1]==idx).map(lambda x: x[0])
	test_csv_data = typename_pd.map(lambda x: x.split(","))
	test_data = test_csv_data.map(create_labeled_point)
	predictions = testm_portsweep.predict(test_data.map(lambda p: p.features))
	if str(predictions.take(1)[0]) == "3.0":
		print(typename_pd.collect())
		cur=cur+1
コード例 #35
0
parsedData = raw_data.map(parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    test.count())
print('Time consumed = '), (datetime.now() - startTime)

print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

#save and load model
model.save(sc, "DT-Class-N-00-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-N-00-08")
sc.stop()
コード例 #36
0
# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Decision Tree Classification').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse the data file into an RDD of LabelPoint
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')

#split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#train a decision tree model
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32)

# evaluate model on test instance and compute test error
predictions = model.predict(testData.map(lambda x : x.features))
labelAndPredictions = testData.map(lambda lp : lp.label).zip(predictions)
testErr = labelAndPredictions.filter(lambda (v, p) : v != p).count()/float(testData.count())

print('test err' + str(testErr))
print('learned classification tree model :' + str(model.toDebugString))

# save and load model
model.save(sc, '../model/myDecisionTreeClassificationModel')
sameModel = DecisionTreeModel.load(sc, '../model/myDecisionTreeClassificationModel')

sc.stop()
コード例 #37
0
	label = clean_line_split[10]
	nonLable = clean_line_split[0:10] + clean_line_split[11:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Wide-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")
コード例 #38
0
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                        impurity='variance', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
    sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")
    # $example off$
コード例 #39
0
    def selectNext(self):
        # get predictions from individual trees
        self.trainDataUnknown = self.indicesUnknown.map(lambda _: (_, None)) \
            .leftOuterJoin(self.dataset.trainSet) \
            .map(lambda _: (_[0], _[1][1]))

        # zipping actual indices with dummy indices so that they can be traced later
        actualIndices = self.trainDataUnknown.map(lambda _: _[0]) \
            .zipWithIndex() \
            .map(lambda _: (_[1], _[0]))

        # an empty RDD
        rdd = sc.parallelize([])
        ''' these java objects are not serializable
         thus still no support to make an RDD out of it!! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        '''
        for x in self.model._java_model.trees():
            # zipping each prediction from each decision tree with individual sample index so that they can be added later
            predX = DecisionTreeModel(x) \
                .predict(self.trainDataUnknown.map(lambda _: _[1].features)) \
                .zipWithIndex() \
                .map(lambda _: (_[1], _[0]))
            predX = actualIndices.leftOuterJoin(predX).map(lambda _: _[1])
            rdd = rdd.union(predX)
        ''' adding up no. of 1 in each sample's prediction this is the class prediction of 1s'''
        sumScore = rdd.groupByKey().mapValues(sum)
        totalEstimators = self.nEstimators

        # average of the predicted scores
        f_1 = sumScore.map(lambda _: (_[0], _[1] / totalEstimators))

        # standard deviation of predicted scores
        f_2 = sumScore.map(lambda _: getSD(_, totalEstimators))

        # - proportion of positive points
        nLabeled = self.trainDataKnown.count()
        nUnlabeled = self.trainDataUnknown.count()
        proportionPositivePoints = (self.trainDataKnown.map(
            lambda _: _[1].label).reduce(lambda x, y: x + y)) / nLabeled
        f_3 = f_1.map(lambda _: proportionPositivePoints)

        # - estimate variance of forest by looking at avergae of variance of some predictions
        estimateVariance = (
            f_2.map(lambda _: _[1]).reduce(lambda x, y: x + y)) / nUnlabeled
        f_6 = f_3.map(lambda _: estimateVariance)

        # - number of already labelled datapoints
        f_8 = f_3.map(lambda _: nLabeled)

        myDebugger.TIMESTAMP('features ready for transposing')

        # transposing start
        tempf_1 = f_1.map(lambda _: _[1]).zipWithIndex().map(lambda _:
                                                             (_[1], _[0]))
        tempf_2 = f_2.map(lambda _: _[1]).zipWithIndex().map(lambda _:
                                                             (_[1], _[0]))
        tempf_3 = f_3.zipWithIndex().map(lambda _: (_[1], _[0]))
        tempf_6 = f_6.zipWithIndex().map(lambda _: (_[1], _[0]))
        tempf_8 = f_8.zipWithIndex().map(lambda _: (_[1], _[0]))
        LALDataset = tempf_1\
            .leftOuterJoin(tempf_2)\
            .leftOuterJoin(tempf_3)\
            .leftOuterJoin(tempf_6)\
            .leftOuterJoin(tempf_8)\
            .map(lambda _  : LabeledPoint(_[0] ,
                              [_[1][0][0][0][0],  _[1][0][0][0][1],  _[1][0][0][1], _[1][0][1], _[1][1]]))

        myDebugger.TIMESTAMP('transposing done')

        # # predict the expected reduction in the error by adding the point
        LALprediction = self.lalModel.predict(LALDataset.map(lambda _ : _.features))\
            .zipWithIndex()\
            .map(lambda _ : (_[1],_[0]))
        myDebugger.TIMESTAMP('prediction done')

        # Selecting the index which has the highest uncertainty/ closest to probability 0.5
        selectedIndex1toN = LALprediction.sortBy(lambda _: _[1]).max()[0]

        # takes the selectedIndex from the unknown samples and add it to the known ones
        self.indicesKnown = self.indicesKnown.union(
            sc.parallelize([selectedIndex1toN]))

        # updating unknown indices
        self.indicesUnknown = self.indicesUnknown.filter(
            lambda _: _ != selectedIndex1toN)
        ''' debugging block '''
        myDebugger.TIMESTAMP('update unknown indices')
        myDebugger.DEBUG(selectedIndex1toN)
        myDebugger.DEBUG(self.indicesKnown.collect())
        myDebugger.DEBUG(self.indicesUnknown.collect())
        myDebugger.TIMESTAMP('DEBUGGING DONE')
コード例 #40
0
ファイル: loadModel.py プロジェクト: hims209/cloudcomputing
from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml import Pipeline
from pyspark.mllib.tree import DecisionTreeModel
sc = SparkContext()
spark = SparkSession(sc)

inputDF = spark.read.csv('s3://himaniproject2/ValidationDataset.csv',
                         header='true',
                         inferSchema='true',
                         sep=';')

transformed_df = inputDF.rdd.map(
    lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

model = DecisionTreeModel.load(sc, "s3://himaniproject2/model")

predictions = model.predict(transformed_df.map(lambda x: x.features))

labels_and_predictions = transformed_df.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(
    transformed_df.count())
print(".........................................................")
print("Model accuracy....................: %.3f%%" % (acc * 100))

metrics = MulticlassMetrics(labels_and_predictions)

fscore = metrics.fMeasure()
print(".........................................................")
print("F1 Score.................................. = %s" % fscore)
コード例 #41
0
            .setMaster(master)
            .setAppName(app_name))

    sc = SparkContext(conf=conf)
    lines = sc.textFile(input)
    parsedData = lines.map(parseLine)
    (trainingData, testData) = parsedData.randomSplit([0.5, 0.5])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    predictions.foreach(my_print)

    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    labelsAndPredictions.foreach(my_print)

    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, output)
    sameModel = DecisionTreeModel.load(sc, output)


    sc.stop()
コード例 #42
0
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(
    sc, 'file:///usr/local/spark/data/mllib/sample_libsvm_data.txt'
)  # The code on web is wrong, this is correct.
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel')
sameModel = DecisionTreeModel.load(
    sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel')
コード例 #43
0
	def getModel(self, path):
		if self.type == 'NaiveBayes':
			return NaiveBayesModel.load(self.sc, path)
		elif self.type == 'DecisionTree':
			return DecisionTreeModel.load(self.sc, path)
コード例 #44
0
	#Cancelled becomes the 9th column now, and total columns in the data = 9
	label = clean_line_split[8]
	nonLable = clean_line_split[0:8]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-W-95-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-W-95-08")
sc.stop ()
コード例 #45
0
    print "######################################################\n"
    print "######################################################\n"
    print "#########            Start!!!                  #######\n"
    print "######################################################\n"
    print "######################################################\n"
    print "\n\n\n"
    #stop_rdd = rdd_tweets.coalesce(1)
    #stop_rdd.saveAsTextFile(output_path)
    print "****************************************************\n"
    print "Here is the last step\n"
    print "****************************************************\n"



    #Here is the trainning steps.
    binladen_model = DecisionTreeModel.load(sc, binladen_model_path)
    #
    #training_data = MLUtils.loadLibSVMFile(sc, training_path)
    test_data = rdd_labelFeatures
    # Evaluate model on test instances and compute test error
    predictions = binladen_model.predict(test_data.map(lambda x: x.features))
    # test the error value
    labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v!=p).count() / float(test_data.count())
    tmp_str = 'Test Error = ' + str(testErr)
    print(tmp_str)
    log_write(tmp_str)
    print "\n\n"

    #featuresAndPredictions = test_data.flatMap(lambda words: resplit_only_feature(words))\
    #        .zip(predictions)
コード例 #46
0
	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Wide-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")
sc.stop ()
コード例 #47
0
                         scheduled_departure_time=t[1].scheduled_departure_time,
                         actual_departure_time=t[1].actual_departure_time,
                         departure_delay_minutes=t[1].departure_delay_minutes,
                         scheduled_arrival_time=t[1].scheduled_arrival_time,
                         actual_arrival_time=t[1].actual_arrival_time,
                         arrival_delay_minutes=t[1].arrival_delay_minutes,
                         crs_elapsed_flight_minutes=t[1].crs_elapsed_flight_minutes,
                         distance=t[1].distance)


if __name__ == "__main__":
    sc = SparkContext(appName="InsightEdge Python API Demo: prediction job")
    ssc = StreamingContext(sc, 3)
    sqlc = SQLContext(sc)

    zkQuorum = "localhost:2181"
    topic = "flights"

    model = DecisionTreeModel(Utils.load_model_from_grid("DecisionTreeFlightModel", sc))

    carrier_mapping = sc.broadcast(load_mapping("CarrierMap", sqlc))
    origin_mapping = sc.broadcast(load_mapping("OriginMap", sqlc))
    destination_mapping = sc.broadcast(load_mapping("DestinationMap", sqlc))

    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    lines.foreachRDD(predict_and_save)

    ssc.start()
    ssc.awaitTermination()
コード例 #48
0
	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Narrow-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Narrow-2008")
sc.stop ()
コード例 #49
0
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=n_estimators,
                                         featureSubsetStrategy="auto",
                                         impurity='gini')
    ''' accuracy test on testset here'''
    predictions = model.predict(test.map(lambda x: x.features))
    labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda _: _[0] != _[1])

    n_unlabeled = unlabeled_data.count()

    rdd = sc.parallelize([])
    for tree in model._java_model.trees():
        predX = DecisionTreeModel(tree).predict(unlabeled_data.map(lambda _ : _[0].features))\
            .zipWithIndex()\
            .map(lambda _: (_[1], _[0]))
        rdd = rdd.union(predX)

    classPrediction = rdd.groupByKey().mapValues(sum)
    classPrediction = classPrediction.sortByKey()
    entropies = classPrediction.map(lambda _: abs(0.5 -
                                                  (1 - (_[1] / n_estimators))))

    unlabeled_entropies = unlabeled_indices.map(lambda _: _[0])\
        .zipWithIndex()\
        .map(lambda _: (_[1], _[0]))\
        .leftOuterJoin(entropies.zipWithIndex().map(lambda _:(_[1], _[0])))\
        .map(lambda _:_[1])

    sorted_unlabeled_entropies = unlabeled_entropies.sortBy(lambda _: _[1])
コード例 #50
0
ファイル: test_algorithms.py プロジェクト: Ignalina/spark311
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
コード例 #51
0
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("TaxiWeb")
sc = SparkContext(conf=conf)
model = DecisionTreeModel.load(sc, "TugasAkhir/Model/decision_tree/decision_tree_v5")