def main(): if len(sys.argv) != 2: print("parameter error") sys.exit(1) arg = json.loads(sys.argv[1]) spark = SparkSession.builder \ .appName("spark_lr") \ .enableHiveSupport() \ .getOrCreate() scene = arg["scene"] version = arg["version"] model = arg["model"] sample_path = arg["sample-path"] num_features = arg["num-features"] num_classes = arg["num-classes"] train_file = sample_path.replace( "{scene}.{model}.{version}.[train|test]", "%s.%s.%s.train" % (scene, model, version)) test_file = sample_path.replace("{scene}.{model}.{version}.[train|test]", "%s.%s.%s.test" % (scene, model, version)) train = MLUtils.loadLibSVMFile(spark.sparkContext, train_file, num_features) test = MLUtils.loadLibSVMFile(spark.sparkContext, test_file, num_features) print(train.count()) print(test.count()) print(train.getNumPartitions()) print(test.getNumPartitions())
def split_data(): try: #pat_proc = sc.textFile("hdfs://master:54310/bibudh/healthcare/data/cloudera_challenge/pat_proc_libsvm_format") #sqlContext.createDataFrame(pat_proc.map(lambda x: custom_encode(x)).take(10000)).foreach(check_for_ascending) #map(lambda w: check_for_ascending(w), pat_proc.map(lambda x: custom_encode(x)).take(10000000)) #pat_proc = sqlContext.read.format("libsvm").load(home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*') #This gives a DataFrame pat_proc = MLUtils.loadLibSVMFile(sc, home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*').toDF() #Naive Bayes expects #data as an RDD of LabeledPoint print("pat_proc.count() = " + str(pat_proc.count())) #150,127 rows, the two columns are ['label', 'features'] anom = pat_proc.filter(pat_proc.label == 1) #This can be done since we have called toDF() on output of loadLibSVMFile() benign = pat_proc.filter(pat_proc.label == 0) n_benign = benign.count() #Take a random sample of 50K from benign frac = 50000/n_benign (into_model, for_finding_more) = benign.randomSplit([frac, 1 - frac]) print("into_model.count() = " + str(into_model.count()) + ", for_finding_more.count() = " + str(for_finding_more.count())) for_modeling = anom.unionAll(into_model) #for_modeling = for_modeling.rdd #LogisticRegressionWithSGD works on RDD of LabeledPoint objects (train, test) = for_modeling.randomSplit([0.5, 0.5]) test_data_size = test.count() print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size)) ret_obj = {'train': train, 'test': test, 'for_finding_more': for_finding_more} except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return ret_obj
def Random_Forest(filename, sc): filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt" # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, filename) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) # Save and load model #model.save(sc, "target/tmp/myRandomForestClassificationModel") #sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
def predict(): testData = MLUtils.loadLibSVMFile(sc,INPUT_DATA_PATH) print("[INFO] load complete.") model = RandomForestModel.load(sc,TEST_MODEL_PATH) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) lst = predictions.collect() with open(TEST_PREDICT_PATH+"/"+time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())+".txt",'w') as f: for k in lst: f.write(str(k)+"\n") labelsAndPredictions = testData.map(lambda lp: tobin(lp.label)).zip(predictions.map(lambda lp: tobin(lp))) #print(labelsAndPredictions.collect()) metrics = BinaryClassificationMetrics(labelsAndPredictions) # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC) #print(labelsAndPredictions.collect()) testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('[INFO] Test Error = ' + str(testErr))
def main(): options = parse_args() sc = SparkContext(appName="PythonRandomForestClassificationExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, options.data_file) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) # Save and load model model.save(sc, options.output_model) sameModel = RandomForestModel.load(sc, options.output_model)
def random_forest(): conf = SparkConf().setAppName('RF') sc = SparkContext(conf=conf) # print("\npyspark version:" + str(sc.version) + "\n") data = MLUtils.loadLibSVMFile(sc, './data/sample_libsvm_data.txt') (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda v, p: v != p).count() / float( testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) # Save and load model model.save(sc, ".model/myRandomForestClassificationModel") sameModel = RandomForestModel.load( sc, "./model/myRandomForestClassificationModel")
def prediction(model_directory, libsvm_file, outputfile): sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") model = LogisticRegressionModel.load(sc, model_directory) #print "numfeature",model.numFeatures #print "aaaaaaaa" vectors = MLUtils.loadLibSVMFile(sc, libsvm_file, numFeatures=model.numFeatures) vectors.cache() model.clearThreshold() # vector = vectors.collect() # for v in vector: # # features = v.features # print features # print "bbbb",len(features),model.predict(Vectors.dense(features)) # exit() scores = vectors.map(lambda p: (model.predict(Vectors.dense(p.features)))) # lambda p: (p.label, model.predict(p.features))) scores_list = scores.collect() file_out_obj = open(outputfile, 'w') for score in scores_list: #print '----->',score file_out_obj.write(str(score) + '\n') file_out_obj.close()
def train(): data = MLUtils.loadLibSVMFile(sc, TEST_DATA_PATH) print("[INFO] load complete.") # 划分训练集 data = data.randomSplit([0.2, 0.8])[0] (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=NUM_OF_CLASSES, categoricalFeaturesInfo={}, numTrees=NUM_OF_TREES, featureSubsetStrategy="auto", impurity='gini', maxDepth=MAXDEPTH, maxBins=MAXBINS) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('[INFO] Test Error = ' + str(testErr)) print('[INFO] Learned classification forest model:') print(model.toDebugString()) # Save and load model model.save(sc, TEST_MODEL_PATH) sameModel = RandomForestModel.load(sc, TEST_MODEL_PATH)
def _load_data(self, path): dataset_format = self.job_args.get('dataset_format') if dataset_format == 'libsvm': return MLUtils.loadLibSVMFile(self.context, path) else: return self.context.textFile(path).cache()
def LinearRegression(trainFile, testFile, taskid,sc): # filename = "/Users/Jacob/repository/SparkService/data/lpsa.data" # data = sc.textFile(filename) # parsedData = data.map(parsePoint) trainData = MLUtils.loadLibSVMFile(sc, trainFile) testData = MLUtils.loadLibSVMFile(sc, testFile) # train the model model = LinearRegressionWithSGD.train(trainData) # Evaluate the model on training data # predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features))) predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count() print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n") # Save and load model #model.save(sc, "myModelPath") #sameModel = LinearRegressionModel.load(sc, "myModelPath")
def train_model(filename='final_tip_all.txt', test_portion=0.2, cat_var=cat_var_dic, n_tree=250, mode_feature_strat='auto', max_deep=5, max_bin=32): # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose sc = SparkContext() sqlContext = SQLContext(sc) spark = SparkSession.builder.appName("RandomForestRegressor").getOrCreate() # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, filename) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([1 - test_portion, test_portion]) ##### TREAT TEMP AS CONTINUOUS #### model = RandomForest.trainRegressor( trainingData, categoricalFeaturesInfo=cat_var, numTrees=n_tree, featureSubsetStrategy=mode_feature_strat, impurity='variance', maxDepth=max_deep, maxBins=max_bin) ############ prediction !!!! #### # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) testRMSE = math.sqrt(testMSE) #predictions.takeSample(withReplacement = False, num = 5) # convert the rdd object to dataframe as follows df_predictions = predictions.map(lambda x: (x, )).toDF() df_predictions.cache() #df_predictions.show(5, False) #print('Learned regression forest model:') #print(model.toDebugString()) print('Test Root Mean Squared Error on ' + filename + ' = ' + str(testRMSE))
def npmat_to_rdd_wreadwrite(sc,X,Y,f_name,delete_file=False): """ Takes a data prepared for scikit model X in numpy matrix format, Y one-dimensional numpy array and writes to file in libsvm format with filename string f_name provided (could delete automatically), then reads from file directly into spark RDD object (for given Sparkcontext sc) """ sklearn.datasets.dump_svmlight_file(X,Y,f_name,zero_based=False) read_rdd= MLUtils.loadLibSVMFile(sc, f_name) if delete_file: os.remove(f_name) return read_rdd
def npmat_to_rdd_wreadwrite(sc, X, Y, f_name, delete_file=False): """ Takes a data prepared for scikit model X in numpy matrix format, Y one-dimensional numpy array and writes to file in libsvm format with filename string f_name provided (could delete automatically), then reads from file directly into spark RDD object (for given Sparkcontext sc) """ sklearn.datasets.dump_svmlight_file(X, Y, f_name, zero_based=False) read_rdd = MLUtils.loadLibSVMFile(sc, f_name) if delete_file: os.remove(f_name) return read_rdd
def __init__(self, sc): """Init the engine and train the model """ logger.info("Starting up the GeneLearn Engine: ") self.sc = sc logger.info("Loading training data...") dataset_path = "/Users/qingpeng/Dropbox/Development/Bitbucket/jgi-genelearn/scripts/Flask" training_file_path = os.path.join(dataset_path, 'training.svmlib') training = MLUtils.loadLibSVMFile(sc, training_file_path) self.model = LogisticRegressionWithLBFGS.train(training)
def testing_model(model_directory, libsvm, prediction, report, prc_file): sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") model = LogisticRegressionModel.load(sc, model_directory) testing_rdd = MLUtils.loadLibSVMFile(sc, libsvm, numFeatures=model.numFeatures) testing_rdd.cache() au_prc, precision, recall, thresholds, y_true, y_scores = evaluate_model( testing_rdd, model) print 'evaluating_model done!\n' write_to_report(au_prc, precision, recall, thresholds, report) print 'write_to_report done!\n' write_to_prediction(y_true, y_scores, prediction) print 'write_to_prediction done!\n' draw_prc(precision, recall, prc_file, au_prc) print 'draw_prc done!\n'
def training(model_directory, libsvm, scaler): sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") training_rdd = MLUtils.loadLibSVMFile(sc, libsvm) training_rdd.cache() if scaler == '1': label = training_rdd.map(lambda x: x.label) features = training_rdd.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) data1 = label.zip(scaler1.transform(features)) # convert into labeled point data2 = data1.map(lambda x: LabeledPoint(x[0], x[1])) model_logistic = LogisticRegressionWithLBFGS.train(data2) else: model_logistic = LogisticRegressionWithLBFGS.train(training_rdd) model_logistic.save(sc, model_directory)
def predict(): conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf=conf) testData = MLUtils.loadLibSVMFile(sc, TEST_DATA_PATH) model = RandomForestModel.load(sc, TEST_MODEL_PATH) predictions = model.predict(testData.map(lambda x: x.features)) predictlabel_list = predictions.collect() rateOFeachSort_dict = analyse_result(predictlabel_list) save(predictlabel_list) return rateOFeachSort_dict
def Gradient_BoostedTrees(filename, sc): # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString())
def random_forest(): """ 使用mllib对Spark安装包mllib的测试数据集做随机森林测试 80%数据作为训练数据 20%数据作为测试数据 :return: """ data_rdd = MLUtils.loadLibSVMFile( sc, '{}/mllib/sample_libsvm_data.txt'.format(current_dir)) train_data_rdd, test_data_rdd = data_rdd.randomSplit([0.8, 0.2]) model = RandomForest.trainClassifier(train_data_rdd, numClasses=2, categoricalFeaturesInfo={}, numTrees=3) # 根据测试集的features预测laber值为0还是1 predict_rdd = model.predict(test_data_rdd.map(lambda x: x.features)) # 测试集实际的laber值 labels_rdd = test_data_rdd.map(lambda lp: lp.label).zip(predict_rdd) # 测试样本中预测值与实际值不符的百分比(错误率) print(labels_rdd.filter(lambda x: x[0] != x[1]).count()) test_err = labels_rdd.filter(lambda x: x[0] != x[1]).count() / float( test_data_rdd.count()) print("test error rate:{}".format(test_err)) # 保存 训练好的模型 model_path = "{}/my_random_forest_model".format(current_dir) if not os.path.exists(model_path): model.save(sc, model_path) trained_model = RandomForestModel.load( sc, "{}/my_random_forest_model".format(current_dir)) print(trained_model.toDebugString()) return trained_model
accuracy = metrics.accuracy_score(expected, predicted) if i==0: print("Random Forest accuracy is {}".format(accuracy)) else: print("Gradient Boosting accuracy is {}".format(accuracy)) cal_model_accuracy((RFT, GBT)) # In[6]: #IV Use MLlib sc = SparkContext("local", "Ensemble_Tree") # In[7]: data = MLUtils.loadLibSVMFile(sc, '/usr/local/spark/data/mllib/sample_libsvm_data.txt') # In[8]: #Split the training set and test set (trainingData, testData) = data.randomSplit([0.7, 0.3]) # In[9]: #Training model RF_model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=32)
# -*- coding:utf-8 -*- from pyspark import SparkConf, SparkContext from pyspark.mllib.tree import RandomForest from pyspark.mllib.util import MLUtils if __name__ == "__main__": conf = SparkConf().setAppName("RandomForestForClassification").setMaster("local[2]") sc = SparkContext(conf=conf) # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, '../sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / \ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') print(model.toDebugString())
# MLlibs # from pyspark.mllib.classification import SVMWithSGD from pyspark.mllib.tree import DecisionTree from pyspark.mllib.util import MLUtils from pyspark import SparkContext, SparkConf from numpy import array # Exclude for AWS implementation conf = SparkConf() sc = SparkContext(conf=conf) # 0 128:51 129:159 130:253 131:159 [Label KEY:VALUE] training_data = MLUtils.loadLibSVMFile(sc, "train_libsvm.txt") # 231547 128:51 129:159 130:253 131:159 [PhraseId KEY:VALUE] testing_data = MLUtils.loadLibSVMFile(sc, "test_libsvm.txt") # Build the model model = DecisionTree.trainClassifier(training_data, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100) # Evaluating the model on testing_data data labelsAndPreds = testing_data.map(lambda p: (p.label, model.predict(p.features))) test = {} for i in labelsAndPreds.collect(): test[i[1]] = i[0] # Write predictions o = DictWriter(open('pred.csv', 'w'), ['PhraseId', 'Sentiment'])
# Import DecisionTree / DecisionTreeModel from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.util import MLUtils from pyspark import SparkContext sc = SparkContext("local", "SVM") # Loading and parsing data into RDD of LabeledPoint # Sample data provided by Spark 1.3.1 folder # To run locally #data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt') # To run on hadoop server data = MLUtils.loadLibSVMFile(sc, 'jingrong/sample_libsvm_data.txt') # Splits data - Approximately 70% training , 30% testing (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train the decision tree model # Empty categoricalFeaturesInfo indicates that all features are continuous. model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) # Evaluate the model on test instances, compute test error allPredictions = model.predict(testData.map(lambda x: x.features)) predictionsAndLabels = testData.map(lambda pl: pl.label).zip(allPredictions) testMeanSquaredError = predictionsAndLabels.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) # Printing results print "Tested Mean Squared Error: ", testMeanSquaredError
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.mllib.util import MLUtils datapath = '/Data/Spark/MLLib/Sample_Binary_Classification_Data.txt' # fraction of data to sample fraction = 0.1 examples = MLUtils.loadLibSVMFile(sc, datapath) numExamples = examples.count() if numExamples == 0: print("Error: Data file had no samples to load.", file=sys.stderr) exit(1) print('Loaded data with %d examples from file: %s' % (numExamples, datapath)) # Example: RDD.sample() and RDD.takeSample() expectedSampleSize = int(numExamples * fraction) print('Sampling RDD using fraction %g. Expected sample size = %d.'% (fraction, expectedSampleSize)) sampledRDD = examples.sample(withReplacement=True, fraction=fraction) print(' RDD.sample(): sample has %d examples' % sampledRDD.count()) sampledArray = examples.takeSample(withReplacement=True, num=expectedSampleSize) print(' RDD.takeSample(): sample has %d examples' % len(sampledArray))
#------------------------------------------------------------------------------- # Read the training data and build the model #------------------------------------------------------------------------------- #reading the train dataframes trainingDF = spark.read.load("../data/train_small.parquet") #convert every row to LabeledPoint transformedTrainingRDD = (trainingDF.rdd.map( lambda row: LabeledPoint(int(row.label) - 1, row.features))) #print transformedTrainingRDD.show() #Save the RDD in LibSVM format, as Naive Bayes reads in the same format MLUtils.saveAsLibSVMFile(transformedTrainingRDD, "trainingLibsvmfile") training = MLUtils.loadLibSVMFile(sc, "trainingLibsvmfile/*") print "trainingLibsvmfile created!!" # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(training, numClasses=10, categoricalFeaturesInfo={}, numTrees=24, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) print "Model built!!"
from __future__ import print_function import sys from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonRandomForestClassificationExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'deathproject/datanew2.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. print('Starting...') model = RandomForest.trainClassifier(trainingData, numClasses=8, categoricalFeaturesInfo={0: 4, 1: 19, 2: 9, 3: 2, 4: 6, 6: 8, 7: 4, 8: 3, 9: 16, 10: 8, 11: 11}, numTrees=8, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) #model = RandomForest.trainClassifier(trainingData, numClasses=8, categoricalFeaturesInfo={1: 4, 1}, #numTrees=10, featureSubsetStrategy="auto", #impurity='gini', maxDepth=4, maxBins=32)
# MAGIC Upon completing this lab you should understand how to read from and write to files in Spark, convert between `RDDs` and `DataFrames`, and build a model using both the ML and MLlib APIs. # COMMAND ---------- # MAGIC %md # MAGIC #### Loading the data # MAGIC # MAGIC First, we need to load data into Spark. We'll use a built-in utility to load a [libSVM file](www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html), which is stored in an S3 bucket on AWS. We'll use `MLUtils.loadLibSVMFile` to load our file. Here are the [Python](http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils.loadLibSVMFile) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) APIs. # COMMAND ---------- from pyspark.mllib.util import MLUtils baseDir = '/mnt/ml-amsterdam/' irisPath = baseDir + 'iris.scale' irisRDD = MLUtils.loadLibSVMFile(sc, irisPath, minPartitions=20).cache() # We get back an RDD of LabeledPoints. Note that the libSVM format uses SparseVectors. irisRDD.take(5) # COMMAND ---------- # MAGIC %md # MAGIC What if we wanted to see the first few lines of the libSVM file to see what the format looks like? # COMMAND ---------- sc.textFile(irisPath).take(5) # COMMAND ----------
from pyspark.mllib.stat import Statistics from pyspark.mllib.util import MLUtils if __name__ == "__main__": if len(sys.argv) not in [1, 2]: print("Usage: correlations (<file>)", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonCorrelations") if len(sys.argv) == 2: filepath = sys.argv[1] else: filepath = 'data/mllib/sample_linear_regression_data.txt' corrType = 'pearson' points = MLUtils.loadLibSVMFile(sc, filepath)\ .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray())) print() print('Summary of data file: ' + filepath) print('%d data points' % points.count()) # Statistics (correlations) print() print('Correlation (%s) between label and each feature' % corrType) print('Feature\tCorrelation') numFeatures = points.take(1)[0].features.size labelRDD = points.map(lambda lp: lp.label) for i in range(numFeatures): featureRDD = points.map(lambda lp: lp.features[i]) corr = Statistics.corr(labelRDD, featureRDD, corrType) print('%d\t%g' % (i, corr))
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'file') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='entropy', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "myModelPath") sameModel = DecisionTreeModel.load(sc, "myModelPath")
import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt') traindata=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt') data_720=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_720.txt') data_540=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_540.txt') data_360=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_360.txt') model = GradientBoostedTrees.trainRegressor(traindata, categoricalFeaturesInfo={}, numIterations=5) predictions = model.predict(data.map(lambda x:x.features)) labelsandpredictions=data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data.count()) print("training MSE = "+str(MSE)) labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_rbt") predictions_720 = model.predict(data_720.map(lambda x:x.features)) labelsandpredictions_720=data_720.map(lambda lp: lp.label).zip(predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_720.count()) print("training MSE_720 = "+str(MSE_720)) labelsandpredictions_720.saveAsTextFile("/usr/hadoop/hf_720_rbt") predictions_540 = model.predict(data_540.map(lambda x:x.features)) labelsandpredictions_540=data_540.map(lambda lp: lp.label).zip(predictions_540) MSE_540 = labelsandpredictions_540.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_540.count()) print("training MSE_540 = "+str(MSE_540)) labelsandpredictions_540.saveAsTextFile("/usr/hadoop/hf_540_rbt") predictions_360 = model.predict(data_360.map(lambda x:x.features))
from pyspark.sql import SQLContext from pyspark.mllib.util import MLUtils from pyspark.ml.classification import LogisticRegression from peregrine import descend from peregrine.objectives import Worker from peregrine.objectives import Executor from logreg import logistic_regression from logreg import logreg_local from logreg import collect_one with SparkController() as sc: data_path, npar = './data/a9a', 5 dataset = MLUtils.loadLibSVMFile(sc, data_path, minPartitions=npar).cache() local_data = Worker.from_rows(dataset.collect(), dense=False) n, d = local_data.n_samples, local_data.n_features print '#samples: {n}; #features: {d}'.format(n=n, d=d) print 'Baseline: training in single node mode...' prob = Executor(local_data, n, d, collect_one, logreg_local, cached=True, l2_reg=0.01) descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f') print 'Spark ({} partitions): training using peregrine...'.format(npar) prob = logistic_regression(dataset, dense=False, l2_reg=0.01) descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f') print 'Spark ({} partitions): training using mllib...'.format(npar)
def logsreg(loadTrainingFilePath, sc): # Load training data in LIBSVM format loadTrainingFilePath = '/Users/Jacob/repository/SparkService/data/sample_libsvm_data.txt' data = MLUtils.loadLibSVMFile(sc, loadTrainingFilePath) # Split data into training (60%) and test (40%) traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L) traindata.cache() # Load testing data in LIBSVM format #testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath) # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3) # Compute raw scores on the test set predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label)) Json.generateJson("LogisticRegression", "12345678", traindata, predictionAndLabels); print 'Completed.' # Instantiate metrics object # metrics = MulticlassMetrics(predictionAndLabels) # # Overall statistics # precision = metrics.precision() # recall = metrics.recall() # f1Score = metrics.fMeasure() # #confusion_matrix = metrics.confusionMatrix().toArray() # print("Summary Stats") # print("Precision = %s" % precision) # print("Recall = %s" % recall) # print("F1 Score = %s" % f1Score) # # Statistics by class # labels = traindata.map(lambda lp: lp.label).distinct().collect() # for label in sorted(labels): # print("Class %s precision = %s" % (label, metrics.precision(label))) # print("Class %s recall = %s" % (label, metrics.recall(label))) # print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) # # Weighted stats # print("Weighted recall = %s" % metrics.weightedRecall) # print("Weighted precision = %s" % metrics.weightedPrecision) # print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) # print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) # print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) # #return model parameters # res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)), # ('3','Yes','Precision', metrics.precision(0.0)), # ('4','Yes','Recall', metrics.recall(0.0)), # ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)), # ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)), # ('3','Yes','Precision', metrics.precision(1.0)), # ('4','Yes','Recall', metrics.recall(1.0)), # ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)), # ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)), # ('3','Yes','Precision', metrics.precision(2.0)), # ('4','Yes','Recall', metrics.recall(2.0)), # ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))] # #save output file path as JSON and dump into dumpFilePath # rdd = sc.parallelize(res) # SQLContext.createDataFrame(rdd).collect() # df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value']) #tempDumpFilePath = dumpFilePath + "/part-00000" #if os.path.exists(tempDumpFilePath): # os.remove(tempDumpFilePath) #df.toJSON().saveAsTextFile(hdfsFilePath) #tmpHdfsFilePath = hdfsFilePath + "/part-00000" #subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath]) # Save and load model #clusters.save(sc, "myModel") #sameModel = KMeansModel.load(sc, "myModel")
exit(1) if __name__ == "__main__": if len(sys.argv) > 2: usage() sc = SparkContext(appName="PythonDT") # Load data. dataPath = 'train_svm'# 'data/mllib/sample_libsvm_data.txt' if len(sys.argv) == 2: dataPath = sys.argv[1] if not os.path.isfile(dataPath): sc.stop() usage() points = MLUtils.loadLibSVMFile(sc, dataPath) # Re-index class labels if needed. (reindexedData, origToNewLabels) = reindexClassLabels(points) numClasses = len(origToNewLabels) # Train a classifier. categoricalFeaturesInfo = {} # no categorical features #model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses, # categoricalFeaturesInfo=categoricalFeaturesInfo) model = RandomForest.trainClassifier(reindexedData, numClasses=numClasses,categoricalFeaturesInfo={},numTrees=30,featureSubsetStrategy='auto', impurity='gini', maxDepth=8, maxBins=40, ) # Print learned tree and stats. print origToNewLabels print "Trained DecisionTree for classification:" # print " Model numNodes: %d" % model.numNodes() # print " Model depth: %d" % model.depth() print " Training accuracy: %g" % getAccuracy(model, reindexedData)
""" from __future__ import print_function from pyspark import SparkContext from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils import json from bson import json_util from bson.json_util import dumps if __name__ == "__main__": sc = SparkContext(appName="DecisionTreeClassification") raw_data = MLUtils.loadLibSVMFile(sc, '/home/hechem/spark-campaign-classification/test/data/sample_libsvm_data.txt') (trainingDataSet, testDataSet) = raw_data.randomSplit([0.7, 0.3]) tree = DecisionTree.trainClassifier(trainingDataSet, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=30) predictions = tree.predict(testDataSet.map(lambda x: x.features)) labelsAndPredictions = testDataSet.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testDataSet.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(tree.toDebugString()) tree_to_json = tree.toDebugString() # Parser def parse(lines): block = []
parts = line.strip().split("::") return (int(parts[0]),int(parts[1]),float(parts[2])) if __name__ =="__main__": if(len(sys.argv)!=2): print "Usage: /path to spark/bin/spark-submit name.py movieDir" # step 1 - create spark context conf = SparkConf().setAppName("KMeans-Content")\ .set("spark.executor.memory","1g") sc = SparkContext() # step 2 - load in input file data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat") labels = data.map(lambda x:x.label) features = data.map(lambda x:x.features) # step 3 - standarize the data with unit values and 0 mean scaler = StandardScaler(withMean=False,withStd=True).fit(features) data2 = labels.zip(scaler.transform(features)) numFeatures = len(data2.values().take(10)[0]) print "Type of data2: ",type(data2) #RDD print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd print "Sample: ",data2.values().take(1)[0] # splitting up the data to training, validation and testing models.
# parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # # Build the model (cluster the data) # clusters = KMeans.train(parsedData, 3, maxIterations=10, runs=30, initializationMode="random") # # WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) # print("Within Set Sum of Squared Error = " + str(WSSSE)) # # res = [('k_means',dumpFilePath, WSSSE)] # rdd = sc.parallelize(res) # SQLContext.createDataFrame(rdd).collect() # df = SQLContext.createDataFrame(rdd,['model_name','res_path', 'WSSSE']) # df.toJSON().saveAsTextFile(dumpFilePath) if(model_name == "Regression"): # Load training data in LIBSVM format data = MLUtils.loadLibSVMFile(sc, loadTrainingFilePath) # Split data into training (60%) and test (40%) traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L) traindata.cache() # Load testing data in LIBSVM format #testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath) # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3) # Compute raw scores on the test set predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))
from pyspark.context import SparkContext from pyspark.mllib.util import MLUtils from pyspark.mllib.tree import RandomForest, RandomForestModel sc = SparkContext('yarn', 'weather_predictor') data = MLUtils.loadLibSVMFile(sc, 'hdfs:///users/wfvining/'+sys.argv[1]) (train, test) = data.randomSplit([0.7, 0.3]) model = RandomForest.trainRegressor(trainData, categoricalFeaturesInfo={x:2 for x in range(654, 615)}, numTrees=10, featureSubsetStrategy='auto', maxDepth=5) predictions = model.predict(test.map(lambda x:x.features)) labelsAndPredictions = test.map(lambda lp:lp.label).zip(predictions) testErr = labelsAndPredictions.map( lambda (v, p): (v - p) * (v - p)).sum() / float(test.count()) print('Mean Squared Error: ' + str(testErr)e
def summarize(dataset): print "schema: %s" % dataset.schema().json() labels = dataset.map(lambda r: r.label) print "label average: %f" % labels.mean() features = dataset.map(lambda r: r.features) summary = Statistics.colStats(features) print "features average: %r" % summary.mean() if __name__ == "__main__": if len(sys.argv) > 2: print >> sys.stderr, "Usage: dataset_example.py <libsvm file>" exit(-1) sc = SparkContext(appName="DatasetExample") sqlContext = SQLContext(sc) if len(sys.argv) == 2: input = sys.argv[1] else: input = "data/mllib/sample_libsvm_data.txt" points = MLUtils.loadLibSVMFile(sc, input) dataset0 = sqlContext.inferSchema(points).setName("dataset0").cache() summarize(dataset0) tempdir = tempfile.NamedTemporaryFile(delete=False).name os.unlink(tempdir) print "Save dataset as a Parquet file to %s." % tempdir dataset0.saveAsParquetFile(tempdir) print "Load it back and summarize it again." dataset1 = sqlContext.parquetFile(tempdir).setName("dataset1").cache() summarize(dataset1) shutil.rmtree(tempdir)
# $example off$ from pyspark.mllib.util import MLUtils if __name__ == "__main__": sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") # $example on$ # Load and parse the data def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:]) #data = sc.textFile("/home/yl408/yuhao_datasets/kdda_part") #parsedData = data.map(parsePoint) parsedData = MLUtils.loadLibSVMFile( sc, "file:///home/yl408/yuhao_datasets/kdda") # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001) # Evaluate the model on training data # valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) # MSE = valuesAndPreds \ # .map(lambda (v, p): (v - p)**2) \ # .reduce(lambda x, y: x + y) / valuesAndPreds.count() # print("Mean Squared Error = " + str(MSE)) # Save and load model # model.save(sc, "target/tmp/pythonLinearRegressionWithSGDModel")
sys.path.append("/path/to/spark/python") try: from pyspark import SparkContext, SparkConf from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils print ("Successfully imported Spark Modules") except ImportError as e: print ("Can not import Spark Modules", e) sys.exit(1) if __name__ == "__main__": conf = SparkConf().setAppName("RandomForest_Iris") sc = SparkContext(conf = conf) print "Loading data..." data = MLUtils.loadLibSVMFile(sc, '../../data/iris/iris.scale') (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. model = RandomForest.trainClassifier(trainingData, numClasses=4, categoricalFeaturesInfo={}, numTrees=5, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) # Save model model.save(sc, "model")
# $example on$ import math from pyspark.mllib.regression import LabeledPoint, IsotonicRegression, IsotonicRegressionModel from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonIsotonicRegressionExample") # $example on$ # Load and parse the data def parsePoint(labeledData): return (labeledData.label, labeledData.features[0], 1.0) data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_isotonic_regression_libsvm_data.txt") # Create label, feature, weight tuples from input data with weight set to default value 1.0. parsedData = data.map(parsePoint) # Split data into training (60%) and test (40%) sets. training, test = parsedData.randomSplit([0.6, 0.4], 11) # Create isotonic regression model from training data. # Isotonic parameter defaults to true so it is only shown for demonstration model = IsotonicRegression.train(training) # Create tuples of predicted and real labels. predictionAndLabel = test.map(lambda p: (model.predict(p[1]), p[0])) # Calculate mean squared error between predicted and real labels.
""" from __future__ import print_function import sys from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonRandomForestClassificationExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
# LOAD APPROPRIATE PACKAGE import numpy as np from pyspark.context import SparkContext from pyspark.mllib.util import MLUtils from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.evaluation import BinaryClassificationMetrics sc = SparkContext.getOrCreate() data = MLUtils.loadLibSVMFile(sc, 'data/dataLibSVM.txt') print(data) # NEXT LET'S CREATE THE APPROPRIATE TRAINING AND TEST SETS # WE'LL BE SETTING THEM AS 70-30, ALONG WITH SETTING A # RANDOM SEED GENERATOR TO MAKE MY RESULTS REPRODUCIBLE (trainingSet, testSet) = data.randomSplit([0.7, 0.3], seed = 7) ################## # DECISION TREES # ################## fitDT = DecisionTree.trainClassifier(trainingSet, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=3, maxBins=32) print(fitDT.toDebugString()) predictionsDT = fitDT.predict(testSet.map(lambda x: x.features))
# $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.util import MLUtils from pyspark.mllib.evaluation import MulticlassMetrics # $example off$ from pyspark import SparkContext if __name__ == "__main__": sc = SparkContext(appName="MultiClassMetricsExample") # Several of the methods available in scala are currently missing from pyspark # $example on$ # Load training data in LIBSVM format data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt") # Split data into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=11) training.cache() # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(training, numClasses=3) # Compute raw scores on the test set predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # Overall statistics
sc = SparkContext() if (len(sys.argv) != 2): print "usage: /sparkPath/bin/spark-submit name.py 'movieDirectory'" def parseRating(line): parts = line.strip().split("::") return (int(parts[0]) - 1, int(parts[1]) - 1, float(parts[2])) #load in input file path = sys.argv[1] #path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/" data = MLUtils.loadLibSVMFile(sc, path) labels = data.map(lambda x: x.label) features = data.map(lambda x: x.features) #normalize: #scaler = StandardScaler(withMean = True, withStd = True).fit(features) #data needs to be dense (zeros included) scaler = StandardScaler(withMean=False, withStd=True).fit( features) #becomes dense if using withMean. may run out of memory locally #convert data to dense vector to be normalized #data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray())))) data2 = labels.zip( scaler.transform(features)) #use this line if having memory issues #hide 10% of the data for final test
import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim.txt') traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_ssim.txt') data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_720.txt') data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_540.txt') data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_360.txt') model = GradientBoostedTrees.trainRegressor(traindata, categoricalFeaturesInfo={}, numIterations=5) predictions = model.predict(data.map(lambda x: x.features)) labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float( data.count()) print("training MSE = " + str(MSE)) labelsandpredictions.saveAsTextFile("/usr/hadoop/ssim_rbt") predictions_720 = model.predict(data_720.map(lambda x: x.features)) labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip( predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) * (v - p)).sum() / float(data_720.count()) print("training MSE_720 = " + str(MSE_720)) labelsandpredictions_720.saveAsTextFile("/usr/hadoop/ssim_720_rbt") predictions_540 = model.predict(data_540.map(lambda x: x.features))
if "SPARK_HOME" not in os.environ: os.environ["SPARK_HOME"] = '/usr/hdp/2.3.4.0-3485/spark' SPARK_HOME = os.environ["SPARK_HOME"] sys.path.insert(0, os.path.join(SPARK_HOME, "python", "lib")) sys.path.insert(0, os.path.join(SPARK_HOME, "python")) try: sc.stop() except: pass conf = SparkConf().setAppName("ReadUsersData") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) rdd = MLUtils.loadLibSVMFile(sc, "/sourcedata/zhaih/s1") n = 10000 # number of records to be processed d = 7799 M = np.zeros((n, d + 1)).astype(np.int8) res = rdd.take(n) begin_time = time.time() for i in range(n): if i % 100 == 0: print i id = res[i].label values = res[i].features M[i][d] = int(id) for j in range(d): M[i][j] = int(values[j])
#http://spark.apache.org/docs/1.6.0/mllib-decision-tree.html from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile( sc, 'file:///usr/local/spark/data/mllib/sample_libsvm_data.txt' ) # The code on web is wrong, this is correct. # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model
import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt') traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt') data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_720.txt') data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_540.txt') data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_360.txt') model = DecisionTree.trainRegressor(traindata, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) predictions = model.predict(data.map(lambda x: x.features)) labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float( data.count()) print("training MSE = " + str(MSE)) #labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_dt") predictions_720 = model.predict(data_720.map(lambda x: x.features)) labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip( predictions_720)
def __init__(self,sc, path): # Load and parse the data file into an RDD of LabeledPoint. self.data = MLUtils.loadLibSVMFile(sc, path)
rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel") model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError) if __name__ == "__main__": if len(sys.argv) > 1: print("Usage: gradient_boosted_trees", file=sys.stderr) exit(1) sc = SparkContext(appName="Jay") sqlContext = SQLContext(sc) # Load and parse the data file into a dataframe. df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() # Map labels into an indexed column of labels in [0, numLabels) stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") si_model = stringIndexer.fit(df) td = si_model.transform(df) [train, test] = td.randomSplit([0.7, 0.3]) testClassification(train, test) testRegression(train, test) sc.stop()
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree from pyspark.mllib.util import MLUtils from pyspark import SparkContext sc = SparkContext("local", appName="tree1") #input - tumour patient records data = MLUtils.loadLibSVMFile(sc, 'input.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=10, categoricalFeaturesInfo={}, impurity='gini', maxDepth=2, maxBins=100) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString())
def Random_Forest(trainFile, testFile, taskid, sc): # filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt" # Load and parse the data file into an RDD of LabeledPoint. trainData = MLUtils.loadLibSVMFile(sc, trainFile) testData = MLUtils.loadLibSVMFile(sc, testFile) labelNum = trainData.map(lambda lp: lp.label).distinct().count() # Split the data into training and test sets (30% held out for testing) # (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainData, numClasses=3, categoricalFeaturesInfo={}, numTrees=labelNum, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) Json.generateJson("LogisticRegression", taskid, trainData, labelsAndPredictions); # predictionAndLabels = testData.map(lambda lp: (float(model.predict(lp.features)), lp.label)) # Instantiate metrics object # metrics = MulticlassMetrics(predictionAndLabels) # metrics = MulticlassMetrics(labelsAndPredictions) # # Overall statistics # precision = metrics.precision() # recall = metrics.recall() # f1Score = metrics.fMeasure() # #confusion_matrix = metrics.confusionMatrix().toArray() # print("Summary Stats") # print("Precision = %s" % precision) # print("Recall = %s" % recall) # print("F1 Score = %s" % f1Score) # # Statistics by class # labels = trainData.map(lambda lp: lp.label).distinct().collect() # for label in sorted(labels): # print("Class %s precision = %s" % (label, metrics.precision(label))) # print("Class %s recall = %s" % (label, metrics.recall(label))) # print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) # # Weighted stats # print("Weighted recall = %s" % metrics.weightedRecall) # print("Weighted precision = %s" % metrics.weightedPrecision) # print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) # print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) # print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) # testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) # print('Test Error = ' + str(testErr)) # print('Learned classification forest model:') # print(model.toDebugString()) # Save and load model #model.save(sc, "target/tmp/myRandomForestClassificationModel") #sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
""" from __future__ import print_function from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="BinaryClassificationMetricsExample") # $example on$ # Several of the methods available in scala are currently missing from pyspark # Load training data in LIBSVM format data = MLUtils.loadLibSVMFile( sc, "data/mllib/sample_binary_classification_data.txt") # Split data into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=11) training.cache() # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(training) # Compute raw scores on the test set predictionAndLabels = test.map( lambda lp: (float(model.predict(lp.features)), lp.label)) # Instantiate metrics object metrics = BinaryClassificationMetrics(predictionAndLabels)
import shutil from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonNaiveBayesExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split data approximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4]) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / test.count() print('model accuracy {}'.format(accuracy)) # Save and load model
exit(1) if __name__ == "__main__": if len(sys.argv) > 2: usage() sc = SparkContext(appName="PythonDT") # Load data. dataPath = 'data/mllib/sample_libsvm_data.txt' if len(sys.argv) == 2: dataPath = sys.argv[1] if not os.path.isfile(dataPath): sc.stop() usage() points = MLUtils.loadLibSVMFile(sc, dataPath) # Re-index class labels if needed. (reindexedData, origToNewLabels) = reindexClassLabels(points) numClasses = len(origToNewLabels) # Train a classifier. categoricalFeaturesInfo = {} # no categorical features model = DecisionTree.trainClassifier( reindexedData, numClasses=numClasses, categoricalFeaturesInfo=categoricalFeaturesInfo) # Print learned tree and stats. print("Trained DecisionTree for classification:") print(" Model numNodes: %d" % model.numNodes()) print(" Model depth: %d" % model.depth())
import numpy as np from pyspark.mllib.linalg import Vectors # Configure the environment if 'SPARK_HOME' not in os.environ: os.environ['SPARK_HOME'] = '$SPARK_HOME' # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonNaiveBayesExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile( sc, "/home/ajit/Desktop/spark_lib/sample_libsvm_data.txt") print(type(data)) print(data) # Split data approximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4]) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() print(type(predictionAndLabel))
def loadData(sc, basepath, filepath): data = MLUtils.loadLibSVMFile(sc, os.path.join(basepath, filepath)) trainingData, testData = data.randomSplit([0.7,0.3]) print '\nLoad data finished' return trainingData, testData
from __future__ import print_function # $example on$ from pyspark import SparkContext from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils if __name__ == "__main__": sc = SparkContext(appName="PythonRandomForestRegxample") data = MLUtils.loadLibSVMFile(sc,"file:///home/yl408/yuhao_datasets/phishing") #data = spark.read.format("libsvm").load("file:///home/yl408/yuhao_datasets/rcv1_train.binary") model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) model.save(sc, "file:///home/yl408/spark-ml/myrandomForestModel")
from math import log, exp from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint sc = SparkContext() sqlContext = SQLContext(sc) data = MLUtils.loadLibSVMFile(sc, "hdfs:///hndata/docvecs") data = data.map(lambda lp: LabeledPoint(exp(lp.label)-1.0, lp.features)) # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. rr = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=5, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) predictions = rr.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest rr:')