def train_svm(points): model = SVMWithSGD.train(points, iterations=200) # Save and load model #model.save(sc, "target3/tmp/pythonSVMWithSGDModel") return model
def trainEvaluateModel(trainData, validationData, numIterations, stepSize, regParam): ''' 训练模型时会输入不同的参数。其中,DecisionTree参数有impurity、maxDepth、maxBins等的值都会影响准确率以及训练所需的时间。 我们以图表显示这些参数值、准确率与训练所需的时间。 我们每次只会评估单个参数的不同值,例如评估maxDepth参数的不同值[3, 5, 10, 15, 20, 25],执行步骤如下: (1)用SVMWithSGD.train进行训练传入trainData与单个参数的不同数值; (2)建立模型后,用validationData评估模型的AUC准确率; (3)训练与评估模型重复执行多次,产生多个参数项的AUC与运行时间,并存储于metricsRDD中; (4)全部执行完成后,将metricsRDD转换为Pandas DataFrame; (5)Pandas DataFrame可绘制AUC与运行时间图表,用于显示不同参数的准确率与执行时间的关系。 :param trainData: :param validationData: :param numIterations: :param stepSize: :param regParam: :return: ''' print('======================= 训练评估模型 =======================') startTime = time() model = SVMWithSGD.train(trainData, numIterations, stepSize, regParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print('========== [trainEvaluateModel] >>>> 训练评估模型:使用参数:numIterations=' + str(numIterations) + ', stepSize=' + str(stepSize) + ', regParam=' + str(regParam) + '\n' + '\t\t==>> 所需时间=' + str(duration) + ', 结果AUC=' + str(AUC)) return (AUC, duration, numIterations, stepSize, regParam, model)
def modelWithSVM(trainingData, validationData): ##Train the model using Support Vector Machines with different values of iterations. ##Return the SVM model with best accuracy rate #eta = [0.1, 0.3, 0.5, 1.0, 5.0] regularizationParamater = [.0000001, 1., 5000., 10000., 200000.] bestSVMModel = None bestAccuracy = 0 numOfIterations = 100 visualizationData = [] for regularizer in regularizationParamater: model = SVMWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer) predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features))) totalValidationAds = validationData.count() correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count() accuracy = float(correctlyPredicted)/totalValidationAds visualizationData += [(regularizer, accuracy)] if accuracy > bestAccuracy: bestAccuracy = accuracy bestSVMModel = model return bestSVMModel, visualizationData
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def modelWithSVM(trainingData, validationData): ##Train the model using Support Vector Machines with different values of iterations. ##Return the SVM model with best accuracy rate #eta = [0.1, 0.3, 0.5, 1.0, 5.0] regularizationParamater = [.0000001, 1., 5000., 10000., 200000.] bestSVMModel = None bestAccuracy = 0 numOfIterations = 100 visualizationData = [] for regularizer in regularizationParamater: model = SVMWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer) predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features))) totalValidationAds = validationData.count() correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count() accuracy = float(correctlyPredicted) / totalValidationAds visualizationData += [(regularizer, accuracy)] if accuracy > bestAccuracy: bestAccuracy = accuracy bestSVMModel = model return bestSVMModel, visualizationData
def trainevaluatemodel_svm(traindata,validationdata, iterations, step, minibatchfraction,regparam): starttime=time() model=SVMWithSGD.train(traindata,iterations=iterations, step=step, regParam=regparam, miniBatchFraction=minibatchfraction, initialWeights=None, regType='l2', intercept=False, validateData=True, convergenceTol=0.001) index=evaluation2(model,validationdata) duration=time()-starttime print('Param:'+'\n'+'iterations:'+str(iterations)+'\n'+'step:'+str(step)+'\n'+'minibatchfraction:'+str(minibatchfraction)+'\n'+'regparam:'+str(regparam)+'\n'+'time:'+str(duration)+'\n'+'index:'+str(index)) return (iterations, step, minibatchfraction,regparam,duration,index)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def main(): stock_file = sys.argv[1] output_predict_file = sys.argv[2] conf = SparkConf().setAppName('Stock Prediction Machine Learning with Twitter') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' ''' extracting the header of CSV file''' file_data_all = sc.textFile(stock_file) file_header = file_data_all.first() file_data = file_data_all.filter(lambda line: line != file_header).cache() ''' for five different predictions getting data ''' parsedFileData_NextDayActualOpening = file_data.map(parseNextDayActualOpening) parsedFileData_NextDayActualHigh = file_data.map(parseNextDayActualHigh) parsedFileData_NextDayActualLow = file_data.map(parseNextDayActualLow) parsedFileData_NextDayActualClose = file_data.map(parseNextDayActualClose) parsedFileData_NextDayActualVolume = file_data.map(parseNextDayActualVolume) print(parsedFileData_NextDayActualOpening.collect()) ''' calling SVM with Stochastic Gradient Descent and training using our data set ''' svm_model_nxtdayactopn = SVMWithSGD.train(parsedFileData_NextDayActualOpening, iterations=10) lpreds = parsedFileData_NextDayActualOpening.map(lambda line: (line.label, svm_model_nxtdayactopn.predict(line.features))) print(lpreds.collect())
def main(): stock_file = sys.argv[1] output_predict_file = sys.argv[2] conf = SparkConf().setAppName( 'Stock Prediction Machine Learning with Twitter') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' ''' extracting the header of CSV file''' file_data_all = sc.textFile(stock_file) file_header = file_data_all.first() file_data = file_data_all.filter(lambda line: line != file_header).cache() ''' for five different predictions getting data ''' parsedFileData_NextDayActualOpening = file_data.map( parseNextDayActualOpening) parsedFileData_NextDayActualHigh = file_data.map(parseNextDayActualHigh) parsedFileData_NextDayActualLow = file_data.map(parseNextDayActualLow) parsedFileData_NextDayActualClose = file_data.map(parseNextDayActualClose) parsedFileData_NextDayActualVolume = file_data.map( parseNextDayActualVolume) print(parsedFileData_NextDayActualOpening.collect()) ''' calling SVM with Stochastic Gradient Descent and training using our data set ''' svm_model_nxtdayactopn = SVMWithSGD.train( parsedFileData_NextDayActualOpening, iterations=10) lpreds = parsedFileData_NextDayActualOpening.map(lambda line: ( line.label, svm_model_nxtdayactopn.predict(line.features))) print(lpreds.collect())
def svmClassification(trainSetFile,testSetFile): data1 = sc.textFile(directory_supervised + trainSetFile) trainData = data1.map(parsePoint) data2 = sc.textFile(directory_supervised + testSetFile) testData = data2.map(parsePoint) # Build the model model = SVMWithSGD.train(trainData, iterations=10) # Evaluating the model on training data '''labelsAndPreds = trainData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainData.count()) print("Training Error = " + str(trainErr)) labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) print("Test Error = " + str(testErr)) return testErr''' #labelsAndPreds = testData.map(lambda p: (p.label, float(model.predict(p.features)))) #truePos = labelsAndPreds.filter(lambda p: p[0] == p[1]).count() #print("True pos : " + str(truePos)) #metrics1 = MulticlassMetrics(labelsAndPreds) #print("Recall : " + str(metrics1.recall())) #print("Precision : " + str(metrics1.precision())) #print(metrics1.confusionMatrix()) model.clearThreshold() scoreAndLabels = testData.map(lambda p: (float(model.predict(p.features)), p.label)) metrics = BinaryClassificationMetrics(scoreAndLabels) return metrics.areaUnderROC
def train_model(training_data, iterations, model_file_path, calculate_error=True): """ Trains an SVG model and saves it :param training_data: :param iterations: :param model_file_path: :return: """ parsed_data = sc.textFile(training_data).map(parse_point) # Build the model model = SVMWithSGD.train(parsed_data, iterations=iterations) # Save the model model.save(sc, model_file_path) print "Model saved in: ", model_file_path if calculate_error: #predictions labelsAndPreds = parsed_data.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter( lambda (v, p): v != p).count() / float(parsed_data.count()) print("============Training Error = " + str(trainErr))
def sendRecord(tup): if (not tup.isEmpty()): rdd_arr = tup.collect() el = rdd_arr[0][1].split('\n') tmp_file = get_tmpfile("/home/cloudera/Desktop/test_word2vec.txt") model = KeyedVectors.load_word2vec_format(tmp_file) vectores = [] normal_v = [] for i in el: value = i[:1] text = i[2:] text_arr = text.split(' ') vector = [0] * 50 total = 1 for j in text_arr: try: vector += model.get_vector(j) total += 1 except: pass vector = vector / total label = LabeledPoint(int(value), vector) vectores.append(label) normal_v.append([int(value), vector]) vectores.append(LabeledPoint(0.0, [0] * 50)) model = SVMWithSGD.train(sc.parallelize(vectores), iterations=100) #model.save(sc, "/home/cloudera/pythonSVMWithSGDModel") pred = [] match_0 = 0 nmatch_0 = 0 match_1 = 0 nmatch_1 = 0 final = [] for i in normal_v: pre = model.predict(i[1]) if (i[0] == 0): if (pre == i[0]): match_0 += 1 else: nmatch_0 += 1 else: if (pre == i[0]): match_1 += 1 else: nmatch_1 += 1 final.append([pre, i[0], i[1]]) print('\n\n\n\n\n\n\n\n\n\n\n') print('\n\n\n\n\n\n\n\n\n\n\n') print([[match_0, nmatch_0], [nmatch_1, match_1]]) print('\n\n\n\n\n\n\n\n\n\n\n') print('\n\n\n\n\n\n\n\n\n\n\n') print(len(final)) print('\n\n\n\n\n\n\n\n\n\n\n') print('\n\n\n\n\n\n\n\n\n\n\n') f = open('/home/cloudera/vectores' + str(uuid.uuid4()) + '.txt', 'w') for i in final: for j in i: f.write('%s' % j) f.write('\n') f.close()
def create_model(config, class1, class2): # Load training data if len(class1) > 0 and len(class2) > 0: train_feature_path = config['root_directory'] + config[ 'feature_directory'] + config['train_one_feature_filename'] else: train_feature_path = config['root_directory'] + config[ 'feature_directory'] + config['train_all_feature_filename'] data = sc.textFile(train_feature_path) parsed_data = data.map(make_labeled_point) # Build the model model = SVMWithSGD.train(parsed_data, iterations=100) # Evaluate the model on training data labels_and_preds = parsed_data.map(lambda p: (p.label, model.predict(p.features))) train_err = labels_and_preds.filter( lambda lp: lp[0] != lp[1]).count() / float(parsed_data.count()) print("Training Error = " + str(train_err * 100) + "%") if len(class1) > 0 and len(class2) > 0: model_path = config['root_directory'] + config[ 'one_vs_one_model_directory'] else: model_path = config['root_directory'] + config[ 'one_vs_all_model_directory'] if os.path.exists(model_path): shutil.rmtree(model_path) # Save the model model.save(sc, model_path)
def predict_SVMWithSGD(numIterations, step, regParam, regType): """ SVMWithSGD.train(data,iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2',intercept=False, validateData=True,convergenceTol=0.001) data: the training data, an RDD of LabeledPoint iterations: the number of iterations, default 100 step: the step parameter used in SGD, default 1.0 regParam: the regularizer parameter, default 0.01 miniBatchFraction: fraction of data to be used for each SGD iteration, default 1.0 initialWeights: the initial weights, default None regType: the type of regularizer used for training our model, allowed values ('l1':for using L1 regularization; 'l2':for using L2 regularization, default; None: for no regularization) intercept: boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias feature are activated or not, default False) validateData: boolean parameter which indicates if the algorithm should validate data before training, default True convergenceTol: a condition which decides iteration termination, default 0.001 """ svmModel = SVMWithSGD.train(scaledData, iterations=numIterations, step=step, regParam=regParam, regType=regType) svmMetrics = scaledData.map(lambda p: (svmModel.predict(p.features), p.label)) svmAccuracy = svmMetrics.filter( lambda (actual, pred): actual == pred).count() * 1.0 / data.count() metrics = BinaryClassificationMetrics(svmMetrics) #print "SVMWithSGD model accuracy is: %f in %d iterations,step:%f;regParam:%f;regType:%s" % (svmAccuracy, numIterations,step,regParam,regType) return svmAccuracy
def train(sc, file_positive, files_negative, file_output): """ Trains a binary classification model using positive samples in file_positive and negative samples in file_negative. It writes the resulting model to file_output :param sc: The spark context :type sc: SparkContext :param file_positive: The file with positive tweets (relevant ones) :type file_positive: str :param files_negative: The file with negative tweets (non-relevant ones) :type files_negative: list[str] :param file_output: The output where to store the trained model :type file_output: str """ positive_tweets = sc.textFile(file_positive).map(parse_json).filter(is_valid) negative_tweets = [sc.textFile(file_negative).map(parse_json).filter(is_valid) for file_negative in files_negative] positive = positive_tweets.map(parse_positive) negatives = [nt.map(parse_negative) for nt in negative_tweets] data = positive for negative in negatives: data = data.union(negative) try: print("Training classification model") model = SVMWithSGD.train(data, iterations=150, step=1000.0, regType='l1', regParam=1e-7) print("Saving classification model to file") pickle.dump(model, open(file_output, 'wb')) print("Done!") except Exception as e: print("Error:") print(e)
def do_1vsall(class_all, size, num_iter, config): features_path = config['protocol'] + config['bucket'] + config['sep'] + config['features_key'] print('do_1vsall ==============> Setting RDD_ALL') rdd_all = sc.textFile(features_path, minPartitions=4).map(lambda line: line.split(',')).persist() print('do_1vsall ==============> Setting RDD_TRAIN_SET') rdd_train_set = rdd_all.filter(lambda features: int(features[1]) <= size) \ .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \ .map(make_labeled_point) print('do_1vsall ==============> Setting RDD_TEST_SET') rdd_test_set = rdd_all.filter(lambda features: size < int(features[1])) \ .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \ .map(make_labeled_point) # Build the model model_dir = class_all + '_' + str(size) + '_' + str(num_iter) model_s3_file = config['model_key'] + config['sep'] + model_dir model = None if s3_object_exists(config['bucket'], model_s3_file): print('do_1vsall ==============> Loading SVM Model: {}...'.format(model_s3_file)) model = SVMModel.load(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file) else: print('do_1vsall ==============> Building SVM Model') model = SVMWithSGD.train(rdd_train_set, iterations=num_iter) print('do_1vsall ==============> Saving SVM Model: {}...'.format(model_s3_file)) model.save(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file) # Evaluate the model on th test data print('do_1vsall ==============> Evaluating test set') labels_and_preds = rdd_test_set.map(lambda p: (p.label, model.predict(p.features))) train_err = labels_and_preds.filter(lambda lp: lp[0] != lp[1]).count() / float(rdd_test_set.count()) # print("Test Error = " + str(train_err)) success = round(((1 - train_err) * 100), 2) print('{},{}'.format(str(size), str(success))) return size, success
def model_for_class(cl, dat): def adjust_label(lp): return LabeledPoint(1 if lp.label == cl else 0, lp.features) model = SVMWithSGD.train(dat.map(adjust_label), iterations=10) #model.clearThreshold() return model
def run_iterations(parsedData, iter, seed): fp_rates = [] tp_rates = [] # thld_arr = [] for i in range(0, 10): trainingData, testingData = parsedData.randomSplit([70, 30], seed) print("For " + str(iter) + " iterations:") # Build the model model = SVMWithSGD.train(trainingData, iterations=100) # Evaluating the model on training data labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter( lambda (v, p): v != p).count() / float(trainingData.count()) MSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce( lambda x, y: x + y) / labelsAndPreds.count() print("Training Error = " + str(trainErr)) print("MSE = " + str(MSE)) labelsAndPreds = testingData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( testingData.count()) MSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce( lambda x, y: x + y) / labelsAndPreds.count() print("Testing Error = " + str(testErr)) print("MSE = " + str(MSE)) info = labelsAndPreds.collect() actual = [int(i[0]) for i in info] predictions = [i[1] for i in info] false_positive_rate = labelsAndPreds.filter( lambda (v, p): v == 1 and p == 0).count() / float( labelsAndPreds.filter(lambda (v, p): v == 1).count()) true_positive_rate = labelsAndPreds.filter( lambda (v, p): v == 0 and p == 0).count() / float( labelsAndPreds.filter(lambda (v, p): v == 0).count()) fpr, tpr, thresholds = roc_curve(actual, predictions) # roc_auc = auc(false_positive_rate, true_positive_rate) print false_positive_rate print true_positive_rate fp_rates.append(false_positive_rate) tp_rates.append(true_positive_rate) print fp_rates print tp_rates roc_auc = auc(fpr, tpr) plt.title('Receiver Operating Characteristic') plt.plot(fp_rates, tp_rates, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() plt.savefig('fig.png')
def trainSVMModel(data): """ Train an SVM model and return it :param data: RDD[LabeledPoint] :return: svm classification model """ from pyspark.mllib.classification import SVMWithSGD, SVMModel model = SVMWithSGD.train(data, iterations=100) return model
def train_level(docs_with_classes, classification, number_of_terms): training_vectors = docs_with_classes.map( lambda (doc_id, (term_list, classifications)): get_training_vector( classification, term_list, classifications, number_of_terms)) svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG) return training_vectors, svm
def process(reviews): if (reviews.isEmpty()): pass else: start = time.time() #get reviews with overall rating > 3 and overall rating < 3 pos_reviews = reviews.filter(lambda x: x[0] > 3.0) neg_reviews = reviews.filter(lambda x: x[0] < 3.0) #set label for each class. 0.0 is positive - 1.0 is negative review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #reviews tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = ( stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform( prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training dataset with label training = review_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) #train the model classifier model = SVMWithSGD.train(training, iterations=100) model_name = "svm" + str(counter_model) #save model classifier to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name model.save(sc, output_dir) counter_model.add(1) end = time.time() print("Model Name : ", model_name, ", Total Reviews : ", reviews.count(), "Processing Time : ", (end - start))
def trainEvaluateModel(trainData, validationData, numIterations, stepSize, regParam): startTime = time() model = SVMWithSGD.train(trainData, numIterations, stepSize, regParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:numIterations->", numIterations, ", stepSize->", stepSize, ", regParam->", regParam) print("==> 所需时间:", duration, "s ,AUC=", AUC) return (AUC, duration, numIterations, stepSize, regParam, model)
def model_per_class(i, labelled_training_data): one_against_rest_data = labelled_training_data.map(lambda x: change_label(i, x)) ones = one_against_rest_data.filter(lambda x: x.label == 1) zeros = one_against_rest_data.filter(lambda x: x.label == 0) lis = random.sample(range(zeros.count()), ones.count()) zeros = zeros.zipWithIndex().filter(lambda x: x[1] in lis).map(lambda x: x[0]) one_against_rest_data = ones.union(zeros) model = SVMWithSGD.train(one_against_rest_data, iterations=10000) model.clearThreshold() return model
def svm(trainingData,testData,trainingSize,testSize): ''' linear svm classifier ''' # train a SVM model numIterValList = [100,200] regParamValList = [0.01,0.1,1,10,100] stepSizeValList = [0.1,0.5,1] regTypeValList = ['l2','l1'] # variable for the best parameters bestNumIterVal = 200 bestRegParamVal = 0.01 bestStepSizeVal = 1 bestRegTypeVal = 'l2' bestTrainErr = 100 for numIterVal,regParamVal,stepSizeVal,regTypeVal in itertools.product(numIterValList,regParamValList,stepSizeValList,regTypeValList): break model = SVMWithSGD.train(trainingData, iterations=numIterVal, regParam=regParamVal, step=stepSizeVal, regType=regTypeVal) labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSize) if trainErr<bestTrainErr: bestNumIterVal = numIterVal bestRegParamVal = regParamVal bestStepSizeVal = stepSizeVal bestRegTypeVal = regTypeVal bestTrainErr = trainErr print numIterVal,regParamVal,stepSizeVal,regTypeVal,trainErr print bestNumIterVal,bestRegParamVal,bestStepSizeVal,bestRegTypeVal,bestTrainErr model = SVMWithSGD.train(trainingData, iterations=bestNumIterVal, regParam=bestRegParamVal, step=bestStepSizeVal, regType=bestRegTypeVal) # Evaluating the model on training data labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSize) print trainErr # Evaluating the model on training data labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testSize) print testErr pass
def main(): # prepare training data # RDDTrainData = sc.textFile('2007_100.csv') RDDTrainData = sc.textFile(','.join([ # '1987.csv', # '1988.csv', # '1989.csv', # '1990.csv', # '1991.csv', # '1992.csv', # '1993.csv', # '1994.csv', # '1995.csv', # '1996.csv', # '1997.csv', # '1998.csv', # '1999.csv', # '2000.csv', # '2001.csv', # '2002.csv', # '2003.csv', # '2004.csv', # '2005.csv', # '2006.csv', '2007.csv', ])) RDDTrainHeader = RDDTrainData.take(1)[0] trainData = RDDTrainData.filter(lambda line: line != RDDTrainHeader)\ .map(split)\ .map(parseTrain) # prepare testing data RDDTestData = sc.textFile('2008.csv') RDDTestHeader = RDDTestData.take(1)[0] testData = RDDTestData.filter(lambda line: line != RDDTestHeader)\ .map(split)\ .map(parseTest) # do prediction # SVM model = SVMWithSGD.train(trainData, iterations=100) # Logistic Regression # model = LogisticRegressionWithLBFGS.train(trainData) predictionData = testData.map(lambda d: (int(d.label), model.predict(d.features))) # evaluate error rate errorCount = predictionData.filter( lambda d: int(d[0]) != int(d[1])).count() totalCount = predictionData.count() print 'error rate =', errorCount, '/', totalCount, '=', float( errorCount) / float(totalCount)
def train_evaluate_model(train_data, valid_data, iterations, step, regParam): start_time = time() # 训练 model = SVMWithSGD.train( train_data, iterations=iterations, step=step, regParam=regParam) # 评估 # y_pred y_true AUC = evaluate_model(model, valid_data) duration = time() - start_time print(f"训练评估:使用参数 step={step}, iterations={iterations}, regParam={regParam} ==>所需时间={duration} 结果AUC = {AUC}") return AUC, duration, iterations, step, regParam, model
def svm_train(sc, top_path, stopwords_dict=None): # 留个词词典接口,如果有新的词典,把词典放到该目录下 curpath = os.path.normpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) if stopwords_dict is None: stopwords = set( read_file(os.path.join(curpath, u"stopwords.txt")).split()) else: stopwords = set( read_file(os.path.join(curpath, u"stopwords_dict.txt")).split()) # 形成两类的文件夹的每个文本分词,去停用词,词频统计结果{'pos':[counter,..],'neg':[counter]} sub_folder = os.listdir(top_path) if len(sub_folder) != 2: raise OSError("need and only need two folder") top_folder_dict = {} for name in sub_folder: top_folder_dict[name] = pre_process(os.path.join(top_path, name), stopwords) # 选出两类直接区分度最大的词作为这两类的特征词集 topk = 500 features = feature_selection(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[0]], topk) # 计算两类的IDF IDF = idf(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[1]], features) # 每一类每一篇文本在指定二分类下的向量表示[(),()...] vector1 = { '1.0': feature_vector(tf(top_folder_dict[sub_folder[1]], features), IDF) } vector0 = { '0.0': feature_vector(tf(top_folder_dict[sub_folder[0]], features), IDF) } # 转为Spark所需要的输入格式[Labpoint(0.0,[]),...] labpoint1 = [LabeledPoint(1.0, list) for list in vector1['1.0']] labpoint0 = [LabeledPoint(0.0, list) for list in vector0['0.0']] train_data = labpoint1 + labpoint0 classifier = SVMWithSGD.train(sc.parallelize(train_data)) path = os.path.join(curpath, 'svm_' + sub_folder[1] + '_' + sub_folder[0] + '.pkl') if os.path.isfile(path): os.remove(path) with open(path, 'wb') as output: pickle.dump((features, IDF, classifier), output)
def SVM_train(data): data_train = split_data(data) key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1])) training, test = key_FT.randomSplit([0.8, 0.2], 0) model_SVM = SVMWithSGD.train(training, 10) predictionAndlabel = test.map( lambda x: (float(model_SVM.predict(x.features)), x.label)) accuracy = 1.0 * predictionAndlabel.filter( lambda (x, v): x == v).count() / test.count() print("accuracy of model_SVM:%f" % accuracy) return model_SVM, accuracy
def gen_predictors(training_data): classifiers = dict() for item in label_map.iteritems(): print "Gen predictor for label '{0}' ...".format(item[0]) global processed_label processed_label = item[1] svm = SVMWithSGD.train(training_data.map(transform_label)) classifiers[item[1]] = svm return classifiers
def main_spark(sc, trainData, testData, outputFilename): # Load and process data dataProcessing = sc.textFile(trainData) \ .map(parseTrainData) # Load test data testDataLoad = sc.textFile(testData)\ .map(parseTestData) # Build svm model #model = SVMWithSGD.train(dataProcessing, iterations=100, step=1.0, regParam=0.01) model = SVMWithSGD.train(dataProcessing, iterations=50, step=1.0, regParam=0.01, miniBatchFraction=20.0)
def main(): # prepare training data # RDDTrainData = sc.textFile('2007_100.csv') RDDTrainData = sc.textFile(','.join([ # '1987.csv', # '1988.csv', # '1989.csv', # '1990.csv', # '1991.csv', # '1992.csv', # '1993.csv', # '1994.csv', # '1995.csv', # '1996.csv', # '1997.csv', # '1998.csv', # '1999.csv', # '2000.csv', # '2001.csv', # '2002.csv', # '2003.csv', # '2004.csv', # '2005.csv', # '2006.csv', '2007.csv', ])) RDDTrainHeader = RDDTrainData.take(1)[0] trainData = RDDTrainData.filter(lambda line: line != RDDTrainHeader)\ .map(split)\ .map(parseTrain) # prepare testing data RDDTestData = sc.textFile('2008.csv') RDDTestHeader = RDDTestData.take(1)[0] testData = RDDTestData.filter(lambda line: line != RDDTestHeader)\ .map(split)\ .map(parseTest) # do prediction # SVM model = SVMWithSGD.train(trainData, iterations=100) # Logistic Regression # model = LogisticRegressionWithLBFGS.train(trainData) predictionData = testData.map(lambda d: (int(d.label), model.predict(d.features)) ) # evaluate error rate errorCount = predictionData.filter(lambda d: int(d[0]) != int(d[1])).count() totalCount = predictionData.count() print 'error rate =', errorCount, '/', totalCount, '=', float(errorCount) / float(totalCount)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def mySVM(training, test): # SVM training_svc = training.map(lambda x: LabeledPoint(x[29], x[1:28])) sv = SVMWithSGD.train(training_svc, iterations=100, step=0.1, regParam=0.01) test_svc = test.map(lambda x: LabeledPoint(x[29], x[1:28])) predictions = test_svc.map(lambda x: (x.TARGET, float(sv.predict(x.features)))) return predictions
def SVM_module(training): """This function returns a SVM model from your training data. :param training: (REQUIRED) - the training data :return: SVM model Use it as (Be sure to call split_data() to get the training data): >>> model = SVM_module(trainingData) """ # Train a SVM model return SVMWithSGD.train(training, iterations=300)
def run_iterations(parsedData, iter, seed): fp_rates = [] tp_rates = [] # thld_arr = [] for i in range(0, 10): trainingData, testingData = parsedData.randomSplit([70, 30], seed) print("For " + str(iter) + " iterations:") # Build the model model = SVMWithSGD.train(trainingData, iterations=100) # Evaluating the model on training data labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingData.count()) MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count() print("Training Error = " + str(trainErr)) print("MSE = " + str(MSE)) labelsAndPreds = testingData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testingData.count()) MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count() print("Testing Error = " + str(testErr)) print("MSE = " + str(MSE)) info = labelsAndPreds.collect() actual = [int(i[0]) for i in info] predictions = [i[1] for i in info] false_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 1 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 1).count()) true_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 0 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 0).count()) fpr, tpr, thresholds = roc_curve(actual, predictions) # roc_auc = auc(false_positive_rate, true_positive_rate) print false_positive_rate print true_positive_rate fp_rates.append(false_positive_rate) tp_rates.append(true_positive_rate) print fp_rates print tp_rates roc_auc = auc(fpr, tpr) plt.title('Receiver Operating Characteristic') plt.plot(fp_rates, tp_rates, 'b', label='AUC = %0.2f'% roc_auc) plt.legend(loc='lower right') plt.plot([0,1],[0,1],'r--') plt.xlim([-0.1,1.2]) plt.ylim([-0.1,1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() plt.savefig('fig.png')
def main(sc): train_data='/usr/local/spark/data/mllib/sample_svm_data.txt' data=sc.textFile(train_data).map(parse) if os.path.exists('model'): model=SVMModel.load(sc, 'model') else: model=SVMWithSGD.train(data, iterations=100) model.save(sc, 'model') labelsAndPreds=data.map(lambda p: (p.label, model.predict(p.features))) # trainErr=labelsAndPreds.filter(lambda (v, p): v != p).count() / float(data.count()) # print('Training Error =' + str(trainErr)) labelsAndPreds.map(lambda x:str(x[0])+'\t'+str(x[1])).saveAsTextFile('labelsAndPreds')
def training(path): #import dataset into RDD raw_data = sc.textFile(path) #parse raw data into label bag-of-words pairs parsed_data = raw_data.map(lambda line: parse_line(line)) #separate into training set and test set training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17) #get features for model training features = feature_extraction(training_set) labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features)) labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features)) #train logistic regression model lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training) #train naive bayes model nbModel = NaiveBayes.train(labeled_points_training) svmModel = SVMWithSGD.train(labeled_points_training) return lrModel, nbModel, svmModel, labeled_points_test, features
def train_trend_model(self, model, data, i): self.logger.info('Start to train the direction model') rdd_data = self.sc.parallelize(data) if self.trend_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='gini', maxDepth=20, maxBins=32) elif self.trend_prediction_method == self.NAIVE_BAYES: model = NaiveBayes.train(rdd_data) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) elif self.trend_prediction_method == self.SVM: model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) return model
def main(sc): inputFile=sys.argv[1] modelPath=sys.argv[2] data = sc.textFile(inputFile) parsedData = data.map(parsePoint) # Build the model model = SVMWithSGD.train(parsedData, iterations=100) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, modelPath) # sameModel = SVMModel.load(sc, "svm_model") sc.stop()
def svm_train(sc, top_path, stopwords_dict=None): # 留个词词典接口,如果有新的词典,把词典放到该目录下 curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) if stopwords_dict is None: stopwords = set(read_file(os.path.join(curpath, u"stopwords.txt")).split()) else: stopwords = set(read_file(os.path.join(curpath, u"stopwords_dict.txt")).split()) # 形成两类的文件夹的每个文本分词,去停用词,词频统计结果{'pos':[counter,..],'neg':[counter]} sub_folder = os.listdir(top_path) if len(sub_folder) != 2: raise OSError("need and only need two folder") top_folder_dict = {} for name in sub_folder: top_folder_dict[name] = pre_process(os.path.join(top_path, name), stopwords) # 选出两类直接区分度最大的词作为这两类的特征词集 topk = 500 features = feature_selection(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[0]], topk) # 计算两类的IDF IDF = idf(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[1]], features) # 每一类每一篇文本在指定二分类下的向量表示[(),()...] vector1 = {'1.0': feature_vector(tf(top_folder_dict[sub_folder[1]], features), IDF)} vector0 = {'0.0': feature_vector(tf(top_folder_dict[sub_folder[0]], features), IDF)} # 转为Spark所需要的输入格式[Labpoint(0.0,[]),...] labpoint1 = [LabeledPoint(1.0, list) for list in vector1['1.0']] labpoint0 = [LabeledPoint(0.0, list) for list in vector0['0.0']] train_data = labpoint1 + labpoint0 classifier = SVMWithSGD.train(sc.parallelize(train_data)) path = os.path.join(curpath, 'svm_' + sub_folder[1] + '_' + sub_folder[0] + '.pkl') if os.path.isfile(path): os.remove(path) with open(path, 'wb') as output: pickle.dump((features, IDF, classifier), output)
def train_model(training_data, iterations, model_file_path, calculate_error=True): """ Trains an SVG model and saves it :param training_data: :param iterations: :param model_file_path: :return: """ parsed_data = sc.textFile(training_data).map(parse_point) # Build the model model = SVMWithSGD.train(parsed_data, iterations=iterations) # Save the model model.save(sc, model_file_path) print "Model saved in: ", model_file_path if calculate_error: #predictions labelsAndPreds = parsed_data.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsed_data.count()) print("============Training Error = " + str(trainErr))
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier(rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
def svmTest(sqlContext,dataset_rdd,positive_negotive_rate): dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5) dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5) train_positive = dataset_positive.sample(False,0.8) test_positive = dataset_positive.subtract(train_positive) train_negotive = dataset_negotive.sample(False,0.8) test_negotive = dataset_negotive.subtract(train_negotive) trainset_rdd = train_positive.union(train_negotive) testset_rdd = test_positive.union(test_negotive) trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) trainset_nums = trainset.count() testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) testset_nums = testset.count() trainset_positive = train_positive.count() testset_positive = test_positive.count() model = SVMWithSGD.train(trainset,iterations = 100) predict = testset.map(lambda p:(p.label,model.predict(p.features))) hitALL =predict.filter(lambda e:e[0]==e[1]).count() hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count() positive = predict.filter(lambda e:e[1]>0.5).count() recallPositive = hitPositive/float(testset_positive) precision = hitPositive/float(positive) accuracy = hitALL/float(testset.count()) F_Value = 2/(1/precision+1/recallPositive) return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model) def processData(sqlContext): dataset_label_gender = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/labels_gender*') imeis_ads = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/imeis_ads*') imeis_aboutTimes = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/imeis_about*') imeis_apps = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/imeis_apps*') imeis_prvs = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/imeis_prvs*')
def train(self, num_iterations=10): # TODO support all the keyword training params model = SVMWithSGD.train(self._labeled_feature_vector_rdd(), num_iterations) return SVMModel(model, self.feature_cols)
NB_percent = [] LRSGD_percent = [] LRLBFGS_percent = [] for i in topFeatures: parsedData = sortedData.map(lambda line : (line, i)).map(labelData) splits = parsedData.randomSplit((0.9, 0.1)) train_set = splits[0] train_set.cache() test_set = splits[1] test_set.cache() #NBmodel = NaiveBayes.train(train_set) #NB_socredLabel = numpy.array(test_set.map(lambda lp: (NBmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) #findCoveragePercent(NB_socredLabel, 0.4) SVMSGDmodel = SVMWithSGD.train(train_set) SVMSGDmodel.clearThreshold() SVM_scoredLabel = numpy.array(test_set.map(lambda lp: (SVMSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.4)) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.8)) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 1.0)) LRSGDmodel = LogisticRegressionWithSGD.train(train_set) LRSGDmodel.clearThreshold() LRSGD_scoedLabel = numpy.array(test_set.map(lambda lp: (LRSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.4)) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.8)) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 1.0)) LRLBFGSmodel = LogisticRegressionWithLBFGS.train(train_set) LRLBFGSmodel.clearThreshold() LRLBFGS_scoredLabel = numpy.array(test_set.map(lambda lp: (LRLBFGSmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
import sys def toLabeledPoints(sc, data): return sc.parallelize(data).map(lambda x: LabeledPoint(x[0], x[1])) def loadData(path): data_file = open(path,"r") return pickle.load(data_file) def computeError(m, d): labelsAndPreds = d.map(lambda p: (p.label, m.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v,p): v != p).count() / float(d.count()) return trainErr if __name__ == "__main__": conf = SparkConf().setAppName("SpamFilter").setMaster("local[*]") sc = SparkContext(conf=conf) data = toLabeledPoints(sc, loadData(sys.argv[1])) testData = toLabeledPoints(sc, loadData(sys.argv[2])) # Train the model with different regularization parameters results = [] for i in [0.00, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]: model = SVMWithSGD.train(data, step=0.05, regParam=i) results.append((i, computeError(model, testData))) outfile = open("results.txt","w") outfile.write(str(results)) outfile.close()
Err = 0.0 results = [] for train_index, test_index in ss: X_training, Y_training, X_test, Y_test = [], [], [], [] for i in train_index: X_training.append(X[i]) Y_training.append(Y[i]) for i in test_index: X_test.append(X[i]) Y_test.append(Y[i]) parsedData = [] for i in range(0, len(X_training)): parsedData.append(LabeledPoint(Y_training[i], X_training[i])) model = SVMWithSGD.train(sc.parallelize(parsedData)) testErr = 0 for i in range(0, len(X_test)): a = Y_test[i] b = model.predict(X_test[i]) if a != b: testErr += 1 Err += float(testErr) / float(len(X_test)) print ("AVG test error: %.6f" % (Err/iter_number))
values = [float(x) for x in clean_line_split] if values[7] == 0: values[7]=1; else: values[7]=0; return LabeledPoint(values[7], values[0:7]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = SVMWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() # Evaluating the model on test data labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Time consumed = '), (datetime.now() - startTime) print("Training Error = " + str(testErr)) # Save and load model model.save(sc, "SVMwide00-08train") sameModel = SVMModel.load(sc, "SVMwide00-08train")
train_dict = [i.asDict() for i in feats_train] feats_test = test.collect() test_dict = [i.asDict() for i in feats_test] def parsePoint(d): d_copy = deepcopy(d) # I hate using deepcopy so much pred = d_copy['success_class'] d.pop('success_class', None) values = [float(x) for x in d.values()] return LabeledPoint(pred, map(float,values)) trainParsed = sc.parallelize(map(parsePoint, train_dict)) testParsed = sc.parallelize(map(parsePoint, test_dict)) model = SVMWithSGD.train(trainParsed, iterations=100) # Training Error trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features)))) trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count()) print trainErr # Test Error testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features)))) testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count()) print testErr metrics = BinaryClassificationMetrics(testLabelsAndPreds) print metrics.areaUnderROC print metrics.areaUnderPR
# the number of features is the columns of the matrix #we need this information to convert to vectors and label point the coordinate data cols=sc.broadcast(len(m.get_feature_names())) print "number of features"+str(cols.value) #convert to labeled point in parallel tmpLB=tmp.map(partial(toLB,cols=cols,class_v=bY)) print "splitting the data" train, test = tmpLB.randomSplit([0.6, 0.4], seed = 0) print "training the machine learning algorithm" #Change --------------------------------- #model = NaiveBayes.train(train, 1.0) ### Change DONE model=SVMWithSGD.train(train, 1.0) ### Change XCA # TODO We are testing several MLs # 1) LogisticsRegression #model =LogisticRegressionwWithSGD.train(train) This is used for Logistic regression classification # 2) SVM Classification #model=SVMWithSGD.train(train) This used for SVM classiffication # 3) RandomForest #************Random forest model in pyspark is experimental so not sure whether works perfectly or not #model=RandomForest.trainClassifier(train,2,{},300,seed=2) here 300 is best solution as per literature for this dataset print "retrieving predictions and evaluating" predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
# Build the model logitmodel = LogisticRegressionWithLBFGS.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, logitmodel.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) ## 0.353992330848 ############################ SVM ############################## from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint # Build the model SVMmodel = SVMWithSGD.train(parsedData, iterations=100) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, SVMmodel.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) ## 0.555395278766 ############################ Decision TREE ############################## from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree from pyspark.mllib.util import MLUtils def parsePoint(line): values = [float(x) for x in line.split(',')]
rows_num = float(sorted_labelled.count()) precisions = [] recalls = [] recallNum = [] sum = 0.0 model_start = time.time() for label in random_labels: parsedData = sorted_labelled.map(lambda line : (line, label)).map(labelData) splits = parsedData.randomSplit((0.9, 0.1)) train_set = splits[0] test_set = splits[1] test_set.cache() model = SVMWithSGD.train(train_set) #model = LogisticRegressionWithSGD.train(train_set) #model = LogisticRegressionWithLBFGS.train(train_set) #model = DecisionTree.trainClassifier(train_set, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #model = RandomForest.trainClassifier(train_set, numClasses=2, categoricalFeaturesInfo={}, numTrees=5, featureSubsetStrategy="auto", impurity='gini', maxDepth=3, maxBins=32) #labelsAndPreds = test_set.map(lambda p: (p.label, model.predict(p.features))) #testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test_set.count()) predictions = model.predict(test_set.map(lambda x: x.features)) labelsAndPredictions = test_set.map(lambda lp: lp.label).zip(predictions) labelsAndPredictions.cache() precision = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(test_set.count()) if labelsAndPredictions.filter(lambda (v, p): v == 1).count() != 0: recall = labelsAndPredictions.filter(lambda (v, p): v == p and v == 1).count() / float(labelsAndPredictions.filter(lambda (v, p): v == 1).count()) recallNum.append(labelsAndPredictions.filter(lambda (v, p): v == 1).count()) else: recall = 1.0
items = line.strip().split() y = items[0] x = items[1:] # return LabeledPoint(y, x) ## this explicitly maps each example to a higher dimensional space ## namely the space of a degree 2 polynomial kernel poly = npp.Polynomial([float(_) for _ in x]) return LabeledPoint(y, (poly*poly).coef) ## load data and prep for SVM data = sc.textFile("all_hands.txt") examples = sc.parallelize(data.map(get_labeled_point).collect()) results = {} ## train SVMs with different regularization parameters for exponent in range(5,11,2): model = SVMWithSGD.train(examples, iterations=50, regParam=2 ** exponent, miniBatchFraction=1, step=1) ## compute training error for that regParam incorrect_predictions = examples.map(lambda p: p.label != model.predict(p.features)) training_error = incorrect_predictions.filter(lambda p : p).count() / float(examples.count()) print "Training Error: %s" % training_error results[2**exponent] = training_error print results