rdd_train = sc.textFile(TRAINPATH) rdd_test = sc.textFile(TESTPATH) rdd_train = rdd_train.map(lambda line: GetParts(line)) rdd_test = rdd_test.map(lambda line: GetParts(line)) print("load hdfs data successful") # ~ rdd_train.cache() # ~ rdd_test.cache() ## 训练逻辑回归多分类器 print("model train start at:", time.strftime('%Y-%m-%d %H:%M:%S')) model = LogisticRegressionWithLBFGS().train(rdd_train, iterations=100, numClasses=15) print("model train successful at:", time.strftime('%Y-%m-%d %H:%M:%S')) ## 保存模型 import os, tempfile path = tempfile.mkdtemp() model.save(sc, path) print("Model saved at: ", path) ## 计算准确率 scoreAndLabels = rdd_test.map(lambda point: (model.predict(point.features), point.label)) accuracy = scoreAndLabels.filter( lambda l: l[0] == l[1]).count() / rdd_test.count() print("accuracy: ", accuracy)
"""数据集处理操作,包括读图片,numpy.reshape,hog计算等等 并最终转换成rdd""" lrdd = df.rdd.map(lambda row: LabeledPoint(row[-1], \ # 读一行row Vectors.dense( \ # 将row中的图片变成Vector hog(np.array(row[0].data \ # numpy 转换图片为64*64,row[0].data就是图片的值 ).reshape(64,64),cells_per_block=(2, 2))))) # hog计算的参数 return lrdd df_train, df_test = load_df() print("load hdfs data successful",df_train.count(), df_test.count()) ## 将数据都转换成处理特征后的rdd rdd_train = df2labeledrdd(df_train) rdd_test = df2labeledrdd(df_test) print("load hdfs data successful") print("model train start at:", time.strftime('%Y-%m-%d %H:%M:%S')) model = LogisticRegressionWithLBFGS().train(rdd_train, iterations=100, numClasses=15) print("model train successful at:", time.strftime('%Y-%m-%d %H:%M:%S')) ## 保存模型 import os, tempfile path = tempfile.mkdtemp() model.save(sc, path) print("Model saved at: ",path) ## 计算准确率 scoreAndLabels = rdd_test.map(lambda point:(model.predict(point.features),point.label)) accuracy = scoreAndLabels.filter(lambda l: l[0]==l[1]).count() / rdd_test.count() print("accuracy: ",accuracy)
svm_model = SVMWithSGD().train(data, num_iterations) print("svm model :") print(svm_model) nb_model = NaiveBayes().train(nb_data) print("naive bayes model :") print(nb_model) dt_model = DecisionTree().trainClassifier(data, 2, {}) print("decision tree model :") print(dt_model) #start predict data_point = data.first() lr_prediction = lr_model.predict(data_point.features) print("logistic model prediction :" + str(lr_prediction)) print("the true label :" + str(data_point.label)) #analyze data vectors = data.map(lambda lp: lp.features) matrix = RowMatrix(vectors) matrix_summary = matrix.computeColumnSummaryStatistics() print("the col mean of matrix :") print(matrix_summary.mean()) print("the col min of matrix :") print(matrix_summary.min()) print("the col max of matrix :") print(matrix_summary.max()) print("the col variance of matrix :") print(matrix_summary.variance())