rdd_train = sc.textFile(TRAINPATH)
rdd_test = sc.textFile(TESTPATH)

rdd_train = rdd_train.map(lambda line: GetParts(line))
rdd_test = rdd_test.map(lambda line: GetParts(line))

print("load hdfs data successful")
# ~ rdd_train.cache()
# ~ rdd_test.cache()

## 训练逻辑回归多分类器
print("model train start at:", time.strftime('%Y-%m-%d %H:%M:%S'))
model = LogisticRegressionWithLBFGS().train(rdd_train,
                                            iterations=100,
                                            numClasses=15)
print("model train successful at:", time.strftime('%Y-%m-%d %H:%M:%S'))

## 保存模型
import os, tempfile
path = tempfile.mkdtemp()
model.save(sc, path)
print("Model saved at: ", path)

## 计算准确率
scoreAndLabels = rdd_test.map(lambda point:
                              (model.predict(point.features), point.label))
accuracy = scoreAndLabels.filter(
    lambda l: l[0] == l[1]).count() / rdd_test.count()
print("accuracy: ", accuracy)
    """数据集处理操作,包括读图片,numpy.reshape,hog计算等等 并最终转换成rdd"""
    lrdd = df.rdd.map(lambda row: LabeledPoint(row[-1], \    # 读一行row
                    Vectors.dense( \ # 将row中的图片变成Vector
                    hog(np.array(row[0].data \ # numpy 转换图片为64*64,row[0].data就是图片的值
                    ).reshape(64,64),cells_per_block=(2, 2))))) # hog计算的参数
    return lrdd

df_train, df_test = load_df()
print("load hdfs data successful",df_train.count(), df_test.count())

## 将数据都转换成处理特征后的rdd
rdd_train = df2labeledrdd(df_train)
rdd_test = df2labeledrdd(df_test)

print("load hdfs data successful")

print("model train start at:", time.strftime('%Y-%m-%d %H:%M:%S'))
model = LogisticRegressionWithLBFGS().train(rdd_train, iterations=100, numClasses=15)
print("model train successful at:", time.strftime('%Y-%m-%d %H:%M:%S'))

## 保存模型
import os, tempfile
path = tempfile.mkdtemp()
model.save(sc, path)
print("Model saved at: ",path)

## 计算准确率
scoreAndLabels = rdd_test.map(lambda point:(model.predict(point.features),point.label))
accuracy = scoreAndLabels.filter(lambda l: l[0]==l[1]).count() / rdd_test.count()
print("accuracy: ",accuracy)
Beispiel #3
0
svm_model = SVMWithSGD().train(data, num_iterations)
print("svm model :")
print(svm_model)

nb_model = NaiveBayes().train(nb_data)
print("naive bayes model :")
print(nb_model)

dt_model = DecisionTree().trainClassifier(data, 2, {})
print("decision tree model :")
print(dt_model)

#start predict
data_point = data.first()
lr_prediction = lr_model.predict(data_point.features)
print("logistic model prediction :" + str(lr_prediction))
print("the true label :" + str(data_point.label))

#analyze data
vectors = data.map(lambda lp: lp.features)
matrix = RowMatrix(vectors)
matrix_summary = matrix.computeColumnSummaryStatistics()
print("the col mean of matrix :")
print(matrix_summary.mean())
print("the col min of matrix :")
print(matrix_summary.min())
print("the col max of matrix :")
print(matrix_summary.max())
print("the col variance of matrix :")
print(matrix_summary.variance())