from pyspark.mllib.classification import LogisticRegressionWithSGD algo = LogisticRegressionWithSGD() model = algo.train(training_data) score(model) spamExample = tf.transform("You have won $1,000,000. Please fly to Nigeria ASAP. This is urgent".split(" ")) hamExample = tf.transform("Spark is really good at big data processing".split(" ")) print(model.predict(spamExample)) print(model.predict(hamExample)) from pyspark.mllib.classification import LogisticRegressionWithLBFGS algo = LogisticRegressionWithLBFGS() model = algo.train(training_data) score(model) #### Support Vector Machines #### What about SVMs, another popular algorithm? from pyspark.mllib.classification import SVMWithSGD algo = SVMWithSGD() model = algo.train(training_data) score(model) ##### Trees ##### ##### Now let’s try three variants of tree-based classification. ##### The API is slightly different from previous algos.
"""数据集处理操作,包括读图片,numpy.reshape,hog计算等等 并最终转换成rdd""" lrdd = df.rdd.map(lambda row: LabeledPoint(row[-1], \ # 读一行row Vectors.dense( \ # 将row中的图片变成Vector hog(np.array(row[0].data \ # numpy 转换图片为64*64,row[0].data就是图片的值 ).reshape(64,64),cells_per_block=(2, 2))))) # hog计算的参数 return lrdd df_train, df_test = load_df() print("load hdfs data successful",df_train.count(), df_test.count()) ## 将数据都转换成处理特征后的rdd rdd_train = df2labeledrdd(df_train) rdd_test = df2labeledrdd(df_test) print("load hdfs data successful") print("model train start at:", time.strftime('%Y-%m-%d %H:%M:%S')) model = LogisticRegressionWithLBFGS().train(rdd_train, iterations=100, numClasses=15) print("model train successful at:", time.strftime('%Y-%m-%d %H:%M:%S')) ## 保存模型 import os, tempfile path = tempfile.mkdtemp() model.save(sc, path) print("Model saved at: ",path) ## 计算准确率 scoreAndLabels = rdd_test.map(lambda point:(model.predict(point.features),point.label)) accuracy = scoreAndLabels.filter(lambda l: l[0]==l[1]).count() / rdd_test.count() print("accuracy: ",accuracy)
fake_samples = fake_features.map(lambda features: LabeledPoint(1, features)) real_samples = real_features.map(lambda features: LabeledPoint(0, features)) print(fake_samples.take(1)) print(real_samples.take(1)) samples = fake_samples.union(real_samples) [training_data, test_data] = samples.randomSplit([0.8, 0.2]) training_data.cache() test_data.cache() algorithm = LogisticRegressionWithSGD() model = algorithm.train(training_data) print('logistic regression sgd:', score(model)) algorithm = LogisticRegressionWithLBFGS() model = algorithm.train(training_data) print('logistic regression with lbfgs:', score(model)) # algorithm = DecisionTree() # model = algorithm.trainClassifier(training_data, numClasses=2,categoricalFeaturesInfo={}) # print('decision tree: ',score(model)) # # algorithm = RandomForest() # model = algorithm.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16) # print('random forest: ',score(model)) algorithm = NaiveBayes() model = algorithm.train(training_data) print('naive bayes: ', score(model))
for i in range(0, len(features)): features[i] = float(features[i]) if features[i] < 0.0: features[i] = 0.0 return LabeledPoint(label, Vectors.dense(features)) nb_data = records.map(lambda r: labeled_point_nb(r)) print("the first data of nb data and the count of nb data:") print(nb_data.first()) #start train model num_iterations = 10 max_tree_depth = 5 lr_model = LogisticRegressionWithLBFGS().train(data, num_iterations) print("logistic regression model :") print(lr_model) svm_model = SVMWithSGD().train(data, num_iterations) print("svm model :") print(svm_model) nb_model = NaiveBayes().train(nb_data) print("naive bayes model :") print(nb_model) dt_model = DecisionTree().trainClassifier(data, 2, {}) print("decision tree model :") print(dt_model)