コード例 #1
0
from pyspark.mllib.classification import LogisticRegressionWithSGD

algo = LogisticRegressionWithSGD()
model = algo.train(training_data)
score(model)

spamExample = tf.transform("You have won $1,000,000. Please fly to Nigeria ASAP. This is urgent".split(" "))
hamExample = tf.transform("Spark is really good at big data processing".split(" "))

print(model.predict(spamExample))
print(model.predict(hamExample))


from pyspark.mllib.classification import LogisticRegressionWithLBFGS

algo = LogisticRegressionWithLBFGS()
model = algo.train(training_data)
score(model)

#### Support Vector Machines
#### What about SVMs, another popular algorithm?
from pyspark.mllib.classification import SVMWithSGD
algo = SVMWithSGD()
model = algo.train(training_data)
score(model)


##### Trees
#####
##### Now let’s try three variants of tree-based classification. 
##### The API is slightly different from previous algos.
コード例 #2
0
    """数据集处理操作,包括读图片,numpy.reshape,hog计算等等 并最终转换成rdd"""
    lrdd = df.rdd.map(lambda row: LabeledPoint(row[-1], \    # 读一行row
                    Vectors.dense( \ # 将row中的图片变成Vector
                    hog(np.array(row[0].data \ # numpy 转换图片为64*64,row[0].data就是图片的值
                    ).reshape(64,64),cells_per_block=(2, 2))))) # hog计算的参数
    return lrdd

df_train, df_test = load_df()
print("load hdfs data successful",df_train.count(), df_test.count())

## 将数据都转换成处理特征后的rdd
rdd_train = df2labeledrdd(df_train)
rdd_test = df2labeledrdd(df_test)

print("load hdfs data successful")

print("model train start at:", time.strftime('%Y-%m-%d %H:%M:%S'))
model = LogisticRegressionWithLBFGS().train(rdd_train, iterations=100, numClasses=15)
print("model train successful at:", time.strftime('%Y-%m-%d %H:%M:%S'))

## 保存模型
import os, tempfile
path = tempfile.mkdtemp()
model.save(sc, path)
print("Model saved at: ",path)

## 计算准确率
scoreAndLabels = rdd_test.map(lambda point:(model.predict(point.features),point.label))
accuracy = scoreAndLabels.filter(lambda l: l[0]==l[1]).count() / rdd_test.count()
print("accuracy: ",accuracy)
コード例 #3
0
ファイル: classifier.py プロジェクト: RaresBr/licenta
fake_samples = fake_features.map(lambda features: LabeledPoint(1, features))
real_samples = real_features.map(lambda features: LabeledPoint(0, features))

print(fake_samples.take(1))
print(real_samples.take(1))

samples = fake_samples.union(real_samples)
[training_data, test_data] = samples.randomSplit([0.8, 0.2])
training_data.cache()
test_data.cache()

algorithm = LogisticRegressionWithSGD()
model = algorithm.train(training_data)
print('logistic regression sgd:', score(model))

algorithm = LogisticRegressionWithLBFGS()
model = algorithm.train(training_data)
print('logistic regression with lbfgs:', score(model))

# algorithm = DecisionTree()
# model = algorithm.trainClassifier(training_data, numClasses=2,categoricalFeaturesInfo={})
# print('decision tree: ',score(model))
#
# algorithm = RandomForest()
# model = algorithm.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16)
# print('random forest: ',score(model))

algorithm = NaiveBayes()
model = algorithm.train(training_data)
print('naive bayes: ', score(model))
コード例 #4
0
    for i in range(0, len(features)):
        features[i] = float(features[i])
        if features[i] < 0.0:
            features[i] = 0.0
    return LabeledPoint(label, Vectors.dense(features))


nb_data = records.map(lambda r: labeled_point_nb(r))
print("the first data of nb data and the count of nb data:")
print(nb_data.first())

#start train model
num_iterations = 10
max_tree_depth = 5

lr_model = LogisticRegressionWithLBFGS().train(data, num_iterations)
print("logistic regression model :")
print(lr_model)

svm_model = SVMWithSGD().train(data, num_iterations)
print("svm model :")
print(svm_model)

nb_model = NaiveBayes().train(nb_data)
print("naive bayes model :")
print(nb_model)

dt_model = DecisionTree().trainClassifier(data, 2, {})
print("decision tree model :")
print(dt_model)