Ejemplo n.º 1
0
def markDelay(v):
    return LabeledPoint(v[0], np.array(v[1:]))
Ejemplo n.º 2
0
    mse = scoreAndLabels.map(lambda (a, b): (a - b)**2).mean()
    rmse = math.sqrt(mse)
    #metrics = RegressionMetrics(scoreAndLabels)
    #RMSE=metrics.rootMeanSquaredError
    return mae, rmse


sc = SparkContext()

selcol = [1, 3, 4, 6, 18, 23, 25]
train = prep_Data("HW4/200[3-7].csv", selcol)
test = prep_Data("HW4/2008.csv", selcol)

#transform data into the format that can be feed into model
trainLabeled = train.map(
    lambda line: LabeledPoint(extract_label(line), extract_features(line)))
testLabeled = test.map(
    lambda line: LabeledPoint(extract_label(line), extract_features(line)))

#preserver some part of the data as validation data
train_dataset, val_dataset = trainLabeled.randomSplit([0.7, 0.3])

#train
linear_model_val = LinearRegressionWithSGD.train(train_dataset, 100000,
                                                 0.00000000001)
linear_model = LinearRegressionWithSGD.train(trainLabeled, 100000,
                                             0.00000000001)

#evaluateModel(linear_model_val, val_dataset)
#evaluateModel(linear_model, testLabeled)
Ejemplo n.º 3
0
def main(sc):

    train_id = utils.load("data_id/train.p")
    test_id = utils.load("data_id/test.p")

    meta(train_id)

    train_id = [[idx] for idx in train_id]
    test_id = [[idx] for idx in test_id]

    sqlContext = SQLContext(sc)
    train_f = sqlContext.createDataFrame(train_id, ['biz_id'])
    test_f = sqlContext.createDataFrame(test_id, ['biz_id'])

    # Register user defined functions
    # city = udf(lambda b_id: get_city(b_id), StringType())
    state = udf(lambda b_id: MLVectors.dense(get_state(b_id)), VectorUDT())
    stars = udf(lambda b_id: get_stars(b_id), FloatType())
    popularity = udf(lambda b_id: get_popularity(b_id), IntegerType())
    name_size = udf(lambda b_id: get_name_size(b_id), IntegerType())
    name_polar = udf(lambda b_id: get_name_polar(b_id), FloatType())
    pos_neg_score = udf(lambda b_id: MLVectors.dense(get_PosNeg_score(b_id)),
                        VectorUDT())
    # clarity = udf(lambda b_id: get_clarity(b_id), ArrayType(FloatType()))
    elite_cnt = udf(lambda b_id: get_elite_cnt(b_id), IntegerType())
    label = udf(lambda b_id: get_y(b_id), IntegerType())

    # Generate feature columns
    # data_f = data_f.withColumn("city", city(data_f['biz_id']))
    train_f = train_f.withColumn("state", state(train_f['biz_id']))
    train_f = train_f.withColumn("stars", stars(train_f['biz_id']))
    train_f = train_f.withColumn("popularity", popularity(train_f['biz_id']))
    train_f = train_f.withColumn("name_size", name_size(train_f['biz_id']))
    train_f = train_f.withColumn("name_polar", name_polar(train_f['biz_id']))
    train_f = train_f.withColumn("pos_neg_score",
                                 pos_neg_score(train_f['biz_id']))
    # data_f = data_f.withColumn("clarity", clarity(data_f['biz_id']))
    train_f = train_f.withColumn("elite_cnt", elite_cnt(train_f['biz_id']))
    train_f = train_f.withColumn("y", label(train_f['biz_id']))
    train_f.show(5)

    # Generate feature columns
    test_f = test_f.withColumn("state", state(test_f['biz_id']))
    test_f = test_f.withColumn("stars", stars(test_f['biz_id']))
    test_f = test_f.withColumn("popularity", popularity(test_f['biz_id']))
    test_f = test_f.withColumn("name_size", name_size(test_f['biz_id']))
    test_f = test_f.withColumn("name_polar", name_polar(test_f['biz_id']))
    test_f = test_f.withColumn("pos_neg_score",
                               pos_neg_score(test_f['biz_id']))
    test_f = test_f.withColumn("elite_cnt", elite_cnt(test_f['biz_id']))
    test_f = test_f.withColumn("y", label(test_f['biz_id']))
    test_f.show(5)

    # One-hot encoding
    # encoder = OneHotEncoder(inputCol="state", outputCol="stateVec")
    # train_f = encoder.transform(train_f)
    train_f.show(5)
    # test_f = encoder.transform(test_f)
    test_f.show(5)

    # Assemble columns to features
    assembler = VectorAssembler(inputCols=[
        "state", "stars", "popularity", "name_size", "name_polar",
        "pos_neg_score", "elite_cnt"
    ],
                                outputCol="features")

    train_f = assembler.transform(train_f)
    train_f.show(5)
    test_f = assembler.transform(test_f)
    test_f.show(5)

    train_f = train_f.filter(train_f.y != -1)
    test_f = test_f.filter(test_f.y != -1)


    train_d = (train_f.select(col("y"), col("features")) \
                .rdd \
                .map(lambda row: LabeledPoint(float(row.y), MLLibVectors.fromML(row.features))))
    m = SVMWithSGD.train(train_d)
    predictionAndLabels = test_f.rdd.map(lambda row: (float(
        m.predict(MLLibVectors.fromML(row.features))), float(row.y)))
    # Grid search for best params and model
    # scores = {}
    # max_score = 0
    # for m in model_list:
    #     print ('run', m)
    #     evaluator = BinaryClassificationEvaluator()
    #     cv = CrossValidator(estimator=model_list[m],
    #                 estimatorParamMaps=params_list[m],
    #                 evaluator=evaluator,
    #                 numFolds=3)
    #     cv.fit(train)
    #     scores[m] = cv.get_best_score()
    #     if scores[m] > max_score:
    #         op_params = params_list[m][cv.get_best_index()]
    #         op_model = cv.get_best_model()
    #         op_m_name = m

    # predictionAndLabels = test.map(lambda lp: (float(op_model.predict(lp.features)), lp.y))

    # Instantiate metrics object
    bi_metrics = BinaryClassificationMetrics(predictionAndLabels)
    mul_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = %s" % bi_metrics.areaUnderPR)
    # Area under ROC curve
    print("Area under ROC = %s" % bi_metrics.areaUnderROC)
    # Confusion Matrix
    print("Confusion Matrix")
    print(mul_metrics.confusionMatrix().toArray())

    # Overall statistics
    precision = mul_metrics.precision()
    recall = mul_metrics.recall()
    f1Score = mul_metrics.fMeasure()
    accuracy = mul_metrics.accuracy
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    print("Accuracy = %s" % accuracy)

    # Individual label stats
    labels = [0, 1]
    for label in labels:
        print("Class %s precision = %s" %
              (label, mul_metrics.precision(label)))
        print("Class %s recall = %s" % (label, mul_metrics.recall(label)))
Ejemplo n.º 4
0
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# 生成label 和 features 行
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

from pyspark.mllib.linalg import Matrix, Matrices

# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
dm2 = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6])

# 稀疏矩阵,尺寸,各个索引(数量,长度和尺寸对应),值
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
Ejemplo n.º 5
0
labelIndexer = StringIndexer(inputCol="f3", outputCol="att_f3")
model = labelIndexer.fit(df4)
df5 = model.transform(df4)

from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(inputCols=["att_a", "att_f1", "att_f2", "att_f3"],
                     outputCol="features")
df6 = va.transform(df5)

df7 = df6.withColumnRenamed('lables', 'label')
trainDf = df7.select('label', 'features')
trainDf.printSchema()
print trainDf.show()

from pyspark.mllib.regression import LabeledPoint
trainRdd = trainDf.map(lambda row: LabeledPoint(row.label, row.features))
print trainRdd.take(20)

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.01)
model1 = lr.fit(trainDf)
print model1.coefficients
print model1.intercept

from pyspark.sql import Row
test0 = sc.parallelize([Row(features=Vectors.dense(2, 0, 0, 1))]).toDF()
result = model1.transform(test0).head()
print result.prediction
def parse(lp):
    label = float(lp[lp.find('(') + 1: lp.find(')')])
    vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))

    return LabeledPoint(label, vec)
Ejemplo n.º 7
0
    else:
        return acum / count


sc = SparkContext(conf=SparkConf())
learn = sc.textFile('parsedTrainSmall.csv', 8)

learn = learn.map(lambda x: x.split('|')).map(lambda x: (x[0], x[
    2], dame_minhashes_shingles2(dame_shingles_words(x[1], 3, 15))))

from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import DenseVector

data_for_decision_tree = learn.map(
    lambda x: LabeledPoint(label=x[1], features=DenseVector(x[2])))
(dataTrain, dataTest) = data_for_decision_tree.randomSplit([0.7, 0.3])
model = DecisionTree.trainRegressor(dataTrain,
                                    categoricalFeaturesInfo={},
                                    impurity='variance',
                                    maxDepth=5,
                                    maxBins=32)
predictions = model.predict(dataTest.map(lambda x: x.features))
labelsAndPredictions = dataTest.map(lambda x: x.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) *
                                   (v - p)).sum() / float(dataTest.count())
print "MSE = %f" % (testMSE)

learn = learn.map(lambda x: (x[0], x[1], dame_hash_bandas(x[2])))
learn = learn.flatMap(lambda x: flatmapeo(x[0], x[1], x[2])
                      )  #(u'a9wx8dk93sn5', u'1.0', 813759583895638922)
Ejemplo n.º 8
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.linalg import SparseVector, DenseVector

sparse_data = [
    LabeledPoint(0.0, DenseVector([0,  1.0, 2])),
    LabeledPoint(1.0, DenseVector([0, 1, 1.0])),
    LabeledPoint(0.0, DenseVector([1, 0, 1.0])),
    LabeledPoint(1.0, DenseVector([1, 1.3,  2.0])),
    LabeledPoint(1.0, DenseVector([1, 2.1,  1.6])),
]


#data = sc.parallelize(sparse_data)
data = None

model = GradientBoostedTrees.trainRegressor(data, categoricalFeaturesInfo={0:2}, numIterations=10)

print(model.numTrees())

print(model.totalNumNodes())

model.predict(DenseVector([1, 1, 1.0]))
model.predict(DenseVector([0, 0, 1.0]))

#rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
#model.predict(rdd).collect()
#tokenizing the paragraphs for words
tokenizer = Tokenizer(inputCol="review", outputCol="words")
#transformation
wordsData = tokenizer.transform(schemeReview)
#Hashing the words input
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=300)
#transforming the data to hash
featurizedData = hashingTF.transform(wordsData)
#instantiating the IDF model
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label", "features", "id")
#Creating RDD of LabeledPoints
lpSelectData = selectData.map(lambda x:
                              (x.id, LabeledPoint(x.label, x.features)))
#Spliting the data for training and test
(trainingData, testData) = lpSelectData.randomSplit([0.9, 0.1])
# training the Logistic regression with LBFGS model
lrm = LogisticRegressionWithLBFGS.train(trainingData.map(lambda x: x[1]),
                                        iterations=10)
#fetching the labels and predictions for test data
labelsAndPreds = testData.map(lambda p:
                              (p[0], p[1].label, lrm.predict(p[1].features)))
#calculating the accuracy and printing it.
accuracy = labelsAndPreds.filter(lambda (i, v, p): v == p).count() / float(
    testData.count())
print("Accuracy = " + str(accuracy))
Ejemplo n.º 10
0
                        header="false")
val = spark.read.load("hdfs://10.190.2.112/data/val_set.txt",
                      format="csv",
                      sep="\t",
                      inferSchema="true",
                      header="false")
test = spark.read.load("hdfs://10.190.2.112/data/test_set.txt",
                       format="csv",
                       sep="\t",
                       inferSchema="true",
                       header="false")

# create features and labels
HDF = HashingTF(50)
train = train.rdd.map(
    lambda x: LabeledPoint(x[6] == 'E', HDF.transform([x[2], x[3]])))
test = test.rdd.map(
    lambda x: LabeledPoint(x[6] == 'E', HDF.transform([x[2], x[3]])))
val = val.rdd.map(
    lambda x: LabeledPoint(x[6] == 'E', HDF.transform([x[2], x[3]])))

with open('H2_15300180012_output.txt', 'w') as f:
    f.write('H2_15300180012_output\n')


def do_training(para=1.0):
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('Naive Bayes parameter: {} \n'.format(para))

    # Train a naive Bayes model.
    model = NaiveBayes.train(train, para)
Ejemplo n.º 11
0
def parseInput(line):
    return LabeledPoint(float(line[1]),line[0])
Ejemplo n.º 12
0
def get_RDDs(data, corpus, weights, questions, labels):
    w2v = word2vec.Word2Vec(corpus,
                            size=100,
                            window=20,
                            min_count=1,
                            workers=40)
    '''
    one : tfidf scores only
    two : word2vec vectors only
    three : jaccard index only
    four : word2vec * tfidf
    five : word2vec * tfidf, jaccard index

    sum or mean : way the word vectors for the entire sentence
                  were combined into one vector or number

    cosine or squeclidean : similarity measurement on the two sum/mean vectors
    '''
    one = questions.map(lambda x: (weight_vector_tfidf(weights, x[0]),
                                   weight_vector_tfidf(weights, x[1])))
    one_sum = one.map(lambda x: (get_sum(x[0]), get_sum(x[1])))
    one_mean = one.map(lambda x: (get_mean(x[0]), get_mean(x[1])))
    two = questions.map(
        lambda x: (weight_vector_w2v(w2v, x[0]), weight_vector_w2v(w2v, x[1])))
    two_sum = two.map(lambda x: (sum_w2v(x[0]), sum_w2v(x[1])))
    two_mean = two.map(lambda x: (mean_w2v(x[0]), mean_w2v(x[1])))
    three = questions.map(lambda x: jaccard_index(x[0], x[1]))
    four = questions.map(lambda x: (weight_vector_both(weigts, w2v, x[0]),
                                    weight_vector_both(weights, w2v, x[1])))
    four_sum = four.map(lambda x: (sum_w2v(x[0]), sum_w2v(x[1])))
    four_mean = four.map(lambda x: (mean_w2v(x[0]), mean_w2v(x[1])))
    five_sum = four_sum.zip(three)
    five_mean = four_mean.zip(three)
    labels = labels.coalesce(1)
    one_sum_difference = labels.zip(
        one_sum.map(lambda x: abs(x[0] - x[1])).coalesce(1)).repartition(
            100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    one_mean_difference = labels.zip(
        one_mean.map(lambda x: abs(x[0] - x[1])).coalesce(1)).repartition(
            100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    two_sum_cosine = labels.zip(
        two_sum.map(lambda x: get_cosine(x)).coalesce(1)).repartition(100).map(
            lambda x: LabeledPoint(x[0], [x[1]]))
    two_sum_sqeuclidean = labels.zip(
        two_sum.map(lambda x: sqeuclidean(x[0], x[1])).coalesce(
            1)).repartition(100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    two_mean_cosine = labels.zip(
        two_mean.map(lambda x: get_cosine(x)).coalesce(1)).repartition(
            100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    two_mean_sqeuclidean = labels.zip(
        two_mean.map(lambda x: sqeuclidean(x[0], x[1])).coalesce(
            1)).repartition(100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    three = labels.zip(
        three.coalesce(1)).map(lambda x: LabeledPoint(x[0], [x[1]]))
    four_sum_cosine = labels.zip(
        four_sum.map(lambda x: get_cosine(x)).coalesce(1)).repartition(
            100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    four_sum_sqeuclidean = labels.zip(
        four_sum.map(lambda x: sqeuclidean(x[0], x[1])).coalesce(
            1)).repartition(100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    four_mean_cosine = labels.zip(
        four_mean.map(lambda x: get_cosine(x)).coalesce(1)).repartition(
            100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    four_mean_sqeuclidean = labels.zip(
        four_mean.map(lambda x: sqeuclidean(x[0], x[1])).coalesce(
            1)).repartition(100).map(lambda x: LabeledPoint(x[0], [x[1]]))
    five_sum_cosine = labels.zip(
        five_sum.map(lambda x: (get_cosine(x[0]), x[1])).coalesce(1)).map(
            lambda x: LabeledPoint(x[0], [x[1][0], x[1][1]]))
    five_sum_sqeuclidean = labels.zip(
        five_sum.map(lambda x: (sqeuclidean(x[0][0], x[0][1]), x[1])).coalesce(
            1)).map(lambda x: LabeledPoint(x[0], [x[1][0], x[1][1]]))
    five_mean_cosine = labels.zip(
        five_mean.map(lambda x: (get_cosine(x[0]), x[1])).coalesce(1)).map(
            lambda x: LabeledPoint(x[0], [x[1][0], x[1][1]]))
    five_mean_sqeuclidean = labels.zip(
        five_mean.map(lambda x: (sqeuclidean(x[0][0], x[0][1]), x[1])).
        coalesce(1)).map(lambda x: LabeledPoint(x[0], [x[1][0], x[1][1]]))
    RDDs = [
        one_sum_difference, one_mean_difference, two_sum_cosine,
        two_sum_sqeuclidean, two_mean_cosine, two_mean_sqeuclidean, three,
        four_sum_cosine, four_sum_sqeuclidean, four_mean_cosine,
        four_mean_sqeuclidean, five_sum_cosine, five_mean_sqeuclidean,
        five_mean_cosine, five_mean_sqeuclidean
    ]

    return RDDs
Ejemplo n.º 13
0
def parsePoint(line):
    line = line.replace("[", '')
    line = line.replace("]", '')
    line = line.replace(" ", '')
    values = [int(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Liner Regression").setMaster("yarn")
sc = SparkContext(conf=conf)

data = [
             LabeledPoint(0, Vectors.dense([1, 0, 0])),
             LabeledPoint(0, Vectors.dense([2, 0, 0])),
             LabeledPoint(1, Vectors.dense([0, 1, 0])),
             LabeledPoint(1, Vectors.dense([0, 2, 0])),
             LabeledPoint(2, Vectors.dense([0, 0, 1])),
             LabeledPoint(2, Vectors.dense([0, 0, 2]))
]

# $example on$
data = sc.parallelize(data)

 # Split data aproximately into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=0)
training.cache()

model = NaiveBayes.train(training, 1.0)

predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

print("Test Data:")
print(test.collect())
Ejemplo n.º 15
0
sc = SparkContext("local", "titanic_test")
sqlContext = SQLContext(sc)
df = pd.read_csv('Titanic_train.csv')

df['Sex'] = df['Sex'].replace('female', 1)
df['Sex'] = df['Sex'].replace('male', 0)
df['Age'] = df['Age'].replace('NaN', -1)

traindf = pd.DataFrame(df,
                       columns=['Survived', 'Pclass', 'Age', 'Sex', 'Fare'])

sdf = sqlContext.createDataFrame(traindf)

import pyspark.mllib.classification as sparkclass
temp = sdf.map(lambda x: LabeledPoint(x[0], [x[1:]]))

#lrm = sparkclass.SVMWithSGD.train(temp,iterations=10)
#lrm=sparkclass.LogisticRegressionWithSGD.train(temp,iterations=10)

from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

lrm = DecisionTree.trainClassifier(temp,
                                   numClasses=2,
                                   categoricalFeaturesInfo={},
                                   impurity='gini',
                                   maxDepth=5,
                                   maxBins=32)

df = pd.read_csv('Titanic_test.csv')
Ejemplo n.º 16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Project      : tql-Python.
# @File         : libsvm2df
# @Time         : 2020-01-22 12:16
# @Author       : yuanjie
# @Email        : [email protected]
# @Software     : PyCharm
# @Description  : https://blog.csdn.net/weixin_42286026/article/details/84496896

from pyspark.mllib.util import MLUtils

sc = ''
(MLUtils.loadLibSVMFile(sc, 'libsvm__').map(
    lambda r: (r.label, r.features.toArray())).saveAsTextFile('ffm'))

from pyspark.mllib.regression import LabeledPoint
labelpointRDD = sparkdf.rdd.map(lambda row: LabeledPoint(row[-1], row[:-1]))
def create_labeled_point(line_split):
    clean_line_split = line_split[0:41]
    # convert protocol to numeric categorical variable
    try:
        clean_line_split[1] = protocols.index(clean_line_split[1])
    except:
        clean_line_split[1] = len(protocols)
# convert service to numeric categorical variable
    try:
        clean_line_split[2] = services.index(clean_line_split[2])
    except:
        clean_line_split[2] = len(services)
# convert flag to numeric categorical variable
    try:
        clean_line_split[3] = flags.index(clean_line_split[3])
    except:
        clean_line_split[3] = len(flags)
# convert label to binary label
#	attack = 1.0
    attack = 4.0
    if line_split[41] == 'normal.':
        attack = 0.0
#	elif line_split[41]=='back.':
#	if line_split[41]=='back.':
#		attack = 1.0
#	elif line_split[41]=='land.':
#		attack = 2.0
#	elif line_split[41]=='neptune.':
#		attack = 3.0
#	elif line_split[41]=='pod.':
#		attack = 4.0
#	elif line_split[41]=='smurf.':
#		attack = 5.0
#	elif line_split[41]=='teardrop.':
#		attack = 6.0
    elif line_split[41] == 'ipsweep.':
        #	if line_split[41]=='ipsweep.':
        attack = 1.0
    elif line_split[41] == 'nmap.':
        attack = 2.0
#	elif line_split[41]=='portsweep.':
#	if line_split[41]=='portsweep.':
#		attack = 3.0
#	elif line_split[41]=='normal.':
#		attack = 0.0
    else:
        attack = 4.0


#	elif line_split[41]=='imap.':
#		attack = 10.0
#	elif line_split[41]=='ftp_write.':
#		attack = 11.0
#	elif line_split[41]=='guess_passwd.':
#		attack = 12. line_split[41]=='spy.':
#		attack = 13.0
#	elif line_split[41]=='warezclient.':
#		attack = 14.0
#	elif line_split[41]=='warezmaster.':
#		attack = 15.0
#	elif line_split[41]=='multihop.':
#		attack = 16.0
#	elif line_split[41]=='phf.':
#		attack = 17.0

#	elif line_split[41]=='buffer_overflow.':
#		attack = 18.0
#	elif line_split[41]=='rootkit.':
#		attack = 19.0
#	elif line_split[41]=='perl.':
#		attack = 20.0
#	elif line_split[41]=='loadmodule.':
#		attack = 21.0
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
data = [
    LabeledPoint(0.0, [0.0, 1.0]),
    LabeledPoint(1.0, [1.0, 0.0]),
]
lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10)
lrm.predict([1.0, 0.0])
lrm.predict([0.0, 1.0])
lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect()
lrm.clearThreshold()
lrm.predict([0.0, 1.0])
Ejemplo n.º 19
0
def parsePoint(line):
    data = line[1:][:-1]
    values = [float(x) for x in data.split(', ')]
    return LabeledPoint(1 if values[34] > 0.5 else 0, values[:-1])
Ejemplo n.º 20
0
 def labeledPointConverter(row):
     try:
         return LabeledPoint(1.0, row[1:])
     except ValueError:
         return LabeledPoint(50.0,[1.0])
Ejemplo n.º 21
0
def labelData(data):
    return data.map(lambda row: LabeledPoint(row[9], [
        row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8],
        row[10], row[11], row[12], row[13], row[14], row[15]
    ]))
Ejemplo n.º 22
0
    from pyspark.mllib.util import MLUtils


    if __name__ == "__main__":
        if len(sys.argv) not in [1, 2]:
            print("Usage: correlations (<file>)", file=sys.stderr)
            exit(-1)
        sc = SparkContext(appName="PythonCorrelations")
        if len(sys.argv) == 2:
            filepath = sys.argv[1]
        else:
            filepath = 'sample_linear_regression_data.txt'
        corrType = 'pearson'

        points = MLUtils.loadLibSVMFile(sc, filepath)\
            .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))

        print()
        print('Summary of data file: ' + filepath)
        print('%d data points' % points.count())

        # Statistics (correlations)
        print()
        print('Correlation (%s) between label and each feature' % corrType)
        print('Feature\tCorrelation')
        numFeatures = points.take(1)[0].features.size
        labelRDD = points.map(lambda lp: lp.label)
        for i in range(numFeatures):
            featureRDD = points.map(lambda lp: lp.features[i])
            corr = Statistics.corr(labelRDD, featureRDD, corrType)
            print('%d\t%g' % (i, corr))
        i += 1
        step += len(m)
        num_vec = np.array([float(field) for field in record[5:6]])
        return np.concatenate((cat_vec, num_vec))


def extract_hp_label(record):
    return record[6]


def extract_acc_label(record):
    return record[5]


accData = records.map(
    lambda r: LabeledPoint(extract_acc_label(r), extract_features(r)))
hpData = records.map(
    lambda r: LabeledPoint(extract_hp_label(r), extract_features(r)))

acc_first_point = accData.first()
hp_first_point = hpData.first()


def extract_features_dt(record):
    return np.array(map(float, record[5:6]))


# Decision Tree Method

# Feature vector creation for acceleration
data_dt_acc = records.map(
Ejemplo n.º 24
0
def buildTfIdfRddAllTopics(business, sports, politics, entertainment):
	business_df = buildTextRDD(business, BUSINESS_LABEL)
	politics_df = buildTextRDD(sports, POLITICS_LABEL)
	sports_df = buildTextRDD(politics, SPORTS_LABEL)
	entertainment_df = buildTextRDD(entertainment, ENTERTAINMENT_LABEL)

	# Union together all dataframes
	main_df = business_df.union(politics_df)
	main_df = main_df.union(sports_df)
	main_df = main_df.union(entertainment_df)
	main_df = main_df.withColumnRenamed('_1', 'label')
	main_df = main_df.withColumnRenamed('_2', 'content')
	tokenizer = Tokenizer(inputCol="content", outputCol="words")
	wordsData = tokenizer.transform(main_df)
	hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=8)
	featurizedData = hashingTF.transform(wordsData)
	idf = IDF(inputCol="rawFeatures", outputCol="features")
	idfModel = idf.fit(featurizedData)
	rescaledData = idfModel.transform(featurizedData)
	return rescaledData.select([c for c in rescaledData.columns if c in ['label', 'features']]).rdd.map(lambda x: LabeledPoint(x.label, MLLibVectors.fromML(x.features)))
Ejemplo n.º 25
0
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])
Ejemplo n.º 26
0
    """
    :param RDD: RDD, created from step_level_features
    :return: RDD with features, aggregated over the trip
    """ 
    trip_lv = RDD.map(lambda x: (x[0], (min(x[1][4]), max(x[1][4]), min(x[1][5]), max(x[1][5]), 
                                        len(x[1][0]), sum(x[1][4]), np.mean(x[1][4]), np.std(x[1][4]), 
                                        np.mean(x[1][5]), np.std(x[1][5]),
                                        sum([elem < 0.5 for elem in x[1][4]])), x[2]))
    return trip_lv

def create_logistic_model(RDD):
	"""
	:param RDD: RDD, create from trip_level_features
	"return: mllib logistic regression model
	"""
    label_pt = RDD.map(lambda x: LabeledPoint(x[2], x[1]))
    model = LogisticRegressionWithLBFGS.train(label_pt)
    return model


def train_err(model):
	"""
	:param model: mllib logistic regresion model
	:return: training error for model
	"""
	labelsAndPreds = label_pt.map(lambda x: (x.label, model.predict(x.features)))
    trainErr = labelsAndPreds.filter(lambda (x, y): x != y).count() / float(label_pt.count())
    return "Training Error = " + str(trainErr)


"""
Ejemplo n.º 27
0
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[1], [values[0]])
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")

lp = selectData.map(lambda x : LabeledPoint(x.label,x.features))

(trainingData, testData) = lp.randomSplit([0.6, 0.4])

 
model = NaiveBayes.train(trainingData,1.0)

predictionAndLabel = testData.map(lambda p : (model.predict(p.features), p.label))
accuracy = 100 * predictionAndLabel.filter(lambda (x, v): x == v ).count() / testData.count()
print accuracy

fp = predictionAndLabel.filter(lambda (x, v): x == 1 ).filter(lambda(x,v): v==0).count()
tp = predictionAndLabel.filter(lambda (x, v): x == v ).filter(lambda(x,v): v==1).count()
totalpositive = predictionAndLabel.filter(lambda(x,v): v==1).count()
recall = 100*tp/totalpositive
precision = 100*tp/(tp+fp)
Ejemplo n.º 29
0
import numpy as np

from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

conf = SparkConf().setMaster("local").setAppName("Test")

sc = SparkContext(conf=conf)
sparse_data = [
    LabeledPoint(0.0, Vectors.dense([1.0, 0.0])),
    LabeledPoint(1.0, Vectors.dense([0.0, 1.0])),
    LabeledPoint(0.0, Vectors.dense([10.0, 9.0])),
    LabeledPoint(1.0, Vectors.dense([9.0, 10.0]))
]
sparse_data = [
    LabeledPoint(0.0, Vectors.dense([1.0, 0.0])),
    LabeledPoint(1.0, Vectors.dense([0.0, 1.0])),
    LabeledPoint(0.0, Vectors.dense([10.0, 9.0])),
    LabeledPoint(1.0, Vectors.dense([9.0, 10.0]))
]
rdd = sc.parallelize(sparse_data)
model = LogisticRegressionWithSGD.train(rdd, iterations=10)
rdd = rdd.map(lambda x:x.features)
model.predict(rdd).saveAsTextFile("result/hdfs")
sc.stop()
Ejemplo n.º 30
0
 def parsePoint(line):
     values = line.split()
     return LabeledPoint(
         int(values[0]),
         DenseVector([int(x.split(':')[1]) for x in values[1:]]))