コード例 #1
0
def PCAdata(df, num):
    Label = df.map(lambda p: p.label).zipWithIndex().map(lambda (label, index):
                                                         (index, label))
    Features = df.map(lambda p: p.features)
    pcaModel = PCA(num).fit(Features)
    projected = pcaModel.transform(Features)
    second = projected.zipWithIndex().map(lambda (features, index):
                                          (index, features))
    result = Label.join(second).map(
        lambda (idx, (label, features)): LabeledPoint(label, features))
    return result
コード例 #2
0
def pca_fit(parsed_Data):
    x = parsed_Data.map(lambda p: p.features)
    pc = PCA(5).fit(x)
    transformed = pc.transform(x)
    y = parsed_Data.map(lambda p: p.label)
    a = transformed.zip(y)
    paired = a.map(lambda line: LabeledPoint(line[1], line[0]))

    rdd2 = paired.randomSplit([0.8, 0.2])
    model2 = LinearRegressionWithSGD.train(rdd2[0], iterations=100,
                                           step=0.00000001, regType=None)

    # Evaluate the model on training data
    valuesAndPreds = rdd2[1].map(lambda p: (p.label, model2.predict(p.features)))
    MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)\
              .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
コード例 #3
0
def run_pca(sc):
    cpu_count = multiprocessing.cpu_count()
    cluster_loss = dict()

    for n in range(0, CLUSTERS):
        filename = "cluster_" + str(n) + ".csv"
        cl_file = CLUSTER_PATH + filename
        dataset = sc.textFile(cl_file, cpu_count)
        dataset = dataset.map(
            lambda line: Vectors.dense([float(x) for x in line.split(';')]))

        model = PCA(2).fit(dataset)
        transformed = model.transform(dataset)
        transformed_csv = transformed.map(
            lambda x: ';'.join(list(map(str, x))))
        transformed_csv.coalesce(1).saveAsTextFile(PCA_PATH +
                                                   "onehot_%s" % filename)
コード例 #4
0
ファイル: samplewithISAS.py プロジェクト: wenfanwu/gmaze
print rdd_b.count()
print rdd_b.take(1)

#
# Profiles standardisation
#
new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b)
print type(new_scalar)
scaler3 = new_scalar.transform(rdd_b)

#
# Profiles compression with PCA
#
model = PCAmllib(10).fit(scaler3)
print type(model)
transformed = model.transform(scaler3)
print type(transformed)
print transformed.count()
print transformed.first()

#
# Train a Profiles classification model with KMean
#
NBCLUSTERS = 8
INITMODE = 'kmean||'  # kmean|| or random
clusters = mllibKMeans.train(transformed,
                             NBCLUSTERS,
                             maxIterations=100,
                             initializationMode=INITMODE)
# Rq: Option "runs=5" has been deprecated in 1.6.0
コード例 #5
0
data = sc.textFile(
    "hdfs://master:9000/root/pyspark_test/iris_data.txt")  #for hadoop yarn
parsedData = data.map(lambda line: array([x for x in line.split(',')]))

first_data = parsedData.take(1)[0]
data_row = len(first_data)  #include many input and one output attributes

params_only = parsedData.map(
    lambda x: Vectors.dense(np.float_(x[0:(data_row - 1)])))
#params_only.take(5)
#the type of params_only is pyspark.rdd.PipelinedRDD
#params_only=parsedData.map(lambda x: array(np.float_(x[0:(data_row-1)])))

model_test = PCAmllib(2).fit(params_only)
transformed = model_test.transform(params_only)
#transformed.collect()

pca_2d = transformed.collect()


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


k = 3
clusters = KMeans.train(params_only,
                        k,
                        maxIterations=10,
コード例 #6
0
ファイル: pca.py プロジェクト: kendazheng/sparkml
# -*- coding:utf-8 -*-
""""
Program: PCA
Description: 调用spark内置的PCA算法
Author: zhenglei - [email protected]
Date: 2016-01-14 13:45:02
# Last modified: 2016-01-28 19:23:14
Python release: 2.7
"""
# 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.feature import PCA
from pyspark.mllib.linalg import Vectors

if __name__ == '__main__':
    sc = SparkContext()
    tmpdatas = sc.textFile('pcaTestSet.txt')
    datas = tmpdatas.map(lambda line: Vectors.dense(
        array([float(line.split('\t')[0]), float(line.split('\t')[1])])))
    print datas.collect()[0]

    # 将输入降维成1维数据,并测试降维模型的准确性
    model = PCA(1).fit(datas)
    transforms = model.transform(datas)
    print transforms.collect()[0], array(transforms.collect()).shape

    # 测试输入[10.235186,11.321997]之后的降维值
    print model.transform(array([10.235186, 11.321997]))
    sc.stop()
コード例 #7
0
ファイル: pca.py プロジェクト: 0xqq/sparkml-1
""""
Program: PCA
Description: 调用spark内置的PCA算法
Author: zhenglei - [email protected]
Date: 2016-01-14 13:45:02
# Last modified: 2016-01-28 19:23:14
Python release: 2.7
"""
# 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.feature import PCA
from pyspark.mllib.linalg import Vectors

if __name__ == '__main__':
    sc = SparkContext()
    tmpdatas = sc.textFile('pcaTestSet.txt')
    datas = tmpdatas.map(lambda line: Vectors.dense(
        array([float(line.split('\t')[0]),
               float(line.split('\t')[1])])))
    print datas.collect()[0]

    # 将输入降维成1维数据,并测试降维模型的准确性
    model = PCA(1).fit(datas)
    transforms = model.transform(datas)
    print transforms.collect()[0], array(transforms.collect()).shape

    # 测试输入[10.235186,11.321997]之后的降维值
    print model.transform(array([10.235186, 11.321997]))
    sc.stop()
コード例 #8
0
    #
    #
    # '''
    # The A.cartesian(B) will be an RDD of the form:
    # [(A ID1, A String1), (A ID2, A String2), ...]  and  [(B ID1, B String1), (B ID2, B String2), ...]
    # to:
    # [ ((A ID1, A String1), (B ID1, B String1)), ((A ID1, A String1), (B ID2, B String2)), ((A URL2, A String2), (B ID1, B String1)), ... ]¶
    # '''
    # cross_RDD = ID_tokens.cartesian(ID_tokens).cache()
    # # commonTokens:  [[id1, id2], [tokens]]
    # commonTokens = cross_RDD.map(get_common)
    # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache()
    #
    # end = time.time()
    # print 'total prepare: '+ str(end - start)
    # print similarities_RDD.count()
    # c_time = time.time()
    # print 'count time: ' + str(c_time - end)
    # similarities_RDD.collect()
    # c2_time = time.time()
    # print 'count time: ' + str(c2_time - c_time)
    # print 'Successfully Calculated the similarities between all the posts'


if __name__ == '__main__':
    sc = SparkContext('local')
    tfidf_matrix = create_tfidf(sc)
    tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row))
    reduc = PCA(3).fit(tfidf_dVector_matrix)
    after_pca = reduc.transform(tfidf_dVector_matrix)
コード例 #9
0
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

# Store the document names for later:
documentNames = fields.map(lambda x: x[1])

# Now hash the words in each document to their term frequencies:
hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value.

# Let's compute the TF*IDF of each term in each document:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

# Now we have an RDD of sparse vectors, where each value is the TFxIDF
# of each unique hash value for each document.
model = PCAmllib(2).fit(tfidf)
pc = model.transform(tfidf)

#mat = RowMatrix(tfidf)
# Calculate PCA
#pc = mat.computePrincipalComponents(int(mat.numCols))

print("Principal components :")
print(pc)
コード例 #10
0
ファイル: pre-process.py プロジェクト: dichen001/CSC522-Spark
    # '''
    # cross_RDD = ID_tokens.cartesian(ID_tokens).cache()
    # # commonTokens:  [[id1, id2], [tokens]]
    # commonTokens = cross_RDD.map(get_common)
    # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache()
    #
    # end = time.time()
    # print 'total prepare: '+ str(end - start)
    # print similarities_RDD.count()
    # c_time = time.time()
    # print 'count time: ' + str(c_time - end)
    # similarities_RDD.collect()
    # c2_time = time.time()
    # print 'count time: ' + str(c2_time - c_time)
    # print 'Successfully Calculated the similarities between all the posts'


if __name__ == '__main__':
    conf = SparkConf()
    conf.set("spark.executor.memory", "16g")
    conf.set("spark.driver.memory","16g")
    conf.set("spark.driver.maxResultSize","16g")
    sc = SparkContext(conf=conf)
    tfidf_matrix = create_tfidf(sc)
    tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row))
    start2 = time.time()
    model = PCA(20).fit(tfidf_dVector_matrix)
    end2 = time.time()
    print (end2 - start2)
    after_pca = model.transform(tfidf_dVector_matrix).collect
    # LOADING AND COMPUTING TF's TRAINING MODEL
    print('Loading TRAINING_TF_MODEL...')
    tf_training = sc.pickleFile(os.getcwd() + '/model/TF/TF_MODEL_' +
                                str(feature_dim))
    print('done!')

    print('Computing TF-IDF MODEL...')
    idf_training = IDF(minDocFreq=5).fit(tf_training)
    tfidf_training = idf_training.transform(tf_training)
    print('done!')

    # APPLYING PCA ON TRAINING DATA
    if pca_mode.value == 1:
        print('Applying PCA on training data...')
        PCA_model = PCA(low_dim).fit(tfidf_training)
        tfidf_training = PCA_model.transform(tfidf_training)
        k = low_dim

    # pcArray = model.transform(tfidf_training.first()).toArray()

    #setting checkpoint
    # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint")

    # CREATING DStream FROM TRAINING'S RDD
    trainingQueue = [tfidf_training]
    trainingStream = ssc.queueStream(trainingQueue)

    # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND
    model = StreamingKMeans(k=2, decayFactor=1.0,
                            timeUnit='batches').setRandomCenters(k, 1.0, 0)
コード例 #12
0
def reduceDimensions(features_rdd):
    model = PCAmllib(2).fit(features_rdd)
    transformed_rdd = model.transform(features_rdd)
    return transformed_rdd
コード例 #13
0
ファイル: SparkPCA.py プロジェクト: mas-dse/dse_capstone
    m_source_list=[key[0], key[1], key[2]] + \
           [v[1] for v in vals] + \
           [v[2] for v in vals] + \
           [v[3] for v in vals] + \
           [v[4] for v in vals] + \
           [v[5] for v in vals]
    return Vectors.dense(m_source_list)


# COMMAND ----------

m_file_name = '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010'
lines = sc.textFile(m_file_name, minPartitions=4)
newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow)

# COMMAND ----------

t = newrows.first()
print type(t), t

# COMMAND ----------

from pyspark.mllib.feature import PCA as PCAmllib

model = PCAmllib(2).fit(newrows)
transformed = model.transform(newrows)

# COMMAND ----------

t = transformed.first()
print type(t), t
コード例 #14
0
ファイル: SparkPCA.py プロジェクト: haydenholligan/practice
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

# Store the document names for later:
documentNames = fields.map(lambda x: x[1])

# Now hash the words in each document to their term frequencies:
hashingTF = HashingTF(100000)  # 100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value.

# Let's compute the TF*IDF of each term in each document:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

# Now we have an RDD of sparse vectors, where each value is the TFxIDF
# of each unique hash value for each document.
model = PCAmllib(2).fit(tfidf)
pc = model.transform(tfidf)

# mat = RowMatrix(tfidf)
# Calculate PCA
# pc = mat.computePrincipalComponents(int(mat.numCols))

print("Principal components :")
print(pc)
コード例 #15
0
# Get average overall rating per review length (char count)
review_length_cc, = averages_per_key(reviewer_vectors, lambda x:
                                     (x[1][4], [x[1][2]]))
review_length_wc, = averages_per_key(reviewer_vectors, lambda x:
                                     (x[1][6], [x[1][2]]))

result_collection["review_length_char_count"] = review_length_cc
result_collection["review_length_word_count"] = review_length_wc

# Conduct PCA
reviewer_vectors_real = reviewer_vectors.map(
    lambda x: Vectors.dense([val for val in x[1]]))

pca_model = PCA(8).fit(reviewer_vectors_real)
transformed = pca_model.transform(reviewer_vectors_real)

current_best = None
current_best_cost = float("inf")

# Run K-Means
for k in range(2, 70, 7):
    kmeans_model = KMeans.train(transformed, k, maxIterations=100, runs=10)

    cost = kmeans_model.computeCost(transformed)

    if cost < current_best_cost:
        current_best_cost = cost
        current_best = kmeans_model

#current_best.save(sc, "reviews/kmeans_model")
コード例 #16
0
ファイル: PCMspark-ISAS.py プロジェクト: gmaze/pcmspark
sample_mean = scaler.call('mean')

# Effectively scale the dataset:
rdd_norm = scaler.transform(rdd_data)

# In[Reduction]:

# Compute PCA new dimensions:
from pyspark.mllib.feature import PCA as PCAmllib

Neof = 20
reducer = PCAmllib(Neof).fit(rdd_norm)
# print type(reducer)

# Effectively reduce the dataset:
rdd_reduced = reducer.transform(rdd_norm)
# print type(rdd_reduced)

# In[Classification with k-mean]:

### Lancement de KMean pour creation du modele de classification
from pyspark.mllib.clustering import KMeans as KMeansmllib
import time
start_time = time.time()

NBCLUSTERS = 8
INITMODE = 'kmean||'  # kmean|| or random
clusters_kmean = KMeansmllib.train(rdd_reduced,
                                   NBCLUSTERS,
                                   maxIterations=200,
                                   runs=20,
コード例 #17
0
ファイル: SparkPCA.py プロジェクト: kkdyer/dse_capstone
    m_source_list=[key[0], key[1], key[2]] + \
           [v[1] for v in vals] + \
           [v[2] for v in vals] + \
           [v[3] for v in vals] + \
           [v[4] for v in vals] + \
           [v[5] for v in vals]
    return Vectors.dense(m_source_list)

# COMMAND ----------

m_file_name= '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010'
lines = sc.textFile(m_file_name, minPartitions=4)
newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow)

# COMMAND ----------

t= newrows.first()
print type(t), t

# COMMAND ----------

from pyspark.mllib.feature import PCA as PCAmllib

model = PCAmllib(2).fit(newrows)
transformed = model.transform(newrows)

# COMMAND ----------

t= transformed.first()
print type(t), t
コード例 #18
0
def reduceDimensions(features_rdd):
	model = PCAmllib(2).fit(features_rdd)
	transformed_rdd = model.transform(features_rdd)
	return transformed_rdd
コード例 #19
0
    spark = SparkSession\
        .builder\
        .appName("linearSVC Example")\
        .getOrCreate()

    # $example on$
    # Load training data
    inputData = spark.read.format("libsvm") \
        .load("combined_data_svm.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    pca = PCAml(k=2, inputCol="features", outputCol="pca")
    model = PCAmllib(2).fit(train)
    transform = model.transform(train)
    predictions = model.inverse_transform(test)

    # score the model on test data.
    #predictions = lsvcModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

    # compute the classification error on test data.
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    # $example off$

    spark.stop()