Esempio n. 1
0
from pyspark.mllib.feature import StandardScaler, StandardScalerModel

scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data)

sample_mean = scaler.call('mean')

# Effectively scale the dataset:
rdd_norm = scaler.transform(rdd_data)

# In[Reduction]:

# Compute PCA new dimensions:
from pyspark.mllib.feature import PCA as PCAmllib

Neof = 20
reducer = PCAmllib(Neof).fit(rdd_norm)
# print type(reducer)

# Effectively reduce the dataset:
rdd_reduced = reducer.transform(rdd_norm)
# print type(rdd_reduced)

# In[Classification with k-mean]:

### Lancement de KMean pour creation du modele de classification
from pyspark.mllib.clustering import KMeans as KMeansmllib
import time
start_time = time.time()

NBCLUSTERS = 8
INITMODE = 'kmean||'  # kmean|| or random
Esempio n. 2
0
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

# Store the document names for later:
documentNames = fields.map(lambda x: x[1])

# Now hash the words in each document to their term frequencies:
hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value.

# Let's compute the TF*IDF of each term in each document:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

# Now we have an RDD of sparse vectors, where each value is the TFxIDF
# of each unique hash value for each document.
model = PCAmllib(2).fit(tfidf)
pc = model.transform(tfidf)

#mat = RowMatrix(tfidf)
# Calculate PCA
#pc = mat.computePrincipalComponents(int(mat.numCols))

print("Principal components :")
print(pc)
Esempio n. 3
0
rdd_loaded.count()
rdd_b = rdd_loaded.flatMap(lambda x: x[2]).map(lambda x: Vectors.dense(x))
print rdd_b.count()
print rdd_b.take(1)

#
# Profiles standardisation
#
new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b)
print type(new_scalar)
scaler3 = new_scalar.transform(rdd_b)

#
# Profiles compression with PCA
#
model = PCAmllib(10).fit(scaler3)
print type(model)
transformed = model.transform(scaler3)
print type(transformed)
print transformed.count()
print transformed.first()

#
# Train a Profiles classification model with KMean
#
NBCLUSTERS = 8
INITMODE = 'kmean||'  # kmean|| or random
clusters = mllibKMeans.train(transformed,
                             NBCLUSTERS,
                             maxIterations=100,
                             initializationMode=INITMODE)
#data = sc.textFile("iris_data.txt") #for master local or standalone model

data = sc.textFile(
    "hdfs://master:9000/root/pyspark_test/iris_data.txt")  #for hadoop yarn
parsedData = data.map(lambda line: array([x for x in line.split(',')]))

first_data = parsedData.take(1)[0]
data_row = len(first_data)  #include many input and one output attributes

params_only = parsedData.map(
    lambda x: Vectors.dense(np.float_(x[0:(data_row - 1)])))
#params_only.take(5)
#the type of params_only is pyspark.rdd.PipelinedRDD
#params_only=parsedData.map(lambda x: array(np.float_(x[0:(data_row-1)])))

model_test = PCAmllib(2).fit(params_only)
transformed = model_test.transform(params_only)
#transformed.collect()

pca_2d = transformed.collect()


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


k = 3
clusters = KMeans.train(params_only,
                        k,
def reduceDimensions(features_rdd):
    model = PCAmllib(2).fit(features_rdd)
    transformed_rdd = model.transform(features_rdd)
    return transformed_rdd
Esempio n. 6
0
    m_source_list=[key[0], key[1], key[2]] + \
           [v[1] for v in vals] + \
           [v[2] for v in vals] + \
           [v[3] for v in vals] + \
           [v[4] for v in vals] + \
           [v[5] for v in vals]
    return Vectors.dense(m_source_list)


# COMMAND ----------

m_file_name = '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010'
lines = sc.textFile(m_file_name, minPartitions=4)
newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow)

# COMMAND ----------

t = newrows.first()
print type(t), t

# COMMAND ----------

from pyspark.mllib.feature import PCA as PCAmllib

model = PCAmllib(2).fit(newrows)
transformed = model.transform(newrows)

# COMMAND ----------

t = transformed.first()
print type(t), t
Esempio n. 7
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("linearSVC Example")\
        .getOrCreate()

    # $example on$
    # Load training data
    inputData = spark.read.format("libsvm") \
        .load("combined_data_svm.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    pca = PCAml(k=2, inputCol="features", outputCol="pca")
    model = PCAmllib(2).fit(train)
    transform = model.transform(train)
    predictions = model.inverse_transform(test)

    # score the model on test data.
    #predictions = lsvcModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

    # compute the classification error on test data.
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    # $example off$