Esempio n. 1
0
    def __train_model(self):
        """Train the ALS model with the current dataset
        """
        logger.info("Training model 1...")
        kmeans_1 = KMeans().setK(9).setSeed(1)
        model_1 = kmeans_1.fit(self.df_cbg1)
        logger.info("Model 1 built!")
        logger.info("Evaluating the model 1...")
        self.predictions_1 = model_1.transform(self.df_cbg1)
        logger.info("Model 1 Done !")

        logger.info("Training model 2...")
        kmeans_2 = KMeans().setK(9).setSeed(1)
        model_2 = kmeans_2.fit(self.df_cbg2)
        logger.info("Model 2 built!")
        logger.info("Evaluating the model 2...")
        self.predictions_2 = model_2.transform(self.df_cbg2)
        logger.info("Model 2 Done !")

        logger.info("Training model 3...")
        kmeans_3 = KMeans().setK(9).setSeed(1)
        model_3 = kmeans_3.fit(self.df_cbg3)
        logger.info("Model 3 built!")
        logger.info("Evaluating the model 3...")
        self.predictions_3 = model_3.transform(self.df_cbg3)
        logger.info("Model 3 Done !")
Esempio n. 2
0
    def __train_model(self):

        logger.info("Training model 1...")
        kmeans_1 = KMeans().setK(9).setSeed(1)
        model_1 = kmeans_1.fit(self.df_crime1)
        logger.info("Model 1 built!")
        logger.info("Evaluating the model 1...")
        self.predictions_1 = model_1.transform(self.df_crime1)
        logger.info("Model 1 Done !")

        logger.info("Training model 2...")
        kmeans_2 = KMeans().setK(9).setSeed(1)
        model_2 = kmeans_2.fit(self.df_crime2)
        logger.info("Model 2 built!")
        logger.info("Evaluating the model 2...")
        self.predictions_2 = model_2.transform(self.df_crime2)
        logger.info("Model 2 Done !")

        logger.info("Training model 3...")
        kmeans_3 = KMeans().setK(9).setSeed(1)
        model_3 = kmeans_3.fit(self.df_crime3)
        logger.info("Model 3 built!")
        logger.info("Evaluating the model 3...")
        self.predictions_3 = model_3.transform(self.df_crime3)
        logger.info("Model 3 Done !")
Esempio n. 3
0
    def __train_model(self):
        """Train the model with the current dataset
        """
        logger.info("Splitting dataset into 3...")
        # Model 0: 1/3 data pertama.
        # Model 1: 1/3 data pertama + 1/3 data kedua.
        # Model 2: semua data
        self.df0 = self.dforiginal.limit(int(self.dataset_count / 3))
        self.df1 = self.dforiginal.limit(int(self.dataset_count * 2 / 3))
        self.df2 = self.dforiginal
        print('df 0 count = ' + str(self.df0.count()))
        print('df 1 count = ' + str(self.df1.count()))
        print('df 2 count = ' + str(self.df2.count()))
        logger.info("Dataset Splitted !")

        logger.info("Training model 0...")
        kmeans_0 = KMeans().setK(5).setSeed(1)
        model_0 = kmeans_0.fit(self.df0)
        self.predictions_0 = model_0.transform(self.df0)
        logger.info("Model 0 built!")
        logger.info("Evaluating the model 0...")
        evaluator_0 = ClusteringEvaluator()
        silhouette_0 = evaluator_0.evaluate(self.predictions_0)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_0))
        self.centers_0 = model_0.clusterCenters()
        logger.info("Model 0 Done !")

        logger.info("Training model 1...")
        kmeans_1 = KMeans().setK(5).setSeed(1)
        model_1 = kmeans_1.fit(self.df1)
        self.predictions_1 = model_1.transform(self.df1)
        logger.info("Model 1 built!")
        logger.info("Evaluating the model 1...")
        evaluator_1 = ClusteringEvaluator()
        silhouette_1 = evaluator_1.evaluate(self.predictions_1)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_1))
        self.centers_1 = model_1.clusterCenters()
        logger.info("Model 1 Done !")

        logger.info("Training model 2...")
        kmeans_2 = KMeans().setK(5).setSeed(1)
        model_2 = kmeans_2.fit(self.df2)
        self.predictions_2 = model_2.transform(self.df2)
        logger.info("Model 2 built!")
        logger.info("Evaluating the model 2...")
        evaluator_2 = ClusteringEvaluator()
        silhouette_2 = evaluator_2.evaluate(self.predictions_2)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_2))
        self.centers_2 = model_2.clusterCenters()
        logger.info("Model 2 Done !")
Esempio n. 4
0
def train(bucket_name, feature_path, feature_name, output_path, plot_path):
    sc = SparkContext.getOrCreate()
    sqlCtx = SQLContext(sc)

    # read from s3 csv and store to local
    path = feature_path + feature_name  # used both locally and remotely: features/pca.csv
    s3 = boto3.resource('s3')
    s3.Object(bucket_name, path).download_file(path)
    df_spark = sqlCtx.read.csv(path, header=True, inferSchema=True)

    # Dataframe to rdd
    vecAssembler = VectorAssembler(inputCols=df_spark.columns,
                                   outputCol="features")
    df_spark = vecAssembler.transform(df_spark)
    rdd = df_spark.rdd.map(lambda x: array(x["features"]))
    print rdd.take(10)
    # From here: K-means specific
    # Pick k
    cost = np.zeros(20)
    for k in range(2, 20):
        kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
        model = kmeans.fit(df_spark.sample(False, 0.5, seed=42))
        cost[k] = model.computeCost(df_spark)
    plt.figure(1)
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    ax.plot(range(2, 20), cost[2:20])
    ax.set_xlabel('k')
    ax.set_ylabel('cost')
    plt.savefig(plot_path + "k-means vary-k.png")

    # Train and upload model to s3
    k = 8
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(df_spark)

    model.write().overwrite().save(output_path +
                                   "k-means.model")  # save the model to s3

    data = model.transform(df_spark).toPandas()
    print data.info()
    data.to_csv(output_path + "/transformed.csv")
    # #Plotting
    fig = plt.figure(2, figsize=(5, 5))
    plt.scatter(data["pca1"],
                data["pca2"],
                c=data["prediction"],
                s=30,
                cmap='viridis')
    plt.title("K Means (K=%d)" % k, fontsize=14)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.savefig(plot_path + "k-means-cluster.png")
Esempio n. 5
0
def hacker_test(spark, resources_folder):
    data = spark.read.csv(resources_folder + 'hack_data.csv',
                          header=True,
                          inferSchema=True)
    data.printSchema()
    data.show()
    print(data.columns)
    assembler = VectorAssembler(inputCols=[
        'Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
        'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed'
    ],
                                outputCol='features')
    data_assembled = assembler.transform(data)
    data_assembled.show()

    scaler = StandardScaler(inputCol='features', outputCol='scaledfeatures')
    scaler_model = scaler.fit(data_assembled)
    data_assembled_scaled = scaler_model.transform(data_assembled)
    data_assembled_scaled.show()

    data_assembled = data_assembled_scaled.select('scaledfeatures').withColumn(
        'features', data_assembled_scaled['scaledfeatures'])
    data_assembled.show()

    print(
        "************************************* con tres cluster *************************************"
    )
    kmeans3 = KMeans(featuresCol='features', k=3, seed=10)
    model3 = kmeans3.fit(data_assembled)
    wssse3 = model3.summary.trainingCost
    print(wssse3)
    print(model3.clusterCenters())
    model3.summary.predictions.show()

    predictions3 = model3.summary.predictions
    predictions3.groupBy('prediction').count().show()
    # predictions3.agg({'prediction': 'count'}).show()

    print(
        "************************************* con dos cluster *************************************"
    )
    kmeans2 = KMeans(featuresCol='features', k=2, seed=10)
    model2 = kmeans2.fit(data_assembled)
    wssse2 = model2.summary.trainingCost
    print(wssse2)
    print(model2.clusterCenters())
    model2.summary.predictions.show()

    predictions2 = model2.summary.predictions
    predictions2.groupBy('prediction').count().show()
Esempio n. 6
0
    def cluster_kmeans(self, k=22):
        # cost = np.zeros(20)
        # for k in range(2,20):
        # 	kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
        # 	model = kmeans.fit(self.dataframe.sample(False,0.1, seed=42))
        # 	cost[k] = model.computeCost(self.dataframe)

        # fig, ax = plt.subplots(1,1, figsize =(8,6))
        # ax.plot(range(2,20),cost[2:20])
        # ax.set_xlabel('k')
        # ax.set_ylabel('cost')
        # fig.show()
        # time.sleep(20)
        kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(
            "features").setPredictionCol("kmeans_prediction")
        model = kmeans.fit(self.dataframe)
        centers = model.clusterCenters()
        try:
            model.save("kmeans-model" + str(k))
        except:
            model.write().overwrite().save("kmeans-model" + str(k))
        # print("Cluster Centers: ")
        # plt.plot(centers, '-o')
        # plt.show()
        self.dataframe = model.transform(self.dataframe)
Esempio n. 7
0
def feature_engineering(class_balancedDf):
    # N-Gram
    ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams")
    ngramDataFrame = ngram.transform(class_balancedDf)

    # Hashing TF
    hashingTF = HashingTF(inputCol="ngrams",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(ngramDataFrame)

    # IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # K-Means
    kmeans = KMeans().setK(6).setSeed(1)
    kmodel = kmeans.fit(rescaledData).transform(rescaledData)

    #LDA
    lda = LDA(k=10, maxIter=10)
    ldamodel = lda.fit(kmodel).transform(kmodel)

    # changing label column to int
    data = ldamodel.withColumn(
        "label", ldamodel.label.cast("Integer")).drop("prediction")

    return data
Esempio n. 8
0
def kmeans(df):
	kmeans = KMeans(k=2,seed=1)
	model = kmeans.fit(df)
	centers = model.clusterCenters()
	print len(centers)
	kmFeatures = model.transform(df).select("features", "prediction")
	dfwrite(kmFeatures,'kmFeatures')	
Esempio n. 9
0
    def cluster(self):
        from pyspark.ml.clustering import KMeans
        from pyspark.ml.evaluation import ClusteringEvaluator

        # Loads data.
        dataset = self.read.format("libsvm").load(self.dataDir + "data/mllib/sample_kmeans_data.txt")

        # Trains a k-means model.
        kmeans = KMeans().setK(2).setSeed(1)
        model = kmeans.fit(dataset)

        # Make predictions
        predictions = model.transform(dataset)

        # Evaluate clustering by computing Silhouette score
        evaluator = ClusteringEvaluator()

        silhouette = evaluator.evaluate(predictions)
        print("Silhouette with squared euclidean distance = " + str(silhouette))

        # Shows the result.
        centers = model.clusterCenters()
        print("Cluster Centers: ")
        for center in centers:
            print(center)
Esempio n. 10
0
def kmeans_scan(_data, _k_min=2, _k_max=6, _tmp_dir='tmp_models'):
    """Scan different kmeans model within the specified k range.
       The function assume that the input data are ready to be used and already contain the features column.
    """
    # Define the evaluator to find the optimal k. The evaluator compute the Siluhette score.
    evaluator = ClusteringEvaluator()

    # Dictionaries use to save the results obtained for the diferent k considered.
    silhuette_scores = {}
    centers = {}

    # If the temporary directory already exists it will be removed to create a fresh one.
    # Other managements of this case are possible but they won't be considered here, the
    # extension to these cases is straitforward.
    if os.path.exists(_tmp_dir):
        shutil.rmtree(_tmp_dir)

    os.mkdir(_tmp_dir)

    # Fit and save the model for the specifoed k
    for k in range(_k_min, _k_max + 1):
        kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol('features')
        model = kmeans.fit(_data)
        transformed = model.transform(_data)
        silhuette_scores[k] = evaluator.evaluate(transformed)
        centers[k] = model.clusterCenters()
        model.save(os.path.join(_tmp_dir, "model_w_k_{}".format(k)))

    return centers, silhuette_scores
Esempio n. 11
0
def main():
    spark = SparkSession.Builder().getOrCreate()
    # load dataset
    # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
    # dataset = spark.read.format('libsvm').json(datapath+'/data/business.json')

    filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/business_MTL_ONLY.json'
    # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json'
    dataset = spark.read.format('libsvm').json(filename)
    print(dataset)

    # get longitude and latitude
    ll = dataset.select(dataset.categories[0], dataset.longitude,
                        dataset.latitude)
    ll = ll.withColumnRenamed('categories[0]', 'categories')

    ll.show()

    print(ll.schema.names)
    # for item in ll.schema.names:
    #   print(item)
    #   for item2 in item:
    #     print(item2)
    sys.exit()
    # convert ll to dense vectors
    # data =ll.rdd.map(lambda x:(Vectors.dense(float(x[0]), float(x[1])),)).collect()
    assembler = VectorAssembler(inputCols=['longitude', 'latitude'],
                                outputCol='features')

    df = assembler.transform(ll)

    # set KMeans k and seed
    kmeans = KMeans(k=4, seed=1)

    # generate model
    model = kmeans.fit(df)

    # Make predictions
    predictions = model.transform(df)
    predictions.show(20)
    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # number of location in each cluster
    print('Number of business in each cluster: ')
    predictions.groupBy('prediction').count().sort(desc('count')).show()

    # show in which cluster do we have more restaurants
    print('Number of restaurant per clusters')
    predictions.where(predictions.categories == 'Restaurants').groupBy(
        'prediction').count().sort(desc('count')).show()

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
Esempio n. 12
0
def train_cluster(df, k):
    evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='final_features_scaled', \
                                    metricName='silhouette', distanceMeasure='squaredEuclidean')
    kmeans = KMeans() \
        .setK(k) \
        .setFeaturesCol("final_features_scaled") \
        .setPredictionCol("cluster")

    kmeans_model = kmeans.fit(df)

    output = kmeans_model.transform(df)

    score = evaluator.evaluate(output)
    print("k: {}, silhouette score: {}".format(k, score))
    expr_mean = [F.avg(col).alias(col + '_mean') for col in final_features]

    #     @pandas_udf(FloatType(), functionType=PandasUDFType.GROUPED_AGG)
    #     def _func_median(v):
    #         return v.median()
    #     expr_median = [_func_median(output[col]).alias(col+'_median') for col in numeric_features]
    #     df_median = output.groupBy('cluster').agg(*expr_median).toPandas()
    df_mean = output.groupBy('cluster').agg(
        F.count(F.lit(1)).alias("audience_num"), *expr_mean).toPandas()
    #     result = pd.merge(df_mean, df_median, on='cluster')
    return output, df_mean
Esempio n. 13
0
def ClusterWords(w2v \
				, seqs
				):

	#to force each word to be a cluster center we use a trick
	#we train a kmeans model such that the number of clusters is equal to the number of words
	words = w2v.getVectors()

	


	words = words.join(broadcast(seqs), words.word == seqs.word).select(words.word.alias('word'), 'vector')
	words.cache()

	nwords = words.count()
	km = KMeans(featuresCol='vector', predictionCol='cluster', k=nwords)
	centers = km.fit(words)
	
	#create a dictionary of the words
	d = MakeDict(words, 'word', 'vector')

	old_words = words

	

	words = centers.transform(words) \
		.dropDuplicates(subset=['cluster']) \
		.withColumnRenamed('vector', 'centerVector')
		
	words.cache()
	words.show(10, False)
	


	return (words, d, centers)
Esempio n. 14
0
def kmeans(features, num_clusters):
    """Does clustering on the features dataset using KMeans clustering.

    Params:
    - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering
    - num_clusters (int): The number of clusters to be used

    Returns:
    - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column
    """
    kmeans = KMeans(k=num_clusters,
                    featuresCol='features',
                    predictionCol='cluster')
    kmeans_model = kmeans.fit(features)
    clustered = kmeans_model.transform(features)
    clustered.show()
    cluster_centers = kmeans_model.clusterCenters()
    clustered = clustered.rdd.map(
        lambda row: Row(distance=Vectors.squared_distance(
            cluster_centers[row['cluster']], row['features']),
                        **row.asDict())).toDF()
    clustered.show()
    print("=====Clustering Results=====")
    print("Clustering cost = ", kmeans_model.computeCost(features))
    print("Cluster sizes = ", kmeans_model.summary.clusterSizes)
    return clustered
Esempio n. 15
0
def train_cluster(df, k):
    evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='final_features_scaled', \
                                    metricName='silhouette', distanceMeasure='squaredEuclidean')
    kmeans = KMeans() \
        .setK(k) \
        .setFeaturesCol("final_features_scaled") \
        .setPredictionCol("cluster")

    kmeans_model = kmeans.fit(df)

    output = kmeans_model.transform(df)

    score = evaluator.evaluate(output)
    print("k: {}, silhouette score: {}".format(k, score))
    expr_mean = [F.avg(col).alias(col + '_mean') for col in final_features]

    expr_median = [
        F.expr('percentile({}, array(0.5))'.format(col))[0].alias(col +
                                                                  '_mean')
        for col in final_features
    ]

    df_median = output.groupBy('cluster').agg(*expr_median).toPandas()
    df_mean = output.groupBy('cluster').agg(
        F.count(F.lit(1)).alias("audience_num"), *expr_mean).toPandas()
    return output, df_median, df_mean
Esempio n. 16
0
def q6(df):
    import pandas as pd
    from pyspark.ml.clustering import KMeans
    from pyspark.ml.feature import VectorAssembler
    from pyspark.ml.evaluation import ClusteringEvaluator

    vectors = VectorAssembler(inputCols=['start_lat', 'start_long'],
                              outputCol='features',
                              handleInvalid='skip')
    df_ = vectors.transform(df)

    kmeans = KMeans(k=308, seed=1)
    model = kmeans.fit(df_.select('features'))
    predictions = model.transform(df_)
    centers = model.clusterCenters()

    predictions.centers = pd.Series(centers)

    #     evaluator = ClusteringEvaluator()
    #     silhouette = evaluator.evaluate(predictions)
    #     print(f'Silhouette with squared euclidean distance = {str(silhouette)}')

    print('Cluster Centers: ')
    for center in centers:
        print(center)

    return predictions, centers
Esempio n. 17
0
def frequency_vector_DataFrame(trainDF, cluster_count):
    regTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-z]")
    dfTokenizer = regTokenizer.transform(trainDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    df_remover = remover.transform(dfTokenizer)

    # feature extraction using Word2vec
    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec")
    vectors = word2Vec.fit(df_remover).getVectors()
    vectors_DF = vectors.select(vectors.word, vectors.vector.alias("features"))

    #  DF as kmeans
    kmeans = KMeans().setK(cluster_count).setSeed(1)
    km_model = kmeans.fit(vectors_DF)

    # Broadcast operation after getting the words and predictions
    vocabDF = km_model.transform(vectors_DF).select("word", "prediction")
    vocabDict = dict(vocabDF.rdd.collect())
    vocab_dict = sc.broadcast(vocabDict)

    # Cluster vector is in RDD form
    reviewsDF = df_remover.select(df_remover.filtered, df_remover.label).rdd
    clusterVectorRdd = reviewsDF.map(partial(word_to_cluster, vocab_dict=vocab_dict))


    cluster_frequency_feature_Rdd = clusterVectorRdd.map(partial(cluster_frequency_vector, cluster_count=cluster_count))

    cluster_freqDF = cluster_frequency_feature_Rdd.map(lambda (x, y): Row(x, y)).toDF()
    cluster_freq_featureDF = cluster_freqDF.select(cluster_freqDF._1.alias("features"), cluster_freqDF._2.alias("label"))

    return cluster_freq_featureDF
Esempio n. 18
0
def calculate_WSS(kmax, training):
    sse = []

    for k in range(2, kmax):

        kmeans = KMeans().setK(k).setSeed(1)
        model = kmeans.fit(training)

        centroids = model.clusterCenters()

        transformed = model.transform(training).select("features",
                                                       "prediction")
        curr_sse = 0
        train_col = training.collect()
        trans_col = transformed.collect()
        for i in range(len(train_col)):
            curr_center = centroids[trans_col[i].prediction]
            val = 0.0
            for cont_fet in range(len(train_col[i].features)):
                val += (train_col[i].features[cont_fet] -
                        curr_center[cont_fet])**2
            curr_sse += val

        sse.append(curr_sse)
    return sse
Esempio n. 19
0
    def Kmeans(self, dataframe):
        # We are going to use the Elbow method to determine the best number of cluster
        #Kmeans clustering model

        #In ML Pyspark, Kmeans need features
        assembler = VectorAssembler(inputCols=['latitude', 'longitude'],
                                    outputCol="features")
        dataframe = assembler.transform(dataframe)

        cost = np.zeros(20)
        for k in range(2, 20):
            kmeans = KMeans().setK(k).setSeed(1)
            model = kmeans.fit(dataframe)
            cost[k] = model.computeCost(dataframe)

        fig, ax = plot.subplots(1, 1, figsize=(8, 6))
        ax.plot(range(2, 20), cost[2:20])
        ax.set_xlabel('Number of Clusters')
        ax.set_ylabel('Score')
        ax.set_title("Elbow curve")

        centers = model.clusterCenters()
        print("Cluster Centers: ")
        for center in centers:
            print(center)
        return centers
Esempio n. 20
0
def kmeans(dictionary_path, filename_corpus, filename_gl, filename_label, num_of_species):
    dictionary = load_dictionary(dictionary_path)
    corpus = read_corpus(filename_corpus)
    GL = read_group(filename_gl)

    corpus_m = gensim.matutils.corpus2dense(corpus, len(dictionary.keys())).T

    SL = []
    kmer_group_dist = compute_dist(corpus_m, GL, SL, only_seed=False)

    df = pd.DataFrame(kmer_group_dist)

    spark = SparkSession.builder.appName("kmeans").getOrCreate()
    group_dist_df = spark.createDataFrame(df)

    df_columns = group_dist_df.schema.names

    vecAssembler = VectorAssembler(inputCols=df_columns, outputCol="features")
    new_df = vecAssembler.transform(group_dist_df)

    kmeans = KMeans(k=num_of_species, seed=1)  # 2 clusters here
    model = kmeans.fit(new_df.select('features'))

    transformed = model.transform(new_df)
    transformed.select(["features", "prediction"]).show()  

    y_pred = transformed.select("prediction").rdd.flatMap(lambda x: x).collect()

    y_kmer_grp_cl = assign_cluster_2_reads(GL, y_pred)

    labels = read_labels(filename_label)

    prec, rcal = evalQuality(labels, y_kmer_grp_cl, n_clusters=num_of_species)

    return prec, rcal, spark, y_kmer_grp_cl
Esempio n. 21
0
def kmeans_usecase():
    spark = getSparkSession()
    schema = ''
    for i in range(65):
        schema = schema + '_c' + str(i) + ' DOUBLE' + ','
    schema = schema[:len(schema) - 1]
    df_train = spark.read.csv('../data/optdigits.tra', schema=schema)
    df_test = spark.read.csv('../data/optdigits.tes', schema=schema)
    cols = []
    for i in range(65):
        cols.append("_c" + str(i))
    df_train.head = cols
    df_test.head = cols
    assembler = VectorAssembler(inputCols=cols[:-1], outputCol="features")
    train_output = assembler.transform(df_train)
    test_output = assembler.transform(df_test)
    train_features = train_output.select("features").toDF('features')
    test_features = test_output.select("features").toDF('features')
    train_features.show(truncate=False)
    test_features.show(truncate=False)
    kmeans = KMeans().setK(10).setSeed(1)
    model = kmeans.fit(train_features)
    predictions = model.transform(test_features)

    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))
def kmeans(coordinates_list, spark):
    coordinates_list = [
        [float(coordinates[0]), float(coordinates[1])]
        for coordinates in coordinates_list
    ]
    df = spark.createDataFrame(coordinates_list, ["Longitude", "Latitude"])

    vecAssembler = VectorAssembler(
        inputCols=["Longitude", "Latitude"], outputCol="features"
    )
    new_df = vecAssembler.transform(df)

    silhouettes = []
    for k in range(2, 10):
        kmeans = KMeans().setK(k).setSeed(1)
        model = kmeans.fit(new_df.select("features"))
        predictions = model.transform(new_df)

        evaluator = ClusteringEvaluator()
        silhouette = evaluator.evaluate(predictions)
        silhouettes.append([silhouette, predictions, k])

    _, predictions, k = max(silhouettes, key=lambda x: x[0])

    predictions.show()
    print(k)

    return predictions
Esempio n. 23
0
def get_clusters(df, num_clusters, max_iterations, initialization_mode, seed):
    # TODO:
    vecAssembler = VectorAssembler(inputCols=[
        "count1", "count2", "count3", "count4", "count5", "count6", "count7",
        "count8", "count9", "count10", "count11"
    ],
                                   outputCol="features")
    new_df = vecAssembler.transform(df)
    #new_df.show()

    kmeans = KMeans(k=NUM_CLUSTERS,
                    seed=0,
                    maxIter=MAX_ITERATIONS,
                    initMode=INITIALIZATION_MODE)
    model = kmeans.fit(new_df.select('features'))
    transformed = model.transform(new_df)
    #transformed.show()
    grouped = transformed.groupby('prediction').agg(F.collect_list('id'))
    mvv = grouped.select("collect_list(id)").rdd.flatMap(lambda x: x).collect()

    # Use the given data and the cluster pparameters to train a K-Means model
    # Find the cluster id corresponding to data point (a car)
    # Return a list of lists of the titles which belong to the same cluster
    # For example, if the output is [["Mercedes", "Audi"], ["Honda", "Hyundai"]]
    # Then "Mercedes" and "Audi" should have the same cluster id, and "Honda" and
    # "Hyundai" should have the same cluster id
    return mvv
Esempio n. 24
0
def get_uber_data():
    spark = SparkSession\
           .builder\
           .appName("Uber Dataset")\
           .getOrCreate()
    cluster_count = int(request.args.get('cluster_count'))
    dataset = spark.read.csv('uberdata.csv', inferSchema=True, header="True")
    assembler = VectorAssembler(inputCols=["Lat", "Lon"], outputCol="features")
    dataset = assembler.transform(dataset)
    (training, testdata) = dataset.randomSplit([0.7, 0.3], seed=5043)
    kmeans = KMeans().setK(cluster_count)
    model = kmeans.fit(dataset)
    transformed = model.transform(testdata).withColumnRenamed(
        "prediction", "cluster_id")
    transformed.createOrReplaceTempView("data_table")
    transformed.cache()
    centerList = list()
    cluster_centers = model.clusterCenters()
    count = int()
    for center in cluster_centers:
        centersIndList = list()
        centersIndList.append(format(center[0], '.8f'))
        centersIndList.append(format(center[1], '.8f'))
        centersIndList.append(count)
        centerList.append(centersIndList)
        count = count + 1
    centers = spark.createDataFrame(centerList)
    centers.createOrReplaceTempView("centers")
    resultsDFF = spark.sql(
        "SELECT centers._1 as Longitude, centers._2 as Latitude FROM data_table, centers WHERE data_table.cluster_id=centers._3"
    )
    data = resultsDFF.groupBy("Longitude", "Latitude").count()
    return jsonify(data.toJSON().collect())
Esempio n. 25
0
def k_means_transform(book_at, k=100, load_model=True):
    '''
    input: attribute feature matrix of all books
    output: transformed matrix including cluster assignment
    This function is used to cluster all books for faster calculation for knn later
    '''

    if load_model == False:

        ###k-means clustering###
        #Since the data is too big to do knn, first cluster them
        from pyspark.ml.clustering import KMeans
        kmeans = KMeans(
            k=k, seed=42
        )  #divide all books to 1000 clusters (1/1000, less computation for knn)
        model = kmeans.fit(book_at.select('features'))
        #model.save('k-means_model_001_10')
    else:
        from pyspark.ml.clustering import KMeansModel
        model = KMeansModel.load('hdfs:/user/yw2115/k-means_model_001')

    #add the cluster col to original attribute matrix
    transformed = model.transform(book_at)
    transformed = transformed.withColumnRenamed("prediction", "cluster")
    #transormed.show(3)
    return transformed
Esempio n. 26
0
def basic_example(spark, resources_folder):
    data = spark.read.format('libsvm').load(resources_folder +
                                            'sample_kmeans_data.txt')
    data.printSchema()
    data.show()
    final_data = data.select(data['features'])
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(final_data)
    print(type(model))

    # Withinn Sum Square Error
    # ClusteringEvaluator
    # computeCost is deprecated and now we have the values on summary
    wssse = model.summary
    print(type(wssse))
    wssse.predictions.show()
    print("Training Costs!!!!!")
    print(wssse.trainingCost
          )  # esto era en remplazo de model.computeCost(final_data)
    print(model.clusterCenters())

    data = spark.read.format('libsvm').load(resources_folder +
                                            'sample_kmeans_data.txt')
    data.printSchema()
    data.show()
Esempio n. 27
0
def train(k, path):
    # fileSave = "/home/hadoop/data_school/sparkMlib/KMeans"
    # # 男: 1, 女: 2
    # df = spark.read.format('csv').option('header', 'true').load(fileSave).fillna('0')
    df = createDataframeKMeans(path).fillna('0')
    df = df.where(df.TotalFee != '0').where(df.DiseaseCode == '13104')
    df = df.withColumn("Age", df.Age.cast(IntegerType())) \
        .withColumn("TotalFee", df.TotalFee.cast(FloatType()))

    # vecAss = VectorAssembler(inputCols=df.columns[2:], outputCol='feature')
    # data = vecAss.transform(df).select("feature")
    # data.show()
    data = df.drop("DiseaseCode")
    data.show()

    # 转换数据
    featureCreator = VectorAssembler(inputCols=data.columns[1:], outputCol='feature')
    data = featureCreator.transform(data)

    # 评估器
    kmeans = KMeans(k=k, featuresCol='feature')

    # 模型拟合
    model = kmeans.fit(data)
    # 聚合
    test = model.transform(data)
    test.show()
    points = []
    for i in test.select("Age", "TotalFee", "prediction", "HosRegisterCode").collect():
        temp = [float(i['Age']), float(i['TotalFee']), int(i['prediction']), i['HosRegisterCode']]
        points.append(temp)

    centers = model.clusterCenters()
    model.save("/home/hadoop/PycharmProjects/SparkMlib/model/kmeans")
Esempio n. 28
0
class KMeansTrainer:
    def __init__(self, k=100):
        self.kmeans = KMeans().setK(k).setSeed(42).setFeaturesCol('features')

    @staticmethod
    def make_features(user_master: SparkDataFrame):
        """
    This method receives the user_master table with features 1 to 6 and returns a copy of the
    dataframe with an extra column called `features` which is a column of sparce vectors with the features 1 to 6 one-hot encoded. 
    """
        df = user_master.select([f'feature{i}'
                                 for i in range(1, 7)] + ["user_id"])
        cols = df.columns

        categoricalColumns = [f'feature{i}' for i in range(1, 7)]

        stages = []
        for categoricalCol in categoricalColumns:
            stringIndexer = StringIndexer(inputCol=categoricalCol,
                                          outputCol=categoricalCol + 'Index')
            encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],
                                    outputCols=[categoricalCol + "classVec"])
            stages += [stringIndexer, encoder]

        #label_stringIdx = StringIndexer(inputCol = 'item_id', outputCol = 'label')
        #stages += [label_stringIdx]

        assemblerInputs = [c + "classVec" for c in categoricalColumns]
        assembler = VectorAssembler(inputCols=assemblerInputs,
                                    outputCol="features")
        stages += [assembler]

        pipeline = Pipeline(stages=stages)
        pipelineModel = pipeline.fit(df)
        df = pipelineModel.transform(df)
        selectedCols = ['features'] + cols
        df = df.select(selectedCols)
        #df.printSchema()

        return df

    def silhouete_score(self, data):
        """
    returns the silhouette score of data.
    """
        predictions = self.model.transform(data)
        evaluator = ClusteringEvaluator()
        silhouette = evaluator.evaluate(predictions)
        print(f"silhouette score: {silhouette:.4f}")
        return silhouette

    def fit(self, train, silhouette_score=False):
        """
    returns the silhouette score of the fitted data if silhouette_score is True. Default False
    """

        self.model = self.kmeans.fit(train)
        if silhouette_score:
            print("Done Fitting", end="\r")
            return self.silhouete_score(train)
Esempio n. 29
0
 def __find_cluster_split_kmeans_sparkdf(cls, feature_col, df_norm, n_iterations, kmeans_method, sc):
     from pyspark.ml.clustering import KMeans
     start_time = time.time()
     #convert to spark df
     sqlContext = SQLContext(sc)
     spark_df = sqlContext.createDataFrame(df_norm)
     #assemble vector
     vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features")
     spark_df_clustering = vecAssembler.transform(spark_df).select('features')
     n_components_list = []
     n_range = np.arange(2, 20)
     for iteration in np.arange(n_iterations):
         cost = []
         for k in n_range:
             if kmeans_method == 'kmeans':
                 print("Kmeans Elbow Method K = ", k)
                 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
                 model = kmeans.fit(spark_df_clustering)
             elif kmeans_method == 'bisecting_kmeans':
                 print("Bisecting Kmeans Elbow Method K = ", k)
                 bkm = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features")
                 model = bkm.fit(spark_df_clustering)
             cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later
         print('Cluster List: ', n_range)
         print('Within Set Sum of Squared Errors: ', cost)
         n_split_knee = cls.__knee_locator(n_range, cost, 'convex', 'decreasing', 'sum_of_square_error')
         print("Recommended no. of components by knee locator: " + str(n_split_knee))
         n_components_list.append(n_split_knee)
     n_components = int(np.median(n_components_list).round(0))
     print('Recommended median number of splits: ', n_components)
     print("elbow method time: ", time.time()-start_time, "(sec)")
     return n_components
Esempio n. 30
0
    def trainALS(self, ranks, iterations):
        for rank in ranks:
            als = ALS(rank=rank, maxIter=iterations, regParam=0.1, userCol="UserID", itemCol="MovieID",ratingCol="label")
            paramGrid = ParamGridBuilder().addGrid(als.rank,[rank]).build()
            crossval = CrossValidator(estimator=als,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=Remove_nan(metricName="rmse", labelCol="label",
                                      predictionCol="prediction"),
                                      numFolds=5)
            self.trainDf.show()
            cvModel = crossval.fit(self.trainDf)
            predictions = cvModel.transform(self.testDf)
            rmse = Remove_nan(metricName="rmse", labelCol="label",
                                        predictionCol="prediction").evaluate(predictions)
            print "****RMSE VALUE IS :*****", rmse
            movieFactors = cvModel.bestModel.itemFactors.orderBy('id').cache()
            movieFactors.show(truncate=False)
            convertToVectors = udf(lambda features: Vectors.dense(features), VectorUDT())
            movieFactors = movieFactors.withColumn("features", convertToVectors(movieFactors.features))
            kmeans = KMeans(k=50, seed=1)
            kModel = kmeans.fit(movieFactors)
            kmeansDF = kModel.transform(movieFactors)
            clusters = [1, 2]
            kmeansDF = kmeansDF.join(self.movieDf, kmeansDF.id == self.movieDf.MovieID).drop('MovieID')
            for cluster in clusters:
                movieNamesDf = kmeansDF.where(col("prediction") == cluster).select("MovieName")
                movieNamesDf.rdd.map(lambda row: row[0]).saveAsTextFile(outputDir + \
                                                                        "Rank" + str(rank) + "Cluster" + str(cluster))

        if __name__ == "__main__":
            mr = movieRecALS(inputDir + "/MovieLens100K_train.txt", inputDir + "/MovieLens100K_test.txt",
                             inputDir + "/u.item")
            ranks = [2, 4, 8, 16, 32, 64, 128, 256]
            iterations = 20
            mr.trainALS(ranks, iterations)
Esempio n. 31
0
 def test_kmeans_cosine_distance(self):
     data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),),
             (Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),),
             (Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine")
     model = kmeans.fit(df)
     result = model.transform(df).collect()
     self.assertTrue(result[0].prediction == result[1].prediction)
     self.assertTrue(result[2].prediction == result[3].prediction)
     self.assertTrue(result[4].prediction == result[5].prediction)
Esempio n. 32
0
def clustering(input_df, input_col_name, n):
    """ KMeans and PCA """
    input_df = input_df.select('state','categories','stars',input_col_name)
    norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0)
    df = norm.transform(input_df)
    kmeans = KMeans(k=n, seed=2)
    KMmodel = kmeans.fit(df)
    predicted = KMmodel.transform(df).cache()
    pca = PCA(k=2, inputCol='features', outputCol="pc")
    df =  pca.fit(dfsample).transform(dfsample).cache()
    return df
Esempio n. 33
0
def elbow(elbowset, clusters):
	wsseList = []	
	for k in clusters:
		print("Training for cluster size {} ".format(k))
		kmeans = KM(k = k, seed = 1)
		model = kmeans.fit(elbowset)
		transformed = model.transform(elbowset)
		featuresAndPrediction = transformed.select("features", "prediction")

		W = computeCost(featuresAndPrediction, model)
		print("......................WSSE = {} ".format(W))

		wsseList.append(W)
	return wsseList
Esempio n. 34
0
 def test_kmeans_summary(self):
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 1)
Esempio n. 35
0
 def test_kmean_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     path = tempfile.mkdtemp()
     km_path = path + "/km-pmml"
     model.write().format("pmml").save(km_path)
     pmml_text_list = self.sc.textFile(km_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
Esempio n. 36
0
def kmeans(inputdir,df,alg,k):
	from pyspark.ml.clustering import KMeans
        from numpy import array
        from math import sqrt	
	kmeans = KMeans(k=int(k), seed=1,initSteps=5, tol=1e-4, maxIter=20, initMode="k-means||", featuresCol="features")
        model = kmeans.fit(df)
        kmFeatures = model.transform(df).select("labels", "prediction")
        erFeatures = model.transform(df).select("features", "prediction")
	###Evaluation
        rows = erFeatures.collect()
        WSSSE = 0
        for i in rows:
		WSSSE += sqrt(sum([x**2 for x in (model.clusterCenters()[i[1]]-i[0])]))
        print("Within Set Sum of Squared Error = " + str(WSSSE))

	output_data = writeOutClu(inputdir,kmFeatures,alg,k,WSSSE)
	return output_data
Esempio n. 37
0
def cluster():
    ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8'))

    spark = SparkSession.builder\
                        .master("local")\
                        .appName("Word Count")\
                        .config("spark.some.config.option", "some-value")\
                        .getOrCreate()

    df = spark.createDataFrame([["0"],
                                ["1"],
                                ["2"],
                                ["3"],
                                ["4"]],
                               ["id"])
    df.show()

    vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features")
    new_df = vecAssembler.transform(df)

    kmeans = KMeans(k=2, seed=1)  # 2 clusters here
    model = kmeans.fit(new_df.select('features'))
    transformed = model.transform(new_df)
    print(transformed.show())
Esempio n. 38
0
t0 = time.time()
word2Vec = Word2Vec(vectorSize=100, minCount=5, stepSize=0.025, inputCol="text", outputCol="result")
modelW2V = word2Vec.fit(twDF)
wordVectorsDF = modelW2V.getVectors()
timeW2V = time.time() - t0

## Train K-means on top of the Word2Vec matrix:
t0 = time.time()
vocabSize = wordVectorsDF.count()
K = int(math.floor(math.sqrt(float(vocabSize)/2)))
         # K ~ sqrt(n/2) this is a rule of thumb for choosing K,
         # where n is the number of words in the model
         # feel free to choose K with a fancier algorithm         
dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector','features')
kmeans = KMeans(k=K, seed=1)
modelK = kmeans.fit(dfW2V)
labelsDF = modelK.transform(dfW2V).select('prediction').withColumnRenamed('prediction','labels')
vocabSize = wordVectorsDF.count()
timeKmeans = time.time() - t0

sc.stop()


## Print Some Results
printResults = 1 # set t 
if (printResults):
    ## Read Tweets

    print "="*80
    print "Read Tweets..."
    print "Elapsed time (seconds) to read tweets as a data frame: ", timeReadTweets
Esempio n. 39
0

from pyspark.mllib.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark import SparkContext
from pyspark.sql import SQLContext

# sc = SparkContext(appName="test")
# sqlContext = SQLContext(sc)

data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = sqlContext.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)

centers = model.clusterCenters()
model.transform(df).select("features", "prediction").collect()

  .option("header", "true")
  .option("inferSchema", "true")
  .load("/data/retail-data/by-day/*.csv")
  .limit(50)
  .coalesce(1)
  .where("Description IS NOT NULL"))

sales.cache()


# COMMAND ----------

from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print km.explainParams()
kmModel = km.fit(sales)


# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


# COMMAND ----------
Esempio n. 41
0
def assign_cluster(data):
    """Train kmeans on rescaled data and then label the rescaled data."""
    kmeans = KMeans(k=2, seed=1, featuresCol="features_scaled", predictionCol="label")
    model = kmeans.fit(data)
    label_df = model.transform(data)
    return label_df
Esempio n. 42
0
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["Take-out","GoodFor_lunch", "GoodFor_dinner", "GoodFor_breakfast"])

dfhot.show(5)

# Taining set
assembler = VectorAssembler(inputCols = list(set(dfhot.columns) | set(['stars','review_count'])), outputCol="features")
train = assembler.transform(dfhot)

# Kmeans set for 5 clusters
knum = 5
kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=knum, seed=0)
model = kmeans.fit(train)
print "Model Created!"

# See cluster centers:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
    
# Apply the clustering model to our data:
prediction = model.transform(train)
prediction.groupBy("cluster").count().orderBy("cluster").show()

# Look at the features of each cluster
customerCluster = {}
for i in range(0,knum):
Esempio n. 43
0
df0 = tfs.analyze(df).cache()


mllib_df.count()
df0.count()

np.random.seed(2)
init_centers = np.random.randn(k, num_features)
start_centers = init_centers
dataframe = df0


ta_0 = time.time()
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode(
        "random").setMaxIter(num_iters)
mod = kmeans.fit(mllib_df)
ta_1 = time.time()

tb_0 = time.time()
(centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=False)
tb_1 = time.time()

tc_0 = time.time()
(centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=True)
tc_1 = time.time()

mllib_dt = ta_1 - ta_0
tf_dt = tb_1 - tb_0
tf2_dt = tc_1 - tc_0

print("mllib:", mllib_dt, "tf+spark:",tf_dt, "tf:",tf2_dt)
Esempio n. 44
0
# COMMAND ----------

display(transformed)

# COMMAND ----------

# MAGIC %md
# MAGIC #### K-Means Visualized

# COMMAND ----------

modelCenters = []
iterations = [0, 2, 4, 7, 10, 20]
for i in iterations:
    kmeans = KMeans(k=3, seed=5, maxIter=i, initSteps=1)
    model = kmeans.fit(irisTwoFeatures)
    modelCenters.append(model.clusterCenters())

# COMMAND ----------

print 'modelCenters:'
for centroids in modelCenters:
  print centroids

# COMMAND ----------

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

def prepareSubplot(xticks, yticks, figsize=(10.5, 6), hideLabels=False, gridColor='#999999',
Esempio n. 45
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    # Trains a k-means model.
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)

    # Make predictions
    predictions = model.transform(dataset)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
Esempio n. 46
0
#Place the means and std.dev values in a broadcast variable
bcMeans = sc.broadcast(colMeans)
bcStdDev = sc.broadcast(colStdDev)
csAuto = autoVector.map(centerAndScale)
#csAuto.collect()
#csAuto.foreach(println)
print(csAuto)

#Create Spark Data Frame
autoRows = csAuto.map(lambda f:Row(features=f))
autoDf = SQLContext.createDataFrame(autoRows)
autoDf.select("features").show(10)

kmeans = KMeans(k=3, seed=1)
model = kmeans.fit(autoDf)
predictions = model.transform(autoDf)
predictions.collect()
predictions.foreach(println)

#Plot the results in a scatter plot
unstripped = predictions.map(unstripData)
predList=unstripped.collect()
predPd = pd.DataFrame(predList)

# preparing to save the clustered data
list_current_gni_final_maped = current_gni_final_maped.collect()
list_current_gni_rdd = current_gni_rdd.collect()
list_predictions_pandas=predictions.toPandas()
list_predictions_temp=list_predictions_pandas.as_matrix()
# COMMAND ----------

transformedTraining = fittedPipeline.transform(trainDataFrame)


# COMMAND ----------

from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
  .setK(20)\
  .setSeed(1L)


# COMMAND ----------

kmModel = kmeans.fit(transformedTraining)


# COMMAND ----------

transformedTest = fittedPipeline.transform(testDataFrame)


# COMMAND ----------

from pyspark.sql import Row

spark.sparkContext.parallelize([Row(1), Row(2), Row(3)]).toDF()


# COMMAND ----------
    initMode="k-means||", maxIter=20)
type(firstMlKMeans)


# `pyspark.ml` paketo modelių klasės turi `explainParams` metodą, kuruo išvedami modelio parametrų paaiškinimai.

# In[63]:

print(firstMlKMeans.explainParams())


# Apmokykime modelį.

# In[64]:

firstMlModel = firstMlKMeans.fit(ca1mlFeaturizedDF)
type(firstMlModel)


# In[65]:

firstMlModel.clusterCenters()


# Sudarome `Pipeline` žingsnių seką iš `vecAssembler` ir `kmeans` komponentų.

# In[66]:

from pyspark.ml.pipeline import Pipeline

firstPipeline = Pipeline(stages=[vecAssembler, firstMlKMeans])