def k_means_transform(book_at, k=100, load_model=True): ''' input: attribute feature matrix of all books output: transformed matrix including cluster assignment This function is used to cluster all books for faster calculation for knn later ''' if load_model == False: ###k-means clustering### #Since the data is too big to do knn, first cluster them from pyspark.ml.clustering import KMeans kmeans = KMeans( k=k, seed=42 ) #divide all books to 1000 clusters (1/1000, less computation for knn) model = kmeans.fit(book_at.select('features')) #model.save('k-means_model_001_10') else: from pyspark.ml.clustering import KMeansModel model = KMeansModel.load('hdfs:/user/yw2115/k-means_model_001') #add the cluster col to original attribute matrix transformed = model.transform(book_at) transformed = transformed.withColumnRenamed("prediction", "cluster") #transormed.show(3) return transformed
def useModel(path): model_path = '/home/hadoop/PycharmProjects/SparkMlib/model/kmeans' df = createDataframeKMeans(path) df = df.where(df.TotalFee != '0').where(df.DiseaseCode == '24495') df = df.withColumn("Age", df.Age.cast(IntegerType())) \ .withColumn("TotalFee", df.TotalFee.cast(FloatType())) data = df.drop("DiseaseCode") data.show() # 转换数据 featureCreator = VectorAssembler(inputCols=data.columns[1:], outputCol='feature') data = featureCreator.transform(data) model = KMeansModel.load(model_path) # 聚合 test = model.transform(data) test.show() points = [] for i in test.select("Age", "TotalFee", "prediction", "HosRegisterCode").collect(): temp = [float(i['Age']), float(i['TotalFee']), int(i['prediction']), i['HosRegisterCode']] points.append(temp) centers = model.clusterCenters() centerPoints = [] for i in centers: temp = [float(i[0]), float(i[1])] centerPoints.append(temp) getDistance(points, centerPoints)
def predict(bucket_name, feature_path, feature_name, output_path, plot_path): sc = SparkContext.getOrCreate() sqlCtx = SQLContext(sc) # load existing model model_path = output_path + "k-means.model" model = KMeansModel.load(model_path) # read from s3 csv and store to local path = feature_path + feature_name # used both locally and remotely: features/pca.csv s3 = boto3.resource('s3') s3.Object(bucket_name, path).download_file(path) df_spark = sqlCtx.read.csv(path, header=True, inferSchema=True) # Dataframe to rdd vecAssembler = VectorAssembler(inputCols=df_spark.columns, outputCol="features") df_spark = vecAssembler.transform(df_spark) rdd = df_spark.rdd.map(lambda x: array(x["features"])) print rdd.take(10) # From here: K-means model use for prediction data = model.transform(df_spark).toPandas() print output_path + "pred-" + feature_name data.to_csv(path_or_buf=(output_path + "pred-" + feature_name))
def loadModel(path): ''' Loading model from path. Input: path Output: loaded model ''' model = KMeansModel.load(path) return model
def add_high_low_flag(_data, original_features=True): # Add features _data = create_KMeans_features(_data, original=original_features) # add high-low predictions high_low_classifier = KMeansModel.load('KMeans_model') _data = high_low_classifier.transform(_data) return _data
def cluster(player_profile): df = player_profile #columns used for clustering -> features FEATURES_COL = [ 'fouls', 'goals', 'own_goals', 'pass1', 'pass2', 'pass3', 'st1', 'st2', 'st3' ] for col in df.columns: if col in FEATURES_COL: #converts all feature_cols to float datatype df = df.withColumn(col, df[col].cast('float')) df = df.na.drop() #combines features columns to make a single feature vector vecAssembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features") df_kmeans = vecAssembler.transform(df).select('Id', 'features') k = 5 #number of clusters kmeans = KMeans().setK(k).setMaxIter(20).setSeed(1).setFeaturesCol( "features") #seeing if a model exists, then we use that.Otherwise make a new model in except and save it model_path = "/home/revanth/Desktop/SEM5/BD/Big_Data_SEM5/PROJECT_FPL_ANALYTICS/" + "kmeans_model" try: model = KMeansModel.load(model_path) #print("loading saved model") except: model = kmeans.fit(df_kmeans) #this is the time consuming part model.save(model_path) centers = model.clusterCenters() #centroid of each cluster transformed = model.transform(df_kmeans).select( 'Id', 'prediction' ) #applies our model to the given data, giving the cluster number for all id's rows = transformed.collect() df_pred = sqlContext.createDataFrame(rows) df_pred = df_pred.join(df, 'Id') #gives a mean rating for each cluster of players ratings = df_pred.groupby('prediction').agg({ 'player_rating': 'mean' }).withColumnRenamed('avg(player_rating)', 'avg_player_rating').select("prediction", "avg_player_rating") for i in ratings.collect(): #convert dataframe to dictionary ratings_di[i.__getitem__('prediction')] = i.__getitem__( 'avg_player_rating') df_pred = df_pred.withColumn( "player_rating", when(df_pred.no_of_matches < 5, udf_func(df_pred.prediction)).otherwise(df_pred.player_rating)) #returns the ratings #print(df_pred.show()) return df_pred
def load_kmeans_model(_model_dir): """Load the specified model. """ if os.path.exists(_model_dir): print("Loading model from {} direcory...".format(_model_dir)) model = KMeansModel.load(_model_dir) else: print('Model {} not found.'.format(_model_dir)) sys.exit(1) return model
def find_anomalies(points): global cur_model if cur_model is None: model_path = os.getcwd() + "/kmean_model" cur_model = KMeansModel.load(model_path) labels = cur_model.transform(points).select('prediction') points_array = np.asarray(points.collect()) labels_array = np.asarray(labels.collect()) results = list() for item, label in zip(points_array, labels_array): temp = list() temp.append(item[0][0]) temp.append(item[0][1]) temp.append(item[0][2]) temp.append(label[0]) results.append(temp) return results
def k_means(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() from pyspark.ml.linalg import Vectors data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) centers = model.clusterCenters() len(centers) # 2 model.computeCost(df) # 2.000... transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[0].prediction == rows[1].prediction # True rows[2].prediction == rows[3].prediction # True model.hasSummary # True summary = model.summary summary.k # 2 summary.clusterSizes # [2, 2] temp_path = "./" kmeans_path = temp_path + "/kmeans" kmeans.save(kmeans_path) kmeans2 = KMeans.load(kmeans_path) kmeans2.getK() # 2 model_path = temp_path + "/kmeans_model" model.save(model_path) model2 = KMeansModel.load(model_path) model2.hasSummary # False model.clusterCenters()[0] == model2.clusterCenters()[0] # array([ True, True], dtype=bool) model.clusterCenters()[1] == model2.clusterCenters()[1]
#load test data test_data = sc.textFile(golden_file) parsed_test_data = test_data.map( kup.parse_as_binaryTuple).filter(lambda x: x[0] != -1.0) parsed_test_data_df = spark.createDataFrame(parsed_test_data, ["label", "features"]) # load the scaler model, perform feature scaling on test data scalerModel = StandardScalerModel.load(scaler_model_path) test_df_tmp = scalerModel.transform(parsed_test_data_df) test_df = test_df_tmp.drop("features").withColumnRenamed( "scaledFeatures", "features") #load the kmeans model best_model = KMeansModel.load(model_path) start = time() # assign clusters to test data predict_df = best_model.transform(test_df).select( col("label").alias("actualLabel"), "prediction") # assign the label to test data according to the assigned clusters labelPredictedLabel = predict_df.join( cluster_label, cluster_label.prediction == predict_df.prediction).select( predict_df.actualLabel, cluster_label.label) labelPredictedLabel.show(3) testTime = time() - start print("Test time: {} ".format(round(testTime, 3)))
pathVD = args["visualDictionaryPath"] descriptorName = args["descriptor"] output = args["output"] #estimating VLAD descriptors for the whole dataset print("estimating VLAD descriptors using " + descriptorName + " for dataset: /" + path + " and visual dictionary: /" + pathVD) # with open(pathVD, 'rb') as f: # visualDictionary=pickle.load(f) # xianjie parallel spark = SparkSession\ .builder\ .appName("PythonKMeans")\ .getOrCreate() visualDictionary = KMeansModel.load(pathVD) ## #computing the VLAD descriptors dict = {"SURF": describeSURF, "SIFT": describeSIFT, "ORB": describeORB} V, idImages = getVLADDescriptors(path, dict[descriptorName], visualDictionary) #output file = output + ".pickle" with open(file, 'wb') as f: pickle.dump([idImages, V, path], f) print("The VLAD descriptors are saved in " + file)
# plt.plot(centroid1[0], centroid1[1], 'ro') # plt.annotate("Centroid 1", (centroid1[0], centroid1[1])) # plt.plot(centroid2[0], centroid2[1], 'ro') # plt.annotate("Centroid 2", (centroid2[0], centroid2[1])) # # plt.savefig("{}VS{}".format(xVar, yVar)) sc = SparkContext() sqlContext = SQLContext(sc) spark = SparkSession \ .builder \ .appName("ChessKMeans") \ .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/chess_data_testing.games?readPreference=primaryPreferred") \ .getOrCreate() model = KMeansModel.load("KMeansModel_final_both_norm") df = spark.read.parquet("sample.parquet") # Combine all normalized columns into one "features" column assembler = VectorAssembler(inputCols=[ "w_attack_norm", "w_defend_norm", "b_attack_norm", "b_defend_norm", "evals_norm" ], outputCol="features") testing = assembler.transform(df) transformed = model.transform(testing).select('w_attack_norm', 'w_defend_norm', 'b_attack_norm', 'b_defend_norm', 'evals_norm', 'prediction')
dataDF = assembler.transform(dataDF) #dataDF.show() if mode == "training": # splitting datasets for training and testing (training, testdata) = dataDF.randomSplit([0.7, 0.3], seed=5043) kmeans = KMeans().setK(k) model = kmeans.fit(training) # Predicting the cluster that each id will belong transformed = model.transform(testdata).withColumnRenamed( "prediction", "cluster_id") #archives old model model_old = KMeansModel.load(modelpath) model_old.write().overwrite().save(modelpath_archives) logger.info( 'Old Daily Clustering Bikes by location Model has been archived on the {}' .format(datetime.now())) ##### Save model model.write().overwrite().save(modelpath) logger.info( 'New Daily Clustering Bikes by location Model has been trained on the {}' .format(datetime.now())) if mode == "predicting": model = KMeansModel.load(modelpath) logger.info( 'Daily Clustering Bikes by location Started on the {} '.format( datetime.now()))
from pyspark.ml.linalg import DenseVector spark.read.load("test_fet.csv", format="csv", inferSchema="true", header="true").rdd \ .map(lambda x: (x[2], x[1], DenseVector(lis(x[0])))) \ .toDF(["index", "file", "features"]) .write.parquet("test_fet.parquet") # Now I get the Bag of Visual Words representation using K-means model built on training data from pyspark import StorageLevel schema = spark.read.parquet("test_fet.parquet").persist(StorageLevel(True, True, False, False, 1)) import numpy as np from pyspark.ml.clustering import KMeansModel model = KMeansModel.load('KmeansModel') P = np.load('P.npy') from pyspark.ml.linalg import DenseVector predictions = model.transform(schema) df = predictions.rdd \ .map(lambda x: (x[2], x[0], DenseVector(np.matmul(np.array(x[0]), P.T)), x[1], x[3])) \ .toDF(["Index", "Features", "Projections", "File", "VisualWords"]) # Then, I generate binary signatures for all test images tau = np.load('tau.npy') from pyspark.ml.linalg import DenseVector def binsig(z, c, tau): return DenseVector((z > tau[c,:]))
def calc_error(rdd): now = datetime.now() data = rdd.toDF() output = transform_model.transform(data) predictions = model.transform(output) get_logger().info("Processing logfiles") wssse = predictions.select(['endpoint','method','response_code','features','prediction'])\ .rdd\ .map(lambda line: (error(line.features,clusterCenters[line.prediction]), line.response_code, line.endpoint, line.method))\ .filter(lambda x: x[0] > 100.0) if wssse.count() > 0: for line in wssse.collect(): get_logger().warning(line) return wssse model = KMeansModel.load(MODEL_LOCATION) transform_model = PipelineModel.load(TRANSFORM_MODEL_LOCATION) clusterCenters = model.clusterCenters() access_logs = ssc.socketTextStream(SOCKET_HOST, SOCKET_PORT) struc_logs = access_logs.flatMap(lambda line: parse_apache_log_line(line)) struc_logs.pprint() rc_dstream = struc_logs.map(lambda parsed_line: (parsed_line.response_code, 1)) rc_count = rc_dstream.reduceByKey(lambda x,y: x+y) rc_count.pprint(num = 30) struc_logs.foreachRDD(calc_error) ssc.start() ssc.awaitTermination()
from pyspark.ml.clustering import KMeansModel from pyspark.ml.evaluation import ClusteringEvaluator from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession if __name__ == "__main__": sparkSession = SparkSession\ .builder\ .appName("Spark ML KMeans")\ .getOrCreate() model = KMeansModel.load("KMEANSMODELDF") print("Model loaded") # Prepare test data test = sparkSession.createDataFrame([ (1, Vectors.dense([1.1, 3.2])), (2, Vectors.dense([5.1, 1.4])), (3, Vectors.dense([5.2, 2.0])), (4, Vectors.dense([1.0, 4.0]))], ["id", "features"])\ .cache() for row in test.collect(): print(row) prediction = model.transform(test) prediction.printSchema() prediction.show()
def load_model(self, path_to_model): """Load K-Means model from path_to_model.""" from pyspark.ml.clustering import KMeansModel model = KMeansModel.load(path_to_model) return (model)
# заготовка для API from pyspark.ml.clustering import KMeans, KMeansModel from pyspark.ml.evaluation import ClusteringEvaluator from pyspark.ml.feature import VectorAssembler from pyspark.ml import PipelineModel from pyspark import SparkConf, SparkContext spark = SparkContext() df1 = spark.createDataFrame([ (1353, 1347), ], ['user', 'summa']) va = VectorAssembler(inputCols=['user', 'summa'], outputCol="features") modelka = KMeansModel.load('./models/clusters.model') result = modelka.transform( va.transform(df1)).select('prediction').take(1)[0][0] # Проверка на новенького. # Если страый юзер - тянем монгу # Иначе поднимаем мини-инстанс.
from pyspark.ml.clustering import KMeansModel from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession if __name__ == "__main__": sparkSession = SparkSession\ .builder\ .appName("Spark ML KMeans")\ .getOrCreate() model = KMeansModel.load("KMEANSMODELML") print("Model loaded") # Prepare test data test = sparkSession.createDataFrame([ (1, Vectors.dense([1.1, 3.2])), (2, Vectors.dense([5.1, 1.4])), (3, Vectors.dense([5.2, 2.0])), (4, Vectors.dense([1.0, 4.0]))], ["id", "features"])\ .cache() for row in test.collect(): print(row) prediction = model.transform(test) prediction.printSchema() prediction.show() selected = prediction.select("id", "prediction")
tfIdf = PipelineModel.load(tfIdf_model_path) dataset = tfIdf.transform(dataset) # VectorAssembler vector_assembler_output_path = "{}/data/vectorAssemblerModel.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_output_path) dataset = vector_assembler.transform(dataset) print('# Modelos de preprocesamiento cargados') print('> Cargando KMeans') # Clasificación model_path = "{}/data/distanceKmeansRfModel.bin".format(base_path) model = KMeansModel.load(model_path) predictions = model.transform(dataset) centers = model.clusterCenters() vectorCent = F.udf(lambda k: centroid(k,centers), ArrayType(DoubleType())) euclDistance = F.udf(lambda data,centroid: distToCentroid(data,centroid),FloatType()) detectAnom = F.udf(lambda prediction, distance: anomalia(prediction, distance, threshold, limit), BooleanType()) predictions = predictions.withColumn('centroid', vectorCent(F.col('prediction'))) predictions = predictions.withColumn('distance', euclDistance(F.col('features'),F.col('centroid'))) predictions = predictions.withColumn('anomalia', detectAnom(F.col('prediction'),F.col('distance'))) print('# KMeans cargado ') only_predictions = predictions.select('version','timestamp','id','type','event','signal','freq','mod','payload','time','anomalia','distance','prediction')
def kmeans_inference( original_data, msg_col, id_col, w2v_model_path, tks_vec, ft_col, kmeans_mode, kmeans_model_path, pred_mode="static", new_cluster_thresh=None, k_list=[12, 16, 20], # update_model_path=None, distance="cosine", opt_initSteps=10, opt_tol=0.0001, opt_maxIter=30, log_path=None, n_cores=5, # K_optim tr_initSteps=200, tr_tol=0.000001, tr_maxIter=100, # train_kmeans ): """Perform inference on new error messages (Note: only K-Means can be re-trained/updated). -- params: original_data (pyspark.sql.dataframe.DataFrame): data frame with at least error string and id columns msg_col (string): name of the error string column id_col (string): name of the message id column model_path (string): path where to load pre-trained word2vec model tks_vec (string): name of the word2vec representations column ft_col (string): name of the features column kmeans_mode (\"load\" or \"train\"): kmeans mode: \"load\" uses pre-trained model, while \"train\" performs online training kmeans_model_path (string): path to pre-trained model (Specify None for re-training) pred_mode (\"static\" or \"update\"): prediction mode: \"static\" does not allow for creating new clusters new_cluster_thresh (float): distance threshold: if closest centroid is more distant than new_cluster_thresh then a new cluster is created for the new observation k_list (list): grid of K values to try distance (\"euclidean\" or \"cosine\"): distance measure for the kmeans algorithm opt_initStep (int): number of different random intializations for the kmeans algorithm in the optimization phase opt_tol (int): tolerance for kmeans algorithm convergence in the optimization phase opt_maxIter (int): maximum number of iterations for the kmeans algorithm in the optimization phase n_cores (int): number of cores to use log_path (string): where to save optimization stats. Default None (no saving) tr_initStep (int): number of different random intializations for the kmeans algorithm in the training phase tr_tol (int): tolerance for kmeans algorithm convergence in the training phase tr_maxIter (int): maximum number of iterations for the kmeans algorithm in the training phase Returns: original_data (pyspark.sql.dataframe.DataFrame): the input data frame with an extra \"prediction\" column """ from language_models import w2v_preproc from pyspark.ml.clustering import KMeansModel import time import datetime from pyspark.ml.evaluation import ClusteringEvaluator from pathlib import Path if kmeans_mode not in ["load", "train"]: print( """WARNING: invalid param \"kmeans_mode\". Specify either \"load\" to train load a pre-trained model or \"train\" to train it online.""") return (None) original_data = w2v_preproc(original_data, msg_col, id_col, w2v_model_path) if kmeans_mode == "load": original_data = kmeans_preproc(original_data, tks_vec) kmeans_model = KMeansModel.load(kmeans_model_path) else: # K_optim() # initialize a grid of K (number of clusters) values # k_list = [12, 16, 20] # train for different Ks res = K_optim(k_list, dataset=original_data, tks_vec=tks_vec, ft_col=ft_col, distance=distance, initSteps=opt_initSteps, tol=opt_tol, maxIter=opt_maxIter, n_cores=n_cores, log_path=log_path) k_sil = get_k_best(res, "silhouette") if pred_mode == "update": save_mode = "overwrite" kmeans_model_path = "temp_ciccio" # elif kmeans_mode=="load": # kmeans_model_path = None # save_mode = "new" # else: # save_mode = "new" else: kmeans_model_path = None save_mode = "new" best_k_log_path = Path(log_path).parent / "best_K={}.txt".format(k_sil) original_data = kmeans_preproc(original_data, tks_vec) kmeans_model = train_kmeans(original_data, ft_col=ft_col, k=k_sil, distance=distance, initSteps=tr_initSteps, tol=tr_tol, maxIter=tr_maxIter, save_path=kmeans_model_path, mode=save_mode, log_path=best_k_log_path) original_data = kmeans_predict(original_data, kmeans_model["model"], pred_mode=pred_mode, new_cluster_thresh=None, update_model_path=kmeans_model_path) return (original_data)
def _init_model(self): model_path = self._model_path if os.path.exists(model_path): self._kmeans_model = KMeansModel.load(model_path)
def __load_from_hdfs(self): sameModel = KMeansModel.load(self.hdfs_uri) print("k-means() - model loaded from uri {}".format(self.hdfs_uri)) return sameModel