def run(self): tf_path = self.settings.tf_path algorithm = self.settings.algorithm seed = int(self.settings.seed) k = int(self.settings.k) result_path = self.settings.result_path target = self.settings.target spark = SparkSession.builder.getOrCreate() with open("train_spark.txt", "w") as file: file.write("spark context" + str(spark.sparkContext)) file.write("===SeessionID===") file.write(str(id(spark))) df = spark.read.option("header", "true") \ .option("inferSchema", "true") \ .parquet(tf_path) df.repartition(10) # MODELING if algorithm == 'GMM': gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed( seed) print("=====" * 8) print(gmm.explainParams()) print("=====" * 8) model = gmm.fit(df) elif algorithm == 'KMeans': kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(kmm.explainParams()) print("=====" * 8) model = kmm.fit(df) else: raise ValueError("no alg") prediction = model.transform(df) with open("./feature_info.pickle", "rb") as handle: features_info = pickle.load(handle) prediction.select(features_info["numeric_features"] + features_info["category_features"] + [target, 'prediction']).coalesce(1).write.mode( 'overwrite').csv(result_path, header=True) print("Result file is successfully generated at: ", result_path)
# COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(5) print gmm.explainParams() model = gmm.fit(sales) # COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer, CountVectorizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
# COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(5) print gmm.explainParams() model = gmm.fit(sales) # COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ----------
selected = ["home_lat", "home_lon"] # Assemble the feature vector: from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=selected, outputCol="features") assembled = assembler.transform(students) # ## Fit a Gaussian mixture model # Specify a Gaussian mixture model with two clusters: from pyspark.ml.clustering import GaussianMixture gm = GaussianMixture(featuresCol="features", k=2, seed=12345) # Examine the hyperparameters: print(gm.explainParams()) # Fit the Gaussian mixture model: gmm = gm.fit(assembled) type(gmm) # ## Examine the Gaussian mixture model # Examine the mixing weights: gmm.weights # Examine the (multivariate) Gaussian distributions: gmm.gaussiansDF.head(5) # Examine the model summary: