def run(self): tf_path = self.settings.tf_path algorithm = self.settings.algorithm seed = int(self.settings.seed) k = int(self.settings.k) result_path = self.settings.result_path target = self.settings.target spark = SparkSession.builder.getOrCreate() with open("train_spark.txt", "w") as file: file.write("spark context" + str(spark.sparkContext)) file.write("===SeessionID===") file.write(str(id(spark))) df = spark.read.option("header", "true") \ .option("inferSchema", "true") \ .parquet(tf_path) df.repartition(10) # MODELING if algorithm == 'GMM': gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed( seed) print("=====" * 8) print(gmm.explainParams()) print("=====" * 8) model = gmm.fit(df) elif algorithm == 'KMeans': kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(kmm.explainParams()) print("=====" * 8) model = kmm.fit(df) else: raise ValueError("no alg") prediction = model.transform(df) with open("./feature_info.pickle", "rb") as handle: features_info = pickle.load(handle) prediction.select(features_info["numeric_features"] + features_info["category_features"] + [target, 'prediction']).coalesce(1).write.mode( 'overwrite').csv(result_path, header=True) print("Result file is successfully generated at: ", result_path)
sales = va.transform(spark.read.format("csv") .option("header", "true") .option("inferSchema", "true") .load("/data/retail-data/by-day/*.csv") .limit(50) .coalesce(1) .where("Description IS NOT NULL")) sales.cache() # COMMAND ---------- from pyspark.ml.clustering import KMeans km = KMeans().setK(5) print km.explainParams() kmModel = km.fit(sales) # COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ----------
.setInputCols(["Quantity", "UnitPrice"])\ .setOutputCol("features") sales = va.transform( spark.read.format("csv").option("header", "true").option("inferSchema", "true"). load("/databricks-datasets/definitive-guide/data/retail-data/by-day/*.csv" ).limit(50).coalesce(1).where("Description IS NOT NULL")) sales.cache() # COMMAND ---------- from pyspark.ml.clustering import KMeans km = KMeans().setK(5) print km.explainParams() kmModel = km.fit(sales) # COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import BisectingKMeans
feature_info = { "numeric_features": numeric_features, "category_features": category_features } # MODELING if algorithm == 'GMM': gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(gmm.explainParams()) print("=====" * 8) model = gmm.fit(processed) elif algorithm == 'KMeans': kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(kmm.explainParams()) print("=====" * 8) model = kmm.fit(processed) else: raise ValueError("no alg") prediction = model.transform(processed) prediction.select( feature_info["numeric_features"] + feature_info["category_features"] + [target, 'prediction']).coalesce(1).write.mode('overwrite').csv( result_path, header=True) print("Result file is successfully generated at: ", result_path) end = time.time() elapsed = end - start
# standardization may be superfluous in this case. rides_standardized.describe("origin_lat", "origin_lon", "dest_lat", "dest_lon").show() # Spark MLlib does not provide a transformer to unscale the features. In order # to create meaningful plots below, we will proceed with unscaled features. # ## Specify and fit a k-means model # Use the `KMeans` class constructor to specify a k-means model: from pyspark.ml.clustering import KMeans kmeans = KMeans(featuresCol="features", predictionCol="cluster", k=3) type(kmeans) # Use the `explainParams` method to get a full list of the arguments: print(kmeans.explainParams()) # Use the `fit` method to fit the k-means model: kmeans_model = kmeans.fit(rides_standardized) type(kmeans_model) # **Note:** Euclidean distance may not be appropriate in this case. # ## Evaluate the k-means model # Compute cluster costs: kmeans_model.computeCost(rides_standardized) # **Note:** The `computeCost` is generally not informative on its own. It is # more useful when comparing multiple clustering models.
# In[62]: from pyspark.ml.clustering import KMeans as MlKMeans firstMlKMeans = MlKMeans( featuresCol="features", predictionCol="prediction", k=2, initMode="k-means||", maxIter=20) type(firstMlKMeans) # `pyspark.ml` paketo modelių klasės turi `explainParams` metodą, kuruo išvedami modelio parametrų paaiškinimai. # In[63]: print(firstMlKMeans.explainParams()) # Apmokykime modelį. # In[64]: firstMlModel = firstMlKMeans.fit(ca1mlFeaturizedDF) type(firstMlModel) # In[65]: firstMlModel.clusterCenters()