def write_split_urls_and_word_frequency_orders( split_urls_and_word_frequency_orders: DataFrame) -> None: split_urls_and_word_frequency_orders \ .write \ .parquet("out/" + get_configs()["dataset"]["name"] + "/" + get_configs()["parquet-filenames"][ "split-urls-and-word-frequency-orders"], mode="overwrite")
def write_clustered_urls(clustered_urls: DataFrame, clustering_algorithm: str) -> None: clustered_urls.write.parquet( "out/" + get_configs()["dataset"]["name"] + "/" + get_configs()["parquet-filename-prefixes"]["clustered-urls-by"] + clustering_algorithm + ".parquet", mode="overwrite")
def get_best_hyperparameters(self) -> dict: return hyperopt.fmin( fn=self.objective, space=self.search_space, algo=hyperopt.tpe.suggest, max_evals=get_configs()["hyperparameter-optimization"] ["number-of-evaluations"])
def run(self, spark: SparkSession) -> None: self.spark = spark self.split_urls_and_word_frequency_orders = URL_IO.read_split_urls_and_word_frequency_orders( spark) logging.getLogger("run()").info("Running " + self.app_name + " for " + get_configs()["dataset"]["name"]) best_hyperparameters = self.get_best_hyperparameters() logging.getLogger("run()").info( "best hyperparameters: " + str(hyperopt.space_eval(self.search_space, best_hyperparameters)))
app_name: str = "KMeansProfilerJob" def apply_additional_configs(self, builder: SparkSession.Builder) -> SparkSession.Builder: return builder def run(self, spark: SparkSession) -> None: logging.getLogger("KMeansProfilerJob.run()").info("started.") self.__urls_and_vectors = URLFeatureExtractor \ .get_urls_and_vectors(spark, self.__window_size, self.__s, self.__additional_weight_function).cache() self.__number_of_samples = self.__urls_and_vectors.count() self.run_experiments() self.__urls_and_vectors.unpersist() self.log_profiling_results() def log_profiling_results(self) -> None: logger = "KMeansProfilerJob.log_profiling_results()" logging.getLogger(logger).info(str(self._number_of_experiments) + " experiment had run for K-means on " + str(self.__number_of_samples) + " samples.") logging.getLogger(logger).info("Mean execution time: " + str(self.get_mean_execution_time())) logging.getLogger(logger).info("Standard deviation of execution times: " + str(self.get_standard_deviation_of_execution_times())) def get_clusters(self) -> DataFrame: return KMeansJob \ .get_model(self.__urls_and_vectors, self.__k, self.__distance_measure) \ .transform(self.__urls_and_vectors) if __name__ == "__main__": KMeansProfilerJob(get_configs()["profiler"]["number-of-experiments"], 10, "euclidean", 3, 5).start()
logging.getLogger("BisectingKMeansProfilerJob.run()").info("started.") self.__urls_and_vectors = URLFeatureExtractor \ .get_urls_and_vectors(spark, self.__window_size, self.__s, self.__additional_weight_function).cache() self.__number_of_samples = self.__urls_and_vectors.count() self.run_experiments() self.__urls_and_vectors.unpersist() self.log_profiling_results() def log_profiling_results(self) -> None: logger = "BisectingKMeansProfilerJob.log_profiling_results()" logging.getLogger(logger).info( str(self._number_of_experiments) + " experiment had run for K-means on " + str(self.__number_of_samples) + " samples.") logging.getLogger(logger).info("Mean execution time: " + str(self.get_mean_execution_time())) logging.getLogger( logger).info("Standard deviation of execution times: " + str(self.get_standard_deviation_of_execution_times())) def get_clusters(self) -> DataFrame: return BisectingKMeansJob \ .get_model(self.__urls_and_vectors, self.__k, self.__distance_measure) \ .transform(self.__urls_and_vectors) if __name__ == "__main__": BisectingKMeansProfilerJob( get_configs()["profiler"]["number-of-experiments"], 10, "euclidean", 3, 5).start()
def get_model(urls_and_vectors: DataFrame, k: int, distance_measure: str) -> KMeansModel: return KMeans().setMaxIter(get_configs()["clustering"]["max-iters"]).setK(k).setDistanceMeasure(distance_measure) \ .setFeaturesCol("vector").setPredictionCol("cluster_id").fit(urls_and_vectors)
def read_urls(spark: SparkSession) -> DataFrame: return spark.read.csv( "data/" + get_configs()["dataset"]["unlabeled"]["filename"], header=True, schema=URL)
def read_labeled_urls() -> pd.DataFrame: return pd.read_csv("data/" + get_configs()["dataset"]["labeled"]["filename"])
def read_clustered_urls(spark: SparkSession, clustering_algorithm: str) -> DataFrame: return spark.read.parquet( "out/" + get_configs()["dataset"]["name"] + "/" + get_configs()["parquet-filename-prefixes"]["clustered-urls-by"] + clustering_algorithm + ".parquet")
def read_split_urls_and_word_frequency_orders( spark: SparkSession) -> DataFrame: return spark.read.parquet("out/" + get_configs()["dataset"]["name"] + "/" + get_configs()["parquet-filenames"] ["split-urls-and-word-frequency-orders"])
def read_word_vectors(spark: SparkSession, window_size: int) -> DataFrame: return spark.read.parquet( "out/" + get_configs()["dataset"]["name"] + "/" + get_configs()["parquet-filename-prefixes"]["word-vectors"] + str(window_size) + ".parquet")
def write_word_vectors(word_vectors: DataFrame, window_size: int) -> None: word_vectors.write.parquet("out/" + get_configs()["dataset"]["name"] + "/word_vectors_" + str(window_size) + ".parquet", mode="overwrite")
def read_split_urls(spark: SparkSession) -> DataFrame: return spark.read.parquet( "out/" + get_configs()["dataset"]["name"] + "/" + get_configs()["parquet-filenames"]["split-url"])
def write_split_urls(urls: DataFrame) -> None: urls.write.parquet("out/" + get_configs()["dataset"]["name"] + "/" + get_configs()["parquet-filenames"]["split-url"], mode="overwrite")