Esempio n. 1
0
 def write_split_urls_and_word_frequency_orders(
         split_urls_and_word_frequency_orders: DataFrame) -> None:
     split_urls_and_word_frequency_orders \
         .write \
         .parquet("out/" + get_configs()["dataset"]["name"] + "/" + get_configs()["parquet-filenames"][
         "split-urls-and-word-frequency-orders"],
                  mode="overwrite")
Esempio n. 2
0
 def write_clustered_urls(clustered_urls: DataFrame,
                          clustering_algorithm: str) -> None:
     clustered_urls.write.parquet(
         "out/" + get_configs()["dataset"]["name"] + "/" +
         get_configs()["parquet-filename-prefixes"]["clustered-urls-by"] +
         clustering_algorithm + ".parquet",
         mode="overwrite")
 def get_best_hyperparameters(self) -> dict:
     return hyperopt.fmin(
         fn=self.objective,
         space=self.search_space,
         algo=hyperopt.tpe.suggest,
         max_evals=get_configs()["hyperparameter-optimization"]
         ["number-of-evaluations"])
 def run(self, spark: SparkSession) -> None:
     self.spark = spark
     self.split_urls_and_word_frequency_orders = URL_IO.read_split_urls_and_word_frequency_orders(
         spark)
     logging.getLogger("run()").info("Running " + self.app_name + " for " +
                                     get_configs()["dataset"]["name"])
     best_hyperparameters = self.get_best_hyperparameters()
     logging.getLogger("run()").info(
         "best hyperparameters: " +
         str(hyperopt.space_eval(self.search_space, best_hyperparameters)))
Esempio n. 5
0
    app_name: str = "KMeansProfilerJob"

    def apply_additional_configs(self, builder: SparkSession.Builder) -> SparkSession.Builder:
        return builder

    def run(self, spark: SparkSession) -> None:
        logging.getLogger("KMeansProfilerJob.run()").info("started.")
        self.__urls_and_vectors = URLFeatureExtractor \
            .get_urls_and_vectors(spark, self.__window_size, self.__s, self.__additional_weight_function).cache()
        self.__number_of_samples = self.__urls_and_vectors.count()
        self.run_experiments()
        self.__urls_and_vectors.unpersist()
        self.log_profiling_results()

    def log_profiling_results(self) -> None:
        logger = "KMeansProfilerJob.log_profiling_results()"
        logging.getLogger(logger).info(str(self._number_of_experiments) + " experiment had run for K-means on " +
                                       str(self.__number_of_samples) + " samples.")
        logging.getLogger(logger).info("Mean execution time: " + str(self.get_mean_execution_time()))
        logging.getLogger(logger).info("Standard deviation of execution times: " +
                                       str(self.get_standard_deviation_of_execution_times()))

    def get_clusters(self) -> DataFrame:
        return KMeansJob \
            .get_model(self.__urls_and_vectors, self.__k, self.__distance_measure) \
            .transform(self.__urls_and_vectors)


if __name__ == "__main__":
    KMeansProfilerJob(get_configs()["profiler"]["number-of-experiments"], 10, "euclidean", 3, 5).start()
Esempio n. 6
0
        logging.getLogger("BisectingKMeansProfilerJob.run()").info("started.")
        self.__urls_and_vectors = URLFeatureExtractor \
            .get_urls_and_vectors(spark, self.__window_size, self.__s, self.__additional_weight_function).cache()
        self.__number_of_samples = self.__urls_and_vectors.count()
        self.run_experiments()
        self.__urls_and_vectors.unpersist()
        self.log_profiling_results()

    def log_profiling_results(self) -> None:
        logger = "BisectingKMeansProfilerJob.log_profiling_results()"
        logging.getLogger(logger).info(
            str(self._number_of_experiments) +
            " experiment had run for K-means on " +
            str(self.__number_of_samples) + " samples.")
        logging.getLogger(logger).info("Mean execution time: " +
                                       str(self.get_mean_execution_time()))
        logging.getLogger(
            logger).info("Standard deviation of execution times: " +
                         str(self.get_standard_deviation_of_execution_times()))

    def get_clusters(self) -> DataFrame:
        return BisectingKMeansJob \
            .get_model(self.__urls_and_vectors, self.__k, self.__distance_measure) \
            .transform(self.__urls_and_vectors)


if __name__ == "__main__":
    BisectingKMeansProfilerJob(
        get_configs()["profiler"]["number-of-experiments"], 10, "euclidean", 3,
        5).start()
Esempio n. 7
0
 def get_model(urls_and_vectors: DataFrame, k: int, distance_measure: str) -> KMeansModel:
     return KMeans().setMaxIter(get_configs()["clustering"]["max-iters"]).setK(k).setDistanceMeasure(distance_measure) \
         .setFeaturesCol("vector").setPredictionCol("cluster_id").fit(urls_and_vectors)
Esempio n. 8
0
 def read_urls(spark: SparkSession) -> DataFrame:
     return spark.read.csv(
         "data/" + get_configs()["dataset"]["unlabeled"]["filename"],
         header=True,
         schema=URL)
Esempio n. 9
0
 def read_labeled_urls() -> pd.DataFrame:
     return pd.read_csv("data/" +
                        get_configs()["dataset"]["labeled"]["filename"])
Esempio n. 10
0
 def read_clustered_urls(spark: SparkSession,
                         clustering_algorithm: str) -> DataFrame:
     return spark.read.parquet(
         "out/" + get_configs()["dataset"]["name"] + "/" +
         get_configs()["parquet-filename-prefixes"]["clustered-urls-by"] +
         clustering_algorithm + ".parquet")
Esempio n. 11
0
 def read_split_urls_and_word_frequency_orders(
         spark: SparkSession) -> DataFrame:
     return spark.read.parquet("out/" + get_configs()["dataset"]["name"] +
                               "/" + get_configs()["parquet-filenames"]
                               ["split-urls-and-word-frequency-orders"])
Esempio n. 12
0
 def read_word_vectors(spark: SparkSession, window_size: int) -> DataFrame:
     return spark.read.parquet(
         "out/" + get_configs()["dataset"]["name"] + "/" +
         get_configs()["parquet-filename-prefixes"]["word-vectors"] +
         str(window_size) + ".parquet")
Esempio n. 13
0
 def write_word_vectors(word_vectors: DataFrame, window_size: int) -> None:
     word_vectors.write.parquet("out/" + get_configs()["dataset"]["name"] +
                                "/word_vectors_" + str(window_size) +
                                ".parquet",
                                mode="overwrite")
Esempio n. 14
0
 def read_split_urls(spark: SparkSession) -> DataFrame:
     return spark.read.parquet(
         "out/" + get_configs()["dataset"]["name"] + "/" +
         get_configs()["parquet-filenames"]["split-url"])
Esempio n. 15
0
 def write_split_urls(urls: DataFrame) -> None:
     urls.write.parquet("out/" + get_configs()["dataset"]["name"] + "/" +
                        get_configs()["parquet-filenames"]["split-url"],
                        mode="overwrite")