Exemple #1
0
    def train(cls, spark, data_store, **args):
        categorical_colnames = args["cat_colnames"]
        numerical_colnames = args["num_colnames"]
        sdf = args["sdf"]
        event_sdf = sdf
        if sdf is None:
            event_sdf = data_store.read_spark_df_from_data_store(**args)

        kmean_preprocess_model = None
        kmean_postprocess_model = None

        if not event_sdf.rdd.isEmpty():
            kmean_preprocess_model = KMeansPipeline.train(
                spark=spark,
                sdf=event_sdf,
                cat_colnames=categorical_colnames,
                num_colnames=numerical_colnames)

            centroids = np.array(kmean_preprocess_model.pipeline_model.
                                 stages[-1].clusterCenters())

            result_cluster_sdf = kmean_preprocess_model.pipeline_model.transform(
                event_sdf).persist()

            result_score_sdf = result_cluster_sdf.withColumn(
                "sed",
                udf_calculate_SED(centroids)(col("features"))).persist()

            kmean_postprocess_model = MinMaxPipeline.train(
                spark=spark,
                result_score_sdf=result_score_sdf,
                colnames=["sed"])

        return ClusterEventModel(kmeans_model=kmean_preprocess_model,
                                 minmax_model=kmean_postprocess_model)
Exemple #2
0
    def train(cls, spark, **args):
        categorical_colnames = args["categorical_colnames"]
        numerical_colnames = args["numerical_colnames"]
        sdf = args["sdf"]
        entity_profile_sdf = sdf
        if sdf.count() > 0:
            kmean_preprocess_model = KMeansPipeline.train(
                spark=spark,
                sdf=entity_profile_sdf,
                cat_colnames=categorical_colnames,
                num_colnames=numerical_colnames)

            centroids = np.array(kmean_preprocess_model.pipeline_model.
                                 stages[-1].clusterCenters())

            result_cluster_sdf = kmean_preprocess_model.pipeline_model.transform(
                entity_profile_sdf).persist()

            result_score_sdf = result_cluster_sdf.withColumn(
                "sed",
                udf_calculate_SED(centroids)(col("features"))).persist()

            kmean_postprocess_model = MinMaxPipeline.train(
                spark=spark,
                result_score_sdf=result_score_sdf,
                colnames=["sed"])
        else:
            kmean_preprocess_model = None
            kmean_postprocess_model = None

        return ClusterProfileModel(kmeans_model=kmean_preprocess_model,
                                   minmax_model=kmean_postprocess_model)
Exemple #3
0
 def load(cls, spark, path):
     try:
         clustering_pipeline_model = PipelineModel.load(path + "/clustering_pipeline_model")
         kmeans_model = KMeansPipeline(pipeline_model=clustering_pipeline_model)
         scoring_pipeline_model = PipelineModel.load(path + "/scoring_pipeline_model")
         minmax_model = MinMaxPipeline(pipeline_model=scoring_pipeline_model)
     except:
         kmeans_model = None
         minmax_model = None
     return ClusterEventModel(kmeans_model=kmeans_model,
                              minmax_model=minmax_model)