Exemple #1
0
 def train(
     cls,
     rdd,
     k,
     maxIterations=100,
     runs=1,
     initializationMode="k-means||",
     seed=None,
     initializationSteps=5,
     epsilon=1e-4,
     initialModel=None,
 ):
     """Train a k-means clustering model."""
     clusterInitialModel = []
     if initialModel is not None:
         if not isinstance(initialModel, KMeansModel):
             raise Exception(
                 "initialModel is of " + str(type(initialModel)) + ". It needs " "to be of <type 'KMeansModel'>"
             )
         clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
     model = callMLlibFunc(
         "trainKMeansModel",
         rdd.map(_convert_to_vector),
         k,
         maxIterations,
         runs,
         initializationMode,
         seed,
         initializationSteps,
         epsilon,
         clusterInitialModel,
     )
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Exemple #2
0
 def train(cls,
           rdd,
           k,
           maxIterations=100,
           runs=1,
           initializationMode="k-means||",
           seed=None,
           initializationSteps=5,
           epsilon=1e-4,
           initialModel=None):
     """Train a k-means clustering model."""
     if runs != 1:
         warnings.warn(
             "Support for runs is deprecated in 1.6.0. This param will have no effect in 1.7.0."
         )
     clusterInitialModel = []
     if initialModel is not None:
         if not isinstance(initialModel, KMeansModel):
             raise Exception("initialModel is of " +
                             str(type(initialModel)) + ". It needs "
                             "to be of <type 'KMeansModel'>")
         clusterInitialModel = [
             _convert_to_vector(c) for c in initialModel.clusterCenters
         ]
     model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector),
                           k, maxIterations, runs, initializationMode, seed,
                           initializationSteps, epsilon,
                           clusterInitialModel)
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Exemple #3
0
 def train(
     cls,
     rdd,
     k,
     maxIterations=100,
     runs=1,
     initializationMode="k-means||",
     seed=None,
     initializationSteps=5,
     epsilon=1e-4,
 ):
     """Train a k-means clustering model."""
     model = callMLlibFunc(
         "trainKMeansModel",
         rdd.map(_convert_to_vector),
         k,
         maxIterations,
         runs,
         initializationMode,
         seed,
         initializationSteps,
         epsilon,
     )
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Exemple #4
0
 def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
           seed=None, initializationSteps=5, epsilon=1e-4):
     """Train a k-means clustering model."""
     model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations,
                           runs, initializationMode, seed, initializationSteps, epsilon)
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Exemple #5
0
 def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"):
     """Train a k-means clustering model."""
     # cache serialized data to avoid objects over head in JVM
     jcached = _to_java_object_rdd(rdd.map(_convert_to_vector), cache=True)
     model = callMLlibFunc("trainKMeansModel", jcached, k, maxIterations, runs,
                           initializationMode)
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Exemple #6
0
    def train(cls,
              rdd,
              k,
              maxIterations=100,
              initializationMode="k-means||",
              seed=None,
              initializationSteps=2,
              epsilon=1e-4,
              initialModel=None):
        """
        Train a k-means clustering model.

        :param rdd:
          Training points as an `RDD` of `Vector` or convertible
          sequence types.
        :param k:
          Number of clusters to create.
        :param maxIterations:
          Maximum number of iterations allowed.
          (default: 100)
        :param initializationMode:
          The initialization algorithm. This can be either "random" or
          "k-means||".
          (default: "k-means||")
        :param seed:
          Random seed value for cluster initialization. Set as None to
          generate seed based on system time.
          (default: None)
        :param initializationSteps:
          Number of steps for the k-means|| initialization mode.
          This is an advanced setting -- the default of 2 is almost
          always enough.
          (default: 2)
        :param epsilon:
          Distance threshold within which a center will be considered to
          have converged. If all centers move less than this Euclidean
          distance, iterations are stopped.
          (default: 1e-4)
        :param initialModel:
          Initial cluster centers can be provided as a KMeansModel object
          rather than using the random or k-means|| initializationModel.
          (default: None)
        """
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise Exception("initialModel is of " +
                                str(type(initialModel)) + ". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [
                _convert_to_vector(c) for c in initialModel.clusterCenters
            ]
        model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector),
                              k, maxIterations, initializationMode, seed,
                              initializationSteps, epsilon,
                              clusterInitialModel)
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])
Exemple #7
0
    def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
              seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None):
        """
        Train a k-means clustering model.

        :param rdd:
          Training points as an `RDD` of `Vector` or convertible
          sequence types.
        :param k:
          Number of clusters to create.
        :param maxIterations:
          Maximum number of iterations allowed.
          (default: 100)
        :param runs:
          Number of runs to execute in parallel. The best model according
          to the cost function will be returned (deprecated in 1.6.0).
          (default: 1)
        :param initializationMode:
          The initialization algorithm. This can be either "random" or
          "k-means||".
          (default: "k-means||")
        :param seed:
          Random seed value for cluster initialization. Set as None to
          generate seed based on system time.
          (default: None)
        :param initializationSteps:
          Number of steps for the k-means|| initialization mode.
          This is an advanced setting -- the default of 5 is almost
          always enough.
          (default: 5)
        :param epsilon:
          Distance threshold within which a center will be considered to
          have converged. If all centers move less than this Euclidean
          distance, iterations are stopped.
          (default: 1e-4)
        :param initialModel:
          Initial cluster centers can be provided as a KMeansModel object
          rather than using the random or k-means|| initializationModel.
          (default: None)
        """
        if runs != 1:
            warnings.warn(
                "Support for runs is deprecated in 1.6.0. This param will have no effect in 2.0.0.")
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise Exception("initialModel is of "+str(type(initialModel))+". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
        model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations,
                              runs, initializationMode, seed, initializationSteps, epsilon,
                              clusterInitialModel)
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])
Exemple #8
0
def callBigDlFunc(bigdl_type, name, *args):
    """ Call API in PythonBigDL """
    sc = SparkContext.getOrCreate()
    if bigdl_type == "float":
        api = getattr(
            sc._jvm.com.intel.analytics.bigdl.python.api.PythonBigDL.ofFloat(),
            name)
    elif bigdl_type == "double":
        api = getattr(
            sc._jvm.com.intel.analytics.bigdl.python.api.PythonBigDL.ofDouble(
            ), name)
    else:
        raise Exception("Not supported bigdl_type: %s" % bigdl_type)
    return callJavaFunc(sc, api, *args)
Exemple #9
0
def callBigDlFunc(bigdl_type, name, *args):
    """ Call API in PythonBigDL """
    sc = SparkContext.getOrCreate()
    if bigdl_type == "float":
        api = getattr(
            sc._jvm.com.intel.analytics.bigdl.python.api.PythonBigDL.ofFloat(),
            name)
    elif bigdl_type == "double":
        api = getattr(
            sc._jvm.com.intel.analytics.bigdl.python.api.PythonBigDL.ofDouble(),
            name)
    else:
        raise Exception("Not supported bigdl_type: %s" % bigdl_type)
    return callJavaFunc(sc, api, *args)
Exemple #10
0
def callBigDlFunc(bigdl_type, name, *args):
    """ Call API in PythonBigDL """
    gateway = _get_gateway()
    error = Exception("Cannot find function: %s" % name)
    for jinvoker in JavaCreator.instance(bigdl_type, gateway).value:
        # hasattr(jinvoker, name) always return true here,
        # so you need to invoke the method to check if it exist or not
        try:
            api = getattr(jinvoker, name)
            result = callJavaFunc(api, *args)
        except Exception as e:
            error = e
            if "does not exist" not in str(e):
                raise e
        else:
            return result
    raise error
Exemple #11
0
def callBigDlFunc(bigdl_type, name, *args):
    """ Call API in PythonBigDL """
    jinstance = JavaCreator.instance(bigdl_type=bigdl_type).value
    sc = get_spark_context()
    api = getattr(jinstance, name)
    return callJavaFunc(sc, api, *args)
Exemple #12
0
    def train(
        cls,
        rdd,
        k,
        maxIterations=100,
        initializationMode="k-means||",
        seed=None,
        initializationSteps=2,
        epsilon=1e-4,
        initialModel=None,
        distanceMeasure="euclidean",
    ):
        """
        Train a k-means clustering model.

        .. versionadded:: 0.9.0

        Parameters
        ----------
        rdd : ::py:class:`pyspark.RDD`
            Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector`
            or convertible sequence types.
        k : int
            Number of clusters to create.
        maxIterations : int, optional
            Maximum number of iterations allowed.
            (default: 100)
        initializationMode : str, optional
            The initialization algorithm. This can be either "random" or
            "k-means||".
            (default: "k-means||")
        seed : int, optional
            Random seed value for cluster initialization. Set as None to
            generate seed based on system time.
            (default: None)
        initializationSteps :
            Number of steps for the k-means|| initialization mode.
            This is an advanced setting -- the default of 2 is almost
            always enough.
            (default: 2)
        epsilon : float, optional
            Distance threshold within which a center will be considered to
            have converged. If all centers move less than this Euclidean
            distance, iterations are stopped.
            (default: 1e-4)
        initialModel : :py:class:`KMeansModel`, optional
            Initial cluster centers can be provided as a KMeansModel object
            rather than using the random or k-means|| initializationModel.
            (default: None)
        distanceMeasure : str, optional
            The distance measure used by the k-means algorithm.
            (default: "euclidean")
        """
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise TypeError("initialModel is of " +
                                str(type(initialModel)) + ". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [
                _convert_to_vector(c) for c in initialModel.clusterCenters
            ]
        model = callMLlibFunc(
            "trainKMeansModel",
            rdd.map(_convert_to_vector),
            k,
            maxIterations,
            initializationMode,
            seed,
            initializationSteps,
            epsilon,
            clusterInitialModel,
            distanceMeasure,
        )
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])