Example #1
0
    def train(data, algo, numClasses, categoricalFeaturesInfo,
              impurity, maxDepth, maxBins=100):
        """
        Train a DecisionTreeModel for classification or regression.

        :param data: Training data: RDD of LabeledPoint.
                     For classification, labels are integers
                      {0,1,...,numClasses}.
                     For regression, labels are real numbers.
        :param algo: "classification" or "regression"
        :param numClasses: Number of classes for classification.
        :param categoricalFeaturesInfo: Map from categorical feature index
                                        to number of categories.
                                        Any feature not in this map
                                        is treated as continuous.
        :param impurity: For classification: "entropy" or "gini".
                         For regression: "variance".
        :param maxDepth: Max depth of tree.
                         E.g., depth 0 means 1 leaf node.
                         Depth 1 means 1 internal node + 2 leaf nodes.
        :param maxBins: Number of bins used for finding splits at each node.
        :return: DecisionTreeModel
        """
        sc = data.context
        dataBytes = _get_unmangled_labeled_point_rdd(data)
        categoricalFeaturesInfoJMap = \
            MapConverter().convert(categoricalFeaturesInfo,
                                   sc._gateway._gateway_client)
        model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
            dataBytes._jrdd, algo,
            numClasses, categoricalFeaturesInfoJMap,
            impurity, maxDepth, maxBins)
        dataBytes.unpersist()
        return DecisionTreeModel(sc, model)
Example #2
0
    def trainRegressor(data,
                       categoricalFeaturesInfo,
                       impurity="variance",
                       maxDepth=5,
                       maxBins=32):
        """
        Train a DecisionTreeModel for regression.

        :param data: Training data: RDD of LabeledPoint.
                     Labels are real numbers.
        :param categoricalFeaturesInfo: Map from categorical feature index
                                        to number of categories.
                                        Any feature not in this map
                                        is treated as continuous.
        :param impurity: Supported values: "variance"
        :param maxDepth: Max depth of tree.
                         E.g., depth 0 means 1 leaf node.
                         Depth 1 means 1 internal node + 2 leaf nodes.
        :param maxBins: Number of bins used for finding splits at each node.
        :return: DecisionTreeModel
        """
        sc = data.context
        dataBytes = _get_unmangled_labeled_point_rdd(data)
        categoricalFeaturesInfoJMap = \
            MapConverter().convert(categoricalFeaturesInfo,
                                   sc._gateway._gateway_client)
        model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
            dataBytes._jrdd, "regression", 0, categoricalFeaturesInfoJMap,
            impurity, maxDepth, maxBins)
        dataBytes.unpersist()
        return DecisionTreeModel(sc, model)
Example #3
0
    def trainRegressor(data, categoricalFeaturesInfo,
                       impurity="variance", maxDepth=5, maxBins=32):
        """
        Train a DecisionTreeModel for regression.

        :param data: Training data: RDD of LabeledPoint.
                     Labels are real numbers.
        :param categoricalFeaturesInfo: Map from categorical feature index
                                        to number of categories.
                                        Any feature not in this map
                                        is treated as continuous.
        :param impurity: Supported values: "variance"
        :param maxDepth: Max depth of tree.
                         E.g., depth 0 means 1 leaf node.
                         Depth 1 means 1 internal node + 2 leaf nodes.
        :param maxBins: Number of bins used for finding splits at each node.
        :return: DecisionTreeModel
        """
        sc = data.context
        dataBytes = _get_unmangled_labeled_point_rdd(data)
        categoricalFeaturesInfoJMap = \
            MapConverter().convert(categoricalFeaturesInfo,
                                   sc._gateway._gateway_client)
        model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
            dataBytes._jrdd, "regression",
            0, categoricalFeaturesInfoJMap,
            impurity, maxDepth, maxBins)
        dataBytes.unpersist()
        return DecisionTreeModel(sc, model)
Example #4
0
File: tree.py Project: Filix/spark
    def trainClassifier(data, numClasses, categoricalFeaturesInfo,
                        impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                        minInfoGain=0.0):
        """
        Train a DecisionTreeModel for classification.

        :param data: Training data: RDD of LabeledPoint.
                     Labels are integers {0,1,...,numClasses}.
        :param numClasses: Number of classes for classification.
        :param categoricalFeaturesInfo: Map from categorical feature index
                                        to number of categories.
                                        Any feature not in this map
                                        is treated as continuous.
        :param impurity: Supported values: "entropy" or "gini"
        :param maxDepth: Max depth of tree.
                         E.g., depth 0 means 1 leaf node.
                         Depth 1 means 1 internal node + 2 leaf nodes.
        :param maxBins: Number of bins used for finding splits at each node.
        :param minInstancesPerNode: Min number of instances required at child nodes to create
                                    the parent split
        :param minInfoGain: Min info gain required to create a split
        :return: DecisionTreeModel
        """
        sc = data.context
        dataBytes = _get_unmangled_labeled_point_rdd(data)
        categoricalFeaturesInfoJMap = \
            MapConverter().convert(categoricalFeaturesInfo,
                                   sc._gateway._gateway_client)
        model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
            dataBytes._jrdd, "classification",
            numClasses, categoricalFeaturesInfoJMap,
            impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
        dataBytes.unpersist()
        return DecisionTreeModel(sc, model)
Example #5
0
    def train(data, algo, numClasses, categoricalFeaturesInfo,
              impurity, maxDepth, maxBins=100):
        """
        Train a DecisionTreeModel for classification or regression.

        :param data: Training data: RDD of LabeledPoint.
                     For classification, labels are integers
                      {0,1,...,numClasses}.
                     For regression, labels are real numbers.
        :param algo: "classification" or "regression"
        :param numClasses: Number of classes for classification.
        :param categoricalFeaturesInfo: Map from categorical feature index
                                        to number of categories.
                                        Any feature not in this map
                                        is treated as continuous.
        :param impurity: For classification: "entropy" or "gini".
                         For regression: "variance".
        :param maxDepth: Max depth of tree.
                         E.g., depth 0 means 1 leaf node.
                         Depth 1 means 1 internal node + 2 leaf nodes.
        :param maxBins: Number of bins used for finding splits at each node.
        :return: DecisionTreeModel
        """
        sc = data.context
        dataBytes = _get_unmangled_labeled_point_rdd(data)
        categoricalFeaturesInfoJMap = \
            MapConverter().convert(categoricalFeaturesInfo,
                                   sc._gateway._gateway_client)
        model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
            dataBytes._jrdd, algo,
            numClasses, categoricalFeaturesInfoJMap,
            impurity, maxDepth, maxBins)
        dataBytes.unpersist()
        return DecisionTreeModel(sc, model)
Example #6
0
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        @param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        @param lambda_: The smoothing parameter
        """
        sc = data.context
        dataBytes = _get_unmangled_labeled_point_rdd(data)
        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_)
        return NaiveBayesModel(
            _deserialize_double_vector(ans[0]), _deserialize_double_vector(ans[1]), _deserialize_double_matrix(ans[2])
        )
Example #7
0
    def trainClassifier(data,
                        numClasses,
                        categoricalFeaturesInfo,
                        impurity="gini",
                        maxDepth=5,
                        maxBins=32,
                        minInstancesPerNode=1,
                        minInfoGain=0.0):
        """
        Train a DecisionTreeModel for classification.

        :param data: Training data: RDD of LabeledPoint.
                     Labels are integers {0,1,...,numClasses}.
        :param numClasses: Number of classes for classification.
        :param categoricalFeaturesInfo: Map from categorical feature index
                                        to number of categories.
                                        Any feature not in this map
                                        is treated as continuous.
        :param impurity: Supported values: "entropy" or "gini"
        :param maxDepth: Max depth of tree.
                         E.g., depth 0 means 1 leaf node.
                         Depth 1 means 1 internal node + 2 leaf nodes.
        :param maxBins: Number of bins used for finding splits at each node.
        :param minInstancesPerNode: Min number of instances required at child nodes to create
                                    the parent split
        :param minInfoGain: Min info gain required to create a split
        :return: DecisionTreeModel
        """
        sc = data.context
        dataBytes = _get_unmangled_labeled_point_rdd(data)
        categoricalFeaturesInfoJMap = \
            MapConverter().convert(categoricalFeaturesInfo,
                                   sc._gateway._gateway_client)
        model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
            dataBytes._jrdd, "classification", numClasses,
            categoricalFeaturesInfoJMap, impurity, maxDepth, maxBins,
            minInstancesPerNode, minInfoGain)
        dataBytes.unpersist()
        return DecisionTreeModel(sc, model)
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        @param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        @param lambda_: The smoothing parameter
        """
        sc = data.context
        dataBytes = _get_unmangled_labeled_point_rdd(data)
        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd,
                                                       lambda_)
        return NaiveBayesModel(_deserialize_double_vector(ans[0]),
                               _deserialize_double_vector(ans[1]),
                               _deserialize_double_matrix(ans[2]))