def train(data, algo, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins=100): """ Train a DecisionTreeModel for classification or regression. :param data: Training data: RDD of LabeledPoint. For classification, labels are integers {0,1,...,numClasses}. For regression, labels are real numbers. :param algo: "classification" or "regression" :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: Map from categorical feature index to number of categories. Any feature not in this map is treated as continuous. :param impurity: For classification: "entropy" or "gini". For regression: "variance". :param maxDepth: Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. :return: DecisionTreeModel """ sc = data.context dataBytes = _get_unmangled_labeled_point_rdd(data) categoricalFeaturesInfoJMap = \ MapConverter().convert(categoricalFeaturesInfo, sc._gateway._gateway_client) model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel( dataBytes._jrdd, algo, numClasses, categoricalFeaturesInfoJMap, impurity, maxDepth, maxBins) dataBytes.unpersist() return DecisionTreeModel(sc, model)
def trainRegressor(data, categoricalFeaturesInfo, impurity="variance", maxDepth=5, maxBins=32): """ Train a DecisionTreeModel for regression. :param data: Training data: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: Map from categorical feature index to number of categories. Any feature not in this map is treated as continuous. :param impurity: Supported values: "variance" :param maxDepth: Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. :return: DecisionTreeModel """ sc = data.context dataBytes = _get_unmangled_labeled_point_rdd(data) categoricalFeaturesInfoJMap = \ MapConverter().convert(categoricalFeaturesInfo, sc._gateway._gateway_client) model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel( dataBytes._jrdd, "regression", 0, categoricalFeaturesInfoJMap, impurity, maxDepth, maxBins) dataBytes.unpersist() return DecisionTreeModel(sc, model)
def trainRegressor(data, categoricalFeaturesInfo, impurity="variance", maxDepth=5, maxBins=32): """ Train a DecisionTreeModel for regression. :param data: Training data: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: Map from categorical feature index to number of categories. Any feature not in this map is treated as continuous. :param impurity: Supported values: "variance" :param maxDepth: Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. :return: DecisionTreeModel """ sc = data.context dataBytes = _get_unmangled_labeled_point_rdd(data) categoricalFeaturesInfoJMap = \ MapConverter().convert(categoricalFeaturesInfo, sc._gateway._gateway_client) model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel( dataBytes._jrdd, "regression", 0, categoricalFeaturesInfoJMap, impurity, maxDepth, maxBins) dataBytes.unpersist() return DecisionTreeModel(sc, model)
def trainClassifier(data, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ Train a DecisionTreeModel for classification. :param data: Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: Map from categorical feature index to number of categories. Any feature not in this map is treated as continuous. :param impurity: Supported values: "entropy" or "gini" :param maxDepth: Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. :param minInstancesPerNode: Min number of instances required at child nodes to create the parent split :param minInfoGain: Min info gain required to create a split :return: DecisionTreeModel """ sc = data.context dataBytes = _get_unmangled_labeled_point_rdd(data) categoricalFeaturesInfoJMap = \ MapConverter().convert(categoricalFeaturesInfo, sc._gateway._gateway_client) model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel( dataBytes._jrdd, "classification", numClasses, categoricalFeaturesInfoJMap, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) dataBytes.unpersist() return DecisionTreeModel(sc, model)
def train(data, algo, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins=100): """ Train a DecisionTreeModel for classification or regression. :param data: Training data: RDD of LabeledPoint. For classification, labels are integers {0,1,...,numClasses}. For regression, labels are real numbers. :param algo: "classification" or "regression" :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: Map from categorical feature index to number of categories. Any feature not in this map is treated as continuous. :param impurity: For classification: "entropy" or "gini". For regression: "variance". :param maxDepth: Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. :return: DecisionTreeModel """ sc = data.context dataBytes = _get_unmangled_labeled_point_rdd(data) categoricalFeaturesInfoJMap = \ MapConverter().convert(categoricalFeaturesInfo, sc._gateway._gateway_client) model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel( dataBytes._jrdd, algo, numClasses, categoricalFeaturesInfoJMap, impurity, maxDepth, maxBins) dataBytes.unpersist() return DecisionTreeModel(sc, model)
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). @param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). @param lambda_: The smoothing parameter """ sc = data.context dataBytes = _get_unmangled_labeled_point_rdd(data) ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_) return NaiveBayesModel( _deserialize_double_vector(ans[0]), _deserialize_double_vector(ans[1]), _deserialize_double_matrix(ans[2]) )
def trainClassifier(data, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ Train a DecisionTreeModel for classification. :param data: Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: Map from categorical feature index to number of categories. Any feature not in this map is treated as continuous. :param impurity: Supported values: "entropy" or "gini" :param maxDepth: Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. :param minInstancesPerNode: Min number of instances required at child nodes to create the parent split :param minInfoGain: Min info gain required to create a split :return: DecisionTreeModel """ sc = data.context dataBytes = _get_unmangled_labeled_point_rdd(data) categoricalFeaturesInfoJMap = \ MapConverter().convert(categoricalFeaturesInfo, sc._gateway._gateway_client) model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel( dataBytes._jrdd, "classification", numClasses, categoricalFeaturesInfoJMap, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) dataBytes.unpersist() return DecisionTreeModel(sc, model)
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). @param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). @param lambda_: The smoothing parameter """ sc = data.context dataBytes = _get_unmangled_labeled_point_rdd(data) ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_) return NaiveBayesModel(_deserialize_double_vector(ans[0]), _deserialize_double_vector(ans[1]), _deserialize_double_matrix(ans[2]))