def main(readcsv=read_csv, method='defaultDense'): infile = "./data/batch/adaboost_train.csv" testfile = "./data/batch/adaboost_test.csv" # Configure a adaboost training object train_algo = d4p.adaboost_training() # Read data. Let's have 20 independent, and 1 dependent variable (for each observation) indep_data = readcsv(infile, range(20)) dep_data = readcsv(infile, range(20, 21)) # Now train/compute, the result provides the model for prediction train_result = train_algo.compute(indep_data, dep_data) # Now let's do some prediction predict_algo = d4p.adaboost_prediction() # read test data (with same #features) pdata = readcsv(testfile, range(20)) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # The prediction result provides prediction assert predict_result.prediction.shape == (pdata.shape[0], dep_data.shape[1]) ptdata = np.loadtxt(testfile, usecols=range(20, 21), delimiter=',', ndmin=2) assert np.allclose(predict_result.prediction, ptdata) return (train_result, predict_result, ptdata)
def fit(self, X, y): if self.split_criterion not in ('gini', 'infoGain'): raise ValueError('Parameter "split_criterion" must be ' '"gini" or "infoGain".') if not isinstance(self.max_tree_depth, numbers.Integral) or \ self.max_tree_depth < 0: raise ValueError('Parameter "max_tree_depth" must be ' 'positive integer value or zero.') if not isinstance(self.min_observations_in_leaf_node, numbers.Integral) or \ self.min_observations_in_leaf_node <= 0: raise ValueError( 'Parameter "min_observations_in_leaf_node" must be ' 'non-zero positive integer value.') if not isinstance(self.max_iterations, numbers.Integral) or \ self.max_iterations <= 0: raise ValueError('Parameter "max_iterations" must be ' 'non-zero positive integer value.') if self.learning_rate <= 0: raise ValueError('Parameter "learning_rate" must be ' 'non-zero positive value.') # it is not clear why it is so but we will get error from # Intel(R) oneAPI Data Analytics # Library otherwise if self.accuracy_threshold < 0 and self.accuracy_threshold >= 1: raise ValueError('Parameter "accuracy_threshold" must be ' 'more or equal to 0 and less than 1.') # Check that X and y have correct shape X, y = check_X_y(X, y, y_numeric=False, dtype=[np.single, np.double]) check_classification_targets(y) # Encode labels le = preprocessing.LabelEncoder() le.fit(y) self.classes_ = le.classes_ y_ = le.transform(y) # Convert to 2d array y_ = y_.reshape((-1, 1)) self.n_classes_ = len(self.classes_) self.n_features_in_ = X.shape[1] # Classifier can't train when only one class is present. # Trivial case if self.n_classes_ == 1: return self # Define type of data fptype = getFPType(X) # Fit the model tr = d4p.decision_tree_classification_training( fptype=fptype, nClasses=self.n_classes_, # this parameter is strict upper bound in DAAL maxTreeDepth=self.max_tree_depth + 1, minObservationsInLeafNodes=self.min_observations_in_leaf_node, splitCriterion=self.split_criterion, pruning='none') pr = d4p.decision_tree_classification_prediction( fptype=fptype, nClasses=self.n_classes_) train_algo = d4p.adaboost_training( fptype=fptype, nClasses=self.n_classes_, weakLearnerTraining=tr, weakLearnerPrediction=pr, maxIterations=self.max_iterations, learningRate=self.learning_rate, accuracyThreshold=self.accuracy_threshold) train_result = train_algo.compute(X, y_) # Store the model self.daal_model_ = train_result.model # Return the classifier return self