class DecisionTreeClassifierImpl(): def __init__(self, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight='balanced', presort=False): self._hyperparams = { 'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'random_state': random_state, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'class_weight': class_weight, 'presort': presort} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def fit(self, X, y): """ Fit the NearestCentroid model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. y : array, shape = [n_samples] Target values (integers) """ self.y = y if self.fit_base: self.base_classifier.fit(X, y) distances = self.base_classifier.predict_proba(X) topNIndices, topNDistances = self._get_top_labels(distances) training_data = self._extract_features(topNIndices, topNDistances, y, distances) # create a decision tree for each label self.meta_classifiers = {} for label, training_samples_of_label in training_data.items(): training_samples_of_label = np.matrix(training_samples_of_label) decision_tree = DecisionTreeClassifier(criterion="gini") decision_tree.fit(training_samples_of_label[:, 0:-1], training_samples_of_label[:, -1:]) self.meta_classifiers[label] = decision_tree
def decision_tree_depths(): max_depths = [2, 4, 6, 8, 10, 12, 16, 18, 20, 25, 30, 40] columns = [ 'Max Depths', 'Training Score', 'Test Score', 'Train Time', 'Test Time' ] df = pd.DataFrame(columns=columns) for depth in max_depths: start_train = time.time() dt = DecisionTreeClassifier(max_depth=depth) print(dt) dt.fit(X_train, y_train) end_train = time.time() - start_train train_score = dt.score(X_train, y_train) start_test = time.time() test_score = dt.score(X_test, y_test) end_test = time.time() - start_test values = [depth, train_score, test_score, end_train, end_test] df.loc[len(df)] = values print(' '.join(str(col) for col in columns)) print(' '.join(str(val) for val in values)) df.to_excel('adult_dt.xls')
def decision_tree_training_sets(): training_set_sizes = [.1,.25,.5,.75,.9] columns = ['Training Set Size', 'Training Score', 'Test Score', 'Train Time', 'Test Time'] df = pd.DataFrame(columns=columns) for training_set_size in training_set_sizes: X_train, X_test, y_train, y_test = train_test_split( encoded_data[list(set(encoded_data.columns) - set(['Target']))], encoded_data['Target'], train_size=training_set_size) scaler = preprocessing.StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train.astype('float32')), columns=X_train.columns) X_test = scaler.transform(X_test.astype('float32')) start_train = time.time() dt = DecisionTreeClassifier(max_depth=8) print(dt) dt.fit(X_train, y_train) end_train = time.time() - start_train train_score = dt.score(X_train, y_train) start_test = time.time() test_score = dt.score(X_test, y_test) end_test = time.time() - start_test values = [training_set_size, train_score, test_score, end_train, end_test] df.loc[len(df)] = values print(' '.join(str(col) for col in columns)) print(' '.join(str(val) for val in values)) df.to_excel('diabetes_dt_training_sets.xls')
def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict): clf = DecisionTreeClassifier() clf.fit(X, y) current_state_to_predict = np.array(current_state_to_predict).reshape( 1, -1) predicted_state = clf.predict(current_state_to_predict) return predicted_state
def fit(self, X, y): """ Fit the NearestCentroid model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. y : array, shape = [n_samples] Target values (integers) """ self.y = y if self.fit_base: self.base_classifier.fit(X, y) distances = self.base_classifier.predict_proba(X) topNIndices, topNDistances = self._get_top_labels(distances) training_data = self._extract_features(topNIndices, topNDistances, y, distances) # create a decision tree for each label self.meta_classifiers = {} for label, training_samples_of_label in training_data.items(): training_samples_of_label = np.matrix(training_samples_of_label) decision_tree = DecisionTreeClassifier(criterion="gini") decision_tree.fit(training_samples_of_label[:, 0:-1], training_samples_of_label[:, -1:]) self.meta_classifiers[label] = decision_tree
def dtree(X, y, model_path): model = DecisionTreeClassifier() model.fit(X, y) expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def wrapper_for_decision_tree_accuracy(X, y, relative_test_size): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=relative_test_size, random_state=42) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) score = accuracy_score(pred, y_test) return score
def wrapper_for_decision_tree_accuracy(X, y, relative_test_size): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=relative_test_size, random_state=42) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) score = accuracy_score(pred,y_test) return score
def create_decision_tree(self): ''' based on experiments our best model was the decision tree model with the following params: ''' tree = DecisionTreeClassifier(max_depth=65, min_samples_split=0.03, min_samples_leaf=3, max_features=8) tree.fit(self.X_train, self.Y_train) predicted_y = tree.predict(self.X_test) print(predicted_y) self.print_stats(predicted_y, "") self.test_df['learning_label'] = predicted_y self.test_df.to_csv('output/feature_extraction.csv', encoding="latin-1") # save the training dataset
def MarginBoostClf(features, labels, max_depth, n_steps, margin): sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size clf_list = [] for t in range(n_steps): clf = DecisionTreeClassifier(max_depth=max_depth) clf = clf.fit(features, labels, sample_weight=weights) y_predict = clf.predict(features) incorrect = y_predict != labels # Error fraction estimator_error = np.mean( np.average(incorrect, weights=weights, axis=0)) if (estimator_error >= 0.5): break step_size = 0.5 * (np.log((1 - estimator_error) / estimator_error) + np.log(1 - margin) - np.log(1 + margin)) norm_factor = 2 * pow(estimator_error * (1 - estimator_error), 0.5) for i in range(sample_size): if (labels[i] == y_predict[i]): weights[i] *= np.exp(-step_size) / norm_factor else: weights[i] *= np.exp(step_size) / norm_factor clf_list.append([clf, step_size]) return clf_list
def BoostByMaj(features, labels, max_depth, gamma): sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size counts = np.zeros(sample_size) k_pre = get_k_from_gamma(gamma, sample_size) k = k_pre #k = min(600, k_pre) print('k ', k) clf_list = [] for i in range(k): estimator_error = 0.6 countdown = 10 while ((estimator_error >= 0.5) and (countdown >= 0)): clf = DecisionTreeClassifier(max_depth=max_depth) clf = clf.fit(features, labels, sample_weight=weights) y_predict = clf.predict(features) correct_ones = y_predict == labels incorrect_ones = y_predict != labels estimator_error = np.mean( np.average(incorrect_ones, weights=weights, axis=0)) unweighted_estimator_error = np.mean( np.average(incorrect_ones, axis=0)) countdown -= 1 counts += correct_ones coeff_1 = int(np.floor(k / 2)) - counts coeff_2 = int(np.ceil(k / 2)) - i - 1 + counts weights = comb(k - i - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow( 0.5 - gamma, coeff_2) print('i', i, 'error', estimator_error, 'unweighted_error', unweighted_estimator_error, 'wnorm', np.linalg.norm(weights, ord=1)) weights = weights / np.linalg.norm(weights, ord=1) clf_list.append([clf, 1]) return clf_list, weights
def sklearn_titanic(): from sklearn.tree.tree import DecisionTreeClassifier from sklearn.preprocessing.label import LabelEncoder total_df = pd.read_csv("titanic_clean.csv") total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True) total_df.dropna(inplace=True) for col in total_df.columns.tolist(): if str(total_df[col].dtype) == 'object': total_df[col] = LabelEncoder().fit_transform(total_df[col]) total_num = total_df.shape[0] train_df = total_df.iloc[:int(total_num * 0.8)] test_df = total_df.iloc[int(total_num * 0.8):] clf = DecisionTreeClassifier() clf.fit(train_df.drop(['survived'], axis=1), train_df['survived']) print(clf.score(test_df.drop(['survived'], axis=1), test_df['survived']))
def DeepBBM2(features, labels, max_depth, gamma, max_depth_range): num_features = features.shape[1] sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size D_weights = np.ones(sample_size) / sample_size counts = np.zeros(sample_size) k_pre = get_k_from_gamma(gamma, sample_size) k = k_pre #k = min(600, k_pre) normalizer = np.exp(1) * sample_size print('k ', k) clf_list = [] rademacher_list = [] for depth in max_depth_range: rademacher_list.append( calc_rademacher(depth, sample_size, num_features, normalizer)) for t in range(k): best_loss = 10000 best_error = 1 best_depth = -1 best_clf = DecisionTreeClassifier(max_depth=0) for depth in max_depth_range: new_clf_list, new_weights = DeepBoost(features, labels, 1, max_depth_range, initial_weights=weights) new_clf = DecisionTreeClassifier(max_depth=depth) new_clf = new_clf.fit(features, labels, sample_weight=weights) new_error = eval_clf(new_clf, features, labels, weights) new_edge = new_error - 0.5 new_sign_edge = np.sign(new_edge) new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth - 1] # print ('new_error', new_error, 'new_grad', new_grad) print('depth', depth, 'new_error', new_error, 'new_grad', new_loss) if (new_loss < best_loss): best_clf = new_clf best_loss = new_loss best_error = new_error best_depth = depth y_predict = best_clf.predict(features) correct_ones = y_predict == labels counts += correct_ones # if (best_error >= 0.5): # break; coeff_1 = int(np.floor(k / 2)) - counts coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow( 0.5 - gamma, coeff_2) print('i', t, 'error', best_error, 'wnorm', np.linalg.norm(weights, ord=1)) weights = weights / np.linalg.norm(weights, ord=1) clf_list.append([best_clf, 1]) return clf_list, weights
def DeepBBM(features, labels, gamma, max_depth_range, PARAM_lambda_2): num_features = features.shape[1] sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size counts = np.zeros(sample_size) k_pre = get_k_from_gamma(gamma, sample_size) k = k_pre #k = min(600, k_pre) normalizer = np.exp(1) * sample_size # print ('k ', k) clf_list = [] rademacher_list = [] for depth_index in range(len(max_depth_range)): depth = max_depth_range[depth_index] rademacher_list.append( calc_rademacher(depth, sample_size, num_features, normalizer)) for t in range(k): best_loss = 10000 best_error = 1 best_depth = -1 best_clf = DecisionTreeClassifier(max_depth=0) for depth_index in range(len(max_depth_range)): depth = max_depth_range[depth_index] new_clf = DecisionTreeClassifier(max_depth=depth) new_clf = new_clf.fit(features, labels, sample_weight=weights) new_error = eval_clf(new_clf, features, labels, weights) new_edge = new_error - 0.5 new_sign_edge = np.sign(new_edge) new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth_index] if (new_loss < best_loss): best_clf = new_clf best_loss = new_loss best_error = new_error best_depth = depth y_predict = best_clf.predict(features) correct_ones = y_predict == labels counts += correct_ones coeff_1 = int(np.floor(k / 2)) - counts coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow( 0.5 - gamma, coeff_2) clf_list.append([best_clf, 1, best_depth]) # print ('i', t, 'error', best_error, 'wnorm', np.linalg.norm(weights, ord=1)) if (np.max(coeff_1) < 0): break weights = weights / np.linalg.norm(weights, ord=1) return clf_list, weights
def BrownBoost(features, labels, max_depth, total_time): sample_size = features.shape[0] clf_list = [] r = np.zeros(sample_size) weights = np.array([]) s = total_time #s works as the remaining time with the initial value total_time T = total_time alpha = 0 i = 0 b = np.zeros(sample_size) while (s > 0 and i < 200): weights = np.exp(-(r + s)**2 / total_time) weights = weights / (np.sum(weights)) clf = DecisionTreeClassifier(max_depth=max_depth) clf = clf.fit(features, labels, sample_weight=weights) y_predict = clf.predict(features) incorrect = y_predict != labels # Error fraction estimator_error = np.mean( np.average(incorrect, weights=weights, axis=0)) print('estimator_error is', estimator_error) if (estimator_error >= 0.5): break for j in range(sample_size): if (labels[j] == y_predict[j]): b[j] = 1 else: b[j] = -1 a = r + s (t, alpha) = SolveODE(a, b, s, sample_size, T) r += alpha * b s = s - t print(s) clf_list.append([clf, alpha]) i += 1 return clf_list
def ret_trained_DT_clf(X, Y): clf = DecisionTreeClassifier(max_depth=3) clf.fit(X, Y) return clf
target_b = [0 if target[i] == "ELK" else 1 for i in range(len(target))] target_c = [0 if target[i] == "CATTLE" else 1 for i in range(len(target))] X_train_deer, X_test_deer, y_train_deer, y_test_deer = train_test_split( train, target_a, random_state=0, test_size=0.3) X_train_elk, X_test_elk, y_train_elk, y_test_elk = train_test_split( train, target_b, random_state=0, test_size=0.3) X_train_cattle, X_test_cattle, y_train_cattle, y_test_cattle = train_test_split( train, target_c, random_state=0, test_size=0.3) print("-----Question 1-----") ##Question 1## ###Decision Tree print("-----DECISION TREE-----") print("DEER confusion Matrix and accuracy score") clf = DecisionTreeClassifier() clf.fit(X_train_deer, y_train_deer) y_pred = clf.predict(X_test_deer) ##predict my y's based on x's print(confusion_matrix(y_test_deer, y_pred)) print("Testing Score") print(accuracy_score(y_test_deer, y_pred)) # y_pred = clf.predict(X_train_deer) print("Training Score") print(accuracy_score(y_train_deer, y_pred)) print("ELK confusion matrix and accuracy score") clf = DecisionTreeClassifier() clf.fit(X_train_elk, y_train_elk) y_pred = clf.predict(X_test_elk) print(confusion_matrix(y_test_elk, y_pred)) print("Testing Score") print(accuracy_score(y_test_elk, y_pred))
Created on 2019年1月4日 决策树 ''' import numpy as np from sklearn.model_selection._split import train_test_split from sklearn.metrics.classification import classification_report from sklearn.tree.tree import DecisionTreeClassifier def iris_type(s): it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} return it[str(s, encoding="utf8")] path = 'demo1_Iris.txt' # data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) x, y = np.split(data, (4, ), axis=1) x = x[:, :4] x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6) clf = DecisionTreeClassifier(criterion='entropy', random_state=0) clf.fit(x, y.ravel()) print('feature_importances_', clf.feature_importances_) y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred))
class Model(object): """ The machine learning component of the tester. This component stores four different models: 1) A model to decide between different types of events (drags and touches). 2) A model to decide on the starting position for drags. 3) A model to decide on the ending position for drags. 4) A model to decide on the position of the touch. The input data are all the different known UI elements on the screen from the training data and whether or not they are visible on the screen. To acquire this, we first get the stored XML model and record the resource-id and class. We concatenate them into an array and mark as (1) for visible and (0) for not visible. """ def __init__(self): self.symbols = {} self.action_data = None self.action_labels = None self.action_classifier = None self.drag_data = None self.drag_end_labels = None self.drag_end_classifier = None self.drag_start_labels = None self.drag_start_classifier = None self.touch_data = None self.touch_labels = None self.touch_classifier = None self.device_info = device.info def parse_events(self, queue): symbols = {"randomizer": 0} events = [] all_data = [] all_results = [] drag_data = [] drag_start_results = [] drag_end_results = [] touch_data = [] touch_results = [] while not queue.empty(): event = queue.get() events.append(event) lst = event.state.start.as_list(symbols) lst[0] = random() all_data.append(lst) if event.action.is_drag(): drag_data.append(lst) all_results.append(DRAG) start = event.changes.start() end = event.changes.end() drag_start_results.append(start.x * start.y) drag_end_results.append(end.x * end.y) if event.action.is_touch(): touch_data.append(lst) all_results.append(TOUCH) start = event.changes.start() touch_results.append(start.x * start.y) if event.action.is_back(): all_results.append(BACK) data = np.zeros((len(all_data), len(symbols))) for i, item in enumerate(all_data): data[i, : len(item)] = item[:] drags = np.zeros((len(drag_data), len(symbols))) for i, item in enumerate(drag_data): drags[i, : len(item)] = item[:] touches = np.zeros((len(touch_data), len(symbols))) for i, item in enumerate(touch_data): touches[i, : len(item)] = item[:] self.symbols = symbols self.action_data = data self.action_labels = np.array(all_results) self.drag_data = drags self.drag_start_labels = np.array(drag_start_results) self.drag_end_labels = np.array(drag_end_results) self.touch_data = touches self.touch_labels = np.array(touch_results) def train(self): self.action_classifier = DecisionTreeClassifier() self.action_classifier.fit(self.action_data, self.action_labels) self.drag_start_classifier = DecisionTreeRegressor() self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels) self.drag_end_classifier = DecisionTreeRegressor() self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels) self.touch_classifier = DecisionTreeRegressor() self.touch_classifier.fit(self.touch_data, self.touch_labels) def predict(self, state): input = state.as_list(self.symbols, False) input[0] = random() action = Action() type = self.action_classifier.predict(input) width = self.device_info["displayWidth"] if type == DRAG: start = self.drag_start_classifier.predict(input)[0] end = self.drag_end_classifier.predict(input)[0] start = Point(start % width, start / width) end = Point(end % width, end / width) action.init(ACTION_DRAG, start, end, 0.5) elif type == TOUCH: point = self.touch_classifier.predict(input)[0] point = Point(point % width, point / width) action.init(ACTION_TOUCH, point.x, point.y) elif type == BACK: action.init(ACTION_BACK) return action def save(self): pass
def decision_tree_fit(X, y): clf = DecisionTreeClassifier(min_samples_leaf=5, random_state=42) return clf.fit(X, y)
build_iris(RidgeClassifierCV(), "RidgeIris", with_proba = False) build_iris(BaggingClassifier(RidgeClassifier(random_state = 13), random_state = 13, n_estimators = 3, max_features = 0.5), "RidgeEnsembleIris") build_iris(SGDClassifier(random_state = 13, max_iter = 100), "SGDIris", with_proba = False) build_iris(SGDClassifier(random_state = 13, loss = "log", max_iter = 100), "SGDLogIris") build_iris(SVC(), "SVCIris", with_proba = False) build_iris(NuSVC(), "NuSVCIris", with_proba = False) build_iris(VotingClassifier([("dt", DecisionTreeClassifier(random_state = 13)), ("nb", GaussianNB()), ("lr", LogisticRegression())]), "VotingEnsembleIris", with_proba = False) build_iris(OptimalXGBClassifier(objective = "multi:softprob", ntree_limit = 7), "XGBIris", ntree_limit = 7) if "Iris" in datasets: mapper = DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()) ]) iris_Xt = mapper.fit_transform(iris_X) dt_classifier = DecisionTreeClassifier(random_state = 13) dt_classifier.fit(iris_Xt, iris_y) lr_classifier = LogisticRegression(random_state = 13) lr_classifier.fit(iris_Xt, iris_y) pipeline = PMMLPipeline([ ("mapper", mapper), ("estimator", SelectFirstEstimator([ ("X[2] <= 3", dt_classifier), (str(True), lr_classifier) ])) ]) pipeline.active_fields = iris_X.columns.values pipeline.target_fields = ["Species"] store_pkl(pipeline, "SelectFirstIris") species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) store_csv(species, "SelectFirstIris")
### a classic way to overfit is to use a small number ### of data points and a large number of features; ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here from sklearn.tree.tree import DecisionTreeClassifier vocab_list = vectorizer.get_feature_names() dtc = DecisionTreeClassifier() dtc.fit(features_train, labels_train) pred = dtc.predict(features_test) from sklearn.metrics import accuracy_score accuracy = accuracy_score(labels_test, pred) print(accuracy) feature_importances = dtc.feature_importances_ for i in range(0, len(feature_importances)): if feature_importances[i] > 0.2: print("Importance = ", feature_importances[i], " number is ", i, " word is ", vocab_list[i])
def main(): print("Loading samples and labels") samples, labels, _ = load_files("data") print("Loaded {} samples".format(samples.shape[0])) sequence_dim = 100 print("Converting to sequences of length {}".format(sequence_dim)) samples, labels = make_sequences(samples, labels, sequence_dim) print("Number of samples from sequences: {}".format(samples.shape[0])) lb = LabelBinarizer() labels = lb.fit_transform(labels) # flattened samples for Decision Tree flatSamples = samples.reshape(samples.shape[0], -1) #tree! (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(flatSamples, labels, test_size=0.25, random_state=42) print("=" * 20) print("Building DecisionTree model") model = DecisionTreeClassifier() model.fit(trainSamples, trainLabels) treeResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), treeResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), treeResults.argmax(axis=1))) treeAcc = accuracy_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)) print("Accuracy Tree: {:.2f}".format(treeAcc)) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)))) print("=" * 20) print("Building CNN model") (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42) inputShape = (samples.shape[1], samples.shape[2]) model = Sequential() model.add(Conv1D(32, 10, padding="same", input_shape=inputShape)) model.add(Activation("relu")) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Conv1D(64, 10, padding="same")) model.add(Activation("relu")) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Conv1D(128, 10, padding="same")) model.add(Activation("relu")) model.add(Dropout(0.2)) model.add(Flatten(input_shape=inputShape)) model.add(Dense(128, activation='sigmoid')) model.add(Dense(64, activation='sigmoid')) model.add(Dense(labels.shape[1], activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) EPOCHS = 10 BATCH = 128 model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS, validation_data=(testSamples, testLabels)) cnnResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), cnnResults.argmax(axis=1), target_names=lb.classes_)) print("CNN Accuracy: {:.2f}".format( accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) input("")
# with open('../feature_names.pickle', 'r') as pickled: # feature_names = pickle.load(pickled) print "Loaded data; testing classifier..." features_train, labels_train = ClassBalancingClassifierWrapper.rebalance( features_train, labels_train, ratio=2) results = [] for i in range(15): print 'Round', i classifier = DecisionTreeClassifier() classifier = SKLPipeline([('feature_selection', SelectPercentile(f_classif, 1)), ('classification', classifier)]) classifier.fit(features_train, labels_train) labels_test_predicted = classifier.predict(features_test) results.append(diff_binary_vectors(labels_test_predicted, labels_test_gold)) # support = classifier.steps[0][1].get_support(True) # print 'Selected', len(support), 'features:' # for index in support: # print ' ', feature_names[index] print 'Results:' print ClassificationMetrics.average(results, False) # Visualize last round '''
bestIndex = j bestP = data["pred_" + top16tags[j]][i] labelY.append(bestIndex) bestPossibleY.append(bestP) print("Label Y stat:") labelYStat = defaultdict(lambda: 0) for ly in labelY: labelYStat[ly] = labelYStat[ly] + 1 for i in range(0, 16): print("\tindex " + str(i) + ": " + str(labelYStat[i])) model_to_show = DecisionTreeClassifier(random_state=42, max_depth=5) model = DecisionTreeClassifier(random_state=42, max_depth=30) # model = RandomForestClassifier(n_estimators=25, random_state=42) model.fit(X, labelY) model_to_show.fit(X, labelY) tree.export_graphviz(model_to_show, out_file=OUTPUT_TREE_FILE, feature_names=columns, label='none') predY = model.predict(X) print("Pred Y stat:") predYStat = defaultdict(lambda: 0) for py in predY: predYStat[py] = predYStat[py] + 1 for i in range(0, 16): print("\tindex " + str(i) + ": " + str(predYStat[i]))
from sklearn.metrics import f1_score from sklearn.datasets import make_classification from sklearn.tree.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split X, Y = make_classification( n_samples=10000, n_features=1000, n_informative=10) trai_x, test_x, trai_y, test_y = train_test_split(X, Y, train_size=0.8) # clf = DecisionTreeClassifier() model = clf.fit(trai_x, trai_y) trai_yp = model.predict(trai_x) test_yp = model.predict(test_x) print( model, "F1, Trai:%.6f, Test:%.6f" % (f1_score(trai_y, trai_yp), f1_score(test_y, test_yp)) ) # clf = LogisticRegression() model = clf.fit(trai_x, trai_y) trai_yp = model.predict(trai_x) test_yp = model.predict(test_x)
def DeepBoost(features, labels, n_steps, max_depth_range, PARAM_lambda, PARAM_beta, initial_weights=None): num_features = features.shape[1] sample_size = features.shape[0] if (initial_weights == None): weights = np.ones(sample_size) / sample_size normalizer = np.exp(1) * sample_size clf_list = [] for t in range(n_steps): best_error = 0 best_grad = 0 best_index = -1 #? old_tree_is_best = False for j in range(len(clf_list)): triple = clf_list[j] alpha = triple[1] if (abs(alpha) >= kTolerance): old_clf = triple[0] tree_depth = triple[2] error = eval_clf(old_clf, features, labels, weights) edge = error - 0.5 sign_edge = np.sign(edge) grad = gradient(error, tree_depth, alpha, sign_edge, sample_size, num_features, normalizer, PARAM_lambda, PARAM_beta) # print ('depth', tree_depth, 'error', error, 'grad', grad) if (abs(grad) > abs(best_grad)): best_grad = grad best_error = error best_index = j old_tree_is_best = True best_depth = -1 for depth in max_depth_range: new_clf = DecisionTreeClassifier(max_depth=depth) new_clf = new_clf.fit(features, labels, sample_weight=weights) new_error = eval_clf(new_clf, features, labels, weights) new_edge = new_error - 0.5 new_sign_edge = np.sign(new_edge) new_grad = gradient(new_error, depth, 0, new_sign_edge, sample_size, num_features, normalizer, PARAM_lambda, PARAM_beta) if (abs(new_grad) > abs(best_grad)): best_new_clf = new_clf best_grad = new_grad best_error = new_error best_depth = depth old_tree_is_best = False if old_tree_is_best: triple = clf_list[best_index] alpha = triple[1] clf = triple[0] depth = triple[2] eta = compute_eta(best_error, depth, alpha, sample_size, num_features, normalizer, PARAM_lambda, PARAM_beta) clf_list[best_index][1] += eta else: alpha = 0 clf = best_new_clf depth = best_depth #print ('t', t, 'best_error', best_error) eta = compute_eta(best_error, depth, alpha, sample_size, num_features, normalizer, PARAM_lambda, PARAM_beta) clf_list.append([clf, eta, depth]) old_normalizer = normalizer normalizer = 0 y_predict = clf.predict(features) for i in range(sample_size): if (labels[i] == y_predict[i]): u = eta else: u = -eta weights[i] *= np.exp(-u) normalizer += weights[i] weights = weights / normalizer normalizer = normalizer * old_normalizer return clf_list, weights
print("rmse: " + str(rmse)) rmse = rmseEval(data["tw"]['target'], combinedPrediction2)[1] print("rmse: " + str(rmse)) print("identification test:") identificationColumns = [] for c in columns["all"]: if c not in ['target', 'prediction', 'timestamp', 'location']: identificationColumns.append(c) clf = DecisionTreeClassifier() traintestX = generateTrainingData(data["all"], identificationColumns) clf = clf.fit(traintestX, label) prediction = clf.predict(traintestX) a = accuracy_score(label, prediction) print(str(a)) a = accuracy_score(label, prediction, normalize=False) print(str(a)) a = confusion_matrix(label, prediction) print(str(a)) # with open(OUTPUT_DIRECTORY + "dt.dot", 'w') as f: # f = tree.export_graphviz(clf, out_file=f, feature_names=identificationColumns)#, max_depth=10)
def train(dataset): if (dataset == 'spambase'): features, labels, testing_features, true_labels = fetch_data_from_raw( 'spambase') else: features, labels, testing_features, true_labels = fetch_npy_data( dataset) PARAM_lambda = 0.001 PARAM_lambda_2 = 0.01 PARAM_beta = 0.001 gamma = 0.06 tree_depth = 15 depth_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] ## DeepBoost T = 200 clf_list_db, weights = DeepBoost(features, labels, T, depth_range, PARAM_lambda, PARAM_beta) train_error_db = testEnsemble(clf_list_db, features, labels) test_error_db = testEnsemble(clf_list_db, testing_features, true_labels) print('db done') ## Deep BBM gamma_list = [0.15, 0.1, 0.08, 0.06] lambda_2_list = [1, 0.1, 0.01, 0.001, 0.0001] max_depth_list = [1, 2, 3, 5, 10, 15] # parameter search for Deep BBM # train_errors_dbbm = np.zeros([len(gamma_list), len(lambda_2_list), len(max_depth_list)]) # test_errors_dbbm = np.zeros([len(gamma_list), len(lambda_2_list), len(max_depth_list)]) # for i in range(len(gamma_list)): # for j in range(len(lambda_2_list)): # for k in range(len(max_depth_list)): # gamma = gamma_list[i] # lambda_2 = lambda_2_list[j] # max_depth = max_depth_list[k] # depth_range = [] # for l in range(max_depth): # depth_range.append(l+1) # print (depth_range) # clf_list_dbbm, weights = DeepBBM(features, labels, gamma, depth_range, lambda_2) # train_error_dbbm = testEnsemble(clf_list_dbbm, features, labels) # test_error_dbbm = testEnsemble(clf_list_dbbm, testing_features, true_labels) # print ('ga', gamma, 'l2', lambda_2, 'md', max_depth, 'TrErr', train_error_dbbm, 'TeErr', test_error_dbbm) # train_errors_dbbm[i, j, k] = train_error_dbbm # test_errors_dbbm[i, j, k] = test_error_dbbm # np.save('TrErr_dbbm_ps_2', train_errors_dbbm) # np.save('TeErr_dbbm_ps_2', test_errors_dbbm) clf_list_dbbm, weights = DeepBBM(features, labels, gamma, depth_range, PARAM_lambda_2) train_error_dbbm = testEnsemble(clf_list_dbbm, features, labels) test_error_dbbm = testEnsemble(clf_list_dbbm, testing_features, true_labels) print('dbbm done') ## DecisionTreeClassifier dtc = DecisionTreeClassifier(max_depth=tree_depth) dtc = dtc.fit(features, labels) train_pred = dtc.predict(features) train_mse_dtc = ((train_pred - labels)**2).mean(axis=0) test_pred = dtc.predict(testing_features) # print (np.concatenate((np.expand_dims(pred, axis=1), np.expand_dims(true_labels, axis=1)), axis=1)) test_mse_dtc = ((test_pred - true_labels)**2).mean(axis=0) ## Boost by Majority # gamma = 0.1 clf_list_bbm, weights = BoostByMaj(features, labels, tree_depth, gamma) train_error_bbm = testEnsemble(clf_list_bbm, features, labels) test_error_bbm = testEnsemble(clf_list_bbm, testing_features, true_labels) #PlotMarginDistribution(clf_list_bbm, testing_features, true_labels) print('bbm done') # ## AdaBoost # T = 200 clf_list_adb = AdaBoostClf(features, labels, tree_depth, T) train_error_adb = testEnsemble(clf_list_adb, features, labels) test_error_adb = testEnsemble(clf_list_adb, testing_features, true_labels) print('adb done') #PlotMarginDistribution(clf_list_adb, testing_features, true_labels) ## MarginBoost (from our homework) # T = 200 margin = pow(2, -6) clf_list_mb = MarginBoostClf(features, labels, tree_depth, T, margin) train_error_mb = testEnsemble(clf_list_mb, features, labels) test_error_mb = testEnsemble(clf_list_mb, testing_features, true_labels) print('mb done') #PlotMarginDistribution(clf_list_mgb, testing_features, true_labels) ## BrownBoost total_time = 100 clf_list_brown = BrownBoost(features, labels, tree_depth, total_time) train_error_brown = testEnsemble(clf_list_brown, features, labels) test_error_brown = testEnsemble(clf_list_brown, testing_features, true_labels) print('bb done') print('DeepBoost: train_error', train_error_db) print('DeepBoost: test_error', test_error_db) print('DeepBBM: train_error', train_error_dbbm) print('DeepBBM: test_error', test_error_dbbm) print('decision tree: train_mse', train_mse_dtc) print('decision tree: test_mse', test_mse_dtc) print('BBM: train_error', train_error_bbm) print('BBM: test_error', test_error_bbm) print('AdaBoost: train_error', train_error_adb) print('AdaBoost: test_error', test_error_adb) print('MarginBoost: train_error', train_error_mb) print('MarginBoost: test_error', test_error_mb) print('BrownBoost: train_error', train_error_brown) print('BrownBoost: test_error', test_error_brown)
def decision_tree_fit(X,y): clf = DecisionTreeClassifier(min_samples_leaf=5, random_state=42) return clf.fit(X, y)
''' Random forest ''' print('Run random forest....') rr_model = RandomForestClassifier(n_estimators=100,max_depth=10, random_state=1) rr_model.fit(rel_train_X.relation_matrix, rel_train_Y.values) rf_pred_train = rr_model.predict(rel_train_X.relation_matrix) train_result.append(('RF', evaluateByF1(rf_pred_train, rel_train_Y.values))) rf_pred_test = rr_model.predict(test_X) test_result.append(('RF', evaluateByF1(rf_pred_test, test_Y))) print('Run decision tree....') id3_model = DecisionTreeClassifier(max_depth=10, random_state=1) id3_model.fit(rel_train_X.relation_matrix, rel_train_Y.values) id3_pred_train = id3_model.predict(rel_train_X.relation_matrix) train_result.append(('ID3', evaluateByF1(id3_pred_train, rel_train_Y.values))) id3_pred_test = id3_model.predict(test_X) test_result.append(('ID3', evaluateByF1(id3_pred_test, test_Y))) print('Performance of CBA and CMAR with different measures:') printList(train_result) printList(test_result)
from numpy import loadtxt from sklearn.model_selection import train_test_split from sklearn.tree.tree import DecisionTreeClassifier func = lambda x: 0.0 if x == b'False' else 1.0 all_data = loadtxt('flare.csv', delimiter=',', skiprows=1, converters={32: func}) target = all_data[:, -1] data = all_data[:, 0:-1] train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25, random_state=100) # SEIF A - Train the tree without trimming clf = DecisionTreeClassifier(criterion="entropy") clf.fit(train_x, train_y) success_rate = clf.score(test_x, test_y) # SEIF B - Trim to leaves smaller or equal to 20 clf = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=20) clf.fit(train_x, train_y) success_rate = clf.score(test_x, test_y)
y_train_age = data[pd.notnull(data.Age)][['Age']] regresor.fit(X_train_age, y_train_age) # TODO: sprawdzić tą predykcję wieku, działa chyba ok # data['AgePredicted'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), None) data['Age'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), data['Age']) ##predykcja poziomu classifier = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2) # X_train_floor = data[pd.notnull(data.Floor)][['Embarked', 'Pclass']] y_train_floor = data[pd.notnull(data.Floor)]['Floor'].values.astype('int') classifier.fit(X_train_floor, y_train_floor) data['Floor'] = np.where(pd.isnull(data.Floor), classifier.predict(data[['Embarked', 'Pclass']]), data['Floor']) ##zmiana ceny za bilet data['TicketCounts'] = data.groupby('Ticket')['Ticket'].transform('count') data['Fare'] = data['Fare'] / data['TicketCounts'] ##usunięcie nieużywanych kolumn data = data.drop(['Ticket', 'Cabin', 'Name', 'SibSp', 'Parch', 'TicketCounts'], axis=1) ##zalozenie indeksu na kolumnie
store_pkl(audit_mapper, "Audit.pkl") audit_X = audit[:, 0:48] audit_y = audit[:, 48] audit_y = audit_y.astype(int) print(audit_X.dtype, audit_y.dtype) def predict_audit(classifier): adjusted = DataFrame(classifier.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns = ["probability_0", "probability_1"]) return pandas.concat((adjusted, adjusted_proba), axis = 1) audit_tree = DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5) audit_tree.fit(audit_X, audit_y) store_pkl(audit_tree, "DecisionTreeAudit.pkl") store_csv(predict_audit(audit_tree), "DecisionTreeAudit.csv") audit_forest = RandomForestClassifier(random_state = 13, min_samples_leaf = 5) audit_forest.fit(audit_X, audit_y) store_pkl(audit_forest, "RandomForestAudit.pkl") store_csv(predict_audit(audit_forest), "RandomForestAudit.csv") audit_regression = LogisticRegression() audit_regression.fit(audit_X, audit_y) store_pkl(audit_regression, "RegressionAudit.pkl") store_csv(predict_audit(audit_regression), "RegressionAudit.csv")
def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict): clf = DecisionTreeClassifier() clf.fit(X, y) current_state_to_predict = np.array(current_state_to_predict).reshape(1,-1) predicted_state = clf.predict(current_state_to_predict) return predicted_state
dot_filename = mkstemp(suffix='.dot', dir=tmp_dir)[1] with open(dot_filename, "w") as out_file: export_graphviz(clf, out_file=out_file, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) from IPython.display import Image image_filename = image_filename or ('%s.png' % dot_filename) subprocess.call(('dot -Tpng -o %s %s' % (image_filename, dot_filename)).split(' ')) image = Image(filename=image_filename) os.remove(dot_filename) return image from sklearn import datasets from sklearn.tree.tree import DecisionTreeClassifier %matplotlib inline iris = datasets.load_iris() X = iris.data Y = iris.target clf = DecisionTreeClassifier(max_depth=3) clf.fit(X, Y) convert_decision_tree_to_ipython_image(clf, image_filename='tree.png')