def init_classifiers(model_condig, observations, target): classifiers = {} for key in model_condig.keys(): print 'Initializing classigier ', key if key == 'svm': best_estimator = search_best_param_for_model( key, SVC(), model_condig[key], observations, target) classifiers[key] = best_estimator if key == 'decision_tree': best_estimator = search_best_param_for_model( key, DecisionTreeClassifier(), model_condig[key], observations, target) classifiers[key] = best_estimator if key == 'random_forest': best_estimator = search_best_param_for_model( key, RandomForestClassifier(), model_condig[key], observations, target) classifiers[key] = best_estimator if key == 'adaboost': best_estimator = search_best_param_for_model( key, AdaBoostClassifier(base_estimator=DecisionTreeClassifier()), adjust_adaboost_param(model_condig[key]), observations, target) classifiers[key] = best_estimator return classifiers
def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict): clf = DecisionTreeClassifier() clf.fit(X, y) current_state_to_predict = np.array(current_state_to_predict).reshape( 1, -1) predicted_state = clf.predict(current_state_to_predict) return predicted_state
def BoostByMaj(features, labels, max_depth, gamma): sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size counts = np.zeros(sample_size) k_pre = get_k_from_gamma(gamma, sample_size) k = k_pre #k = min(600, k_pre) print('k ', k) clf_list = [] for i in range(k): estimator_error = 0.6 countdown = 10 while ((estimator_error >= 0.5) and (countdown >= 0)): clf = DecisionTreeClassifier(max_depth=max_depth) clf = clf.fit(features, labels, sample_weight=weights) y_predict = clf.predict(features) correct_ones = y_predict == labels incorrect_ones = y_predict != labels estimator_error = np.mean( np.average(incorrect_ones, weights=weights, axis=0)) unweighted_estimator_error = np.mean( np.average(incorrect_ones, axis=0)) countdown -= 1 counts += correct_ones coeff_1 = int(np.floor(k / 2)) - counts coeff_2 = int(np.ceil(k / 2)) - i - 1 + counts weights = comb(k - i - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow( 0.5 - gamma, coeff_2) print('i', i, 'error', estimator_error, 'unweighted_error', unweighted_estimator_error, 'wnorm', np.linalg.norm(weights, ord=1)) weights = weights / np.linalg.norm(weights, ord=1) clf_list.append([clf, 1]) return clf_list, weights
def MarginBoostClf(features, labels, max_depth, n_steps, margin): sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size clf_list = [] for t in range(n_steps): clf = DecisionTreeClassifier(max_depth=max_depth) clf = clf.fit(features, labels, sample_weight=weights) y_predict = clf.predict(features) incorrect = y_predict != labels # Error fraction estimator_error = np.mean( np.average(incorrect, weights=weights, axis=0)) if (estimator_error >= 0.5): break step_size = 0.5 * (np.log((1 - estimator_error) / estimator_error) + np.log(1 - margin) - np.log(1 + margin)) norm_factor = 2 * pow(estimator_error * (1 - estimator_error), 0.5) for i in range(sample_size): if (labels[i] == y_predict[i]): weights[i] *= np.exp(-step_size) / norm_factor else: weights[i] *= np.exp(step_size) / norm_factor clf_list.append([clf, step_size]) return clf_list
def train_ensemble_decision_tree_classifier(): #min_samples_split, min_samples_leaf, max_leaf_nodes, splitter classifier1 = SklearnClassifier(DecisionTreeClassifier(random_state=0), sparse=False) classifier2 = SklearnClassifier(DecisionTreeClassifier(max_depth=20, min_samples_split=3, min_samples_leaf=4, max_leaf_nodes=35, splitter='best', random_state=0), sparse=False) classifier3 = SklearnClassifier(DecisionTreeClassifier(max_depth=30, min_samples_split=2, min_samples_leaf=2, max_leaf_nodes=40, splitter='best', random_state=0), sparse=False) test_classifiers = [] test_classifiers.append(classifier1) test_classifiers.append(classifier2) test_classifiers.append(classifier3) trained_classifiers = [] for classifier in test_classifiers: classifier = classifier.train(train_features) trained_classifiers.append(classifier) voted_classifier = VoteClassifier(trained_classifiers) save_classifier(voted_classifier, 'voted_classifier_decision_tree.pickle') print_and_get_split_dataset_accuracy(test_classifiers, train_features) print_voted_classifier_cross_validation_experiment_result( test_classifiers, train_features)
def fit(self, X, y): """ Fit the NearestCentroid model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. y : array, shape = [n_samples] Target values (integers) """ self.y = y if self.fit_base: self.base_classifier.fit(X, y) distances = self.base_classifier.predict_proba(X) topNIndices, topNDistances = self._get_top_labels(distances) training_data = self._extract_features(topNIndices, topNDistances, y, distances) # create a decision tree for each label self.meta_classifiers = {} for label, training_samples_of_label in training_data.items(): training_samples_of_label = np.matrix(training_samples_of_label) decision_tree = DecisionTreeClassifier(criterion="gini") decision_tree.fit(training_samples_of_label[:, 0:-1], training_samples_of_label[:, -1:]) self.meta_classifiers[label] = decision_tree
def dtree(X, y, model_path): model = DecisionTreeClassifier() model.fit(X, y) expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def wrapper_for_decision_tree_accuracy(X, y, relative_test_size): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=relative_test_size, random_state=42) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) score = accuracy_score(pred, y_test) return score
def wrapper_for_decision_tree_accuracy(X, y, relative_test_size): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=relative_test_size, random_state=42) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) score = accuracy_score(pred,y_test) return score
def create_decision_tree(self): ''' based on experiments our best model was the decision tree model with the following params: ''' tree = DecisionTreeClassifier(max_depth=65, min_samples_split=0.03, min_samples_leaf=3, max_features=8) tree.fit(self.X_train, self.Y_train) predicted_y = tree.predict(self.X_test) print(predicted_y) self.print_stats(predicted_y, "") self.test_df['learning_label'] = predicted_y self.test_df.to_csv('output/feature_extraction.csv', encoding="latin-1") # save the training dataset
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def decision_tree(): print "Run Decision Tree" pipeline = Pipeline([('count', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('classify', DecisionTreeClassifier())]) print "Splitting into training and testing" cutoff = np.random.rand(len(data)) < 0.7 train = data[cutoff] test = data[~cutoff] conversationsX = train["conversation"].values conversationsY = train["category"].values testX = test["conversation"].values testY = test["category"].values predictX = testSet["conversation"].values pipeline.fit(conversationsX, conversationsY) testYResults = pipeline.predict(testX) report = classification_report(testY, testYResults) print report predictions = pipeline.predict(predictX) return predictions
def build_audit(classifier, name, with_proba=True): mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Employment", [ LabelBinarizer(), SelectFromModel(EstimatorProxy( DecisionTreeClassifier(random_state=13)), threshold="1.25 * mean") ]), ("Education", [ LabelBinarizer(), SelectorProxy( SelectFromModel(EstimatorProxy( RandomForestClassifier(random_state=13, n_estimators=3)), threshold="median")) ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]), ("Occupation", [LabelBinarizer(), SelectorProxy(SelectKBest(k=3))]), ("Income", ContinuousDomain()), ("Gender", LabelEncoder()), ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain()) ]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def train_ensemble_classifier(): # classifier2 = SklearnClassifier(GaussianNB(), sparse=False) # classifier1 = SklearnClassifier(SVC(), sparse=False) # classifier3 = SklearnClassifier(RandomForestClassifier(), sparse=False) # classifier4 = SklearnClassifier(DecisionTreeClassifier(), sparse=False) classifier2 = SklearnClassifier(GaussianNB(), sparse=False) classifier1 = SklearnClassifier(SVC(degree=18, C=12), sparse=False) classifier3 = SklearnClassifier(RandomForestClassifier(max_depth=100, n_estimators=10), sparse=False) classifier4 = SklearnClassifier(DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=2, max_leaf_nodes=30, splitter='best', random_state=0), sparse=False) test_classifiers = [] test_classifiers.append(classifier1) test_classifiers.append(classifier2) test_classifiers.append(classifier3) test_classifiers.append(classifier4) trained_classifiers = [] for classifier in test_classifiers: classifier = classifier.train(train_features) trained_classifiers.append(classifier) voted_classifier = VoteClassifier(trained_classifiers) save_classifier(voted_classifier, 'voted_classifier.pickle') print_and_get_split_dataset_accuracy(test_classifiers, train_features) print_voted_classifier_cross_validation_experiment_result( test_classifiers, train_features)
def __init__(self, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight='balanced', presort=False): self._hyperparams = { 'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'random_state': random_state, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'class_weight': class_weight, 'presort': presort } self._wrapped_model = Op(**self._hyperparams)
def sklearn_supervised(data=None, label=None, model_savepath='./models/classify.model', model_name='SVM', **sklearn_param): ''' :param data: 训练文本 :param label: 训练文本的标签 :param model_savepath: 模型保存路径 :param model_name: 机器学习分类模型,SVM,KNN,Logistic :param return: 训练好的模型 ''' if model_name == 'KNN': # 调用KNN,近邻=5 model = KNeighborsClassifier(**sklearn_param) model.fit(data, label) elif model_name == 'SVM': # 核函数为linear,惩罚系数为1.0 model = SVC(**sklearn_param) model.fit(data, label) elif model_name == 'Logistic': model = LogisticRegression(**sklearn_param) # 核函数为线性,惩罚系数为1 model.fit(data, label) elif model_name == 'DecisionTree': model = DecisionTreeClassifier(**sklearn_param) model.fit(data, label) elif model_name == 'Naivebayes': model = GaussianNB() model.fit(data, label) if model_savepath != None: joblib.dump(model, model_savepath) # 保存模型 return model
def __init__(self, data, protectedIndex, protectedValue, numRounds=20, weakLearner=DecisionTreeClassifier(), computeError=boosting.weightedLabelError): self.splitData(data) _, self.hypotheses, self.alphas = boosting.detailedBoost( self.trainingData, numRounds, weakLearner, computeError) super().__init__(defaultThreshold=0, marginRange=(-1, 1), protectedIndex=protectedIndex, protectedValue=protectedValue)
def build_audit(classifier, name, with_proba = True, **pmml_options): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [CategoricalDomain(), LabelBinarizer(), SelectFromModel(DecisionTreeClassifier(random_state = 13))]), (["Education"], [CategoricalDomain(), LabelBinarizer(), SelectFromModel(RandomForestClassifier(random_state = 13, n_estimators = 3), threshold = "1.25 * mean")]), (["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]), (["Gender"], [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), (["Deductions"], [CategoricalDomain()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def compare_sklearn_dt(chess_data, chess_target, credit_data, credit_target, iris_data, iris_target, lens_data, lens_target, vote_data, vote_targets): sk_dt = DecisionTreeClassifier(max_depth=5, min_samples_split=10) iris_scores = cross_val_score(sk_dt, iris_data, iris_target, cv=10) print('(SK-IRIS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format( iris_scores.mean() * 100, iris_scores.std() * 2)) votes_scores = cross_val_score(sk_dt, lens_data, lens_target, cv=10) print('(SK-LENS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format( votes_scores.mean() * 100, votes_scores.std() * 2)) votes_scores = cross_val_score(sk_dt, vote_data, vote_targets, cv=10) print('(SK-VOTES) Accuracy: {0:.2f}% (+/- {1:.2f})'.format( votes_scores.mean() * 100, votes_scores.std() * 2)) credit_scores = cross_val_score(sk_dt, credit_data, credit_target, cv=10) print('(SK-CREDIT) Accuracy: {0:.2f}% (+/- {1:.2f})'.format( credit_scores.mean() * 100, credit_scores.std() * 2)) chess_scores = cross_val_score(sk_dt, chess_data, chess_target, cv=10) print('(SK-CHESS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format( chess_scores.mean() * 100, chess_scores.std() * 2))
def sklearn_titanic(): from sklearn.tree.tree import DecisionTreeClassifier from sklearn.preprocessing.label import LabelEncoder total_df = pd.read_csv("titanic_clean.csv") total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True) total_df.dropna(inplace=True) for col in total_df.columns.tolist(): if str(total_df[col].dtype) == 'object': total_df[col] = LabelEncoder().fit_transform(total_df[col]) total_num = total_df.shape[0] train_df = total_df.iloc[:int(total_num * 0.8)] test_df = total_df.iloc[int(total_num * 0.8):] clf = DecisionTreeClassifier() clf.fit(train_df.drop(['survived'], axis=1), train_df['survived']) print(clf.score(test_df.drop(['survived'], axis=1), test_df['survived']))
def use(): # test use from sklearn.tree.tree import DecisionTreeClassifier import sklearn.datasets path = 'model.pkl' iris = sklearn.datasets.load_iris() model = DecisionTreeClassifier() # a = None # try: # a.test() # except Exception as e: # traceback.print_exc() train(model, iris.data, iris.target) save(model, path) model = load(path) print(model.predict(iris.data))
def forest_fit(self, X, y): for i in range(self.n_estimators): self.trees["tree{}".format(i)] = DecisionTreeClassifier( max_features='auto') self.trees["tree{}".format(i)].fit(X, y) if i % 5 == 0: self.trees["SVM{}".format(i)] = SVC() self.trees["SVM{}".format(i)].fit(X, y)
def train_individual_classifier(): #classifier = SklearnClassifier(SVC(), sparse=False) classifier = SklearnClassifier(DecisionTreeClassifier(random_state=0), sparse=False) # classifier = SklearnClassifier(GaussianNB(), sparse=False) # classifier = SklearnClassifier(RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), sparse=False) print_cross_validation_experiment_result(classifier, train_features) classifier.train(train_features) save_classifier(classifier, 'my_classifier.pickle')
def adjust_adaboost_param(tuning_param): if tuning_param['base_estimator_name'] == 'DecisionTreeClassifier': tuning_param['base_estimator'] = [] for max_feature in tuning_param['base_estimator_max_features']: tuning_param['base_estimator'].append( DecisionTreeClassifier(max_features=max_feature)) tuning_param.pop('base_estimator_name') tuning_param.pop('base_estimator_max_features') return tuning_param
def boost(trainingData, numRounds=20, weakLearner=DecisionTreeClassifier(), computeError=weightedLabelError): generator = adaboostGenerator(trainingData, weakLearner, numRounds, computeError) for h, _, _ in generator: pass return h
def train(self): self.action_classifier = DecisionTreeClassifier() self.action_classifier.fit(self.action_data, self.action_labels) self.drag_start_classifier = DecisionTreeRegressor() self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels) self.drag_end_classifier = DecisionTreeRegressor() self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels) self.touch_classifier = DecisionTreeRegressor() self.touch_classifier.fit(self.touch_data, self.touch_labels)
def decision_tree_training_sets(): training_set_sizes = [.1,.25,.5,.75,.9] columns = ['Training Set Size', 'Training Score', 'Test Score', 'Train Time', 'Test Time'] df = pd.DataFrame(columns=columns) for training_set_size in training_set_sizes: X_train, X_test, y_train, y_test = train_test_split( encoded_data[list(set(encoded_data.columns) - set(['Target']))], encoded_data['Target'], train_size=training_set_size) scaler = preprocessing.StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train.astype('float32')), columns=X_train.columns) X_test = scaler.transform(X_test.astype('float32')) start_train = time.time() dt = DecisionTreeClassifier(max_depth=8) print(dt) dt.fit(X_train, y_train) end_train = time.time() - start_train train_score = dt.score(X_train, y_train) start_test = time.time() test_score = dt.score(X_test, y_test) end_test = time.time() - start_test values = [training_set_size, train_score, test_score, end_train, end_test] df.loc[len(df)] = values print(' '.join(str(col) for col in columns)) print(' '.join(str(val) for val in values)) df.to_excel('diabetes_dt_training_sets.xls')
class DecisionTreeClassifierImpl(): def __init__(self, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight='balanced', presort=False): self._hyperparams = { 'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'random_state': random_state, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'class_weight': class_weight, 'presort': presort} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def decision_tree_depths(): max_depths = [2, 4, 6, 8, 10, 12, 16, 18, 20, 25, 30, 40] columns = [ 'Max Depths', 'Training Score', 'Test Score', 'Train Time', 'Test Time' ] df = pd.DataFrame(columns=columns) for depth in max_depths: start_train = time.time() dt = DecisionTreeClassifier(max_depth=depth) print(dt) dt.fit(X_train, y_train) end_train = time.time() - start_train train_score = dt.score(X_train, y_train) start_test = time.time() test_score = dt.score(X_test, y_test) end_test = time.time() - start_test values = [depth, train_score, test_score, end_train, end_test] df.loc[len(df)] = values print(' '.join(str(col) for col in columns)) print(' '.join(str(val) for val in values)) df.to_excel('adult_dt.xls')
def build_audit(classifier, name, with_proba=True, **kwargs): continuous_mapper = DataFrameMapper([("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain())]) categorical_mapper = DataFrameMapper([ ("Employment", [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(DecisionTreeClassifier(random_state=13)) ]), ("Education", [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(RandomForestClassifier(random_state=13, n_estimators=3), threshold="1.25 * mean") ]), ("Marital", [ CategoricalDomain(), LabelBinarizer(neg_label=-1, pos_label=1), SelectKBest(k=3) ]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectKBest(k=3)]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label=-3, pos_label=3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([("continuous", continuous_mapper), ("categorical", Pipeline([("mapper", categorical_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def detailedBoost(trainingData, numRounds=20, weakLearner=DecisionTreeClassifier(), computeError=weightedLabelError, diagnostic=None): generator = adaboostGenerator(trainingData, weakLearner, numRounds, computeError) for h, hypotheses, alphas in generator: if diagnostic is not None: diagnostic({'h': h, 'hypoheses': hypotheses, 'alphas': alphas}) return h, hypotheses, alphas
def DeepBBM2(features, labels, max_depth, gamma, max_depth_range): num_features = features.shape[1] sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size D_weights = np.ones(sample_size) / sample_size counts = np.zeros(sample_size) k_pre = get_k_from_gamma(gamma, sample_size) k = k_pre #k = min(600, k_pre) normalizer = np.exp(1) * sample_size print('k ', k) clf_list = [] rademacher_list = [] for depth in max_depth_range: rademacher_list.append( calc_rademacher(depth, sample_size, num_features, normalizer)) for t in range(k): best_loss = 10000 best_error = 1 best_depth = -1 best_clf = DecisionTreeClassifier(max_depth=0) for depth in max_depth_range: new_clf_list, new_weights = DeepBoost(features, labels, 1, max_depth_range, initial_weights=weights) new_clf = DecisionTreeClassifier(max_depth=depth) new_clf = new_clf.fit(features, labels, sample_weight=weights) new_error = eval_clf(new_clf, features, labels, weights) new_edge = new_error - 0.5 new_sign_edge = np.sign(new_edge) new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth - 1] # print ('new_error', new_error, 'new_grad', new_grad) print('depth', depth, 'new_error', new_error, 'new_grad', new_loss) if (new_loss < best_loss): best_clf = new_clf best_loss = new_loss best_error = new_error best_depth = depth y_predict = best_clf.predict(features) correct_ones = y_predict == labels counts += correct_ones # if (best_error >= 0.5): # break; coeff_1 = int(np.floor(k / 2)) - counts coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow( 0.5 - gamma, coeff_2) print('i', t, 'error', best_error, 'wnorm', np.linalg.norm(weights, ord=1)) weights = weights / np.linalg.norm(weights, ord=1) clf_list.append([best_clf, 1]) return clf_list, weights
def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict): clf = DecisionTreeClassifier() clf.fit(X, y) current_state_to_predict = np.array(current_state_to_predict).reshape(1,-1) predicted_state = clf.predict(current_state_to_predict) return predicted_state
class Model(object): """ The machine learning component of the tester. This component stores four different models: 1) A model to decide between different types of events (drags and touches). 2) A model to decide on the starting position for drags. 3) A model to decide on the ending position for drags. 4) A model to decide on the position of the touch. The input data are all the different known UI elements on the screen from the training data and whether or not they are visible on the screen. To acquire this, we first get the stored XML model and record the resource-id and class. We concatenate them into an array and mark as (1) for visible and (0) for not visible. """ def __init__(self): self.symbols = {} self.action_data = None self.action_labels = None self.action_classifier = None self.drag_data = None self.drag_end_labels = None self.drag_end_classifier = None self.drag_start_labels = None self.drag_start_classifier = None self.touch_data = None self.touch_labels = None self.touch_classifier = None self.device_info = device.info def parse_events(self, queue): symbols = {"randomizer": 0} events = [] all_data = [] all_results = [] drag_data = [] drag_start_results = [] drag_end_results = [] touch_data = [] touch_results = [] while not queue.empty(): event = queue.get() events.append(event) lst = event.state.start.as_list(symbols) lst[0] = random() all_data.append(lst) if event.action.is_drag(): drag_data.append(lst) all_results.append(DRAG) start = event.changes.start() end = event.changes.end() drag_start_results.append(start.x * start.y) drag_end_results.append(end.x * end.y) if event.action.is_touch(): touch_data.append(lst) all_results.append(TOUCH) start = event.changes.start() touch_results.append(start.x * start.y) if event.action.is_back(): all_results.append(BACK) data = np.zeros((len(all_data), len(symbols))) for i, item in enumerate(all_data): data[i, : len(item)] = item[:] drags = np.zeros((len(drag_data), len(symbols))) for i, item in enumerate(drag_data): drags[i, : len(item)] = item[:] touches = np.zeros((len(touch_data), len(symbols))) for i, item in enumerate(touch_data): touches[i, : len(item)] = item[:] self.symbols = symbols self.action_data = data self.action_labels = np.array(all_results) self.drag_data = drags self.drag_start_labels = np.array(drag_start_results) self.drag_end_labels = np.array(drag_end_results) self.touch_data = touches self.touch_labels = np.array(touch_results) def train(self): self.action_classifier = DecisionTreeClassifier() self.action_classifier.fit(self.action_data, self.action_labels) self.drag_start_classifier = DecisionTreeRegressor() self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels) self.drag_end_classifier = DecisionTreeRegressor() self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels) self.touch_classifier = DecisionTreeRegressor() self.touch_classifier.fit(self.touch_data, self.touch_labels) def predict(self, state): input = state.as_list(self.symbols, False) input[0] = random() action = Action() type = self.action_classifier.predict(input) width = self.device_info["displayWidth"] if type == DRAG: start = self.drag_start_classifier.predict(input)[0] end = self.drag_end_classifier.predict(input)[0] start = Point(start % width, start / width) end = Point(end % width, end / width) action.init(ACTION_DRAG, start, end, 0.5) elif type == TOUCH: point = self.touch_classifier.predict(input)[0] point = Point(point % width, point / width) action.init(ACTION_TOUCH, point.x, point.y) elif type == BACK: action.init(ACTION_BACK) return action def save(self): pass
### a classic way to overfit is to use a small number ### of data points and a large number of features; ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here from sklearn.tree.tree import DecisionTreeClassifier vocab_list = vectorizer.get_feature_names() dtc = DecisionTreeClassifier() dtc.fit(features_train, labels_train) pred = dtc.predict(features_test) from sklearn.metrics import accuracy_score accuracy = accuracy_score(labels_test, pred) print(accuracy) feature_importances = dtc.feature_importances_ for i in range(0, len(feature_importances)): if feature_importances[i] > 0.2: print("Importance = ", feature_importances[i], " number is ", i, " word is ", vocab_list[i])
store_pkl(audit_mapper, "Audit.pkl") audit_X = audit[:, 0:48] audit_y = audit[:, 48] audit_y = audit_y.astype(int) print(audit_X.dtype, audit_y.dtype) def predict_audit(classifier): adjusted = DataFrame(classifier.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns = ["probability_0", "probability_1"]) return pandas.concat((adjusted, adjusted_proba), axis = 1) audit_tree = DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5) audit_tree.fit(audit_X, audit_y) store_pkl(audit_tree, "DecisionTreeAudit.pkl") store_csv(predict_audit(audit_tree), "DecisionTreeAudit.csv") audit_forest = RandomForestClassifier(random_state = 13, min_samples_leaf = 5) audit_forest.fit(audit_X, audit_y) store_pkl(audit_forest, "RandomForestAudit.pkl") store_csv(predict_audit(audit_forest), "RandomForestAudit.csv") audit_regression = LogisticRegression() audit_regression.fit(audit_X, audit_y) store_pkl(audit_regression, "RegressionAudit.pkl")
def decision_tree_fit(X,y): clf = DecisionTreeClassifier(min_samples_leaf=5, random_state=42) return clf.fit(X, y)