def compare_clf(): c_for_svm = svm_configure() k_for_gini, k_for_ig = dt_configure() classifier = [ SVC(kernel='linear', C=c_for_svm), DT(criterion='gini', max_leaf_nodes=k_for_gini), DT(criterion='entropy', max_leaf_nodes=k_for_ig), LDA() ] p, r, f = [], [], [] metrics = ['average_precision_score', 'recall_score', 'f1_score'] for i, clf in enumerate(classifier): y_pred = clf.fit(X_train, Y_train).predict(X_test) plot_confusion_matrix(Y_test, y_pred, classes=class_name, idx=i + 1) precision, recall, f1 = average_precision_score(Y_test, y_pred), \ recall_score(Y_test, y_pred, 'weighted'), \ f1_score(Y_test, y_pred, 'weighted') p.append(precision) r.append(recall) f.append(f1) plt.show() for j in range(3): plt.subplot(1, 3, j + 1) plt.bar(range(4), p) plt.xlabel(['SVM', 'DT-gini', 'DT-IG', 'LDA']) plt.title(metrics[j]) plt.show()
def _find_best_model(x, y, z, params_grid, test_size, log_features=False): """ Performs GridSearch on `params_grid`. PARAMETERS ---------- - x (numpy array) : the input set of random variables, of shape (N, D1) - y (numpy array) : the target set of random variables, of shape (N, D2) - z (numpy array) : the conditioning set of random variables, of shape (N, D3) - params_grid (dict) : the hyperparameters to try out while performing grid search ; for more details, look up `sklearn.model_selection.GridSearchCV` - test_size (float) : the proportion of samples to be used as test data - log_features (bool, default=False) : if True 'log2' will be used as `max_features` for the Decision Tree Regressor provided there are atleast 10 features in the input RETURNS ------- - the Decision Tree Regressor with the optimal value for `min_sample_split`. """ model_input = _mix_merge_columns(x, z) if log_features and model_input.shape > 10: max_features = 'log2' else: max_features = 'auto' cv_splits = ShuffleSplit(n_splits=3, test_size=test_size) best_params = GridSearchCV(DT(max_features=max_features), params_grid, cv=cv_splits, n_jobs=-1).fit(model_input, y).best_params_ best_model = DT(**best_params) return best_model
def trace_distribution(self, features_after, labels, u): ScoreDT = [] DToriginal = [] sampledfeatures = features_after.sample(u) index = sampledfeatures.index.tolist() combi = list(combinations(self.lis, 5)) pl.figure(facecolor='white') for x in combi: features = sampledfeatures.loc[:, x] print(features) features_origin = sampledfeatures.loc[:, x] print(features_origin) features = features.reset_index(drop=True) features.loc[:, x] = Ratio().shuffle(features.loc[:, x]) label = [] for n in index: label.append(labels[n]) clfTestDT = cross_val_score(DT(min_samples_split=5, random_state=2), features.values, label, cv=5).mean() ScoreDT.append(clfTestDT) clfTestDTorigin = cross_val_score(DT(min_samples_split=5, random_state=2), features_origin.values, label, cv=5).mean() DToriginal.append(clfTestDTorigin) h = sorted(ScoreDT) fit = stats.norm.pdf(h, np.mean(h), np.std(h)) # pl.plot(h,fit)#,label='Surrogates: mean=%0.2f'% np.mean(h)) pl.hist(h, normed=True, label='Surrogates: mean=%0.2f' % np.mean(h)) v = sorted(DToriginal) fit1 = stats.norm.pdf(v, np.mean(v), np.std(v)) #pl.plot(v,fit1)#'-o',label='Real data: mean=%0.2f'% np.mean(v)) pl.hist(v, normed=True, label='Real data: mean=%0.2f' % np.mean(v)) pl.legend(bbox_to_anchor=(0., -0.12, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) pl.title( 'Surrogate data testing for %s random uniform samples and 5 features' % u) pl.show() print(np.mean(h), np.std(h)) print(np.mean(v), np.std(v))
def __init__(self, model_name='KNN', params=[]): if model_name == 'KNN': if params != []: self.clf = KNN(n_neighbors=params[0]) else: self.clf = KNN() elif model_name == 'DT': if params != []: self.clf = DT(max_depth=params[0]) else: self.clf = DT() else: self.clf = SVC() iris = datasets.load_iris() self.X = iris.data self.y = iris.target
def train(X, y, args): print('start...') stime = time.time() clf = DT(random_state=10) clf.fit(X, y) return clf
def DTpredictor(X_train, y_train, X_test): '''Logistic Regression Classifier Input traning data ,target, and test data Output prabability of each label for test data''' from sklearn.tree import DecisionTreeClassifier as DT from sklearn.model_selection import StratifiedShuffleSplit as SSS # cross validation using StratifiedShuffleSplit sss = SSS(n_splits=5, test_size=0.2, random_state=0) sss.get_n_splits(X_train, y_train) accuracy, logLoss, count = 0, 0, 0 for train_ind, test_ind in sss.split(X_train, y_train): Xtrain, Xtest = X_train.iloc[train_ind], X_train.iloc[test_ind] ytrain, ytest = y_train[train_ind], y_train[test_ind] model = DT(random_state=1) model.fit(Xtrain, ytrain) y_pred = model.predict(Xtest) accuracy += metrics.accuracy_score(ytest, y_pred) logLoss += metrics.log_loss(ytest, y_pred) count += 1 y_pred = model.predict(X_test) modelName = model.__class__.__name__ accModels[modelName] = accuracy / count predictions[modelName] = y_pred return y_pred, accuracy
def train(xFile, yFile): with open(xFile, "rb") as file_r: X = pickle.load(file_r) X = reshape(X, (212841, -1)) # reshape一下 (212841, 30*128) # 读取label数据,并且One-Hot Encoding with open(yFile, "r") as yFile_r: labelLines = [_.strip("\n") for _ in yFile_r.readlines()] values = array(labelLines) labelEncoder = LabelEncoder() integerEncoded = labelEncoder.fit_transform(values) integerEncoded = integerEncoded.reshape(len(integerEncoded), 1) # print(integerEncoded) # 获得label one hot 编码 Y = integerEncoded.reshape(212841, ) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) # 决策树分类器 clf = DT(criterion="entropy", splitter="random") # criterion 可以使用"gini"或者"entropy",前者代表基尼系数,后者代表信息增益。一般说使用默认的基尼系数"gini"就可以了,即CART算法。除非你更喜欢类似ID3, C4.5的最优特征选择方法。 # splitter 可以使用"best"或者"random"。前者在特征的所有划分点中找出最优的划分点。后者是随机的在部分划分点中找局部最优的划分点。默认的"best"适合样本量不大的时候,而如果样本数据量非常大,此时决策树构建推荐"random" clf.fit(X_train, Y_train) # 测试数据 predict = clf.predict(X_test) count = 0 for p, t in zip(predict, Y_test): if p == t: count += 1 print("Decison tree Accuracy is:", count / len(Y_test))
def iterate(self): print '-'*80 print 'Running RUSBoost Iterations...' # performance by number of estimators and max depth results = [] for ne in self.n_estimators_conf: for rr in self.rus_ratio_conf: print 'Iteration: nestimators=%s, rus_ratio=%s' % (str(ne), str(rr)) m = rusBoost(base_learner=DT(max_depth=2), n_estimators=ne, rus_ratio=rr, class_numsamples_dict=self.class_numsamples_dict) m.fit(self.xtrain, self.ytrain) predtrain = m.predict(self.xtrain) predtest = m.predict(self.xtest) predprobatrain = m.predict_proba(self.xtrain) predprobatest = m.predict_proba(self.xtest) accuracytrain = metrics.accuracy_score(predtrain, self.ytrain) accuracytest = metrics.accuracy_score(predtest, self.ytest) kstrain = multiclass_log_loss(self.ytrain, predprobatrain) kstest = multiclass_log_loss(self.ytest, predprobatest) cr = self.convert_cr(metrics.classification_report(self.ytest, predtest)) results.append([ne, rr, accuracytrain, accuracytest, kstrain, kstest, cr]) self.results = pd.DataFrame(results) self.results.columns = ['ne', 'rr', 'accuracy_train', 'accuracy_test', 'ks_train', 'ks_test', 'cr']
def stat_on_train(model, train_set, val_set, is_using_val_set=True): """ train a model with the train set and test on the validation set, return the test results and model. :param str model: the classification model (DT, NB or KNN) :param list train_set: the training set instances :param list val_set: the validation set instances :param boolean is_using_val_set: if is_using_val_set is True, the method will train the model using all the instances in the training and validation set, and return the model; otherwise it will just use the instances in the training set. """ if model == "DT": model = DT() elif model == "KNN": model = KNN() elif model == "NB": model = NB() else: exit() xtrain = np.array([[float(i) for i in v[:-1]] for v in train_set]) ytrain = np.array([v[-1] for v in train_set]) xtest = np.array([[float(i) for i in v[:-1]] for v in val_set]) ytest = np.array([v[-1] for v in val_set]) clf = model.fit(xtrain, ytrain) ypred = clf.predict(xtest) if is_using_val_set: clf = model.fit(np.concatenate((xtrain, xtest), axis=0), np.concatenate((ytrain, ytest), axis=0)) return get_stat(ytest, ypred), clf
def fit_model(features, prices): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data to define how to split #and how many test runs of each split on data cv_sets = ShuffleSplit(n_splits=10, test_size=0.2, random_state=1) # TODO: Create a decision tree regressor object regressor = DT() # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)} # Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric, greater_is_better=True) # Create the grid search object--RandomizedSearchCV is another option grid = GridSearchCV(regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets) # Fit the grid search object to the data to compute the optimal model grid = grid.fit(features, prices) # Return the optimal model after fitting the data return grid.best_estimator_
def main(): df = pd.read_csv('data.csv', index_col='id') df = my_preprocessing(df) data_X, data_y = df.drop('y', axis=1), df['y'] train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.25, random_state=0) tree = DT(random_state=0) parameters = {'max_depth':np.arange(2,11)} gcv = GridSearchCV(tree, parameters, cv=5, scoring='roc_auc', return_train_score=True, n_jobs=-1) gcv.fit(data_X, data_y) print(gcv.best_params_) best_tree = gcv.best_estimator_ forest = RandomForestClassifier(random_state=0) parameters = {'max_depth':np.arange(2,11)} gcv = GridSearchCV(forest, parameters, cv=5, scoring='roc_auc', return_train_score=True, n_jobs=-1) gcv.fit(data_X, data_y) print(gcv.best_params_) best_forest = gcv.best_estimator_ test_data = pd.read_csv('test.csv', index_col='id') test_data my_preprocessing(test_data) pred = best_forest.predict_proba(test_data)[:, 1] submit = pd.read_csv('../input/sample_submission.csv', header=None) submit[1] = pred submit.to_csv('submit.csv', index=False, header=False)
def tune_params(feature_count): X_train, X_test, y_train, y_test = get_data(feature_count, 2) # model params params = { "criterion": ["mse", "mae"], # use entropy "splitter": ["best", "random"], "max_depth": range(2, 21), "min_samples_split": range(2, 21), "min_samples_leaf": range(1, 21), "min_impurity_decrease": [0.0, 0.05, 0.1, 0.15, 0.2, 0.25] } # run randomized search n_iter_search = 60 clf = RandomizedSearchCV(DT(), param_distributions=params, n_iter=n_iter_search, n_jobs=-1) clf.fit(X_train, y_train) r2 = clf.score(X_test, y_test) print "\tBest result from Tunning: %d features, score of %.5f" % ( X_train.shape[-1], r2) print clf.best_params_
def fit(self, X, y): X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=0) train_predict, val_predict = 0, 0 # 按照二分类比例的初始化公式计算 fit_val = np.log(y_train.mean() / (1 - y_train.mean())) next_fit_val = np.full(X_train.shape[0], fit_val) last_val_score = -np.infty for i in range(self.n_estimator): cur_booster = DT(max_depth=self.max_depth) cur_booster.fit(X_train, next_fit_val) train_predict += cur_booster.predict(X_train) * self.lr val_predict += cur_booster.predict(X_val) * self.lr next_fit_val = y_train - np.exp(train_predict) / ( 1 + np.exp(train_predict)) self.booster.append(cur_booster) cur_val_score = self.record_score(y_train, y_val, train_predict, val_predict, i) if cur_val_score < last_val_score: self.best_round = i print("\n训练结束!最佳轮数为%d" % (i + 1)) break last_val_score = cur_val_score
def fit(self, X, y): # 在数据集中划分训练集和验证集 X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=0) train_predict, val_predict = 0, 0 next_fit_val = np.full(X_train.shape[0], np.mean(y_train)) # 为early_stop做记录准备 last_val_score = np.infty for i in range(self.n_estimator): cur_booster = DT(max_depth=self.max_depth) cur_booster.fit(X_train, next_fit_val) train_predict += cur_booster.predict(X_train) * self.lr val_predict += cur_booster.predict(X_val) * self.lr # 平方损失为((y - (F_{m-1} + w)^2)/2,若记残差为r # 即为((r - w)^2)/2,此时关于w在0点处的负梯度求得恰好为r # 因此拟合的值就是y_train - train_predict next_fit_val = y_train - train_predict self.booster.append(cur_booster) cur_val_score = self.record_score(y_train, y_val, train_predict, val_predict, i) if cur_val_score > last_val_score: self.best_round = i print("\n训练结束!最佳轮数为%d" % (i + 1)) break last_val_score = cur_val_score
def fit(self, X, y): self.model_list = [] df = pd.DataFrame(X); df['label'] = y if len(df[df['label']==0]) > len(df[df['label']==1]): df_maj = df[df['label']==0]; n_maj = len(df_maj) df_min = df[df['label']==1]; n_min = len(df_min) else: df_maj = df[df['label']==1]; n_maj = len(df_maj) df_min = df[df['label']==0]; n_min = len(df_min) cols = df.columns.tolist(); cols.remove('label') for ibagging in range(self.n_estimators): b = min(0.1*((ibagging%10)+1), 1) train_maj = df_maj.sample(frac=b, replace=True) train_min = df_min.sample(frac=b, replace=True) # train_maj = df_maj.sample(frac=1/self.n_estimators, replace=True) # train_min = df_min.sample(frac=1/self.n_estimators, replace=True) # train_maj = df_maj.sample(n=n_min, replace=True) # train_min = df_min.sample(frac=1/self.n_estimators, replace=True) df_k = train_maj.append(train_min) X_train, y_train = SMOTE_IMB(k_neighbors=min(5, len(train_min)-1)).fit_resample(df_k[cols], df_k['label']) # print ('Bagging Iter: {} |b: {:.1f}|n_train: {}|n_smote: {}'.format( # ibagging, b, len(y_train), len(y_train)-len(df_k))) model = DT().fit(X_train, y_train) self.model_list.append(model) return self
def fit(self, user_i): datasets = [] DTs = [] s = str(user_i) + '_' with open("out", 'a') as standardout: print("[Fitting]\n", file=standardout) for i in range(self.n_trees): data_indeces = np.random.randint(0, self.X.shape[0], self.X.shape[0]) y_indeces = np.random.randint( 0, self.X.shape[1], np.random.randint(1, self.X.shape[1], 1)[0]) temp_d = DT(criterion='entropy') temp_d.fit( self.X[data_indeces, y_indeces].reshape(data_indeces.shape[0], y_indeces.shape[0]), self.y[data_indeces]) DTs.append((temp_d, y_indeces)) dts = [] for i in range(len(DTs)): t_file_name = s + str(i) + '.pkl' d_temp_file = open('/dev/core/files/' + t_file_name, 'wb') pickle.dump(DTs[i], d_temp_file) dts.append(t_file_name) return dts
def __init__(self, base_estimator=DT(), n_estimators=10, random_seed=None): self.base_estimator = base_estimator self.n_estimators = n_estimators self.random_seed = random_seed self.model_list = [] # Will be set in the fit function self.feature_cols = None
def train_valid_dt(source1, source2): """ 决策树,就是使用这里的代码 """ X_train, X_test, y_train, y_test = getData(const.DATAPATH, source1) print('starting...') stime = time.time() clf = DT(random_state=10) clf.fit(X_train, y_train) tree_text = export_text(clf, feature_names=X_train.columns.values.tolist(), max_depth=20) print('Tree Structure : ') print(tree_text) with open(os.path.join(const.DATAPATH, 'dt_structure_{}.txt'.format(source)), 'w', encoding='utf-8', errors='ignore') as f: f.write(tree_text) print('Feature importance : ') print(clf.feature_importances_) print('Time cost {:.2f} ||| Score={:.4f}'.format( (time.time() - stime) / 60, clf.score(X_test, y_test))) valid(clf, const.DATAPATH, source2) return clf
def setUp(self): self.tmp_fn = 'Tmp' self.iris = load_iris() self.n_features = len(self.iris.data[0]) base_estimator = DT(max_depth=4, random_state=0) self.clf = ADA(base_estimator=base_estimator, n_estimators=100, random_state=0) self.clf.fit(self.iris.data, self.iris.target)
def iterate(self): print '-'*80 print 'Running SMOTEBoost Iterations...' # performance by number of estimators and max depth results = [] for ne in self.n_estimators_conf: for sr in self.smote_ratio_conf: print 'Iteration: nestimators=%s, smote_ratio=%s' % (str(ne), str(sr)) m = SB(base_learner=DT(max_depth=2), n_estimators=ne, smote_ratio=sr, class_numsamples_dict=class_numsamples_dict, df_smote=df_smote) m.fit(self.xtrain, self.ytrain) predtrain = m.predict(self.xtrain) predtest = m.predict(self.xtest) predprobatrain = m.predict_proba(self.xtrain) predprobatest = m.predict_proba(self.xtest) accuracytrain = metrics.accuracy_score(predtrain, self.ytrain) accuracytest = metrics.accuracy_score(predtest, self.ytest) kstrain = multiclass_log_loss(self.ytrain, predprobatrain) kstest = multiclass_log_loss(self.ytest, predprobatest) results.append([ne, sr, accuracytrain, accuracytest, kstrain, kstest]) self.results = pd.DataFrame(results) self.results.columns = ['ne', 'sr', 'accuracy_train', 'accuracy_test', 'ks_train', 'ks_test']
def __init__(self, base_learner=DT(max_depth=2), n_estimators=3, rus_ratio=1.0, class_numsamples_dict=False): self.m = base_learner self.T = n_estimators self.rus_ratio = rus_ratio self.class_numsamples_dict = class_numsamples_dict
def fit(x, y, max_depth=5): classifier = DTClassifier() classifier.max_depth = max_depth classifier.clf = DT(max_depth=classifier.max_depth) classifier.clf.fit(x, y) return classifier
def NLMmodelexp1(): modelExperiment( nlmInsampleData, nlmOutsampleData, 'NLMdata/', fullFV, [LR(), DT(), KNC(), RF(), ABC(), GNB(), QDA()], [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ], 'NLMmodelExperiment1.csv', 'NLMclassifier_plot1.png', True)
def __init__(self, base_learner=DT(max_depth=2), n_estimators=3, synthetic_data=None, synthetic_ratio=1.0): self.m = base_learner self.T = n_estimators self.df_synthetic = synthetic_data self.synthetic_ratio = synthetic_ratio
def tree(labels, X, df, i): tree = DT(max_depth=4) tree.fit(X, labels) impt = tree.feature_importances_ para = tree.get_params() export_graphviz(tree, out_file=OUTPUT_DIRECTORY + str(i) + "_tree.dot", feature_names=df.columns) return impt
def __init__(self, task='spam'): super(TaskTrainer, self).__init__() from sklearn.svm import SVC as SVM self.task = task if task == 'vehicle': self.env = SVM(C=1e2, kernel='rbf', random_state=0) # For vehicle task elif task == 'page': self.env = SVM(C=1e2, kernel='rbf', random_state=0, gamma=1e-2) # For page blocks elif task == 'credit': self.env = DT(max_depth=4) # For credit card task elif task == 'spam': self.env = LogisticRegression(C=1e2, random_state=0) # For spam detection task
def DT_classif(): # Decision Tree # http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html # sklearn.tree.DecisionTreeClassifier(criterion=’gini’, splitter=’best’, max_depth=None, # min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, # random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, # class_weight=None, presort=False) hypers = { 'max_depth': 5, 'class_weight': 'balanced', } return DT(**hypers)
def __init__(self, base_learner=DT(max_depth=2), n_estimators=3, smote_ratio=10, class_numsamples_dict=False, df_smote=False, smote_decay='linear'): self.m = base_learner self.T = n_estimators self.smote_ratio = smote_ratio self.class_numsamples_dict = class_numsamples_dict self.df_smote = df_smote self.smote_decay = smote_decay
def fit(self, user_i): datasets = [] DTs = [] s = str(user_i) + '_' z = time.time() with open("out", 'a') as standardout: print("[Fitting]", file=standardout) for i in range(self.n_trees): data_indeces = np.random.randint(0, self.X.shape[0], self.X.shape[0]) y_indeces = np.random.randint( 0, self.X.shape[1], np.random.randint(1, self.X.shape[1], 1)[0]) temp_d = DT(criterion='entropy') temp_d.fit( self.X[data_indeces, y_indeces].reshape(data_indeces.shape[0], y_indeces.shape[0]), self.y[data_indeces]) DTs.append((temp_d, y_indeces)) dts = [] #filestart = time.time() #q = Queue() #proc = [] for i in range(len(DTs)): t_file_name = s + str(i) + '.pkl' d_temp_file = open(t_file_name, 'wb') pickle.dump(DTs[i], d_temp_file) dts.append(t_file_name) #p = Process(target=file_dumper,args=(q,t_file_name,DTs[i])) #p.start() #proc.append(p) pickled = [] for i in dts: with open(i, 'r') as pklfile: pickled.append(pklfile.read()) ''' for i in range(len(DTs)): proc[i].join() dts.append(q.get()) ####DANGER #d_temp_file.close() filestop = time.time() v = time.time() with open("out",'a') as standardout: print("FIT TIME",v-z,file=standardout) with open("out",'a') as standardout: print("FIT COMPLETE",file=standardout) #return DTs ''' return dts, pickled
def SOmodelexp1(): modelExperiment( SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV, [LR(), DT(), KNC(), RF(n_estimators=200), ABC(), GNB(), QDA()], [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ], 'SOmodelExperiment1.csv', 'SOclassifier_plot1.png', True)