def classify_fold(classifier_args, X_train, y_train, X_test, fold_index, args, runs_dir): classifier_name = args[0] input_dim = X_train.shape[1] num_class = np.unique(y_train) clf = get_classifier( classifier_args, args, runs_dir) # runs_dir is a dir to put training log of the classifier if not use_class_weight_to_balance: X_train, y_train = balance_data(X_train, y_train) tick = time.time() clf.fit(X_train, y_train) tock = time.time() print("training time = {0:.0f}".format(tock - tick)) tick = time.time() if classifier_name in ['cnn', 'softmax']: pred = clf.predict(X_test, eval_mode=True) else: pred = clf.predict(X_test) tock = time.time() duration = tock - tick print("Predicted data of size ", X_test.shape, " in {0:.2f} sec.".format(duration)) print('Done with fold-{}'.format(fold_index)) return pred, duration
def process_fold(classifier_args, X_train, y_train, fold_index): classifier_name = args[0] input_dim = X_train.shape[1] num_class = np.unique(y_train) clf = get_classifier( classifier_args, args, runs_dir) # runs_dir is a dir to put training log of the classifier #unique,counts = np.unique(y_train,return_counts=True) if not use_class_weight_to_balance: X_train, y_train = balance_data(X_train, y_train) #unique,counts = np.unique(y_train,return_counts=True) tick = time.time() clf.fit(X_train, y_train) pickle.dump(clf, open(runs_dir, 'wb')) tock = time.time() duration = tock - tick print("Trained and predicted data of sizes ", X_train.shape, X_test.shape, " in {0:.2f} min.".format(duration / 60)) print('Done with fold-{}'.format(fold_index)) importance = [ importance for feature_name, importance in zip(feat_names, clf.feat_importances) ] return importance
def test_equal_binary(self): """Tests if binary data have balanced class size.""" x_seq = [np.array([1, 2, 3]), np.array([1]), np.array([2]), np.array([1, 3]), np.array([3]), np.array([1, 3]), np.array([1, 2, 3]), np.array([2])] y_label = np.array([1, 0, 1, 1, 0, 1, 0, 1]) x_seq, y_label = utils.balance_data(x_seq, y_label) self.assertEqual(sum(y_label)*2, len(y_label))
def test_equal_with_size(self): """Tests if binary data have balanced class size with specified size.""" x_seq = [np.array([1, 2, 3]), np.array([1]), np.array([2]), np.array([1, 3]), np.array([3]), np.array([1, 3]), np.array([1, 2, 3]), np.array([2])] y_label = np.array([1, 0, 1, 1, 0, 1, 0, 1]) x_seq, y_label = utils.balance_data(x_seq, y_label, class_size=2) self.assertEqual(np.sum(y_label == 0), 2) self.assertEqual(np.sum(y_label == 1), 2)
def test_equal_multi(self): """Tests if multi-class data have balanced class size.""" x_seq = [np.array([1, 2, 3]), np.array([1]), np.array([2]), np.array([1, 3]), np.array([3]), np.array([1, 3]), np.array([1, 2, 3]), np.array([2])] y_label = np.array([1, 0, 2, 1, 0, 2, 0, 1]) x_seq, y_label = utils.balance_data(x_seq, y_label) self.assertEqual(np.sum(y_label == 0), 2) self.assertEqual(np.sum(y_label == 1), 2) self.assertEqual(np.sum(y_label == 2), 2)
def train_fold(X_train,y_train,fold_index,args,runs_dir): classifier_name = args['classifier_name'] balance = args['balance'] clf = get_classifier(args,runs_dir) # runs_dir is a dir to put training log of the classifier if balance=='explicit': print("explicit balancing data") X_train,y_train = balance_data(X_train,y_train) tick = time.time() #clf.fit(X_train,y_train) if classifier_name in ['tree', 'forest']: pickle.dump(clf,open(runs_dir,'wb')) tock = time.time() duration = tock-tick print("Trained data of size {} for Fold-{} in {:.0f} min, {:.0f} sec ".format(X_train.shape,fold_index,duration//60,duration%60)) return clf,duration
def perform_single_knn(input_data): """ Perform a single trial of knn with selected features """ # extract inputs from input tuple features = input_data[0] labels = input_data[1] knn_num_neighbors = input_data[2] knn_weights = input_data[3] knn_algorithm = input_data[4] knn_metric = input_data[5] knn_imbalanced_data = input_data[6] test_size = input_data[7] # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split the # data in the same way random.seed() train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=test_size, random_state=random.randint( 1, 99999999))) if not knn_imbalanced_data: # Manually balance data. Don't do this on the whole data set, instead do it only # on train set, so the it is balanced and precision per classes are equal. Do not # balance test set, because test set should reflect the true distribution of new # points to predict. train_features, train_labels = utils.balance_data( train_features, train_labels) model = KNeighborsClassifier(n_neighbors=knn_num_neighbors, weights=knn_weights, algorithm=knn_algorithm, metric=knn_metric) model = model.fit(train_features, train_labels) predicted_labels = model.predict(test_features) label_values = [0, 1] trial_metrics = compute_evaluation_metrics(predicted_labels, test_labels, label_values) return trial_metrics
def train_and_save_classifier(X_train,y_train,args): classifier_name = args['classifier_name'] balance = args['balance'] clf = get_classifier(args) # runs_dir is a dir to put training log of the classifier if balance=='explicit': tick = time.time() X_train,y_train = balance_data(X_train,y_train) tock = time.time() tick = time.time() print("Shufling data") X_train, y_train = shuffle(X_train,y_train) print('fitting model') clf.fit(X_train,y_train) if classifier_name in ['tree', 'forest']: with open(join(args['runs_dir'],'model.pkl'),'wb') as f: pickle.dump(clf,f) tock = time.time() duration = tock-tick print("Trained data of size {} in {:.0f} min, {:.0f} sec ".format(X_train.shape,duration//60,duration%60)) return
if dp['train_test'] == 'train': train.append(dh) elif dp['train_test'] == 'test': test.append(dh) ##### Bundling data ##### print('bundling data') trainfile = expdir + 'traindata.csv' if len(train) > 0: dh_train = utils.bundle_data(train, trainfile) else: dh_train = datahandler.Datahandler() dh_train.set(trainfile) if 'balance' in cp.sections(): print('balancing data') dh_train = utils.balance_data(dh_train, cp['balance']['outfile']) train_dataset = dh_train.dataset testfile = expdir + 'testdata.csv' if len(test) > 0: dh_test = utils.bundle_data(test, testfile) test_dataset = dh_test.dataset else: try: dh_test = datahandler.Datahandler() dh_test.set(testfile) test_dataset = dh_test.dataset except: test_dataset = False ##### Sampling data #####
return self def predict(self, test_x): pre_y = self.model.predict(test_x, batch_size=256) return pre_y if __name__ == '__main__': # 三个输入:loan、know、attribute共三个矩阵,然后combine后进行训练。 Y = np.load('embedding_y.npy') X_attr = np.load('embedding_x_attr.npy') X_loan = load_file('embeding_matrix', tail='loan') X_chaxun = load_file('embeding_matrix', tail='chaxun') Y = np.where(Y == 'good', 1, 0) X_train1, X_test1, X_train2, X_test2, X_train3, X_test3, y_train_or, y_test = train_test_split( X_attr, X_loan, X_chaxun, Y, test_size=0.2, random_state=4, shuffle=False) X_train1, y_train = balance_data(X_train1, y_train_or) X_train2, _ = balance_data(X_train2, y_train_or) X_train3, _ = balance_data(X_train3, y_train_or) X_train1, X_test1 = normal(X_train1, X_test1) X_train2, X_test2 = normal(X_train2, X_test2) X_train3, X_test3 = normal(X_train3, X_test3) # print(X_train1.shape) # X_train1, y_train = up_sample(X_train1, y_train_or, 2) # X_train2, _ = up_sample(X_train2, y_train_or, 2) # X_train3, _ = up_sample(X_train3, y_train_or, 2) # print(X_train1.shape, X_train2.shape, X_train3.shape) y_train = to_categorical(y_train, 2) y_test = to_categorical(y_test, 2) # 输入模型 model = Model_Graph([X_train1, X_train2, X_train3], y_train) model.mutilin_model()
if paths[i][-1] == '2': continue utils.fixPath(paths[i]+'/IMG/', paths[i]+'/', append=app) app = True data = pd.read_csv('../../dataset/data/driving_log_clean.csv') # Shuffling the data data = data.sample(frac=1).reset_index(drop=True) # Reading the data from the pandas dataframe X = data[['center', 'left', 'right']].values y = data['steering'].values # Balancing the dataset, and dropping some of the common examples X, y = utils.balance_data(X, y) # Some information about the data after balancing print("Full Data Size: ", data.shape) print("Data After Balancing: ", len(X)+len(y)) # Splitting the data: (See SPLIT under SOME CONSTANTS) X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=SPLIT, random_state=0) # Some information about the data after splitting print("Splitting with split rate: ", SPLIT) print("Training Data Size: ", X_train.shape, y_train.shape) print("Validation Data Size: ", X_valid.shape, y_valid.shape) # Freeing the memory block cause you know, it needs to be free. data = None
def train(): clf = request.form['train'] if allowed_classifier(clf): string = str('train') hist_n = string + "hist.jpeg" cnmt_n = string + "cnmt.jpeg" pkl_hnd = store(app.config['static_path'], app.root_path) # Feature extraction data = utils.file_parser( os.path.join(app.config['upload_path'], "data.txt")) features = utils.feature_extractor(data['text'], 5000).todense() sh = data.shape # Preprocessing features and labels data_x = utils.preprocess_features(features, 2500) data_y, enc = utils.label_encoder(data['label'], False, None) pkl_hnd.dump(enc, 'enc') # storing encoder # Splitting data into training set and validation set train_x, train_y, valid_x, valid_y = utils.train_valid( data_x, data_y, 0.2) #Balancing data with SMOTE text, label = utils.balance_data(train_x, train_y) # Selecting model and tuning hyperparameters tr = model(clf, text[:sh[0], :], label[:sh[0]], valid_x, valid_y) comb_mod = tr.model_selection() # Fitting model and predicting mod = tr.build_model(comb_mod) pkl_hnd.dump(mod, 'model') # storing the model pr = predict_model(valid_x) pred = pr.predict_model(mod) #Training Statistics st = stats(pred, valid_y) acc, f1 = st.train_stats() #Plotting histogram and confusion matrix pkl_hnd.plot_hist(data['label'], hist_n) n_labels = np.unique(np.asarray(data['label'])) pkl_hnd.dump(n_labels, 'n_labels') # storing labels cnf_matrix = st.cnf_mtx() pkl_hnd.plot_confusion_matrix( cnf_matrix, n_labels, cnmt_n, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues, ) return render_template("train_result.html", accuracy=acc, img_hist=url_for(app.config['static_path'], filename=hist_n), img_cfmt=url_for(app.config['static_path'], filename=cnmt_n), f1=f1) else: flash('Please enter a valid classifier') return redirect(url_for('index'))
def main(): # repeat and get more samples than needed until we can create fully balanced test and # train. We might not be since labels falling on the quantiles get assigned to the left # class, thus making the sample unbalanced oversample_rate = 1.1 while True: add_log_vars = True (train_features, train_labels, test_features, test_labels, class_values, feature_names, label_name) = utils.prepare_data( args.input_filename, args.label_column, int(args.train_size * oversample_rate), int(args.test_size * oversample_rate), add_log_vars) all_labels = np.concatenate([train_labels, test_labels]) # change labels to equally sized classes, using all labels in case minimum falls in # test. quantiles = scipy.stats.mstats.mquantiles( all_labels, np.arange(0, 1, 1.0 / args.num_classes)) train_labels = np.digitize(train_labels, quantiles) test_labels = np.digitize(test_labels, quantiles) class_values = range(1, args.num_classes + 1) train_features, train_labels = utils.balance_data( train_features, train_labels, class_values, args.train_size) test_features, test_labels = utils.balance_data( test_features, test_labels, class_values, args.test_size) if train_features is None or test_features is None: oversample_rate += 0.1 continue break all_labels = np.concatenate([train_labels, test_labels]) print("Label is {}".format(label_name)) print("Max of all labels: %d" % np.max(all_labels)) print("Min of all labels: %d" % np.min(all_labels)) if not args.skip_feature_selection: (train_features, test_features, feature_names) = feature_selection.L1SVMFeatureSelection( train_features, train_labels, test_features, feature_names, args.feature_selection_cost, args.threshold, args.num_jobs) model = models.train_logistic(train_features, train_labels, args.skip_cross_validation, args.multi_class, args.penalty, args.evaluation, args.num_jobs, args.cost) # Predict test and report full stats y_true = test_labels y_pred_prob = model.predict_proba(test_features) df = pd.DataFrame(data=y_pred_prob, columns=model.classes_) df['max_prob'] = df.max(axis=1) df['max_prob_class'] = df.idxmax(axis=1) df['true'] = y_true y_pred = df['max_prob_class'] # TODO: if requested, choose the predicted values such that the class frquencies match # the expected class frequencies print("\n*****************************\n") print('MAE on test: {}'.format( mean_absolute_error(y_true, y_pred, multioutput='uniform_average'))) print('Test Accuracy: {}'.format(accuracy_score(y_true, y_pred) * 100.)) print('Classification report:') print(classification_report(y_true, y_pred, class_values)) print('Weighted Precision Recall:') print( precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='weighted')) print('Unweighted Precision Recall:') print( precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='macro')) # print and plot confusion matrix print('Confusion Matrix Without Normalization') np.set_printoptions(precision=2) cm = confusion_matrix(y_true, y_pred, class_values) print(cm) print('Confusion Matrix With Normalization') cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print(cm_normalized) plt.figure() plt.subplot(2, 1, 1) utils.plot_confusion_matrix(cm, class_values, 'Unnormalized confusion matrix') # Normalize the confusion matrix by row (i.e by the number of samples # in each class) plt.subplot(2, 1, 2) utils.plot_confusion_matrix(cm_normalized, class_values, 'Normalized confusion matrix') pdf = PdfPages(args.output_filename + '.pdf') plt.savefig(pdf, format='pdf') pdf.close() # Now print stats on subsets based on confidence of max_prob_class. Sort predictions # by confidence in descending order and take subsets from the top of the sorted df df = df.sort_values(by='max_prob', ascending=False) print(','.join([ 'Probability Threshold', 'Percentage Predicted', 'Accuracy', 'AverageRecall', 'AveragePrecision', 'AverageFscore' ])) for percent_to_predict in range(1, 100): lowest_idx = int(percent_to_predict * len(df.index) / 100.0) df_subset = df.iloc[0:lowest_idx] prob_threshold = df_subset['max_prob'].min() accuracy = accuracy_score(df_subset['true'], df_subset['max_prob_class']) (precision, recall, fscore, support) = precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='macro') print(','.join( map(str, [ prob_threshold, percent_to_predict, accuracy, recall, precision, fscore ])))