def train_svm(experimental_c, experimental_gamma): train = np.load('train_vars.npy') val = np.load('val_vars.npy') train_labels = np.load('train_labels.npy').ravel() val_labels = np.load('val_labels.npy').ravel() val_size = len(val_labels) svm_model = svm(C=experimental_c, gamma=experimental_gamma) print 'Train y shape = %s' % (train_labels.shape, ) print 'Train X shape = %s' % (train.shape, ) svm_model.fit(train, train_labels) predictions = svm_model.predict(val) print 'Val y shape = %s' % (val_labels.shape, ) print 'Predictions shape = %s' % (predictions.shape, ) correct = np.sum(np.equal(predictions, val_labels)) accuracy = correct / float(val_size) result = 1 - accuracy print 'Number of correct predictions: %f' % correct print 'Fraction of correct predictions: %f' % accuracy print 'Error rate: %f' % result print 'Number of labels %f' % len(val_labels) result = float(result) print 'Result = %f' % result #time.sleep(np.random.randint(60)) return result
def main(job_id, params): cSVM = svm(C = 10.**params['C'], gamma = 10.**params['gamma']) train = np.load("train.npy") val = np.load("val.npy") trainLabel = np.load("trainLabel.npy") valLabel = np.load("valLabel.npy") cSVM.fit(train, trainLabel) preds = cSVM.predict(val) return -np.sum(np.equal(preds,valLabel))*1./len(valLabel)
def test(relativePath, params): cSVM = svm(C=10.**params['C'], gamma=10.**params['gamma']) train = np.load(relativePath + "train.npy") test = np.load(relativePath + "test.npy") trainLabel = np.load(relativePath + "trainLabel.npy") testLabel = np.load(relativePath + "testLabel.npy") cSVM.fit(train, trainLabel) preds = cSVM.predict(test) return -np.sum(np.equal(preds, testLabel)) * 1. / len(testLabel)
def main(job_id, params): cSVM = svm(C=10.**params['C'], gamma=10.**params['gamma']) train = np.load("train.npy") val = np.load("val.npy") trainLabel = np.load("trainLabel.npy") valLabel = np.load("valLabel.npy") cSVM.fit(train, trainLabel) preds = cSVM.predict(val) return -np.sum(np.equal(preds, valLabel)) * 1. / len(valLabel)
def test(relativePath, params): cSVM = svm(C = 10.**params['C'], gamma = 10.**params['gamma']) train = np.load(relativePath + "train.npy") test = np.load(relativePath + "test.npy") trainLabel = np.load(relativePath + "trainLabel.npy") testLabel = np.load(relativePath + "testLabel.npy") cSVM.fit(train, trainLabel) preds = cSVM.predict(test) return -np.sum(np.equal(preds,testLabel))*1./len(testLabel)
def create_model(self, model_type, parameters): if model_type == 'lr': model = lr() elif model_type == 'svm': model = svm() elif model_type == 'mlp': model = mlp() elif model_type == 'rf': model = rf() elif model_type == 'xgb': model = xgb() return model.set_params(**parameters)
def add_model(self, model_type): if model_type == 'lr': self.models.append((model_type, lr(normalize=True))) elif model_type == 'ridge': self.models.append((model_type, rc(normalize=True, cv=None))) elif model_type == 'lasso': self.models.append((model_type, la(normalize=True))) elif model_type == 'svm': self.models.append((model_type, svm())) self.param_grid['svm'] = { 'kernel': ['rbf'], 'C': range(10, 100, 10), 'epsilon': [0.01] } elif model_type == 'mlp': self.models.append((model_type, mlp())) self.param_grid['mlp'] = { 'hidden_layer_sizes': [(16, 16, 16, 16, 16), (16, 16, 16, 16)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'adam'], 'alpha': [0.001, 0.01], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init': [0.001, 0.01, 0.1], #'early_stopping':[True,False], #'validation_fraction':[0.1,0.05,0.2], #'max_iter':[200,1000,2000] } elif model_type == 'xgb': self.models.append((model_type, xgb())) self.param_grid[model_type] = { 'max_depth': range(5, 15, 2), 'min_child_weight': range(1, 6, 2), 'n_estimators': range(10, 50, 10), 'learning_rate': [0.01, 0.05, 0.1], 'n_jobs': [4], 'reg_alpha': [0, 0.005, 0.01], 'subsample': [0.8, 1], 'colsample_bytree': [0.8, 1] } elif model_type == 'rf': self.models.append((model_type, rf())) self.param_grid[model_type] = { 'n_estimators': [10, 100, 500], #'max_depth':range(3,10,2), #'min_child_weight':range(1,6,2), #'learning_rate':[0.01,0.05,0.1] }
def init_model(modeltype): if modeltype == 'mlp': ### Feedforward Neural Network Regression Model regression_model = mlp(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', alpha=0.5, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=1000, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10) elif modeltype == 'svm': ### Support Vector Machine Regression Model regression_model = svm(kernel='rbf', C=1e6, epsilon=0.1, gamma='auto', tol=0.001, cache_size=2000, shrinking=True, verbose=False, max_iter=-1) return regression_model
def get_esti(): graphs = getgraphs()[:100] vectors = vectorize(graphs) return svm(kernel='linear').fit(vectors)
from sklearn.svm import SVC as svm from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn import metrics data_set = ds.load_digits() x = data_set.data y = data_set.target X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) neighbors_model = KNeighborsClassifier(n_neighbors=3) bayes_model = naive_bayes() tree_model = tree() svm_model = svm() forest_model = RandomForestClassifier() neighbors_model.fit(X_train, y_train) bayes_model.fit(X_train, y_train) tree_model.fit(X_train, y_train) svm_model.fit(X_train, y_train) forest_model.fit(X_train, y_train) y_actual_neighbors = neighbors_model.predict(X_test) y_actual_bayes = bayes_model.predict(X_test) y_actual_tree = tree_model.predict(X_test) y_actual_svm = svm_model.predict(X_test) y_actual_forest = forest_model.predict(X_test) neighbors_metrics = metrics.classification_report(y_test, y_actual_neighbors)
import sklearn.svm.SVR as svm svr = svm(kernal="linear", C=1.0)
def __init__(self, feat, classes): self.model = svm(kernel='rbf', gamma=1e-4, C=1e+5) self.model.fit(feat, classes)
x = data_set.data y = data_set.target cv_kfold = KFold(n_splits=30) neighbors_classifiers = [] bayes_classifiers = [] tree_classifiers = [] svm_classifiers = [] forest_classifiers = [] for train_index, test_index in cv_kfold.split(y): neighbors_model = KNeighborsClassifier(n_neighbors=3) bayes_model = naive_bayes() tree_model = tree() svm_model = svm() forest_model = RandomForestClassifier() X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] neighbors_model.fit(X_train, y_train) bayes_model.fit(X_train, y_train) tree_model.fit(X_train, y_train) svm_model.fit(X_train, y_train) forest_model.fit(X_train, y_train) neighbors_classifiers.append(neighbors_model) bayes_classifiers.append(bayes_model) tree_classifiers.append(tree_model) svm_classifiers.append(svm_model) cross_neighbors = cross_val_score(KNeighborsClassifier(n_neighbors=3),
def wordGraph_train(): from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC as svm from sklearn.metrics import f1_score, precision_score, recall_score from sklearn.cross_validation import train_test_split import pickle imdir = '../../icdar2013/task21_22/train/image/' worddir = '../../icdar2013/task21_22/train/word_label/' chardir = '../../icdar2013/task21_22/train/char_label/' mywordparser = parseWord2013() mycharparser = parseChar2013() wordDataList = mywordparser.parseData(imdir, worddir) charDataList = mycharparser.parseData(imdir, chardir) DataList = cwCombine(wordDataList, charDataList) train_feature, train_label = plotbb_train(DataList) train_feature = numpy.asarray(train_feature) train_label = numpy.asarray(train_label) numpy.save('train_feature_seg', train_feature, ) numpy.save('train_label_seg', train_label, ) train_feature_width = train_feature[:, :, 2] train_feature_height = train_feature[:, :, 3] train_feature_area = train_feature[:, :, 2] * train_feature[:, :, 3] train_feature_aspectRatio = numpy.float32(train_feature_width) / numpy.float32(train_feature_height) train_feature_x = train_feature[:, :, 0] train_feature_y = train_feature[:, :, 1] train_feature_cx = train_feature_x+train_feature_width/2 train_feature_cy = train_feature_y+train_feature_height/2 edge_width_dif = abs(train_feature_width[:, 0] - train_feature_width[:, 1]) edge_height_dif = abs(train_feature_height[:, 0] - train_feature_height[:, 1]) edge_area_dif = abs(train_feature_area[:, 0] - train_feature_area[:, 1]) edge_aspectRatio_dif = abs(train_feature_aspectRatio[:, 0] - train_feature_aspectRatio[:, 1]) edge_cx_dis = abs(train_feature_cx[:, 0] - train_feature_cx[:, 1]) edge_cy_dis = abs(train_feature_cy[:, 0] - train_feature_cy[:, 1]) edge_width_mean = abs(train_feature_width[:, 0] + train_feature_width[:, 1]) / 2 edge_height_mean = abs(train_feature_height[:, 0] + train_feature_height[:, 1]) / 2 edge_area_mean = abs(train_feature_area[:, 0] + train_feature_area[:, 1]) / 2 edge_aspectRatio_mean = abs(train_feature_aspectRatio[:, 0] + train_feature_aspectRatio[:, 1]) / 2 edge_cu_dis = numpy.power(numpy.power(edge_cx_dis,2)+ numpy.power(edge_cy_dis, 2),0.5) # center Euclidean distance edge_bd_dis = numpy.maximum(edge_cx_dis - edge_width_mean, edge_cy_dis - edge_height_mean)# closest boundary distance edge_cu_dis_norm = numpy.float32(edge_cu_dis) / numpy.float32(edge_area_mean) edge_bd_dis_norm = numpy.float32(edge_bd_dis) / numpy.float32(edge_area_mean) edge_width_dif_norm = numpy.float32(edge_width_dif) / numpy.float32(edge_width_mean) edge_height_dif_norm = numpy.float32(edge_height_dif) / numpy.float32(edge_height_mean) edge_cx_dis_norm = numpy.float32(edge_cx_dis) / numpy.float32(edge_width_mean) edge_cy_dis_norm = numpy.float32(edge_cy_dis) / numpy.float32(edge_height_mean) edge_angle_dis = numpy.arctan(numpy.float32(edge_cx_dis) / numpy.float32(edge_cy_dis)) feature = numpy.asarray([edge_cu_dis, edge_cu_dis_norm, edge_bd_dis, edge_bd_dis_norm, edge_cx_dis, edge_cx_dis_norm, edge_cy_dis, edge_cy_dis_norm, edge_width_dif, edge_width_dif_norm, edge_height_dif, edge_height_dif_norm, edge_angle_dis]).T # replace NaN or Inf with average values ########### feature_mean = numpy.mean(feature, axis = 0) for i in range(feature.shape[0]): for j in range(feature.shape[1]): if numpy.isnan(feature[i, j]) or numpy.isinf(feature[i, j]): print 'NaN(Inf) found!' feature[i, j] = feature_mean[j] print 'w/o sample equalization' f_train, f_test, l_train, l_test = train_test_split(feature, train_label, test_size = 0.2) classifier = GradientBoostingClassifier(max_depth = 1) classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('adaboost_unequal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'AdaBoost classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = svm(kernel = 'linear') classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('svm_unequal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'SVM classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = RandomForestClassifier() classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('randomForest_unequal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'Random Forest classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' print 'Equalized Samples: SMOTE algorithm' feature0 = feature[train_label == 0, ...] label0 = train_label[train_label == 0] feature1 = feature[train_label == 1, ...] label1 = train_label[train_label == 1] feature0 = SMOTE(feature0, len(feature1)/len(feature0)*100, 3) label0 = numpy.zeros(len(feature0)) f_train = numpy.concatenate([feature0, feature1]) l_train = numpy.concatenate([label0, label1]) classifier = GradientBoostingClassifier(max_depth = 1) classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('adaboost_smote_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'AdaBoost classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = svm(kernel = 'linear') classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('svm_smote_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'SVM classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = RandomForestClassifier() classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('randomForest_smote_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'Random Forest classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' print 'Sample equalization with random selection' feature0 = feature[train_label == 0, ...] label0 = train_label[train_label == 0] feature1 = feature[train_label == 1, ...] label1 = train_label[train_label == 1] idx = numpy.random.choice(feature1.shape[0], feature0.shape[0], replace = False) feature1 = feature1[idx, ...] label1 = label1[idx, ...] f_train = numpy.concatenate([feature0, feature1]) l_train = numpy.concatenate([label0, label1]) classifier = GradientBoostingClassifier(max_depth = 1) classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('adaboost_equal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'AdaBoost classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = svm(kernel = 'linear') classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('svm_equal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'SVM classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = RandomForestClassifier() classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('randomForest_equal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'Random Forest classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' return
def main(): train, test = load("cancer-data-train.csv"), load("cancer-data-test.csv") X_train, y_train = train X_test, y_test = test X_train, X_test, print_pred = arguments(sys.argv, X_train, X_test) fig = plot.figure() # Passing training data and classes to find best C and number of leaf nodes to use. Also creating graphs to display this info classifier_plotter(X_train, y_train) # Setting up graphs for each plot ax1 = fig.add_subplot(234) ax1.set_title('Average Precsion Scores') ax1.set_ylabel('Precsion Score') ax1.set_xlabel('Classifier') ax2 = fig.add_subplot(235) ax2.set_title('Average Recall Scores') ax2.set_ylabel('Recall Score') ax2.set_xlabel('Classifier') ax3 = fig.add_subplot(236) ax3.set_title('Average F-measures') ax3.set_ylabel('F-measure') ax3.set_xlabel('Classifier') # Create and train the classifiers classifier_svm, classifier_gini, classifier_ig, classifier_lda = svm( kernel='linear', C=0.1), dt(criterion='gini', max_leaf_nodes=10), dt(criterion='entropy', max_leaf_nodes=5), lda() classifier_svm.fit(X_train, y_train), classifier_gini.fit( X_train, y_train), classifier_ig.fit(X_train, y_train), classifier_lda.fit( X_train, y_train) # Make the predictions pred_svm, pred_gini, pred_ig, pred_lda = classifier_svm.predict( X_test), classifier_gini.predict(X_test), classifier_ig.predict( X_test), classifier_lda.predict(X_test) # Calculate the precision, recall, f-measure avg_precision_svm, avg_precision_gini, avg_precision_ig, avg_precision_lda = average_precision_score( y_test, pred_svm), average_precision_score( y_test, pred_gini), average_precision_score( y_test, pred_ig), average_precision_score(y_test, pred_lda) recall_svm, recall_gini, recall_ig, recall_lda = recall_score( y_test, pred_svm, average='weighted'), recall_score( y_test, pred_gini, average='weighted'), recall_score( y_test, pred_ig, average='weighted'), recall_score(y_test, pred_lda, average='weighted') f_svm, f_gini, f_ig, f_lda = f1_score( y_test, pred_svm, average='weighted'), f1_score( y_test, pred_gini, average='weighted'), f1_score( y_test, pred_ig, average='weighted'), f1_score(y_test, pred_lda, average='weighted') ################## Extra Credit ######################### # Train classifier and make predictions on test set classifier_rfc = rfc(n_estimators=100, max_depth=2) classifier_rfc.fit(X_train, y_train) pred_rfc = classifier_rfc.predict(X_test) #Calculate precision, recall and f-measure for Random Forest Classifier avg_precision_rfc = average_precision_score(y_test, pred_rfc) recall_rfc = recall_score(y_test, pred_rfc, average='weighted') f_rfc = f1_score(y_test, pred_rfc, average='weighted') ######################################################### # Printing scores and predictions print_scores([[ avg_precision_svm, avg_precision_gini, avg_precision_ig, avg_precision_lda, avg_precision_rfc ], [recall_svm, recall_gini, recall_ig, recall_lda, recall_rfc], [f_svm, f_gini, f_ig, f_lda, f_rfc]]) print_predictions([pred_svm, pred_gini, pred_ig, pred_lda, pred_rfc], print_pred) # Create the graphs for the scores score_plotter(ax1, [ avg_precision_svm, avg_precision_gini, avg_precision_ig, avg_precision_lda, avg_precision_rfc ]) score_plotter(ax2, [recall_svm, recall_gini, recall_ig, recall_lda, recall_rfc]) score_plotter(ax3, [f_svm, f_gini, f_ig, f_lda, f_rfc]) plot.tight_layout(w_pad=1.5, h_pad=2.0) plot.show()
def classifier_plotter(X_train, y_train): ''' Takes the training data and runs through SVM, DT-Gini and DT-IG with multiple C values and max_leaf_nodes to try. The method then creates a graph by taking the average of cross validation scores for that C value or max_leaf_node. Params: X_train: List/s of features already standardized from the initial dataset y_train: List of classifiers for X_train taken from the original dataset Return: Outputs a graph of the average cross validation scores. ''' i, d = 1, 0 # Values to test c_values = [0.01, 0.1, 1, 10, 100] k_values = [2, 5, 10, 20] classifiers = ["SVM", "DT-Gini & DT-IG"] for clf in classifiers: count = 1 if clf == "SVM": if d == 0: ax = plot.subplot(231) ax.set_title(clf) plot.ylabel('F-measure') plot.xlabel('C values') d += 1 print('SVM') for c in c_values: classi = svm(kernel='linear', C=c).fit(X_train, y_train) scores = cross_val_score(classi, X_train, y_train, cv=10) ax.plot(str(c), scores.mean(), 'bs') print('%d.) %.4f%%' % (count, scores.mean() * 100)) count += 1 plot.axis([None, None, 0.90, 1]) print('\n') i += 1 d = 0 elif clf == "DT-Gini & DT-IG": count = 1 if d == 0: ax = plot.subplot(232) plot.ylabel('F-measure') plot.xlabel('Max Leaf Nodes') print(' Gini\tIG') for k in k_values: gini_class, ig_class = dt(criterion='gini', max_leaf_nodes=k), dt( criterion='entropy', max_leaf_nodes=k) score_gini, score_ig = cross_val_score(gini_class, X_train, y_train, cv=10), cross_val_score( ig_class, X_train, y_train, cv=10) ax.plot(str(k), score_gini.mean(), 'r.', str(k), score_ig.mean(), 'g.') print('%d.) %.4f%%\t%.4f%%' % (count, score_gini.mean() * 100, score_ig.mean() * 100)) count += 1 plot.axis([None, None, 0.889, 0.96]) ax.legend(('Gini', 'IG'), loc=2) print('\n') i += 1 d = 0 else: return "Should not get here."
import glob import numpy as np import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier as knn from sklearn.svm import LinearSVC as svm import pickle li=[] for infile in glob.glob('*/*'): a=cv2.imread(infile) gray=cv2.cvtColor(a,cv2.COLOR_BGR2GRAY) li.append(cv2.resize(gray,(350,350)).flatten()) li=np.asarray(li) x=li[0] plt.imshow(x.reshape(350,350)) y=[] for infile in glob.glob('*/*'): path=infile.split('/') y.append(path[0]) y=np.asarray(y) clf=knn() clf2=svm() clf2.fit(li,y) pickle.dump(clf2,open("model",'wb'))
def wordGraph_train(): from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC as svm from sklearn.metrics import f1_score, precision_score, recall_score from sklearn.cross_validation import train_test_split import pickle imdir = '../../icdar2013/task21_22/train/image/' worddir = '../../icdar2013/task21_22/train/word_label/' chardir = '../../icdar2013/task21_22/train/char_label/' mywordparser = parseWord2013() mycharparser = parseChar2013() wordDataList = mywordparser.parseData(imdir, worddir) charDataList = mycharparser.parseData(imdir, chardir) DataList = cwCombine(wordDataList, charDataList) train_feature, train_label = plotbb_train(DataList) train_feature = numpy.asarray(train_feature) train_label = numpy.asarray(train_label) numpy.save( 'train_feature_seg', train_feature, ) numpy.save( 'train_label_seg', train_label, ) train_feature_width = train_feature[:, :, 2] train_feature_height = train_feature[:, :, 3] train_feature_area = train_feature[:, :, 2] * train_feature[:, :, 3] train_feature_aspectRatio = numpy.float32( train_feature_width) / numpy.float32(train_feature_height) train_feature_x = train_feature[:, :, 0] train_feature_y = train_feature[:, :, 1] train_feature_cx = train_feature_x + train_feature_width / 2 train_feature_cy = train_feature_y + train_feature_height / 2 edge_width_dif = abs(train_feature_width[:, 0] - train_feature_width[:, 1]) edge_height_dif = abs(train_feature_height[:, 0] - train_feature_height[:, 1]) edge_area_dif = abs(train_feature_area[:, 0] - train_feature_area[:, 1]) edge_aspectRatio_dif = abs(train_feature_aspectRatio[:, 0] - train_feature_aspectRatio[:, 1]) edge_cx_dis = abs(train_feature_cx[:, 0] - train_feature_cx[:, 1]) edge_cy_dis = abs(train_feature_cy[:, 0] - train_feature_cy[:, 1]) edge_width_mean = abs(train_feature_width[:, 0] + train_feature_width[:, 1]) / 2 edge_height_mean = abs(train_feature_height[:, 0] + train_feature_height[:, 1]) / 2 edge_area_mean = abs(train_feature_area[:, 0] + train_feature_area[:, 1]) / 2 edge_aspectRatio_mean = abs(train_feature_aspectRatio[:, 0] + train_feature_aspectRatio[:, 1]) / 2 edge_cu_dis = numpy.power( numpy.power(edge_cx_dis, 2) + numpy.power(edge_cy_dis, 2), 0.5) # center Euclidean distance edge_bd_dis = numpy.maximum(edge_cx_dis - edge_width_mean, edge_cy_dis - edge_height_mean) # closest boundary distance edge_cu_dis_norm = numpy.float32(edge_cu_dis) / numpy.float32( edge_area_mean) edge_bd_dis_norm = numpy.float32(edge_bd_dis) / numpy.float32( edge_area_mean) edge_width_dif_norm = numpy.float32(edge_width_dif) / numpy.float32( edge_width_mean) edge_height_dif_norm = numpy.float32(edge_height_dif) / numpy.float32( edge_height_mean) edge_cx_dis_norm = numpy.float32(edge_cx_dis) / numpy.float32( edge_width_mean) edge_cy_dis_norm = numpy.float32(edge_cy_dis) / numpy.float32( edge_height_mean) edge_angle_dis = numpy.arctan( numpy.float32(edge_cx_dis) / numpy.float32(edge_cy_dis)) feature = numpy.asarray([ edge_cu_dis, edge_cu_dis_norm, edge_bd_dis, edge_bd_dis_norm, edge_cx_dis, edge_cx_dis_norm, edge_cy_dis, edge_cy_dis_norm, edge_width_dif, edge_width_dif_norm, edge_height_dif, edge_height_dif_norm, edge_angle_dis ]).T # replace NaN or Inf with average values ########### feature_mean = numpy.mean(feature, axis=0) for i in range(feature.shape[0]): for j in range(feature.shape[1]): if numpy.isnan(feature[i, j]) or numpy.isinf(feature[i, j]): print 'NaN(Inf) found!' feature[i, j] = feature_mean[j] print 'w/o sample equalization' f_train, f_test, l_train, l_test = train_test_split(feature, train_label, test_size=0.2) classifier = GradientBoostingClassifier(max_depth=1) classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('adaboost_unequal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'AdaBoost classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = svm(kernel='linear') classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('svm_unequal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'SVM classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = RandomForestClassifier() classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('randomForest_unequal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'Random Forest classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' print 'Equalized Samples: SMOTE algorithm' feature0 = feature[train_label == 0, ...] label0 = train_label[train_label == 0] feature1 = feature[train_label == 1, ...] label1 = train_label[train_label == 1] feature0 = SMOTE(feature0, len(feature1) / len(feature0) * 100, 3) label0 = numpy.zeros(len(feature0)) f_train = numpy.concatenate([feature0, feature1]) l_train = numpy.concatenate([label0, label1]) classifier = GradientBoostingClassifier(max_depth=1) classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('adaboost_smote_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'AdaBoost classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = svm(kernel='linear') classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('svm_smote_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'SVM classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = RandomForestClassifier() classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('randomForest_smote_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'Random Forest classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' print 'Sample equalization with random selection' feature0 = feature[train_label == 0, ...] label0 = train_label[train_label == 0] feature1 = feature[train_label == 1, ...] label1 = train_label[train_label == 1] idx = numpy.random.choice(feature1.shape[0], feature0.shape[0], replace=False) feature1 = feature1[idx, ...] label1 = label1[idx, ...] f_train = numpy.concatenate([feature0, feature1]) l_train = numpy.concatenate([label0, label1]) classifier = GradientBoostingClassifier(max_depth=1) classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('adaboost_equal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'AdaBoost classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = svm(kernel='linear') classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('svm_equal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'SVM classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' classifier = RandomForestClassifier() classifier = classifier.fit(f_train, l_train) pickle.dump(classifier, open('randomForest_equal_seg.pkl', 'w')) pred_train = classifier.predict(f_train) pred_test = classifier.predict(f_test) f1_train = f1_score(l_train, pred_train) precision_train = precision_score(l_train, pred_train) recall_train = recall_score(l_train, pred_train) f1_test = f1_score(l_test, pred_test) precision_test = precision_score(l_test, pred_test) recall_test = recall_score(l_test, pred_test) print 'Random Forest classifier training(testing):' print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')' print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')' print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')' print '\r\n' return