def train_classifier(texts, y): ''' Here is a perfect example of the "feel it ... func it" philosophy: The pype call uses the function arguments and function body to specify three variables, texts, a list of strings, y, a list of floats, and vectorizer, a scikit-learn object that vectorizes text. This reiterates the adivce that you should use the function body and function arguments to declare your scope, whenever you can. Line-by-line, here we go: {'vectorizer':vectorizer.fit, 'X':vectorizer.transform}, We build a dict, the first element of which is the fit vectorizer. Luckily, the 'fit' function returns an instance of the trained vectorizer, so we do not need to use _do. This vectorizer is then assigned to 'vectorizer'. Because iterating through dictionaries in Python3.6 preserves the order of the keys in which they were declared, we can apply the fit function to the vectorizer on the texts, assign that to the 'vectorizer' key. We need this instance of the vectorizer to run the classifier for unknown texts. After this, we apply the 'transform' to convert the texts into a training matrix keyed by 'X', whose rows are texts and whose columns are words. _a('classifier',(Classifier().fit,_['X'],y)), Finally, we can build a classifier. _a, or _assoc, means we are adding a key-value pair to the previous dictionary. This will be a new instance of our Classifier, which is trained through the fit function on the text-word matrix 'X' and the labels vector y. _d('X'), Since we don't need the X matrix anymore, we delete it from the returned JSON, which now only contains 'vectorizer' and 'classifier', the two things we will need to classify unknown texts. ''' vectorizer = Vectorizer() return p( texts, { 'vectorizer': vectorizer.fit, 'X': vectorizer.transform }, _a('classifier', (Classifier().fit, _['X'], y)), _d('X'), )
# from sklearn.datasets import fetch_mldata # from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestClassifier as Classifier from sklearn.metrics import confusion_matrix, accuracy_score import time # 比較をしやすくするため,予めtrain/testを分けたものを読み込む from sklearn.externals import joblib data_train, data_test, label_train, label_test = joblib.load("mnist") # mnist = fetch_mldata("MNIST original", data_home=".") # data = np.asarray(mnist.data, np.float32) # data_train, data_test, label_train, label_test = train_test_split(data, mnist.target, test_size=0.2) classifier = Classifier() start = time.time() # 処理時間の計測開始 classifier.fit(data_train, label_train) training_time = time.time() - start start = time.time() # 処理時間の計測開始 result = classifier.predict(data_test) predict_time = time.time() - start # Confusion matrixを計算 print(training_time, predict_time) cmat = confusion_matrix(label_test, result) acc = accuracy_score(label_test, result) print(cmat) print(acc)
# Convert train_y to a 1d array to silence sklearn conversion warnings train_y = train_y.as_matrix().reshape((train_y.shape[0], )) test_x = pd.read_csv('test_x.csv') test_y = pd.read_csv('test_y.csv') # Using CV, discovered n=526 to be best (rng is seeded, results will not change) n_estimators = [526] best_error = float('Inf') best_model = None best_n = None for n in n_estimators: clf = Classifier(n_estimators=n, criterion='entropy', n_jobs=-1, random_state=123) clf.fit(train_x, train_y) predictions = clf.predict(valid_x) error = loss(valid_y, predictions) print "Validation error: " + str(error) if error < best_error: best_error = error best_model = clf best_n = n print "Best n: " + str(best_n) predictions = best_model.predict(test_x)
def train_model(): TIL_n = feat.count_TIL_corpus() decoy_n = TIL_n * _DECOY_PROPORTION FP_n = feat.count_TIL_false_pos() wiki_n = feat.count_WIKI_corpus() skip_wiki_n = wiki_n // decoy_n # Keep the number of false positives in about the same Order-of-Mag skip_FP = FP_n // TIL_n print "Skipping every {} value in FP".format(skip_FP) if FLAG_BUILD_DECOY_LIST: build_skip_query(skip_wiki_n) print "Loading features" features = Word2Vec.load(feat.f_features) dimension = 100 # default dimension ITR_decoy = query_skip_decoys() print "Building training set" ITR_train = list(feat.TIL_full_corpus_iter()) print "Building the false positive set" ITR_FP = list(feat.TIL_false_pos_iter(skip_FP)) print "Building corpus iter" ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy) ITR = list(ITR) Y = np.zeros(len(ITR)) Y[:TIL_n] = 1.0 TTS = train_test_split x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2) print "Proportion of answers {}/{}".format(y_train.sum(), y_test.sum()) print "Calculating the wordVecs for train" vec_train = np.concatenate([ getWordVecs(text, weight, features, dimension) for text, weight in x_train ]) print "Building the scalar" scaler = preprocessing.StandardScaler().fit(vec_train) print "Saving the scaler" joblib.dump(scaler, f_norm_scale) print "Scaling train vectors" vec_train = scaler.transform(vec_train) print "Calculating the wordVecs for test" vec_test = np.concatenate([ getWordVecs(text, weight, features, dimension) for text, weight in x_test ]) print "Scaling test vectors" vec_test = scaler.transform(vec_test) print "Train size/TP in sample", vec_train.shape, (y_train == 1).sum() print "Test size/TP in sample", vec_test.shape, (y_test == 1).sum() print "Training classifer" #from sklearn.linear_model import SGDClassifier as Classifier #from sklearn.linear_model import LogisticRegression as Classifier #from sklearn.linear_model import BayesianRidge as Classifier #from sklearn.naive_bayes import BernoulliNB as Classifier #from sklearn.naive_bayes import GaussianNB as Classifier #from sklearn.naive_bayes import GaussianNB as Classifier #from sklearn.ensemble import RandomForestClassifier as Classifier from sklearn.ensemble import ExtraTreesClassifier as Classifier # This seems to be the best... but high FP rate #from sklearn.naive_bayes import BernoulliNB as Classifier #clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD #clf = Classifier(C=2500,verbose=2) # LogisiticRegression #clf = Classifier() # Naive Bayes clf = Classifier(n_estimators=200, n_jobs=8) # ExtraTrees clf.fit(vec_train, y_train) print 'Test Accuracy: %.3f' % clf.score(vec_test, y_test) idx_TP = np.array(y_test) > 0 vec_TP = np.array(vec_test)[idx_TP] y_TP = np.array(y_test)[idx_TP] print 'Test Accuracy on TP: %.3f' % clf.score(vec_TP, y_TP) vec_FP = np.array(vec_test)[~idx_TP] y_FP = np.array(y_test)[~idx_TP] print 'Test Accuracy on FP: %.3f' % clf.score(vec_FP, y_FP) print "Saving the classifer" joblib.dump(clf, f_clf) #Create ROC curve from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt pred_probas = clf.predict_proba(vec_test)[:, 1] fpr, tpr, _ = roc_curve(y_test, pred_probas) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='area = %.2f' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') plt.show()
X, Y = zip(*list(TRAINING_ITR())) X = np.concatenate(X) TTS = train_test_split x_train, x_test, y_train, y_test = TTS(X, Y, test_size=0.17) print "Scaling train vectors" x_train = scalar.transform(x_train) print "Scaling text vectors" x_test = scalar.transform(x_test) print "Training classifer" from sklearn.ensemble import ExtraTreesClassifier as Classifier clf = Classifier(n_estimators=200, n_jobs=8) # ExtraTrees clf.fit(x_train, y_train) print 'Test Accuracy: %.3f' % clf.score(x_test, y_test) y_test = np.array(y_test) for n in _INV_STATUS_MAP.keys(): idx = y_test == n try: score = clf.score(x_test[idx], y_test[idx]) except: score = -1 print 'Test Accuracy on {}: {:0.3f}'.format(_INV_STATUS_MAP[n], score) print print "Suggesting some new entries"
ret, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) x.append(np.asarray(img, dtype=np.float64)) y.append(name.split("_")[0]) x, y = np.array(x), np.array(y) x_data = x.reshape((len(x), 150, 150, 1)) x_data = list(x_data) for i in range(len(x_data)): x_data[i] = x_data[i].flatten() x_data = np.array(x_data) scaler = StandardScaler() scaler.fit(x_data) x_train = scaler.transform(x_data) model = Classifier(n_estimators=100, max_depth=30, random_state=0) model.fit(x_train, y) def predict_gesture(img): ret, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) x_test = np.array([np.asarray(img, dtype=np.float64)]).reshape( (1, 150, 150, 1)) x_test = x_test / 255 x_test = list(x_test) for i in range(len(x_test)): x_test[i] = x_test[i].flatten() x_test = np.array(x_test) r = model.predict(x_test) return r[0]