def val(): #Not using cross validation here but held-out one X_train, y_train, X_test, y_test= val_data() print("Fitting Model") mnbsvm = NBSVM() mnbsvm.fit(X_train, y_train) print('Test Accuracy: %s' % mnbsvm.score(X_test, y_test))
def main(): X_train, y_train, X_test, y_test = load_imdb() print("Fitting Model") mnbsvm = NBSVM() mnbsvm.fit(X_train, y_train) print('Test Accuracy: %s' % mnbsvm.score(X_test, y_test))
def main(): np.set_printoptions(threshold=np.inf) vectorizer = prep_Vectorizer() X_test, y_test = load_testSet(vectorizer) mnbsvm = NBSVM() scores=[defaultdict(int) for i in range(len(y_test))] result = np.array([0]*len(y_test)) print 'length of result : ' + str(len(result)) print("Fitting Models now") classes = ['awesome','good','average','fair','poor'] predictions = {} for combo in itertools.combinations(classes, 2): X_train, y_train = load_trainSet(vectorizer,combo[0],combo[1]) print('Fitting classifier : ' + " ".join(combo)) mnbsvm.fit(X_train, y_train) predictions[" ".join(combo)] = mnbsvm.predict(X_test) random_count = 0 for i in range(len(y_test)): for combo in predictions: [class1,class2] = combo.split() if predictions[combo][i] == 1: scores[i][class1] += 1 else: scores[i][class2] += 1 max_value = max(scores[i].values()) result_classes = [] for klass in scores[i]: if scores[i][klass] == max_value: result_classes.append(klass) result_class = random.choice(result_classes) if result_class == 'awesome': result[i] = 5 elif result_class == 'good': result[i] = 4 elif result_class == 'average': result[i] = 3 elif result_class == 'fair': result[i] = 2 else: result[i] = 1 print('Test Accuracy: %s' % accuracy_score(y_test, result)) print('Confusion Matrix : ') print confusion_matrix(y_test, result) print('Classification Report :') print classification_report(y_test, result)
def test(): X_train, y_train, X_test = test_data() mnbsvm = NBSVM() mnbsvm.fit(X_train, y_train) y_test = mnbsvm.predict(X_test) with open("result.csv", 'w+', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Id', 'Category']) i = 0 for label in y_test: writer.writerow([i, int(y_test[i])]) i += 1
def get_nbsvm_model(self): token_pattern = r'\w+|[%s]' % string.punctuation pipeline = Pipeline([('tfidf', CountVectorizer(binary=True, token_pattern=token_pattern)), ('clf', NBSVM())]) parameters = { # 'tfidf__max_df': (0.25, 0.5, 0.75,1.0), 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], # 'tfidf__max_features': (100,500,1000,10000), # 'clf__alpha':(1e-3,1e-2,1e-1,1), # 'clf__loss':('squared_hinge','hinge'), # 'clf__C':(1,5,10), } return pipeline, parameters
def main(train_file, test_file, ngram=(1, 3)): print('loading...') train = pd.read_csv(train_file, delimiter='\t', encoding='utf-8', header=0, names=['text', 'label']) # to shuffle: # train.iloc[np.random.permutation(len(df))] test = pd.read_csv(test_file, delimiter='\t', encoding='utf-8', header=0, names=['text', 'label']) print('vectorizing...') vect = CountVectorizer() classifier = NBSVM() # create pipeline clf = Pipeline([('vect', vect), ('nbsvm', classifier)]) params = { 'vect__token_pattern': r"\S+", 'vect__ngram_range': ngram, 'vect__binary': True } clf.set_params(**params) # X_train = vect.fit_transform(train['text']) # X_test = vect.transform(test['text']) print('fitting...') clf.fit(train['text'], train['label']) print('classifying...') pred = clf.predict(test['text']) print('testing...') acc = accuracy_score(test['label'], pred) f1 = semeval_senti_f1(pred, test['label']) print('NBSVM: acc=%f, f1=%f' % (acc, f1))
def param_search(): params = { 'C': np.linspace(0.25, 0.75, num=21), #'beta' : [0.25, 0.3, 0.35, 0.4, 0.45, 0.5], } print('Params : ', params) model = NBSVM() # use 1 less core than available, prevents locking up of laptop n_cores = multiprocessing.cpu_count() - 1 g = GridSearchCV(model, param_grid=params, scoring=scoring, n_jobs=n_cores, cv=100, verbose=1) np.random.seed(1000) print('Loading data') texts, labels, label_map = load_both() print('Tokenizing texts') x_counts = tokenize(texts) print('Finished tokenizing texts') data = tfidf(x_counts) print('Finished computing TF-IDF') g.fit(data, labels) print("Best parameters set found on development set:") print() print(g.best_params_) print() print("Grid scores on development set:") print() means = g.cv_results_['mean_test_score'] stds = g.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, g.cv_results_['params']): print("%0.6f (+/-%0.06f) for %r" % (mean, std * 2, params))
def model_gen(): model = NBSVM(C=0.3, beta=0.5) return model
('clf', LogisticRegression(random_state=RANDOM_SEED, solver='liblinear'))]) clf_bow_count_svm = Pipeline([('countvectorizer', CountVectorizer(token_pattern=r"\S+", lowercase=True, ngram_range=(1, 1))), ('clf', LinearSVC(random_state=RANDOM_SEED, max_iter=1000000))]) clf_bow_count_nbsvm = Pipeline([('countvectorizer', CountVectorizer(token_pattern=r"\S+", lowercase=True, ngram_range=(1, 1))), ('clf', NBSVM(random_state=RANDOM_SEED, max_iter=1000000))]) clf_bow_count_mnb_bigram = Pipeline([('countvectorizer', CountVectorizer(token_pattern=r"\S+", lowercase=True, ngram_range=(1, 2))), ('clf', MultinomialNB())]) clf_bow_count_cnb_bigram = Pipeline([('countvectorizer', CountVectorizer(token_pattern=r"\S+", lowercase=True, ngram_range=(1, 2))), ('clf', ComplementNB())]) clf_bow_count_lr_bigram = Pipeline([ ('countvectorizer', CountVectorizer(token_pattern=r"\S+", lowercase=True,
model1 = MultinomialNB(alpha=0.001) model1.fit(train_features, train["Rating"]) model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True) model2.fit(train_features, train["Rating"]) model3 = RandomForestClassifier() model3.fit(train_features, train["Rating"]) model4 = GradientBoostingClassifier() model4.fit(train_features, train["Rating"]) model5 = NBSVM(C=0.01) model5.fit(train_features, train["Rating"]) pred_1 = model1.predict(test_features.toarray()) pred_2 = model2.predict(test_features.toarray()) pred_3 = model3.predict(test_features.toarray()) pred_4 = model4.predict(test_features.toarray()) pred_5 = model5.predict(test_features) print("MultinomialNB accuracy_score: ", accuracy_score(test["Rating"], pred_1)) print( classification_report(test['Rating'], pred_1, target_names=['1', '2', '3', '4', '5'])) cnf_matrix = confusion_matrix(test['Rating'], pred_1) plot_confusion_matrix(cnf_matrix,
from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import roc_auc_score from numpy import float32 from nbsvm import NBSVM import string def load_data(): vectorizer = CountVectorizer(binary=True) X_train = vectorizer.fit_transform(train.lowercase_parsed_content) X_train = X_train.astype(float32) y_train = np.array(train.response) X_test = vectorizer.transform(test.lowercase_parsed_content) X_test = X_test.astype(float32) y_test = np.array(test.response) return X_train, y_train, X_test, y_test print("Loading data...") X_train, y_train, X_test, y_test = load_data() mnbsvm = NBSVM() print("Training model...") mnbsvm.fit(X_train, y_train) predicted_NBSVM = mnbsvm.predict(X_test) print roc_auc_score(test.response, predicted_NBSVM)
########################################################### # NBSVM # ########################################################### #token_pattern = r'[a-zA-Z]+' #token_pattern = r'\w+' # token_pattern = r'[a-zA-Z]+|[\d+\/\d+]+' #token_pattern = r'\w+|[%s]' % string.punctuation # token_pattern = r'[a-zA-Z]+|[\d+\/\d+]+|[%s]' % string.punctuation token_pattern = r'[\d+\/\d+]+|\w+|[%s]' % string.punctuation pclf_NBSVM = Pipeline([ ('vect', CountVectorizer(ngram_range=(1,3), token_pattern=token_pattern, binary=True)), ('clf', NBSVM(beta=0.3, C=1, alpha=1.0, fit_intercept=False)) ]) pclf_NBSVM.fit(X_train, y_train) print('Test Accuracy: %s' % pclf_NBSVM.score(X_test, y_test)) params = {"vect__ngram_range": [(1,2)], "vect__binary": [True], "clf__alpha": [1], "clf__C": [1], "clf__beta": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "clf__fit_intercept": [False] } # Perform randomized search CV to find best hyperparameters
def main(): np.set_printoptions(threshold=np.inf) vectorizer = prep_Vectorizer() X_test, y_test = load_testSet(vectorizer) mnbsvm = NBSVM() scores = [[0, 0, 0] for i in range(len(y_test))] result = np.array(["none"] * len(y_test)) print("Fitting Models now") X_train, y_train = load_trainSet(vectorizer, 'pos', 'avg') print('Fitting pos-avg classifier') mnbsvm.fit(X_train, y_train) pos_avg_res = mnbsvm.predict(X_test) # print pos_avg_res X_train, y_train = load_trainSet(vectorizer, 'avg', 'neg') print('Fitting avg-neg classifier') mnbsvm.fit(X_train, y_train) avg_neg_res = mnbsvm.predict(X_test) # print avg_neg_res X_train, y_train = load_trainSet(vectorizer, 'pos', 'neg') print('Fitting pos-neg classifier') mnbsvm.fit(X_train, y_train) pos_neg_res = mnbsvm.predict(X_test) # print pos_neg_res random_count = 0 for i in range(len(y_test)): if pos_avg_res[i] == 1: scores[i][0] += 1 else: scores[i][1] += 1 if avg_neg_res[i] == 1: scores[i][1] += 1 else: scores[i][2] += 1 if pos_neg_res[i] == 1: scores[i][0] += 1 else: scores[i][2] += 1 if scores[i][0] == scores[i][1] == scores[i][2]: result[i] = random.choice(["pos", "avg", "neg"]) random_count += 1 else: max_value = max(scores[i]) max_index = scores[i].index(max_value) if max_index == 0: result[i] = "pos" elif max_index == 1: result[i] = "avg" else: result[i] = "neg" print('Test Accuracy: %s' % accuracy_score(y_test, result)) print('Confusion Matrix : ') print confusion_matrix(y_test, result, labels=["pos", "avg", "neg"]) print('Classification Report :') print classification_report(y_test, result, labels=["pos", "avg", "neg"])
# For topic classification (which is what we're doing here), # keywords usually work great, at least as a baseline. # We'll use scikit's tf-idf. vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(train.complaint) y = train.issue # ## Train a classifier # We'll try a simple multinomial naive bayes classifier. # Technically this should use integer counts (because that's what a multinomial # distribution represents), but it works with td-idf in practice too. model = NBSVM() model.fit(X, y) # ## Persist model # Create target directory if necessary. if not os.path.exists(MODEL_DIRECTORY): os.mkdir(MODEL_DIRECTORY) # And persist vectorizer and classifier objects. joblib.dump(vectorizer, MODEL_DIRECTORY + 'vectorizer.pkl') joblib.dump(model, MODEL_DIRECTORY + 'model.pkl')
def test_NBSVM_initializes_with_params(): clf = NBSVM(alpha=0.1, beta=0.2, C=0.3) assert clf.alpha == 0.1 assert clf.beta == 0.2 assert clf.C == 0.3
def test_NBSVM_scores_well_with_tfidf_features(tfidf_newsgroups): train_X, train_y, test_X, test_y = tfidf_newsgroups clf = NBSVM() clf.fit(train_X, train_y) p = clf.predict(test_X) assert accuracy_score(p, test_y) > 0.9
def test_NBSVM_extracts_classes(count_newsgroups): X, y, _, _ = count_newsgroups clf = NBSVM() clf.fit(X, y) assert hasattr(clf, 'classes_') assert len(clf.classes_) == 3
def test_NBSVM_raises_on_sparse_negative(): X = scipy.sparse.csr_matrix([[1., -1.]]) y = scipy.sparse.csr_matrix([1]) clf = NBSVM() with pytest.raises(ValueError): clf.fit(X, y)
def test_NBSVM_raises_on_dense_negative_X(): X = [[1., -1.]] y = [1] clf = NBSVM() with pytest.raises(ValueError): clf.fit(X, y)