Beispiel #1
0
def val():
	#Not using cross validation here but held-out one
	X_train, y_train, X_test, y_test= val_data()
	print("Fitting Model")
	mnbsvm = NBSVM()
	mnbsvm.fit(X_train, y_train)
	print('Test Accuracy: %s' % mnbsvm.score(X_test, y_test))
Beispiel #2
0
def main():

    X_train, y_train, X_test, y_test = load_imdb()

    print("Fitting Model")

    mnbsvm = NBSVM()
    mnbsvm.fit(X_train, y_train)
    print('Test Accuracy: %s' % mnbsvm.score(X_test, y_test))
Beispiel #3
0
def main():

    X_train, y_train, X_test, y_test = load_imdb()

    print("Fitting Model")

    mnbsvm = NBSVM()
    mnbsvm.fit(X_train, y_train)
    print('Test Accuracy: %s' % mnbsvm.score(X_test, y_test))
Beispiel #4
0
def main():
    np.set_printoptions(threshold=np.inf)

    vectorizer = prep_Vectorizer()
    X_test, y_test = load_testSet(vectorizer)
    mnbsvm = NBSVM()

    scores=[defaultdict(int) for i in range(len(y_test))]
    result = np.array([0]*len(y_test))
    print 'length of result : ' + str(len(result))
    print("Fitting Models now")

    classes = ['awesome','good','average','fair','poor']
    predictions = {}

    for combo in itertools.combinations(classes, 2): 
        X_train, y_train = load_trainSet(vectorizer,combo[0],combo[1])
        print('Fitting classifier : ' + " ".join(combo))
        mnbsvm.fit(X_train, y_train)
        predictions[" ".join(combo)] = mnbsvm.predict(X_test)

    random_count = 0
    for i in range(len(y_test)):
        for combo in predictions:
            [class1,class2] = combo.split()
            if predictions[combo][i] == 1:
                scores[i][class1] += 1
            else:
                scores[i][class2] += 1

        max_value = max(scores[i].values())
        result_classes = []
        for klass in scores[i]:
            if scores[i][klass] == max_value:
                result_classes.append(klass)

        result_class = random.choice(result_classes)
        if result_class == 'awesome':
            result[i] = 5
        elif result_class == 'good':
            result[i] = 4
        elif result_class == 'average':
            result[i] = 3
        elif result_class == 'fair':
            result[i] = 2
        else:
            result[i] = 1


    print('Test Accuracy: %s' % accuracy_score(y_test, result))
    print('Confusion Matrix : ')
    print confusion_matrix(y_test, result)
    print('Classification Report :')
    print classification_report(y_test, result)
Beispiel #5
0
def test():
	X_train, y_train, X_test = test_data()
	mnbsvm = NBSVM()
	mnbsvm.fit(X_train, y_train)
	y_test = mnbsvm.predict(X_test)
	with open("result.csv", 'w+', newline='') as csvfile:
		writer = csv.writer(csvfile)
		writer.writerow(['Id', 'Category'])
		i = 0
		for label in y_test:
			writer.writerow([i, int(y_test[i])])
			i += 1
Beispiel #6
0
    def get_nbsvm_model(self):
        token_pattern = r'\w+|[%s]' % string.punctuation
        pipeline = Pipeline([('tfidf',
                              CountVectorizer(binary=True,
                                              token_pattern=token_pattern)),
                             ('clf', NBSVM())])

        parameters = {
            # 'tfidf__max_df': (0.25, 0.5, 0.75,1.0),
            'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
            # 'tfidf__max_features': (100,500,1000,10000),
            # 'clf__alpha':(1e-3,1e-2,1e-1,1),
            # 'clf__loss':('squared_hinge','hinge'),
            # 'clf__C':(1,5,10),
        }
        return pipeline, parameters
Beispiel #7
0
def main(train_file, test_file, ngram=(1, 3)):
    print('loading...')
    train = pd.read_csv(train_file,
                        delimiter='\t',
                        encoding='utf-8',
                        header=0,
                        names=['text', 'label'])

    # to shuffle:
    # train.iloc[np.random.permutation(len(df))]

    test = pd.read_csv(test_file,
                       delimiter='\t',
                       encoding='utf-8',
                       header=0,
                       names=['text', 'label'])

    print('vectorizing...')
    vect = CountVectorizer()
    classifier = NBSVM()

    # create pipeline
    clf = Pipeline([('vect', vect), ('nbsvm', classifier)])
    params = {
        'vect__token_pattern': r"\S+",
        'vect__ngram_range': ngram,
        'vect__binary': True
    }
    clf.set_params(**params)

    # X_train = vect.fit_transform(train['text'])
    # X_test = vect.transform(test['text'])

    print('fitting...')
    clf.fit(train['text'], train['label'])

    print('classifying...')
    pred = clf.predict(test['text'])

    print('testing...')
    acc = accuracy_score(test['label'], pred)
    f1 = semeval_senti_f1(pred, test['label'])
    print('NBSVM: acc=%f, f1=%f' % (acc, f1))
def param_search():
    params = {
        'C': np.linspace(0.25, 0.75, num=21),
        #'beta' : [0.25, 0.3, 0.35, 0.4, 0.45, 0.5],
    }
    print('Params : ', params)

    model = NBSVM()

    # use 1 less core than available, prevents locking up of laptop
    n_cores = multiprocessing.cpu_count() - 1

    g = GridSearchCV(model,
                     param_grid=params,
                     scoring=scoring,
                     n_jobs=n_cores,
                     cv=100,
                     verbose=1)

    np.random.seed(1000)
    print('Loading data')
    texts, labels, label_map = load_both()
    print('Tokenizing texts')
    x_counts = tokenize(texts)
    print('Finished tokenizing texts')
    data = tfidf(x_counts)
    print('Finished computing TF-IDF')

    g.fit(data, labels)
    print("Best parameters set found on development set:")
    print()
    print(g.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = g.cv_results_['mean_test_score']
    stds = g.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, g.cv_results_['params']):
        print("%0.6f (+/-%0.06f) for %r" % (mean, std * 2, params))
def model_gen():
    model = NBSVM(C=0.3, beta=0.5)
    return model
Beispiel #10
0
                                 ('clf',
                                  LogisticRegression(random_state=RANDOM_SEED,
                                                     solver='liblinear'))])
    clf_bow_count_svm = Pipeline([('countvectorizer',
                                   CountVectorizer(token_pattern=r"\S+",
                                                   lowercase=True,
                                                   ngram_range=(1, 1))),
                                  ('clf',
                                   LinearSVC(random_state=RANDOM_SEED,
                                             max_iter=1000000))])
    clf_bow_count_nbsvm = Pipeline([('countvectorizer',
                                     CountVectorizer(token_pattern=r"\S+",
                                                     lowercase=True,
                                                     ngram_range=(1, 1))),
                                    ('clf',
                                     NBSVM(random_state=RANDOM_SEED,
                                           max_iter=1000000))])

    clf_bow_count_mnb_bigram = Pipeline([('countvectorizer',
                                          CountVectorizer(token_pattern=r"\S+",
                                                          lowercase=True,
                                                          ngram_range=(1, 2))),
                                         ('clf', MultinomialNB())])
    clf_bow_count_cnb_bigram = Pipeline([('countvectorizer',
                                          CountVectorizer(token_pattern=r"\S+",
                                                          lowercase=True,
                                                          ngram_range=(1, 2))),
                                         ('clf', ComplementNB())])
    clf_bow_count_lr_bigram = Pipeline([
        ('countvectorizer',
         CountVectorizer(token_pattern=r"\S+",
                         lowercase=True,
Beispiel #11
0
model1 = MultinomialNB(alpha=0.001)
model1.fit(train_features, train["Rating"])

model2 = SGDClassifier(loss='modified_huber',
                       n_iter=5,
                       random_state=0,
                       shuffle=True)
model2.fit(train_features, train["Rating"])

model3 = RandomForestClassifier()
model3.fit(train_features, train["Rating"])

model4 = GradientBoostingClassifier()
model4.fit(train_features, train["Rating"])

model5 = NBSVM(C=0.01)
model5.fit(train_features, train["Rating"])

pred_1 = model1.predict(test_features.toarray())
pred_2 = model2.predict(test_features.toarray())
pred_3 = model3.predict(test_features.toarray())
pred_4 = model4.predict(test_features.toarray())
pred_5 = model5.predict(test_features)

print("MultinomialNB accuracy_score: ", accuracy_score(test["Rating"], pred_1))
print(
    classification_report(test['Rating'],
                          pred_1,
                          target_names=['1', '2', '3', '4', '5']))
cnf_matrix = confusion_matrix(test['Rating'], pred_1)
plot_confusion_matrix(cnf_matrix,
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from numpy import float32

from nbsvm import NBSVM

import string

def load_data():
    vectorizer = CountVectorizer(binary=True)
    X_train = vectorizer.fit_transform(train.lowercase_parsed_content)
    X_train = X_train.astype(float32)
    y_train = np.array(train.response)
    
    X_test = vectorizer.transform(test.lowercase_parsed_content)
    X_test = X_test.astype(float32)
    y_test = np.array(test.response)
    return X_train, y_train, X_test, y_test
    
print("Loading data...")
X_train, y_train, X_test, y_test = load_data()
mnbsvm = NBSVM()
print("Training model...")
mnbsvm.fit(X_train, y_train)
predicted_NBSVM = mnbsvm.predict(X_test)

print roc_auc_score(test.response, predicted_NBSVM)
###########################################################
#                           NBSVM                         #
###########################################################
#token_pattern = r'[a-zA-Z]+'
#token_pattern = r'\w+'
# token_pattern = r'[a-zA-Z]+|[\d+\/\d+]+'
#token_pattern = r'\w+|[%s]' % string.punctuation
# token_pattern = r'[a-zA-Z]+|[\d+\/\d+]+|[%s]' % string.punctuation
token_pattern = r'[\d+\/\d+]+|\w+|[%s]' % string.punctuation


pclf_NBSVM = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3),
                             token_pattern=token_pattern,
                             binary=True)),
    ('clf', NBSVM(beta=0.3, C=1, alpha=1.0, fit_intercept=False))
])

pclf_NBSVM.fit(X_train, y_train)
print('Test Accuracy: %s' % pclf_NBSVM.score(X_test, y_test))


params = {"vect__ngram_range": [(1,2)],
          "vect__binary": [True],
          "clf__alpha": [1],
          "clf__C": [1],
          "clf__beta": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
          "clf__fit_intercept": [False]
          }

# Perform randomized search CV to find best hyperparameters
Beispiel #14
0
def main():
    np.set_printoptions(threshold=np.inf)

    vectorizer = prep_Vectorizer()
    X_test, y_test = load_testSet(vectorizer)
    mnbsvm = NBSVM()

    scores = [[0, 0, 0] for i in range(len(y_test))]
    result = np.array(["none"] * len(y_test))
    print("Fitting Models now")

    X_train, y_train = load_trainSet(vectorizer, 'pos', 'avg')
    print('Fitting pos-avg classifier')
    mnbsvm.fit(X_train, y_train)
    pos_avg_res = mnbsvm.predict(X_test)
    # print pos_avg_res

    X_train, y_train = load_trainSet(vectorizer, 'avg', 'neg')
    print('Fitting avg-neg classifier')
    mnbsvm.fit(X_train, y_train)
    avg_neg_res = mnbsvm.predict(X_test)
    # print avg_neg_res

    X_train, y_train = load_trainSet(vectorizer, 'pos', 'neg')
    print('Fitting pos-neg classifier')
    mnbsvm.fit(X_train, y_train)
    pos_neg_res = mnbsvm.predict(X_test)
    # print pos_neg_res

    random_count = 0
    for i in range(len(y_test)):
        if pos_avg_res[i] == 1:
            scores[i][0] += 1
        else:
            scores[i][1] += 1

        if avg_neg_res[i] == 1:
            scores[i][1] += 1
        else:
            scores[i][2] += 1

        if pos_neg_res[i] == 1:
            scores[i][0] += 1
        else:
            scores[i][2] += 1

        if scores[i][0] == scores[i][1] == scores[i][2]:
            result[i] = random.choice(["pos", "avg", "neg"])
            random_count += 1
        else:
            max_value = max(scores[i])
            max_index = scores[i].index(max_value)
            if max_index == 0:
                result[i] = "pos"
            elif max_index == 1:
                result[i] = "avg"
            else:
                result[i] = "neg"

    print('Test Accuracy: %s' % accuracy_score(y_test, result))
    print('Confusion Matrix : ')
    print confusion_matrix(y_test, result, labels=["pos", "avg", "neg"])
    print('Classification Report :')
    print classification_report(y_test, result, labels=["pos", "avg", "neg"])
Beispiel #15
0
# For topic classification (which is what we're doing here),
# keywords usually work great, at least as a baseline.
# We'll use scikit's tf-idf.

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train.complaint)
y = train.issue


# ## Train a classifier

# We'll try a simple multinomial naive bayes classifier.
# Technically this should use integer counts (because that's what a multinomial
# distribution represents), but it works with td-idf in practice too.

model = NBSVM()
model.fit(X, y)


# ## Persist model

# Create target directory if necessary.

if not os.path.exists(MODEL_DIRECTORY):
    os.mkdir(MODEL_DIRECTORY)

# And persist vectorizer and classifier objects.

joblib.dump(vectorizer, MODEL_DIRECTORY + 'vectorizer.pkl')
joblib.dump(model, MODEL_DIRECTORY + 'model.pkl')
Beispiel #16
0
def test_NBSVM_initializes_with_params():
    clf = NBSVM(alpha=0.1, beta=0.2, C=0.3)
    assert clf.alpha == 0.1
    assert clf.beta == 0.2
    assert clf.C == 0.3
Beispiel #17
0
def test_NBSVM_scores_well_with_tfidf_features(tfidf_newsgroups):
    train_X, train_y, test_X, test_y = tfidf_newsgroups
    clf = NBSVM()
    clf.fit(train_X, train_y)
    p = clf.predict(test_X)
    assert accuracy_score(p, test_y) > 0.9
Beispiel #18
0
def test_NBSVM_extracts_classes(count_newsgroups):
    X, y, _, _ = count_newsgroups
    clf = NBSVM()
    clf.fit(X, y)
    assert hasattr(clf, 'classes_')
    assert len(clf.classes_) == 3
Beispiel #19
0
def test_NBSVM_raises_on_sparse_negative():
    X = scipy.sparse.csr_matrix([[1., -1.]])
    y = scipy.sparse.csr_matrix([1])
    clf = NBSVM()
    with pytest.raises(ValueError):
        clf.fit(X, y)
Beispiel #20
0
def test_NBSVM_raises_on_dense_negative_X():
    X = [[1., -1.]]
    y = [1]
    clf = NBSVM()
    with pytest.raises(ValueError):
        clf.fit(X, y)