def evaluate(self, test_data_file):
		ID_test, X_test, Y_test = csv_to_np(test_data_file[0])
		X_test = average_word_embeddings(X_test, self.word_to_vec_mapping)
		Y_test = integer_to_one_hot(Y_test)

		predictions = self.model.predict(X_test)
		pred_dict = dict()
		for i in range(len(X_test)):
			pred_dict[str(ID_test[i])] = label_to_sentiment(np.argmax(predictions[i]))

		loss, accuracy = self.model.evaluate(X_test, Y_test)
		print()
		print("Loss = ", loss)
		print("Test accuracy = " + str(accuracy*100) + "%")

		evaluation.evaluate(pred_dict, test_data_file[1], type(self).__name__)
		evaluation.confusion(pred_dict, test_data_file[1], type(self).__name__)

		return(predictions, Y_test)
        # ans_num = model.predict(t_F3)
        # model = joblib.load('F3_and_SVM.pkl')
        # ans_num = model.predict(t_F3)
        # ans_num = model.predict(t_F5)
        # Xt = np.concatenate((tF1, tF2, tF3, tF4, tF5, tF6), axis=1)
        # Xt = np.concatenate((tF1, tF2, tF3), axis=1)
        # Xt = np.concatenate((tF1, tF3, tF5, tF6, tF7), axis=1)
        Xt = np.concatenate(( tF2, tF3, tF4), axis=1)
        # Xt = np.concatenate((tF1), axis=1)
        # Xt = tF4
        # Xt = tF1
        # Xt = tF2
        ans_num = model.predict(Xt)
        # ans_num = model.predict(t_F1)
        # ans_num = model.predict(t_F2)
        # # print(ans)
        # # print(len(ans))
        array_to_labels = {0: "positive", 2: "negative", 1: "neutral"}
        labels = [array_to_labels[i] for i in ans_num]
        # # print(labels)
        # # ans_dic = {}
        predictions = dict(zip(list(testdic.keys()), labels))
        # print(ans_dictionary)

        # predictions = {'163361196206957578': 'neutral', '768006053969268950': 'neutral', '742616104384772304': 'neutral', '
        # 102313285628711403': 'neutral', '653274888624828198': 'neutral'}
        # TODO: Remove this line, 'predictions' should be populated with the outputs of your classifier
        # predictions = ans_dictionary
        evaluation.evaluate(predictions, testset, classifier)
        evaluation.confusion(predictions, testset, classifier)
Esempio n. 3
0
print('Training the model... ', end='')
#classifier = MultinomialNB()
classifier = LogisticRegression()

model = classifier.fit(features, train['rating'])
print('Done.')

print('Loading test data... ', end='')
df2 = pd.read_csv('data_test.csv')
print('Done.')

print('Processing test data... ', end='')
test = pd.DataFrame()
test['rating'] = df2['overall']
test['review'] = df2['reviewText']
test = test[~test['review'].isnull()]
test = test[~test['rating'].isnull()]

test['review'] = test['review'].apply(clean_review)
test_features = vectorizer.transform(test['review'].tolist())
print('Done.')

print('Testing model... ', end='')
predict = model.predict(test_features)
test['predict'] = predict
#test.to_csv('reviews_home_binary_test_predict_LogisticRegression.csv', index=False)
print('Done.')

confusion(predict, test['rating'])
plt.hist(predict, bins=range(0, 3))
plt.show()
            wv_mean = test.tweet.apply(tweet_vectors,
                                       args=(300, 'mean')).apply(pd.Series)
            test = pd.concat([test, wv_sum, wv_mean], axis=1)
            X_test = scaler.transform(
                test.drop(['id', 'tweet', 'target', 'label'], axis=1))

        elif classifier == 'combined_model':
            for func in process_funcs:
                data.tweet = data.tweet.apply(func)
            X_test = tfidf_vect.transform(test.tweet)
            test['count_pos'], test['count_neg'], \
                test['count_neu']= zip(*test.tweet.apply(count_opinon_lexicons2, args=(wp, wn, stopWords)))
            wv_sum_test = test.tweet.apply(tweet_vectors,
                                           args=(300, 'sum')).apply(pd.Series)
            wv_mean_test = test.tweet.apply(tweet_vectors,
                                            args=(300,
                                                  'mean')).apply(pd.Series)
            counts_test = scaler.transform(
                test[['count_pos', 'count_neg', 'count_neu']])
            X_test = hstack(
                [X_test, wv_sum_test.values, wv_mean_test.values, counts_test])

        # creating predictions dictionary
        test_pred = sentiment_clf.predict(X=X_test)
        target_names = list(target_dict.keys())
        predictions = dict(
            zip(test['id'].values, map(lambda x: target_names[x], test_pred)))

        evaluation.evaluate(predictions, test_path, classifier)
        evaluation.confusion(predictions, test_path, classifier)
Esempio n. 5
0
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt

df = pd.read_csv('creditcard.csv')
X = df.iloc[:, 1:29]
y = df['Class']

# split data into training and testing sets
spl = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for idx_train, idx_test in spl.split(X, y):
    X_train = X.iloc[idx_train]
    y_train = y.iloc[idx_train]
    X_test = X.iloc[idx_test]
    y_test = y.iloc[idx_test]

# train the model using train data
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# test the model on test data
y_predict = classifier.predict(X_test)
y_score = classifier.predict_proba(X_test)

import evaluation as ev
ev.confusion(y_predict, y_test)

xy = ev.KS_chart(y_score[:, 0], y_test)
plt.plot(xy[:, 0], xy[:, 1])
plt.plot(xy[:, 0], xy[:, 2])
plt.show()
        )  #get the user_ids in test file for produce the dictionary of Predictions
        sens_test = pre_text(test)  #preprocess test data
        if classifier == 'Lexicon_SVM':
            test_feature = get_lexicon_feature(
                sens_test)  #produce features for test

        elif classifier == 'Ngram_tfidf_MultinomialNB':
            ngram_test = get_ngram_sentence(sens_test)
            test_feature = get_ngram_tfidf(
                ngram_test)  #produce features for test
#        elif classifier == 'word2vec':
#            test_feature = getAvgFeatureVecs(sens_test, model, num_features)
        else:
            test_feature = get_lexicon_feature(
                sens_test)  #produce features for test

        test_pred = clf.predict(
            test_feature
        )  #use trained model and test feature to get predicted sentiments
        print(
            np.mean(test_pred ==
                    test["sentiment"]))  #calculate the accuaracy of prediction
        predictions = dict(zip(test_id,
                               test_pred))  #get the dictionary of Predictions
        evaluation.evaluate(predictions, testset, classifier)  # calculate f1
        evaluation.confusion(predictions, testset,
                             classifier)  # calculate confusion metrix
#        print(test_pred)
end = time.time()  #record end time
print("running time is", end - start, "second")  #print running time
Esempio n. 7
0
        #trains sentiment classifier
        train_vect = vect.fit_transform([t[1] for t in train_set])


    for testset in testsets.testsets:
        #classifies tweets in test set
        test = prep(testset)
        test_features = vect.transform([t[1] for t in test])

        #Depending on the sentiment classifier, a correpsonding tweet classifier is used to obtained the best results.
        if classifier == "HashingVectorizer":
            classif = Perceptron(max_iter=300)
            classif.fit(train_vect,[(t[0]) for t in train_set])
        elif classifier == "TfidfVectorizer":
            classif = BernoulliNB()
            classif.fit(train_vect,[(t[0]) for t in train_set])
        else:
            classif = MultinomialNB()
            classif.fit(train_vect,[(t[0]) for t in train_set])

        #Predicts the sentiment type of the test features based on the training classifiers
        predictions = classif.predict(test_features)
        id_list = ids(testset)                  #calls the ids function for list of Tweet IDs
        id_list_predict = list(zip(id_list,list(predictions)))
        diction = dict(id_list_predict)         #Creates a dictionary of Tweet IDs and corresponding sentiment


        evaluation.evaluate(diction, testset, classifier)

        evaluation.confusion(diction, testset, classifier)