def get_tweets(data_file): tweets = [] matrix = pd.read_csv(data_file, sep='\t').as_matrix() for tweet in matrix: tweets.append(Tweet(tweet[0], tweet[1], tweet[2], tweet[3])) return tweets trainTweets = get_tweets('data/train-CE.tsv') testTweets = get_tweets('data/test-CE.tsv') topics = np.unique(pd.read_csv('data/test-CE.tsv', sep='\t').as_matrix()[:, 1]) vect = TfidfVectorizer() tfidf = vect.fit_transform(get_negAndpos(trainTweets)) eval_res = [] eval_acc = [] for topic in topics: weights_all = vect.transform([' '.join(Tweet.get_all_messages(testTweets, topic))]) pred_all = tfidf.A.dot(weights_all.T.A) weights = vect.transform(Tweet.get_all_messages(testTweets, topic)) res = [Tweet.sentiments[x] for x in np.argmax(tfidf.A.dot(weights.T.A) * pred_all, axis=0)] eval_res += [Evaluate.evaluateC(res, Tweet.get_all_sentiment(testTweets, topic))] eval_acc += [ np.sum([1 if x == y else 0 for x, y in zip(res, Tweet.get_all_sentiment(testTweets, topic))]) / len(res)] print("Evaluation " + str(np.average(eval_res))) print("Evaluation Acc" + str(np.average(eval_acc)))