def get_correctly_classified_tweets(self, tweets_and_sentiment): """ Classifies the given set of tweets and returns the ones that were correctly classified. """ tweets, sentimentvalues = zip(*tweets_and_sentiment) if sentimentvalues!=None: self.test_words_and_values = sentimentvalues count_vector = self.vect.transform([t.text for t in tweets]) tfidf_count = self.tfidf_transformer.transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)]) tfidf_dict = self.dict_transformer.transform(dict_vector) combined_vector = sp.hstack([tfidf_count, tfidf_dict]) predictions = self.best_estimator.predict(combined_vector) tweets, targets = utils.make_subjectivity_targets(tweets) #return the tweets where the target match prediction correct_tweets = [] correct_sentimentvalues = [] for i in xrange(len(tweets)): if predictions[i]==targets[i]: correct_tweets.append(tweets[i]) correct_sentimentvalues.append(sentimentvalues[i]) return correct_tweets, correct_sentimentvalues
def get_optimal_subjectivity_classifier(): """ Trains and returns the optimal subjectivity classifier. """ tweets = utils.get_pickles(3) tweets, targets = utils.make_subjectivity_targets(tweets) vect_options = { 'ngram_range': (1,1), 'max_df': 0.5 } tfidf_options = { 'sublinear_tf': False, 'use_idf': True, 'smooth_idf': True, } clf = SVM(tweets, targets, vect_options, tfidf_options) clf.set_feature_set('SA', utils.get_sentimentvalues(3)) clf.train_on_feature_set() return clf