class MachineLearningNLP: def __init__(self, classifier_type='NaiveBayes', feats=word_feats): # "Thumbs up? Sentiment Classification using Machine Learning Techniques classifier_list = ['NaiveBayes', 'MaximumEntropy', 'SVM'] if classifier_type in classifier_list: self.classifier_type = classifier_type else: print("Classifier Type is not implemented: " + classifier_type) if self.classifier_type == 'MaximumEntropy': self.classifier = MaxentClassifier elif self.classifier_type == 'SVM': self.classifier = SklearnClassifier(LinearSVC(), sparse=False) elif self.classifier_type == 'NaiveBayes': self.classifier = NaiveBayesClassifier self.feats = feats def convert_txt(self, file_neg, file_pos): negfeats = list(map(self.feats, word_preprocess(file_neg))) posfeats = list(map(self.feats, word_preprocess(file_pos))) negfeats = list(zip(negfeats, ['neg'] * len(negfeats))) posfeats = list(zip(posfeats, ['pos'] * len(posfeats))) # negfeats = [(self.feats(f), 'neg') for f in word_preprocess(file_neg)] # posfeats = [(self.feats(f), 'pos') for f in word_preprocess(file_pos)] return (negfeats, posfeats) def train(self, train_data, **kwargs): self.classifier = self.classifier.train(train_data, **kwargs) def predict(self, test_data): return [self.classifier.classify(feats) for feats, label in test_data] def annotate(self, text): assert isinstance(text, str) text_Encoded = self.feats(text.split()) return self.classifier.classify(text_Encoded) def performance(self, test_data): prediction = self.predict(test_data) pos_loc = set( [i for i in range(len(prediction)) if prediction[i] == 'pos']) neg_loc = set(range(len(prediction))) - pos_loc pos_ref = set( [i for i in range(len(prediction)) if test_data[i][1] == 'pos']) neg_ref = set(range(len(prediction))) - pos_ref print('===============================\n') print('Model Summary:\n') print(self.classifier_type + ' with features ' + self.feats.__name__ + '\n') print('Overall Accuracy: %.3f\n' % (nltk.classify.util.accuracy(self.classifier, test_data))) print('Positive Precision: %.3f\n' % (nltk.precision(pos_ref, pos_loc))) print('Positive Recall: %.3f\n' % (nltk.recall(pos_ref, pos_loc))) print('Negative Precision: %.3f\n' % (nltk.precision(neg_ref, neg_loc))) print('Negative Recall: %.3f\n' % (nltk.recall(neg_ref, neg_loc)))
def funcn(): f = open("amazon_data.txt") pos_tweets = list() neg_tweets = list() for line in f: words = line.split("\t") if words[1] == '0\n' or words[1] == '0': neg_tweets.append(words) else: pos_tweets.append(words) f.close() tweets = [] for (words, sentiment) in pos_tweets + neg_tweets: words_filtered = [e.lower() for e in words.split() if len(e) >= 3] tweets.append((words_filtered, sentiment)) def get_words_in_tweets(tweets): all_words = [] for (words, sentiment) in tweets: all_words.extend(words) return all_words def get_word_features(wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features word_features = get_word_features(get_words_in_tweets(tweets)) def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features training_set = nltk.classify.apply_features(extract_features, tweets) classifie = nltk.NaiveBayesClassifier.train(training_set) classifier = SklearnClassifier(BernoulliNB()).train(training_set) tweet = 'it is not bad' print(classifie.classify(extract_features(tweet.split()))) print(classifier.classify(extract_features(tweet.split()))) classif = SklearnClassifier(SVC(), sparse=False).train(training_set) print(classif.classify(extract_features(tweet.split())))
def searchSGDClassifier_classifier(title, train_departments): """ :param title: :param train_departments: :return: """ timeTraning = time.time() classifier = SklearnClassifier(SGDClassifier(loss='log')) classifier.train(train_departments) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def searchLinearSVC(title, train_departments): """ Linear SVC :param title: :param train_departments: :return: """ timeTraning = time.time() #classifier = SklearnClassifier(LinearSVC(probability=True)) classifier = SklearnClassifier(SVC(kernel='linear', probability=True)) classifier.train(train_departments) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def searchNuSVC_classifier(title, train_departments): """ Nu-Support Vector Classification. :param title: :param train_departments: :return: """ classifier = SklearnClassifier(NuSVC()) classifier.train(train_departments) test_sent_features = word_feats(title) return classifier.classify(test_sent_features)
def predict_nltk(in_text='', n=2): ''' Text language classification Then use scikit-learn classifiers from within NLTK to classify new taxt based on training set. ''' trainingset = [] for label in text: featurs = text_features(text[label]) trainingset.append((featurs, label)) classifier = SklearnClassifier(MultinomialNB()).train(trainingset) in_features = text_features(in_text, n=n) lang = classifier.classify(in_features) print 'Language:', lang
class LinearSVC2Model(SKLearnModel): """This model classifies tweets into any one of twenty classes using SVM classification. """ def __init__(self, balanced=False, C=1.0, dual=True, tol=1e-4, max_iter=1000, loss="squared_hinge") -> None: # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. self.tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize # set class_weight to None unless the 'balanced' has been set to true in the config class_weight = None # type: Optional[str] if balanced: class_weight = "balanced" # Here we create the pipeline for the classifier. # The TfidfTransformer is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. # The LinearSVC sets up a Linear Support Vector Machine classifier. This is different because than using SCV # with a Linear kernel because it uses liblinear as a backend instead of libsvm. This makes it run a lot faster. pipeline = Pipeline([('tfidf', TfidfTransformer()), ('linearsvc', LinearSVC(class_weight=class_weight, C=C, dual=dual, tol=tol, max_iter=max_iter, loss=loss))]) self.classif = SklearnClassifier(pipeline) @staticmethod def get_extra_configs(): configs = [{"name": "balanced", "default": False}, {"name": "C", "default": 1.0}, {"name": "dual", "default": True}, {"name": "tol", "default": 1e-4}, {"name": "max_iter", "default": 1000}, {"name": "loss", "default": "squared_hinge"}] # add config for balanced. return super(LinearSVC2Model, LinearSVC2Model).get_extra_configs() + configs def train(self, tweets: List[Tweet]) -> None: def tweet_to_tuple(x): return (FreqDist(self.tokenizer(x.text)), x.emoji) # Generates tuples of all the tweets to form the corpus corpus = map(tweet_to_tuple, tweets) # Train this model! self.classif.train(corpus) def predict(self, text): return self.classif.classify(FreqDist(self.tokenizer(text)))
def leaveKOutValidation(k=1): accuracy = 0.0 print("Performing leave-"+str(k)+"-out cross-validation") gamesClusters = [feats[int(i*k):int((i+1)*k)] for i in range(int(len(feats)/k))] for games in gamesClusters: training = [x for x in feats if x not in games] pipeline = Pipeline([('tfidf', TfidfTransformer()), #('chi2', SelectKBest(chi2, k=250)), ('nb', MultinomialNB())]) classifier = SklearnClassifier(pipeline).train(training) rw = [] for game in games: classification = classifier.classify(game[0]) accuracy += int((game[1] > 0) == (classification > 0)) / float(len(feats)) print("With leave-"+str(k)+"-out cross-validation, the algorithm is "+str(round(accuracy*100,4))+"% accurate")
def evaluate_classifier(featx,collocationFunc): #negFiles = movie_reviews.fileids('neg') #posFiles = movie_reviews.fileids('pos') #negWordsList=[movie_reviews.words(fileids=[f]) for f in negFiles] #posWordsList=[movie_reviews.words(fileids=[f]) for f in posFiles] #negfeats = [(featx(negWords), 'neg') for negWords in negWordsList] #posfeats = [(featx(posWords), 'pos') for posWords in posWordsList] negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'pos') for f in posids] #lenNegFeats=min(len(negfeats),24) #lenPosFeats=min(len(posfeats),24) lenNegFeats=len(negfeats) lenPosFeats=len(posfeats) negcutoff = int(lenNegFeats*3/4) poscutoff = int(lenPosFeats*3/4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:lenNegFeats] + posfeats[poscutoff:lenPosFeats] #classifier = MaxentClassifier.train(trainfeats) classifier = SklearnClassifier(BernoulliNB()).train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) evaluationMetrics={} print(classifier) evaluationMetrics['accuracy']=nltk.classify.util.accuracy(classifier, testfeats) evaluationMetrics['posPrec']=nltk.precision(refsets['pos'], testsets['pos']) evaluationMetrics['posRecall']=nltk.recall(refsets['pos'], testsets['pos']) evaluationMetrics['posF_Score']=nltk.f_measure(refsets['pos'], testsets['pos']) evaluationMetrics['negPrec']=nltk.precision(refsets['neg'], testsets['neg']) evaluationMetrics['negRecall']=nltk.recall(refsets['neg'], testsets['neg']) evaluationMetrics['negF_Score']=nltk.f_measure(refsets['neg'], testsets['neg']) return evaluationMetrics
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' print(trainfeats) classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats)
def ml_sentiment(self, text): ''' Machine Learning for Sentiment detection. ''' trainingset = [] for tweet in self.data: trainingset.append(self.sentiment_featrues(tweet)) #classifier = nltk.NaiveBayesClassifier.train(trainingset) #classifier = nltk.DecisionTreeClassifier.train(trainingset) classifier = SklearnClassifier(MultinomialNB()).train(trainingset) tokenz = self.ml_tag(text, print_tags=False) tweet = { 'tokens': tokenz, 'sentiment': '' } tokenz_features = self.sentiment_featrues(tweet) #print tokenz_features sentiment = classifier.classify(tokenz_features[0]) #print text, sentiment tweet['sentiment'] = sentiment print '\nTweet:', text self.show_tweet(tweet) return sentiment
class SVCModel(SKLearnModel): """This model classifies tweets into any one of twenty classes using SVM classification. """ def __init__(self, kernel: str = "") -> None: # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. self.tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize # Here we create the pipeline for the classifier. # The TfidfTransformer is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. # The SVC sets up a Support Vector Machine classifier with the configured kernel. # In this case it is either a linear or a radial basis function kernel. # The details for the above items are discussed in the model's readme. pipeline = Pipeline([('tfidf', TfidfTransformer()), ('{}svc'.format(kernel), SVC(kernel=kernel))]) self.classif = SklearnClassifier(pipeline) def train(self, tweets: List[Tweet]) -> None: def tweet_to_tuple(x): return (FreqDist(self.tokenizer(x.text)), x.emoji) # Generates tuples of all the tweets to form the corpus corpus = map(tweet_to_tuple, tweets) # Train this model! self.classif.train(corpus) def predict(self, text): return self.classif.classify(FreqDist(self.tokenizer(text))) def tokenize(self, text): return self.tokenizer(text)
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] ######################################################################################## ######################################################################################## refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) str = 'SINGLE FOLD RESULT ' + '(' + 'linear-svc' + ')' #training with LinearSVC classifier = SklearnClassifier(LinearSVC()) classifier.train(trainfeats) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) * 100 pos_precision = nltk.precision(refsets['pos'], testsets['pos']) pos_recall = recall(refsets['pos'], testsets['pos']) pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) neg_precision = precision(refsets['neg'], testsets['neg']) neg_recall = recall(refsets['neg'], testsets['neg']) neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print(str) print('---------------------------------------') print('accuracy: ', accuracy, '%') print('precision', (pos_precision + neg_precision) / 2)
nonlinear_svm = SklearnClassifier(SVC(gamma='scale', kernel='poly', coef0 = 5.0, degree = 5, C = 5.0, shrinking=True, probability=False, tol=1e-3), sparse=False).train(train_set) print("Accuracy - Nonlinear SVM: ") print(nltk.classify.accuracy(nonlinear_svm, test_set)) random_forest = SklearnClassifier(RandomForestClassifier(n_estimators = 100, criterion = 'gini', max_depth = 5, min_samples_split = 2, min_samples_leaf = 1, min_weight_fraction_leaf = 0.0, max_features = 25, max_leaf_nodes = 20, min_impurity_decrease = 0.0, bootstrap = True, oob_score = False, random_state = None ), sparse = False) random_forest.train(train_set) print("Accuracy - Random Forest Classifier: ") print(nltk.classify.accuracy(random_forest, test_set)) test_tweet = "75% of illegal Aliens commit Felons such as ID, SSN and Welfare Theft Illegal #Immigration is not a Victimless Crime !" # print(naive_bayes.classify(extract_features_of_tweet(test_tweet, raw=True))) # print(maxent.classify(extract_features_of_tweet(test_tweet, raw=True))) print(linear_svm_classifier.classify(extract_features_of_tweet(test_tweet, raw=False))) print(nonlinear_svm.classify(extract_features_of_tweet(test_tweet, raw=True)))
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'Reading Tweets\n' tweets_data_path = '20161019_202620.txt' tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) except: continue tweets = pd.DataFrame() tweets['text'] = [tweet.get('text','') for tweet in tweets_data] tdata = tweets['text'] negfeats = [(featx(f), 'neg') for f in word_split(tdata)] testfeats = negfeats print np.shape(testfeats) #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] #print np.shape(testfeats) # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print testsets[observed] accuracy = nltk.classify.util.accuracy(classifier, testfeats) #pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos']) #pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos']) #pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos']) #neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg']) #neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg']) #neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg']) print '' print '---------------------------------------' print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')' print '---------------------------------------' print 'accuracy:', accuracy
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] #print(negfeats) negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) #print(negcutoff) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #print(trainfeats) testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats) #classifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) #print(testfeats) for i, (feats, label) in enumerate(testfeats): #feats : list of words #label : neg/pos #observed : neg/pos #print(feats,'---',label) refsets[label].add(i) observed = classifier.classify(feats) #print(observed) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = nltk.precision(refsets['pos'], testsets['pos']) pos_recall = nltk.recall(refsets['pos'], testsets['pos']) pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) neg_precision = nltk.precision(refsets['neg'], testsets['neg']) neg_recall = nltk.recall(refsets['neg'], testsets['neg']) neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features() print('') ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation subset_size = int(len(trainfeats) / n) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos']) cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg']) cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) print('')
class RandomForestCascadeClassifier(): def __init__(self, dataset, k, user_followers=True, users_reachable=True, average_time=True, time_to_k=True): self.k = k self._twtokenize = TweetTokenizer(strip_handles=True) self._dataset = dataset self._user_followers = user_followers self._users_reachable = users_reachable self._average_time = average_time self._time_to_k = time_to_k self._stopwords = stopwords.words('english') self._stemmer = PorterStemmer() self._f_count = [] self._r_count = [] self._rt_count = [] self._avg = [] self._time = [] self._train() def _tokenize(self, tweet_text): return [ self._stemmer.stem(token) for token in self._twtokenize.tokenize(tweet_text) if token not in self._stopwords ] def _sorted_cascade_nodes(self, cascade): nodes = cascade['cascade'] cascade_nodes = [(int(key), nodes[key]) for key in nodes.keys()] return sorted(cascade_nodes, key=lambda x: x[0]) def _tweet_length_feature(self, cascade): length = len(cascade['root_tweet']['text']) return length def _user_followers_feature(self, cascade): followers = cascade['root_tweet']['user']['followers_count'] self._f_count.append(followers) return followers def _users_reachable_feature(self, nodes): reachable = 0 for kth, node in zip(range(self.k + 1), nodes): reachable += node[1]['user_followees_count'] self._r_count.append(reachable) return reachable def _average_time_feature(self, nodes): timestamp = [ int(node[1]['created_at']) for kth, node in zip(range(self.k + 1), nodes) ] average = (sum(numpy.diff(timestamp)) / float(len(timestamp))) / 1000 self._avg.append(average) return average def _users_retweet_feature(self, cascade): retweets = cascade['root_tweet']['retweet_count'] self._rt_count.append(retweets) return retweets def _time_to_k_feature(self, nodes): first = int(nodes[0][1]['created_at']) kth = int(list(zip(range(self.k + 1), nodes))[-1][1][1]['created_at']) diff = (kth - first) / 1000 self._time.append(diff) return diff def _extract_features(self, cascade): if cascade['root_tweet']['lang'] == 'en': tweet_tokens = self._tokenize(cascade['root_tweet']['text']) features = { "contains({0})".format(token): True for token in tweet_tokens } else: features = {} features['tweet_length'] = self._tweet_length_feature(cascade) # features['rtweet'] = self._users_retweet_feature(cascade) if self._user_followers: features["user_followers"] = self._user_followers_feature(cascade) cascade_nodes = self._sorted_cascade_nodes(cascade) if self._users_reachable: features['reachable'] = self._users_reachable_feature( cascade_nodes) if self._average_time: features['average'] = self._average_time_feature(cascade_nodes) if self._time_to_k: features['timetok'] = self._time_to_k_feature(cascade_nodes) return features def _train(self): pickle_filename = "{0}.pickle".format(self.__class__.__name__) if os.path.isfile(pickle_filename): with open(pickle_filename, "rb") as classifier_f: self._classifier = pickle.load(classifier_f) classifier_f.close() else: train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset] pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('rf', RandomForestClassifier(n_estimators=1000))]) self._classifier = SklearnClassifier(pipeline, sparse=False).train(train_set) with open(pickle_filename, "wb") as save_classifier: pickle.dump(self._classifier, save_classifier) save_classifier.close() def classify(self, cascade): features = self._extract_features(cascade) return self._classifier.classify(features) def classify_prob(self, cascade): features = self._extract_features(cascade) result = self._classifier.prob_classify(features) return {"positive": result.prob(True), "negative": result.prob(False)} def _metrics(self, results): print( metrics.classification_report(results['actual'], results['prediction'])) def classify_cascades(self, test_dataset): results = {"prediction": [], "actual": []} for cascade in test_dataset: result = self.classify(cascade) actual = cascade['label'] results["prediction"].append(result) results["actual"].append(actual) self._metrics(results) print("Average: {0}, Median: {1}, Std: {2}".format( numpy.average(self._f_count), numpy.median(self._f_count), numpy.std(self._f_count))) print("Average: {0}, Median: {1}, Std: {2}".format( numpy.average(self._r_count), numpy.median(self._r_count), numpy.std(self._r_count))) print("Average: {0}, Median: {1}, Std: {2}".format( numpy.average(self._avg), numpy.median(self._avg), numpy.std(self._avg))) print("Average: {0}, Median: {1}, Std: {2}".format( numpy.average(self._time), numpy.median(self._time), numpy.std(self._time))) def classify_cascades_prob_export(self, test_dataset): export = "dataset/" + self.__class__.__name__ + "_results.json" results = {} for cascade in test_dataset: results[cascade['url']] = self.classify_prob(cascade) export_file = open(export, 'w') export_file.write(json.dumps(results))
observed = logit_classifier.classify(feats) testset[observed].add(i) print("UnigramsLogit Recall") print('Bullying recall:', recall(testset['Bullying'], refset['Bullying'])) print("") # In[14]: #Support Vector Machine for Unigrams from nltk.classify import SklearnClassifier from sklearn.svm import SVC SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set) for i, (feats, label) in enumerate(test_set): refset[label].add(i) observed = SVM_classifier.classify(feats) testset[observed].add(i) print("UnigramSVM Recall") print('Bullying recall:', recall(testset['Bullying'], refset['Bullying'])) # In[15]: #Same thing with Bigrams from nltk import bigrams, trigrams from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures # In[16]: combined = zip(Tweet, Labels)
def evaluate_classifier(data): trainfeats, testfeats = train_test_split(data, test_size=0.3, random_state=0) # using 3 classifiers classifier_list = ['nb','svm'] classifier_dict ={'nb':'Naive Bayes', 'svm':'SVM'} for cl in classifier_list: classifierPkl = os.path.join('pkl',cl+".pkl") if not os.path.exists('./%s'%classifierPkl): if cl == 'svm': classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifier = NaiveBayesClassifier.train(trainfeats) pickle.dump(classifier,open(classifierPkl, 'wb')) else: classifier = pickle.load(open(classifierPkl,'rb')) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = precision(refsets['positive'], testsets['positive']) pos_recall = recall(refsets['positive'], testsets['positive']) pos_fmeasure = f_measure(refsets['positive'], testsets['positive']) neg_precision = precision(refsets['negative'], testsets['negative']) neg_recall = recall(refsets['negative'], testsets['negative']) neg_fmeasure = f_measure(refsets['negative'], testsets['negative']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifier_dict[cl] + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features() print('') n = 5 # 5-fold cross-validation for cl in classifier_list: subset_size = len(trainfeats) / n accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i*int(subset_size):][:int(subset_size)] training_this_round = trainfeats[:i*int(subset_size)] + trainfeats[(i+1)*int(subset_size):] classifierPkl = os.path.join('pkl',cl+"_cv.pkl") if not os.path.exists('./%s'%classifierPkl): if cl == 'svm': classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifier = NaiveBayesClassifier.train(training_this_round) pickle.dump(classifier,open(classifierPkl, 'wb')) else: classifier = pickle.load(open(classifierPkl,'rb')) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = precision(refsets['positive'], testsets['positive']) cv_pos_recall = recall(refsets['positive'], testsets['positive']) cv_pos_fmeasure = f_measure(refsets['positive'], testsets['positive']) cv_neg_precision = precision(refsets['negative'], testsets['negative']) cv_neg_recall = recall(refsets['negative'], testsets['negative']) cv_neg_fmeasure = f_measure(refsets['negative'], testsets['negative']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifier_dict[cl] + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2) print('recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2) print('f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2) print('')
def evaluate_classifier(featx): negfeats = [(featx(f), 'negative') for f in splitter(negative)] posfeats = [(featx(f), 'positive') for f in splitter(positive)] neautralfeats = [(featx(f), 'neautral') for f in splitter(neautral)] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) neautcutoff = int(len(neautralfeats) * 3 / 4) trainfeats = negfeats[: negcutoff] + posfeats[: poscutoff] + neautralfeats[: neautcutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neautralfeats[ neautcutoff:] # Max Entropy and SVM classifiers classifier_list = ['maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = precision(refsets['positive'], testsets['positive']) if pos_precision is None: pos_precision = 0.0 pos_recall = recall(refsets['positive'], testsets['positive']) if pos_recall is None: pos_recall = 0.0 pos_fmeasure = f_measure(refsets['positive'], testsets['positive']) if pos_fmeasure is None: pos_fmeasure = 0.0 neut_precision = precision(refsets['neautral'], testsets['neautral']) if neut_precision is None: neut_precision = 0.0 neut_recall = recall(refsets['neautral'], testsets['neautral']) if neut_recall is None: neut_recall = 0.0 neut_fmeasure = f_measure(refsets['neautral'], testsets['neautral']) if neut_fmeasure is None: neut_fmeasure = 0.0 neg_precision = precision(refsets['negative'], testsets['negative']) if neg_precision is None: neg_precision = 0.0 neg_recall = recall(refsets['negative'], testsets['negative']) if neg_recall is None: neg_recall = 0.0 neg_fmeasure = f_measure(refsets['negative'], testsets['negative']) if neg_fmeasure is None: neg_fmeasure = 0.0 print('\n') print(classifierName) print('accuracy:', accuracy) acrcy.append(accuracy) print('precision', (pos_precision + neg_precision + neut_precision) / 3) prcsn.append((pos_precision + neg_precision + neut_precision) / 3) print('recall', (pos_recall + neg_recall + neut_recall) / 3) rcall.append((pos_recall + neg_recall + neut_recall) / 3) print('f-measure', (pos_fmeasure + neg_fmeasure + neut_fmeasure) / 3) fmsr.append((pos_fmeasure + neg_fmeasure + neut_fmeasure) / 3)
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = precision(refsets['pos'], testsets['pos']) pos_recall = recall(refsets['pos'], testsets['pos']) pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) neg_precision = precision(refsets['neg'], testsets['neg']) neg_recall = recall(refsets['neg'], testsets['neg']) neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features() print('') ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation for cl in classifier_list: subset_size = int(len(trainfeats) / n) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = precision(refsets['pos'], testsets['pos']) cv_pos_recall = recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = precision(refsets['neg'], testsets['neg']) cv_neg_recall = recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) print('')
tweets = [] stop_words = set(stopwords.words('english')) for (words, sentiment) in train: words_filtered = [e.lower() for e in words.split() if e not in stop_words] tweets.append((words_filtered, sentiment)) # print tweets # word_features = get_word_features(get_words_in_tweets(tweets)) # training_set = nltk.classify.apply_features(extract_features, tweets) training_set=traindict(tweets) print training_set # classifier = nltk.NaiveBayesClassifier.train(training_set) classifier = SklearnClassifier(SVC(), sparse=False).train(training_set) tweetd = 'I have cows :(' print classifier.classify(dict(Counter(clean(tweetd.lower())))) # tweetd = 'Obama is boring :(' # print classifier.classify(extract_features(tweetd.lower().split()))
if r['tag'] == 2: train.append((tx, "obj")) # elif r['tag']==0: # train.append((tx,"neg")) else: train.append((tx, "subj")) tweets = [] stop_words = set(stopwords.words('english')) for (words, sentiment) in train: words_filtered = [e.lower() for e in words.split() if e not in stop_words] tweets.append((words_filtered, sentiment)) # print tweets # word_features = get_word_features(get_words_in_tweets(tweets)) # training_set = nltk.classify.apply_features(extract_features, tweets) training_set = traindict(tweets) print training_set # classifier = nltk.NaiveBayesClassifier.train(training_set) classifier = SklearnClassifier(SVC(), sparse=False).train(training_set) tweetd = 'I have cows :(' print classifier.classify(dict(Counter(clean(tweetd.lower())))) # tweetd = 'Obama is boring :(' # print classifier.classify(extract_features(tweetd.lower().split()))
def create_classifier(featx): pos_data = pickle.load( open(os.path.join(config.test_path, 'pos_review.pkl'), 'r')) neg_data = pickle.load( open(os.path.join(config.test_path, 'neg_review.pkl'), 'r')) pos_words = pos_data[:] neg_words = neg_data[:] print len(pos_words), '------', len(neg_words) pos_features = [(featx(w_lst), 'pos') for w_lst in pos_words] neg_features = [(featx(w_lst), 'neg') for w_lst in neg_words] negoff = int(len(neg_features) * 0.9) posoff = int(len(pos_features) * 0.9) r_pos_cut = pos_features[:posoff] r_neg_cut = neg_features[:negoff] print r_pos_cut is None, '---r_pos_cut----', len(r_pos_cut) print r_neg_cut is None, '---r_neg_cut----', len(r_neg_cut) t_pos_cut = pos_features[posoff:] t_neg_cut = neg_features[negoff:] print t_pos_cut is None, '---t_pos_cut----', len(t_pos_cut) print t_neg_cut is None, '---t_neg_cut----', len(t_neg_cut) r_pos_cut.extend(r_neg_cut) train_set = r_pos_cut t_pos_cut.extend(t_neg_cut) test_set = t_pos_cut # print pos_features print train_set is None, '---train_set----', len(train_set) print test_set is None, '-----test_set--', len(test_set) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) nb_classifier = nltk.NaiveBayesClassifier.train(train_set) nba = nltk.classify.accuracy(nb_classifier, test_set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = nb_classifier.classify(feats) testsets[observed].add(i) print "NBayes accuracy is %.7f" % nba # 0.5325077 refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) svm_classifier = SklearnClassifier(LinearSVC()).train(train_set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = svm_classifier.classify(feats) testsets[observed].add(i) svmm = nltk.classify.accuracy(svm_classifier, test_set) print "SVM accuracy is %.7f" % svmm # 0.6604747 refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) maxent_classifier = nltk.classify.MaxentClassifier.train(train_set, max_iter=7) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = maxent_classifier.classify(feats) testsets[observed].add(i) maxent = nltk.classify.accuracy(maxent_classifier, test_set) print "MaxentClassifier accuracy is %.7f" % maxent # 0.6449948 classifier_pkl = os.path.join(config.test_path, 'my_classifier_svm.pkl') # 消极语料 with open(classifier_pkl, 'wb') as f: pickle.dump(svm_classifier, f) classifier_pkl = os.path.join(config.test_path, 'my_classifier_maxent.pkl') # 消极语料 with open(classifier_pkl, 'wb') as f: pickle.dump(maxent_classifier, f) classifier_pkl = os.path.join(config.test_path, 'my_classifier_nb.pkl') # 消极语料 with open(classifier_pkl, 'wb') as f: pickle.dump(nb_classifier, f) print 'done!'
def hello(): f = open("Training_amazon_data.txt") pos_tweets = list() neg_tweets = list() for line in f: words = line.split("\t") if words[1] == '0\n' or words[1] == '0': neg_tweets.append(words) else: pos_tweets.append(words) f.close() tweets = [] for (words, sentiment) in pos_tweets + neg_tweets: words_filtered = [e.lower() for e in words.split() if len(e) >= 3] tweets.append((words_filtered, sentiment)) def get_words_in_tweets(tweets): all_words = [] for (words, sentiment) in tweets: all_words.extend(words) return all_words def get_word_features(wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features word_features = get_word_features(get_words_in_tweets(tweets)) def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features training_set = nltk.classify.apply_features(extract_features, tweets) #classifie = nltk.NaiveBayesClassifier.train(training_set) classifie = SklearnClassifier(BernoulliNB()).train(training_set) form = ReusableForm(request.form) print(form.errors) if request.method == 'POST': name = request.form['name'] file = open("test.txt") resfile = open("result.txt", "w") for line in file: review = classifie.classify(extract_features(line.split())) resfile.write(line) resfile.write(review) file.close() resfile.close() #if (classifie.classify(extract_features(name.split())) == '1'): # review = 'Positive' # else: # review = 'Negative' name = classifie.classify(extract_features(name.split())) print(name) if form.validate(): # Save the comment here. flash(name) else: flash('Error: All the form fields are required. ') return render_template('analysis.html', form=form)
from sklearn.svm import SVC #TRAINING AND TEST DATA def SVM_Classifier(): train = [('I love this sandwich.', 'pos'), ('This is an amazing place!', 'pos'), ('I feel very good about these beers.', 'pos'), ('This is my best work.', 'pos'), ("What an awesome view", 'pos'), ('I do not like this restaurant', 'neg'), ('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'), ('My boss is horrible.', 'neg')] test = [ ('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] test_sentence = "This is the best band I've ever heard!" #FEATURESETS all_words = set(word.lower() for passage in train for word in word_tokenize(passage[0])) t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train] testf=[({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in test] test_sent_features = {word.lower(): (word in word_tokenize(test_sentence.lower())) for word in all_words} #CLASSIFICATION #SUPPORT VECTOR MACHINE classif1 = SklearnClassifier(SVC(), sparse=False).train(t) classif1.classify(test_sent_features)
print('neg F-measure:', f_measure(refsets['neg'], testsets['neg'])) # Model #4: **UNIGRAMS** & SVM Model # In[28]: # #Create an SVM to compare which is the better performing model from nltk.classify import SklearnClassifier from sklearn.svm import SVC SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set) for i, (uni_featureset, label) in enumerate(test_set): refsets[label].add(i) observed = SVM_classifier.classify(uni_featureset) testsets[observed].add(i) print('pos precision:', precision(refsets['pos'], testsets['pos'])) print('pos recall:', recall(refsets['pos'], testsets['pos'])) print('pos F-measure:', f_measure(refsets['pos'], testsets['pos'])) print('neg precision:', precision(refsets['neg'], testsets['neg'])) print('neg recall:', recall(refsets['neg'], testsets['neg'])) print('neg F-measure:', f_measure(refsets['neg'], testsets['neg'])) # Model #5 In order to get more context, we should start modeling **BIGRAMS** & Naive Bayes with the same dataset and compare # In[29]: rl = zip(reviews, ratings)
def evaluate_bow(): lines = codecs.open(BC3_LABELLED_FILE, "r").readlines() data = [] gold = [] for i, line in enumerate(lines): tokens = line.strip().split() if len(tokens) > 2: label = tokens.pop(0) tag = tokens.pop(0) if tag == "none": continue if i < len(lines) and len(lines[i + 1].strip().split()) > 2: lines[i + 1].strip().split().pop(0) next_label = lines[i + 1].strip().split().pop(0) else: next_label = "T" gold.append(tag) data.append((FreqDist(tokens), tag, next_label)) limit = int(float(len(data)) * 0.8) # training set: bags-of-words and tag tuples train = [(bow, tag) for bow, tag, next_label in data[:limit]] # training the classifier classifier = SklearnClassifier(MultinomialNB()).train(train) results = { "segmented": [], "unsegmented": [] } all_choices = [] # all choices made choices = [] # choices for the current segment nb = 1 # number of lines in the segment for i, (bow, tag, next_label) in enumerate(data[limit:]): # bow classification choice = classifier.classify(bow) choices.append(choice) all_choices.append(choice) # line by line classification for unsegmented results results["unsegmented"].append(choice) # more complex classification for segmented results if next_label == "T": most_common = Counter(choices).most_common() if len(most_common) > 1: tf = FreqDist(all_choices) vote = most_common[0] best = 1 for candidate, occ in most_common: if tf[candidate] > best: vote = candidate best = tf[candidate] else: vote, occ = most_common[0] results["segmented"] += [vote for choice in choices] choices = [] nb = 1 else: nb += 1 # incrementing the current number of lines in the bag for i, label in enumerate(gold[limit:]): bow, tag, next_label = data[i + limit] print("# {0}\t{1}\t{2}".format(label, results["unsegmented"][i], results["segmented"][i])) if next_label == "T": print("# ------------------") # segmented metrics sp = metrics.precision_score(gold[limit:], results["segmented"]) sr = metrics.recall_score(gold[limit:], results["segmented"]) sf = (2.0 * (sr * sp)) / (sr + sp) # unsegmented metrics up = metrics.precision_score(gold[limit:], results["unsegmented"]) ur = metrics.recall_score(gold[limit:], results["unsegmented"]) uf = (2.0 * (ur * up)) / (ur + up) print("#") print("# Pre.:\t\tRec:\t\tF1:") print("# segmented: {0}%\t\t{1}%\t\t{2}%".format(dec(sp * 100), dec(sr * 100), dec(sf * 100))) print("# non-segmented: {0}%\t\t{1}%\t\t{2}%".format(dec(up * 100), dec(ur * 100), dec(uf * 100)))
print("UnigramsLogit Recall") print('Bullying recall:', recall(testset['Bullying'], refset['Bullying'])) print("") # In[34]: #Run Support Vector Machine for Unigrams from nltk.classify import SklearnClassifier from sklearn.svm import SVC SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set) for i, (feats, label) in enumerate(test_set): refset[label].add(i) observed = SVM_classifier.classify(feats) testset[observed].add(i) print("UniigramSVM Recall") print('Bullying recall:', recall(testset['Bullying'], refset['Bullying'])) # In[35]: #Do the same thing with bigrams from nltk import bigrams, trigrams from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures # In[36]:
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats) newsdata = {} ''' news_path = "./xa/" out_ = open('result.txt', 'w') for root, dirs, files, in os.walk(news_path): for name in files: if name == ".DS_Store": continue fp = open(root+'/'+name, 'r') #print(name) date = '' text = [] gotDate = False #print(root+'/'+name) for line in fp: if gotDate == False: date = line.replace('\n','') gotDate = True if date not in newsdata: newsdata[date] = [0,0] else: if len(line.strip()) == 0: gotDate = False continue text.append(line) #print(text) newsfeat = [(featx(f), date) for f in word_split(text)] del text[:] observed = classifier.classify(newsfeat[0][0]) if observed == 'neg': newsdata[date][1] += 1 #print('------------------------------ '+ 'neg') else: newsdata[date][0] += 1 #print('------------------------------ '+ 'pos') #print(root+'/'+name+': '+ 'pos') gotDate = False fp.close() for date in newsdata: #print(date+': '+str(newsdata[date][0])+', '+str(newsdata[date][1])) out_.write(date+'\n'+str(newsdata[date][0])+', '+str(newsdata[date][1])+'\n') out_.close() ''' out_ = open('TEST_result.txt', 'w') fp = open('test_half_half.txt', 'r') #print(name) date = '' text = [] gotDate = False #print(root+'/'+name) for line in fp: if gotDate == False: date = line.replace('\n', '') gotDate = True if date not in newsdata: newsdata[date] = [0, 0] else: if len(line.strip()) == 0: gotDate = False continue text.append(line) print(text) newsfeat = [(featx(f), date) for f in word_split(text)] del text[:] observed = classifier.classify(newsfeat[0][0]) if observed == 'neg': newsdata[date][1] += 1 print('------------------------------ ' + 'neg') else: newsdata[date][0] += 1 print('------------------------------ ' + 'pos') #print(root+'/'+name+': '+ 'pos') gotDate = False fp.close() for date in newsdata: #print(date+': '+str(newsdata[date][0])+', '+str(newsdata[date][1])) out_.write(date + '\n' + str(newsdata[date][0]) + ', ' + str(newsdata[date][1]) + '\n') out_.close()
def evaluate_classifier(featx): #negfeats = [(featx(mark_negation(f)), 'neg') for f in word_split(negdata)] #posfeats = [(featx(mark_negation(f)), 'pos') for f in word_split(posdata)] negfeats = [(featx(f), 'neg') for f in word_split(negdata)] #print negfeats[1:25] #raw_input('>') posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] print "No of training reviews:", len(trainfeats) #print trainfeats testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print "No of testing reviews:", len(testfeats) # using 3 classifiers classifier_list = ['nb', 'svm', 'maxent'] # NB_pred = [] new_label = [] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) original_label = [] for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) original_label.append(label) #print feats #raw_input('> ') observed = classifier.classify(feats) NB_pred.append(observed) testsets[observed].add(i) #print refsets['pos'] #print testsets['pos'] #print original_label #print NB_Pred #cm = confusion_matrix(original_label,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(original_label,NB_pred)) new_label = original_label accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = nltk.precision(refsets['pos'], testsets['pos']) pos_recall = nltk.recall(refsets['pos'], testsets['pos']) pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) neg_precision = nltk.precision(refsets['neg'], testsets['neg']) neg_recall = nltk.recall(refsets['neg'], testsets['neg']) neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features(50) print('') #print len(NB_pred) ME_pred = NB_pred[982:] SVM_pred = NB_pred[491:982] NB_pred = NB_pred[0:491] #print NB_pred #print "-----------------------" #print ME_pred #print "-----------------------" #print SVM_pred #print "-----------------------" #cm = confusion_matrix(SVM_pred,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,NB_pred)) #cm = confusion_matrix(ME_pred,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(ME_pred,NB_pred)) #cm = confusion_matrix(SVM_pred,ME_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,ME_pred)) final_pred = [] for i in range(0, 491): c1 = 0 if NB_pred[i] == 'pos': c1 = c1 + 1 if ME_pred[i] == 'pos': c1 = c1 + 1 if SVM_pred[i] == 'pos': c1 = c1 + 1 #print i if c1 == 3 or c1 == 2: final_pred.append('pos') else: final_pred.append('neg') print "-----------------------" #print final_pred print "-----------------------" #print new_label print "Results of ensemble: NB + SVM + ME::" print "----------Confusion Matrix--------------" cm = confusion_matrix(final_pred, new_label) print cm print "" print "The accuracy score of ensemble is {:.2%}".format( accuracy_score(final_pred, new_label)) print "##############################################" ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation for cl in classifier_list: subset_size = len(trainfeats) / n accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos']) cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg']) cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) if cl == 'maxent': maxent_accuracy_next = (sum(accuracy) / n) maxent_accuracy.append(maxent_accuracy_next) elif cl == 'svm': svm_accuracy_next = (sum(accuracy) / n) svm_accuracy.append(svm_accuracy_next) else: nb_accuracy_next = (sum(accuracy) / n) nb_accuracy.append(nb_accuracy_next)
def evaluate_mult_classifiers(feature_x, n_folds=5): # 5-fold default for cross-validation # train_feats = 75% of pos_data + 75% of neg_data # test_feats = 25% of pos_data + 25% of neg_data neg_feats = [(feature_x(i), 'neg') for i in word_split(neg_data)] pos_feats = [(feature_x(i), 'pos') for i in word_split(pos_data)] neg_cutoff = int(len(neg_feats) * 0.75) pos_cutoff = int(len(pos_feats) * 0.75) train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff] test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:] classifier_list = ['NB', 'SVM'] ## CROSS VALIDATION train_feats = neg_feats + pos_feats # Shuffle training set random.shuffle(train_feats) for cl in classifier_list: subset_size = int(len(train_feats) / n_folds) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 print('--------------------------') print('Beginning Cross-validation') print('--------------------------') for i in range(n_folds): testing_this_round = train_feats[i * subset_size:][:subset_size] training_this_round = train_feats[:i * subset_size] + train_feats[ (i + 1) * subset_size:] if cl == 'NB': classifierName = 'Naive Bayes' # Using NLTK NaiveBayesClassifier classifier = NaiveBayesClassifier.train(training_this_round) else: classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) ref_sets = collections.defaultdict(set) test_sets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): ref_sets[label].add(i) observed = classifier.classify(feats) test_sets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(ref_sets['pos'], test_sets['pos']) cv_pos_recall = nltk.recall(ref_sets['pos'], test_sets['pos']) cv_pos_fmeasure = nltk.f_measure(ref_sets['pos'], test_sets['pos']) cv_neg_precision = nltk.precision(ref_sets['neg'], test_sets['neg']) cv_neg_recall = nltk.recall(ref_sets['neg'], test_sets['neg']) cv_neg_fmeasure = nltk.f_measure(ref_sets['neg'], test_sets['neg']) print('Fold: {} Acc : {:.4F}'.format(cv_count, cv_accuracy)) print('Fold: {} pos_prec : {:.4F} neg_prec : {:.4F}'.format( cv_count, cv_pos_precision, cv_neg_precision)) print('Fold: {} pos_recall: {:.4F} neg_recall: {:.4F}'.format( cv_count, cv_pos_recall, cv_neg_recall)) print('Fold: {} pos_fmeas : {:.4F} neg_fmeas : {:.4F}'.format( cv_count, cv_pos_fmeasure, cv_neg_fmeasure)) print('--') accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('----------------------------------------------------------') print('{}-Fold Cross Validation results for {} Classifier'.format( n_folds, classifierName)) print('----------------------------------------------------------') print('accuracy : {:.4F}'.format(sum(accuracy) / n_folds)) print('precision: {:.4F}'.format( (sum(pos_precision) / n_folds + sum(neg_precision) / n_folds) / 2)) print('recall : {:.4F}'.format( (sum(pos_recall) / n_folds + sum(neg_recall) / n_folds) / 2)) print('f-measure: {:.4F}'.format( (sum(pos_fmeasure) / n_folds + sum(neg_fmeasure) / n_folds) / 2)) print('\n')
"to": "2018-07-18 00:00:00" } } }] } } } res = es.search(index='twint', doc_type='items', body=doc, scroll='1m') count = 0 print len(res) while len(res) > 0: #if True: scrollId = res['_scroll_id'] print len(res), res['hits']['hits'] #res = es.scroll(scroll_id = scrollId, scroll = '1m') print len(res) #quit() for doc in res['hits']['hits']: print doc['_id'] cf = [(word_feats(f), '') for f in word_split([doc['_source']['tweet']])] observed = classifier.classify(cf[0][0]) count = count + 1 print(doc['_id'], observed, count) es.update(index='twint', doc_type='items', id=doc['_id'], body={"doc": { "polarity": observed }})
def runClassifiers(positives, negatives, featuresToUse, outFile, verbose, classifiersToUse): onDataSet = 0 numDataSets = len(positives + negatives) table = [] pos = [] neg = [] short = NUM_CLASSIFIERS - len(classifiersToUse) for x in range(short): classifiersToUse.append(False) # print which features we are using print("Using these features: ", FeatureExtractor.featuresToString(featuresToUse)) for data in positives: pos.append((FeatureExtractor.langFeatures(data, featuresToUse), True)) onDataSet += 1 for data in negatives: neg.append((FeatureExtractor.langFeatures(data, featuresToUse), False)) onDataSet += 1 random.shuffle(pos) random.shuffle(neg) # Testing is 1/4 of the data set, so we will cut it off there minLen = min(len(pos), len(neg)) posCut = minLen//4 negCut = posCut*2 # splits training and test sets train_data = pos[posCut:] + neg[negCut:] test_data = pos[:posCut] + neg[:posCut] maxEntSupport = featuresToUse["max_ent"] if classifiersToUse[0]: print("Running Naive Bayes classifier") timeStart = time.time() # NLTK's built-in implementation of the Naive Bayes classifier is trained classifier = nltk.NaiveBayesClassifier.train(train_data) # attempt to use sklearn naive bayes, not as good unfortunately # clf = MultinomialNB() # if featuresToUse["words"] or featuresToUse["ngrams"]: # pipeline = Pipeline([ # ('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k='all')), # ('NB', clf)]) # classifier = SklearnClassifier(pipeline) # else: # classifier = SklearnClassifier(clf) # classifier.train(train_data) # get the time it takes to train Naive Bayes print ("\nTime to train in seconds: ", time.time() - timeStart) # if featuresToUse["laugh_count"]: # DataCreator.pickleData("pickled_data/MaxEnt_Full", classifier) # else: # DataCreator.pickleData("pickled_data/MaxEnt_Part", classifier) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Naive Bayes", maxEntSupport)) if verbose: # this is a nice function that reports the top most impactful features the NB classifier found print("\n\n") print (classifier.show_most_informative_features(20)) if classifiersToUse[1]: print("Running Decision Tree classifier") timeStart = time.time() # NLTK's built-in implementation of the Decision Tree classifier is trained classifier = nltk.DecisionTreeClassifier.train(train_data) # get the time to train Decision tree print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Decision Tree")) if verbose: print("Printing tree") # print(classifier.pretty_format()) for (feats, cor) in test_data[:20]: classification = classifier.classify(feats) print("Correct: ", cor, " Result: ", classification)#, "for ", feats[0]) if classifiersToUse[2]: print("Running Maximum Entropy classifier") timeStart = time.time() # NLTK's built-in implementation of the Max Entropy classifier is trained classifier = nltk.MaxentClassifier.train(train_data, max_iter=25) if featuresToUse["laugh_count"]: DataCreator.pickleData("pickled_data/MaxEnt_Full", classifier) else: DataCreator.pickleData("pickled_data/MaxEnt_Part", classifier) # get the time to train Maximum Entropy print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Maximum Entropy")) if verbose: # this is a nice function that reports the top most impactful features the NB classifier found print (classifier.show_most_informative_features(20)) # this is a function that explains the effect of each feature in the set # print (classifier.explain()) if classifiersToUse[3]: print("Running SVM classifier") timeStart = time.time() # Scikit-learn's LinearSVC classifier, wrapped up in NLTK's wrapper class clf = LinearSVC() if featuresToUse["Dim Reduction"]: # pipeline = Pipeline([ # ('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k='all')), # ('randomforest', clf)]) pipeline = Pipeline([('PCA', PCA()), ('classifier', clf)]) classifier = SklearnClassifier(pipeline) else: classifier = SklearnClassifier(clf) classifier.train(train_data) # get the time to train a Support Vector Machine print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Linear SVC")) if classifiersToUse[4]: numEstimators = 50 print("Running AdaBoost classifier") timeStart = time.time() # Scikit-learn's AdaBoost classifier wrapped up in NLTK's wrapper class # The main parameters to tune to obtain good results are: # n_estimators and the complexity of the base estimators # testclf = RandomForestClassifier() # clf = AdaBoostClassifier(base_estimator=testclf, n_estimators=numEstimators) clf = AdaBoostClassifier(n_estimators=numEstimators) if featuresToUse["Dim Reduction"]: pipeline = Pipeline([('TruncatedSVD', TruncatedSVD()), ('classifier', clf)]) classifier = SklearnClassifier(pipeline) else: classifier = SklearnClassifier(clf) classifier.train(train_data) # get the time to train print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "AdaBoost(" + str(numEstimators) + ")", maxEntSupport)) if classifiersToUse[5]: print("Running Random Forest Classifier classifier") timeStart = time.time() # Scikit-learn's Random Forest classifier wrapped up in NLTK's # wrapper class # The main parameters to tune to obtain good results are: # n_estimators clf = RandomForestClassifier() if featuresToUse["Dim Reduction"]: # pipeline = Pipeline([ # ('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k='all')), # ('randomforest', clf)]) pipeline = Pipeline([('TruncatedSVD', TruncatedSVD()), ('classifier', clf)]) classifier = SklearnClassifier(pipeline) else: classifier = SklearnClassifier(clf) classifier.train(train_data) # get the time to train print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Random Forest", maxEntSupport)) if classifiersToUse[6]: numEstimators = 50 print("Running Combo classifier") timeStart = time.time() adaclf = SklearnClassifier(AdaBoostClassifier(n_estimators=numEstimators)) adaclf.train(train_data) naive = nltk.NaiveBayesClassifier.train(train_data) # get the time to train print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table TP = TN = FP = FN = 0 for i, (feats, label) in enumerate(test_data): observed = False if naive.classify(feats) and adaclf.classify(feats): observed = True if label == observed: if observed: TP += 1 else: TN += 1 else: if observed: FP += 1 else: FN += 1 accuracy = (TP+TN)/(TP+FP+TN+FN) p_prec = TP/(TP+FP) p_rec = TP/(TP+FN) f1Pos = 2*((p_prec*p_rec)/(p_prec + p_rec)) n_prec = TN/(TN+FN) n_rec = TN/(TN+FP) f1Neg = 2*((n_prec*n_rec)/(n_prec + n_rec)) table.append(["COMBO", accuracy, p_prec, p_rec, f1Pos, n_prec, n_rec, f1Neg]) if (outFile == ""): print("\n", FeatureExtractor.featuresToString(featuresToUse)) # print(tabulate(table, headers=["Classifier", "accuracy", "pos precision", "pos recall", "pos f1", "neg precision", "neg recall", "neg f1"])) else: with open(outFile, 'a') as out: out.write("\n") out.write(FeatureExtractor.featuresToString(featuresToUse)) out.write(tabulate(table, headers=["Classifier", "accuracy", "pos precision", "pos recall", "pos f1", "neg precision", "neg recall", "neg f1"])) out.write("\n") return table
def evaluate_classifier(featx, balance=False): global negdata global neudata global posdata if balance: neudata = resample(neudata, n_samples=len(negdata)) posdata = resample(posdata, n_samples=len(negdata)) # using 3 classifiers classifier_list = ['svm', 'nb', 'maxent'] negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] neufeats = [(featx(f), 'neu') for f in word_split(neudata)] alldata = negdata + posdata + neudata allfeats = negfeats + posfeats + neufeats #10-fold cross-validation correct = [] incorrect = [] for n in [10]: #range(2,6): negfeatssplit = chunkIt(negfeats, n) negdatasplit = chunkIt(negdata, n) posfeatssplit = chunkIt(posfeats, n) posdatasplit = chunkIt(posdata, n) neufeatssplit = chunkIt(neufeats, n) neudatasplit = chunkIt(neudata, n) for cl in classifier_list: accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] neu_precision = [] neu_recall = [] pos_fmeasure = [] neg_fmeasure = [] neu_fmeasure = [] cv_count = 1 res = {} res["neg"] = 0 res["pos"] = 0 res["neu"] = 0 for i in range(n): testing_this_round = negfeatssplit[i - 1] + posfeatssplit[ i - 1] + neufeatssplit[i - 1] training_this_round = gettrainfeat( negfeatssplit, i) + gettrainfeat( posfeatssplit, i) + gettrainfeat(neufeatssplit, i) if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train( training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) aux_test = {} auxFP_test = {} aux_test['pos'] = 0 aux_test['neu'] = 0 aux_test['neg'] = 0 auxFP_test['pos'] = 0 auxFP_test['neu'] = 0 auxFP_test['neg'] = 0 for ii, (feats, label) in enumerate(testing_this_round): refsets[label].add(ii) observed = classifier.classify(feats) testsets[observed].add(ii) res[observed] = res[observed] + 1 auxFP_test[observed] = auxFP_test[observed] + 1 if (observed == label): correct.append((feats, label)) aux_test[label] = aux_test[label] + 1 else: incorrect.append((feats, label)) cv_accuracy = nltk.classify.util.accuracy( classifier, testing_this_round) cv_neg_precision = float(aux_test['neg']) / float( len(negfeatssplit[i - 1])) print cv_neg_precision cv_neg_recall = float(aux_test['neg']) / float( auxFP_test['neg']) cv_neg_fmeasure = 2 * ((cv_neg_precision * cv_neg_recall) / (cv_neg_precision + cv_neg_recall)) cv_pos_precision = float(aux_test['pos']) / float( len(posfeatssplit[i - 1])) cv_pos_recall = float(aux_test['pos']) / float( auxFP_test['pos']) cv_pos_fmeasure = 2 * ((cv_pos_precision * cv_pos_recall) / (cv_pos_precision + cv_pos_recall)) cv_neu_precision = float(aux_test['neu']) / float( len(neufeatssplit[i - 1])) cv_neu_recall = float(aux_test['neu']) / float( auxFP_test['neu']) cv_neu_fmeasure = 2 * ((cv_neu_precision * cv_neu_recall) / (cv_neu_precision + cv_neu_recall)) #cv_accuracy = float(aux_test['neg'] + aux_test['pos']+ aux_test['neu'])/float(len(testing_this_round)) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) neg_precision.append(cv_neg_precision) neu_precision.append(cv_neu_precision) pos_recall.append(cv_pos_recall) neg_recall.append(cv_neg_recall) neu_recall.append(cv_neu_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) neu_fmeasure.append(cv_neu_fmeasure) cv_count += 1 print "Balance = ", balance print '---------------------------------------' print str( n ) + '-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')' print "Nbr = ", res print 'accuracy:', sum(accuracy) / n print 'precision', ((sum(pos_precision) / n) + (sum(neg_precision) / n) + (sum(neu_precision) / n)) / 3.0 print sum(pos_precision) / n, sum(neg_precision) / n, sum( neu_precision) / n print 'recall', (sum(pos_recall) / n + sum(neg_recall) / n + sum(neu_recall) / n) / 3.0 print sum(pos_recall) / n, sum(neg_recall) / n, sum(neu_recall) / n print 'f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n + sum(neu_fmeasure) / n) / 3.0 print sum(pos_fmeasure) / n, sum(neg_fmeasure) / n, sum( neu_fmeasure) / n print "*********CORRECT****" print(len(correct), len(incorrect)) #print (correct,incorrect) for tt in correct: print(tt[1], alldata[allfeats.index(tt)]) print "***INCORRECT**********" for tt in incorrect: print(tt[1], alldata[allfeats.index(tt)]) #.index(correct[0])) print "..."