def evaluate(self,evaluation_set=None): """ Evaluates the classifier on the given evaluation_set and returns an EvaluationResult object. If no evaluation set is provided, the evaluation corpus is used. """ #if (evaluation_set == None): # evaluation_set = [(sent, sent.certainty) for sent in self._evalcorpus.sents()] #evaluation_set = evaluation_set[0:50] eval_featuresets = [self.sentenceFeatures(sent) for sent in evaluation_set] results = self._classifier.batch_classify(eval_featuresets) labels = [s.certainty for s in evaluation_set] tp,fp,tn,fn = 0,0,0,0 false_pos_sents = [] false_neg_sents = [] for (s,r,l) in zip(evaluation_set,results,labels): if r=='uncertain' and r==l: tp += 1 elif r=='uncertain' and r!=l: fp += 1 false_pos_sents.append(s) debug('FP:'+s.id.encode('UTF-8')+': '+s.string.encode('UTF-8')) elif r=='certain' and r==l: tn += 1 elif r=='certain' and r!=l: fn += 1 false_neg_sents.append(s) #debug('FN:'+s.id.encode('UTF-8')+': '+s.string.encode('UTF-8')) else: raise Exception("Evaluation error!") debug('Size of evaluation set: '+ str(len(evaluation_set)) ) return EvaluationResult(tp,fp,tn,fn)
def evaluate(self, evaluation_set=None): """ Evaluates the classifier on the given evaluation_set and returns an EvaluationResult object. If no evaluation set is provided, the evaluation corpus is used. """ #if (evaluation_set == None): # evaluation_set = [(sent, sent.certainty) for sent in self._evalcorpus.sents()] #evaluation_set = evaluation_set[0:50] eval_featuresets = [ self.sentenceFeatures(sent) for sent in evaluation_set ] results = self._classifier.batch_classify(eval_featuresets) labels = [s.certainty for s in evaluation_set] tp, fp, tn, fn = 0, 0, 0, 0 false_pos_sents = [] false_neg_sents = [] for (s, r, l) in zip(evaluation_set, results, labels): if r == 'uncertain' and r == l: tp += 1 elif r == 'uncertain' and r != l: fp += 1 false_pos_sents.append(s) debug('FP:' + s.id.encode('UTF-8') + ': ' + s.string.encode('UTF-8')) elif r == 'certain' and r == l: tn += 1 elif r == 'certain' and r != l: fn += 1 false_neg_sents.append(s) #debug('FN:'+s.id.encode('UTF-8')+': '+s.string.encode('UTF-8')) else: raise Exception("Evaluation error!") debug('Size of evaluation set: ' + str(len(evaluation_set))) return EvaluationResult(tp, fp, tn, fn)
def _build_bow_features(self,training_set): """ Builds the feature set that will be considered by the classifier. This is a Bag of Words approach. The features are taken from the 'size' most common words after the first 'stop_words_size' most common, which are left out. Note that this set is dependant on the given training_set """ #build features tks = [] if self._useBOL: debug('Using word lemmas as features for the Bag of Words.') for sent in training_set: for (word,lemma,pos,chunk,ne) in sent.genia_words: tks.append(lemma.lower()) else: for sent in training_set: for (word,lemma,pos,chunk,ne) in sent.genia_words: tks.append(word.lower()) counted_words = Counter(tks) most_common_words = [word[0] for word in counted_words.most_common(self._mostCommonSize)] too_common_words = [word[0] for word in counted_words.most_common(self._stopWordsSize)] debug("Words excluded from dictionary:") debug(too_common_words) self._feature_words = list(set(most_common_words) - set(too_common_words)) self._empty_feature_set = dict() self._empty_int_set = dict() for i in range(0,len(self._feature_words)): #debug(self._feature_words[i] + '=>' +str(i)) self._empty_feature_set[self._feature_words[i]] = i self._empty_int_set[(i)] = 0 debug('Size of dictionary: '+ str(len(self._empty_feature_set)))
def _build_bow_features(self, training_set): """ Builds the feature set that will be considered by the classifier. This is a Bag of Words approach. The features are taken from the 'size' most common words after the first 'stop_words_size' most common, which are left out. Note that this set is dependant on the given training_set """ #build features tks = [] if self._useBOL: debug('Using word lemmas as features for the Bag of Words.') for sent in training_set: for (word, lemma, pos, chunk, ne) in sent.genia_words: tks.append(lemma.lower()) else: for sent in training_set: for (word, lemma, pos, chunk, ne) in sent.genia_words: tks.append(word.lower()) counted_words = Counter(tks) most_common_words = [ word[0] for word in counted_words.most_common(self._mostCommonSize) ] too_common_words = [ word[0] for word in counted_words.most_common(self._stopWordsSize) ] debug("Words excluded from dictionary:") debug(too_common_words) self._feature_words = list( set(most_common_words) - set(too_common_words)) self._empty_feature_set = dict() self._empty_int_set = dict() for i in range(0, len(self._feature_words)): #debug(self._feature_words[i] + '=>' +str(i)) self._empty_feature_set[self._feature_words[i]] = i self._empty_int_set[(i)] = 0 debug('Size of dictionary: ' + str(len(self._empty_feature_set)))
def train(self,training_set=None): """ Trains the BOW NaiveBayes classifier. """ if (training_set == None): training_set = [(sent, sent.certainty) for sent in self._corpus.sents()] #training_set = training_set[0:10] #para comparar con los resultados anteriores #build features self._build_bow_features(training_set) #build featuresets for each sentence labeled_featuresets = [] for sent in training_set: featureset = self.sentenceFeatures(sent) labeled_featuresets.append((featureset,sent.certainty)) debug('Size of training set: '+str(len(labeled_featuresets))) #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(labeled_featuresets) #train the NaiveBayes self._classifier = NaiveBayesClassifier.train(labeled_featuresets)
def train(self, training_set=None): """ Trains the BOW NaiveBayes classifier. """ if (training_set == None): training_set = [(sent, sent.certainty) for sent in self._corpus.sents()] #training_set = training_set[0:10] #para comparar con los resultados anteriores #build features self._build_bow_features(training_set) #build featuresets for each sentence labeled_featuresets = [] for sent in training_set: featureset = self.sentenceFeatures(sent) labeled_featuresets.append((featureset, sent.certainty)) debug('Size of training set: ' + str(len(labeled_featuresets))) #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(labeled_featuresets) #train the NaiveBayes self._classifier = NaiveBayesClassifier.train(labeled_featuresets)