def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist): """A copy of the nltk.NaiveBayesClassifer.train(...) method to allow inspection of what the method is actually doing and how long it's taking""" """ @param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples C{(featureset, label)}. """ label_freqdist = nltk.FreqDist() feature_freqdist = nltk.defaultdict(nltk.FreqDist) feature_values = nltk.defaultdict(set) fnames = set() print 'There are ' + str(len(labeled_featuresets)) + ' labeled featuresets' # Count up how many times each feature value occured, given # the label and featurename. print 'Counting feature value occurence' i = 0 for featureset, label in labeled_featuresets: label_freqdist.inc(label) for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname].inc(fval) # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) print 'At featureset...' + str(i) i+=1 # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() feature_freqdist[label, fname].inc(None, num_samples-count) feature_values[fname].add(None) # Create the P(label) distribution print 'Making the P(label) distribution...' label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution print 'Making the P(fval|label, fname) distribution from '\ + str(len(feature_freqdist.items()))\ + ' feature freqs...' feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label,fname] = probdist return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
def __init__(self, rebuild=False): # declare variables for sentiment searcher self.relevant_documents = {} # create sentiment model for objectivity self.word_features = [] self.classifier = None if os.path.exists('models/sentiment/label_probdist.p') and \ os.path.exists('models/sentiment/feature_probdist.p') and \ os.path.exists('models/sentiment/word_feature_list.p') and not rebuild: print 'loading sentiment model' # load in model files with open('models/sentiment/label_probdist.p', 'rb') as label_probdist_file: label_probdist = pickle.load(label_probdist_file) with open('models/sentiment/feature_probdist.p', 'rb') as feature_probdist_file: feature_probdist = pickle.load(feature_probdist_file) with open('models/sentiment/word_feature_list.p', 'rb') as word_feature_list_file: self.word_features = pickle.load(word_feature_list_file) # instantiate classifier self.classifier = nltk.NaiveBayesClassifier( label_probdist, feature_probdist) else: print 'generating sentiment model' # get training data subjective_sents = nltk.corpus.subjectivity.sents( categories='subj') objective_sents = nltk.corpus.subjectivity.sents(categories='obj') subjective_docs = [(sent, 'subj') for sent in subjective_sents] objective_docs = [(sent, 'obj') for sent in objective_sents] # train model sentiment_training_data = subjective_docs + objective_docs self.create_word_features( self.extract_words(sentiment_training_data)) self.classifier = self.train_sentiment_classifier( sentiment_training_data) # save out model so it will not need to be regenerated with open('models/sentiment/label_probdist.p', 'wb') as label_probdist_file: pickle.dump(self.classifier._label_probdist, label_probdist_file) with open('models/sentiment/feature_probdist.p', 'wb') as feature_probdist_file: pickle.dump(self.classifier._feature_probdist, feature_probdist_file) with open('models/sentiment/word_feature_list.p', 'wb') as word_feature_list_file: pickle.dump(self.word_features, word_feature_list_file)
def get_classifier(self): label_probdist = self.estimator(self.label_freqdist) feature_probdist = {} for ((label, fname), freqdist) in self.feature_freqdist.iteritems(): probdist = self.estimator(freqdist, bins=len(self.feature_values[fname])) feature_probdist[label, fname] = probdist return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
def testSetAccuracy(self): ''' ''' accuracy = 0 for i in range(5): print 'iteration %d' % i random.shuffle(self.training_set) train_set = self.training_set[20:] test_set = self.training_set[:20] tempClassifier = nltk.NaiveBayesClassifier(self.training_set) accuracy += nltk.classify.accuracy(tempClassifier, test_set) return accuracy / 5
def read_probdist(self): f = open("label_probdist.dat") label_probdist = pickle.loads(f.read()) f.close() f = open("feature_probdist.dat") feature_probdist = pickle.loads(f.read()) f.close() f = open("all_words.dat") self.all_words = pickle.loads(f.read()) f.close() self.classifier = nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
def getNaiveBayesTrainedClassifier(dataset): #train_set, test_set = getOpinionTrainingData() #change this to the currently used corpus approach #est = lambda fdist : LaplaceProbDist train_set, test_set, prob_dist = dataset() from nltk.probability import DictionaryProbDist dict_probs = {'positive': .1, 'negative': .1} label_probdist = DictionaryProbDist(dict_probs) classifier = nltk.NaiveBayesClassifier(label_probdist=label_probdist, feature_probdist=prob_dist) classifier = classifier.train(train_set, estimator=LaplaceProbDist) print("Classifier accuracy percent: ", (nltk.classify.accuracy(classifier, test_set)) * 100) #print(classifier.show_most_informative_features(10)) return classifier
def CreatNaiveBayes(self, data): label_freqdist = FreqDist() for (name, total, ethList) in data: for i in range(5): label_freqdist[self._ethicity[i]] += ethList[i] label_probdist = ELEProbDist(label_freqdist) feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) #for (name, total, ethList) in data: # x-lets for (name, total, ethList) in data: x_lets = self.get3_let(name) for i in range(5): for x_let in x_lets: feature_freqdist[(self._ethicity[i], x_let)][True] += ethList[i] feature_values[x_let].add(True) for ((label, x_let), freqdist) in feature_freqdist.items(): num = 0 for i in range(5): if label == self._ethicity[i]: num = i break tot = 0 for (name, total, ethList) in data: if x_let not in name: tot += ethList[num] feature_values[x_let].add(None) if tot > 0: feature_freqdist[(label, x_let)][None] += tot; feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = ELEProbDist(freqdist, bins=len(feature_values[fname])) feature_probdist[label, fname] = probdist self.classifier = nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
def load_bayes_from_file(filename): d = pickle.load(open(filename)) return nltk.NaiveBayesClassifier(d["_label_probdist"], d["_feature_probdist"])
def train(labeled_featuresets, estimator=nltk.ELEProbDist): label_probdist = estimator(nltk.label_freqdist) feature_probdist = {} return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
def train(): classifier = nltk.NaiveBayesClassifier() print classifier