def prob_classify(self, featureset): # Discard any feature names that we've never seen before. # Otherwise, we'll just assign a probability of 0 to # everything. featureset = featureset.copy() for fname in list(featureset.keys()): for label in self._labels: if (label, fname) in self._feature_probdist: break else: #print 'Ignoring unseen feature %s' % fname del featureset[fname] # Find the log probabilty of each label, given the features. # Start with the log probability of the label itself. logprob = {} for label in self._labels: logprob[label] = self._label_probdist.logprob(label) # Then add in the log probability of features given labels. for label in self._labels: for (fname, fval) in featureset.items(): if (label, fname) in self._feature_probdist: feature_probs = self._feature_probdist[label, fname] logprob[label] += feature_probs.logprob(fval) else: # nb: This case will never come up if the # classifier was created by # NaiveBayesClassifier.train(). logprob[label] += sum_logs([]) # = -INF. return DictionaryProbDist(logprob, normalize=True, log=True)
def prob_classify(_classifier, featureset): # Discard any feature names that we've never seen before. # Otherwise, we'll just assign a probability of 0 to # everything. featureset = featureset.copy() for fname in list(featureset.keys()): for label in _classifier._labels: if (label, fname) in _classifier._feature_probdist: break else: # print 'Ignoring unseen feature %s' % fname del featureset[fname] # Find the log probabilty of each label, given the features. # Start with the log probability of the label itself. logprob = {} prob_features = [] #{Tuple[str, float]} for label in _classifier._labels: logprob[label] = _classifier._label_probdist.logprob(label) # Then add in the log probability of features given labels. for label in _classifier._labels: for (fname, fval) in featureset.items(): if (label, fname) in _classifier._feature_probdist: feature_probs = _classifier._feature_probdist[label, fname] # print(f"{fname} Label: {label}, Prop: {feature_probs.logprob(fval)}") logprob[label] += feature_probs.logprob(fval) if fval: data = { 'word': fname[9:-1], label: feature_probs.logprob(fval) } prob_features.append(data) else: # nb: This case will never come up if the # classifier was created by # NaiveBayesClassifier.train(). # print(f"{fname} Label[else]: {label}, Prop: {sum_logs([])}") logprob[label] += sum_logs([]) # = -INF. # print(f"Label: {label}, Prop: {logprob[label]}") words_prob = {} for item in prob_features: if item['word'] in words_prob: words_prob[item['word']].update(item) else: words_prob[item['word']] = item words_prob = [val for (_, val) in words_prob.items()] # print(f"prob_features: {words_prob}") return logprob, words_prob
def log_renormalise(self, r, nr): ''' Calculates the renormalisation factor for observed sample types. ''' log_prob_cov = sum_logs([log(nr_, 2) + self.log_prob_measure(r_) for r_, nr_ in zip(r, nr)]) if self._prob_measure(0) < 1: self.log_renormal = log(1 - self._prob_measure(0), 2) + log_prob_cov self._renormal = exp(self.log_renormal) else: # If this happens, Good-Turing smoothing is probably a bad idea... self.log_renormal = float('-inf') self._renormal = 0.0
def _prob_classify(self, input): # Make a featureset of the input after tokenizing tokenized input_tokenized_featureset = self._tokenizeInputToFeatures( input).copy() # Ensuring that all the feature names are valid and can be ued for input_feature_name in input_tokenized_featureset.keys(): for label in self._labels: if (label, input_feature_name ) in self._featureProbabilityDistribution: break else: #print 'Ignoring unseen feature %s' % input_feature_name del input_tokenized_featureset[input_feature_name] # Start with a log probability of 0 to avoid skewing towards larger data sets logprob = {} for label in self._labels: #print "in here adding labels" logprob[label] = 0 # Add in the log probability of features given labels. # Iterate through the labels assigned eg : location,time, noise for label in self._labels: # Iterate through the input feature set one by one eg "{turkey:true, bacon:true}" for (input_feature_name, input_feature_val) in input_tokenized_featureset.items(): # If the combination ie (location,turkey) belongs in the trainig set, add the log probability if (label, input_feature_name ) in self._featureProbabilityDistribution: # Assign its probability feature_probs = self._featureProbabilityDistribution[ label, input_feature_name] logprob[label] += feature_probs.logprob(input_feature_val) else: # nb: This case will never come up if the classifier was created by # NaiveBayesClassifier.train(). logprob[label] += sum_logs([]) # = -INF. # print out the log prob for each label before normalizing #for key,value in self._featureProbabilityDistribution.items(): # print "key value of featureProbabilityDistribution " + str(key) + "," + str(value.freqdist() ) #print "log prob with features is " + str(logprob) dictprobDist = DictionaryProbDist(logprob, normalize=True, log=True) ## print out the probability for each label #for label in dictprobDist.samples(): # print label + " is probability " + str(dictprobDist.prob(label)) return dictprobDist
def log_renormalise(self, r, nr): ''' Calculates the renormalisation factor for observed sample types. ''' log_prob_cov = sum_logs([ log(nr_, 2) + self.log_prob_measure(r_) for r_, nr_ in zip(r, nr) ]) if self._prob_measure(0) < 1: self.log_renormal = log(1 - self._prob_measure(0), 2) + log_prob_cov self._renormal = exp(self.log_renormal) else: # If this happens, Good-Turing smoothing is probably a bad idea... self.log_renormal = float('-inf') self._renormal = 0.0
def _prob_classify(self, input): # Make a featureset of the input after tokenizing tokenized input_tokenized_featureset = self._tokenizeInputToFeatures(input).copy() # Ensuring that all the feature names are valid and can be ued for input_feature_name in input_tokenized_featureset.keys(): for label in self._labels: if (label, input_feature_name) in self._featureProbabilityDistribution: break else: #print 'Ignoring unseen feature %s' % input_feature_name del input_tokenized_featureset[input_feature_name] # Start with a log probability of 0 to avoid skewing towards larger data sets logprob = {} for label in self._labels: #print "in here adding labels" logprob[label] = 0 # Add in the log probability of features given labels. # Iterate through the labels assigned eg : location,time, noise for label in self._labels: # Iterate through the input feature set one by one eg "{turkey:true, bacon:true}" for (input_feature_name, input_feature_val) in input_tokenized_featureset.items(): # If the combination ie (location,turkey) belongs in the trainig set, add the log probability if (label, input_feature_name) in self._featureProbabilityDistribution: # Assign its probability feature_probs = self._featureProbabilityDistribution[label,input_feature_name] logprob[label] += feature_probs.logprob(input_feature_val) else: # nb: This case will never come up if the classifier was created by # NaiveBayesClassifier.train(). logprob[label] += sum_logs([]) # = -INF. # print out the log prob for each label before normalizing #for key,value in self._featureProbabilityDistribution.items(): # print "key value of featureProbabilityDistribution " + str(key) + "," + str(value.freqdist() ) #print "log prob with features is " + str(logprob) dictprobDist = DictionaryProbDist(logprob, normalize=True, log=True) ## print out the probability for each label #for label in dictprobDist.samples(): # print label + " is probability " + str(dictprobDist.prob(label)) return dictprobDist
def prob_classify(self, featureset): featureset = featureset.copy() for fname in list(featureset.keys()): for label in self._labels: if (label, fname) in self._feature_probdist: break else: del featureset[fname] logprob = {} for label in self._labels: logprob[label] = self._label_probdist.logprob(label) for label in self._labels: for (fname, fval) in featureset.items(): if (label, fname) in self._feature_probdist: feature_probs = self._feature_probdist[label, fname] logprob[label] += feature_probs.logprob(fval) else: logprob[label] += sum_logs([]) return DictionaryProbDist(logprob, normalize=True, log=True)
def prob_classify(self, featureset): featureset = featureset.copy() for fname in list(featureset.keys()): for label in self._labels: if (label, fname) in self._feature_probdist: break else: del featureset[fname] logprob = {} for label in self._labels: logprob[label] = self._label_probdist.logprob(label) for label in self._labels: for (fname, fval) in featureset.items(): if (label, fname) in self._feature_probdist: feature_probs = self._feature_probdist[label,fname] logprob[label] += feature_probs.logprob(fval) else: logprob[label] += sum_logs([]) # = -INF. return DictionaryProbDist(logprob, normalize=True, log=True)