def getFullVocab(self, data): if 'uni' in self.grams: unigrams = Vocabulary.getUniGrams(data.x_data) else: unigrams = set() if 'bi' in self.grams: bigrams = Vocabulary.getBiGrams(data.x_data) else: bigrams = set() allgrams = unigrams | bigrams assert len(unigrams) + len(bigrams) == len(allgrams) # Now reduce the vocabulary size counts = Vocabulary.getFullDict(data.x_data, allgrams, self.grams) counts = {k: val for k,val in counts.items() if NBModel.thr_condition(k,\ val,\ unigrams,\ bigrams,\ self.threshold)} self.vocabulary = counts.keys()
def train(self, x_train, y_train): assert len(self.grams) > 0, "You must provide what n-grams to use" assert (self.grams == 'uni' or self.grams == 'bi' or \ self.grams == ['uni', 'bi'] or self.grams == ['bi','uni']), \ "Only uni or bi grams are implemented!" # First extract vocabulary if 'uni' in self.grams: unigrams = Vocabulary.getUniGrams(x_train) else: unigrams = set() if 'bi' in self.grams: bigrams = Vocabulary.getBiGrams(x_train) else: bigrams = set() allgrams = unigrams | bigrams assert len(unigrams) + len(bigrams) == len(allgrams) # Get number of total documents # Pc will be probability of class positive (class 1) N = len(x_train) Pc = sum(y_train) / N probs = {} # Remember that y_train = 1 -> positive for cl in np.unique(y_train): # Get documents for that class cl_docs = [ x_train[i] for i in np.where(np.array(y_train) == cl)[0].tolist() ] counts = Vocabulary.getFullDict(cl_docs, allgrams, self.grams) # Implement the threshold - get rid of features # which appear less than threshold number of times counts = {k: val for k,val in counts.items() if NBModel.thr_condition(k,\ val,\ unigrams,\ bigrams,\ self.threshold)} probs['{}_occr'.format(cl)] = counts probs['{}_tot'.format(cl)] = sum(counts.values()) probs['{}_Pc'.format(cl)] = Pc if cl == 1 else 1 - Pc self.model = probs self.y_train = y_train