def text2vec(self, train_data): idx_vocab = {word: idx for idx, word in enumerate(self.vocabulary)} documents = [] for document in tqdm(train_data.x_data): doc_vocab = Vocabulary.getVocabularyByDocument( document, self.grams) occurrences = np.zeros(len(idx_vocab.keys())) for feature in doc_vocab.keys(): try: occurrences[idx_vocab[feature]] += doc_vocab[feature] except: continue documents.append(occurrences) return documents
def predict(self, x_test): predictions = [] prod_probs = [] smoothing = self.smoothing if self.smoothing else 0 for document in x_test: # Get the vocabulary of the document vocab = Vocabulary.getVocabularyByDocument(document, self.grams) probs = {} for cl in np.unique(self.y_train): # For each class calculate the class probability prob = np.log(self.model['{}_Pc'.format(cl)]) for features in vocab.keys(): # Calculate probability of each feature try: freq = self.model['{}_occr'.format(cl)][features] except: freq = 0 #continue total = self.model['{}_tot'.format(cl)] sizeV = len(self.model['{}_occr'.format(cl)]) calc = np.log(freq + smoothing) if freq + smoothing > 0 else 0 prob += vocab[features] * calc probs[cl] = prob - sum( vocab.values()) * np.log(total + sizeV * smoothing) predictions.append(max(probs, key=probs.get)) prod_probs.append(probs) return predictions