コード例 #1
0
    def text2vec(self, train_data):

        idx_vocab = {word: idx for idx, word in enumerate(self.vocabulary)}

        documents = []
        for document in tqdm(train_data.x_data):

            doc_vocab = Vocabulary.getVocabularyByDocument(
                document, self.grams)
            occurrences = np.zeros(len(idx_vocab.keys()))

            for feature in doc_vocab.keys():

                try:
                    occurrences[idx_vocab[feature]] += doc_vocab[feature]
                except:
                    continue

            documents.append(occurrences)

        return documents
コード例 #2
0
    def predict(self, x_test):

        predictions = []
        prod_probs = []
        smoothing = self.smoothing if self.smoothing else 0

        for document in x_test:
            # Get the vocabulary of the document
            vocab = Vocabulary.getVocabularyByDocument(document, self.grams)
            probs = {}

            for cl in np.unique(self.y_train):
                # For each class calculate the class probability
                prob = np.log(self.model['{}_Pc'.format(cl)])

                for features in vocab.keys():
                    # Calculate probability of each feature
                    try:
                        freq = self.model['{}_occr'.format(cl)][features]
                    except:
                        freq = 0
                        #continue

                    total = self.model['{}_tot'.format(cl)]
                    sizeV = len(self.model['{}_occr'.format(cl)])

                    calc = np.log(freq +
                                  smoothing) if freq + smoothing > 0 else 0

                    prob += vocab[features] * calc

                probs[cl] = prob - sum(
                    vocab.values()) * np.log(total + sizeV * smoothing)

            predictions.append(max(probs, key=probs.get))
            prod_probs.append(probs)

        return predictions