コード例 #1
0
    def getFullVocab(self, data):

        if 'uni' in self.grams:
            unigrams = Vocabulary.getUniGrams(data.x_data)
        else:
            unigrams = set()

        if 'bi' in self.grams:
            bigrams = Vocabulary.getBiGrams(data.x_data)
        else:
            bigrams = set()

        allgrams = unigrams | bigrams

        assert len(unigrams) + len(bigrams) == len(allgrams)

        # Now reduce the vocabulary size
        counts = Vocabulary.getFullDict(data.x_data, allgrams, self.grams)
        counts = {k: val for k,val in counts.items() if NBModel.thr_condition(k,\
                                                                              val,\
                                                                              unigrams,\
                                                                              bigrams,\
                                                                              self.threshold)}

        self.vocabulary = counts.keys()
コード例 #2
0
    def train(self, x_train, y_train):

        assert len(self.grams) > 0, "You must provide what n-grams to use"
        assert (self.grams == 'uni' or self.grams == 'bi' or \
                self.grams == ['uni', 'bi'] or self.grams == ['bi','uni']), \
                "Only uni or bi grams are implemented!"

        # First extract vocabulary
        if 'uni' in self.grams:
            unigrams = Vocabulary.getUniGrams(x_train)
        else:
            unigrams = set()

        if 'bi' in self.grams:
            bigrams = Vocabulary.getBiGrams(x_train)
        else:
            bigrams = set()

        allgrams = unigrams | bigrams

        assert len(unigrams) + len(bigrams) == len(allgrams)

        # Get number of total documents
        # Pc will be probability of class positive (class 1)
        N = len(x_train)
        Pc = sum(y_train) / N
        probs = {}

        # Remember that y_train = 1 -> positive
        for cl in np.unique(y_train):
            # Get documents for that class
            cl_docs = [
                x_train[i]
                for i in np.where(np.array(y_train) == cl)[0].tolist()
            ]
            counts = Vocabulary.getFullDict(cl_docs, allgrams, self.grams)

            # Implement the threshold - get rid of features
            # which appear less than threshold number of times
            counts = {k: val for k,val in counts.items() if NBModel.thr_condition(k,\
                                                                                  val,\
                                                                                  unigrams,\
                                                                                  bigrams,\
                                                                                  self.threshold)}

            probs['{}_occr'.format(cl)] = counts
            probs['{}_tot'.format(cl)] = sum(counts.values())
            probs['{}_Pc'.format(cl)] = Pc if cl == 1 else 1 - Pc

        self.model = probs
        self.y_train = y_train