コード例 #1
0
ファイル: TokenizingClass.py プロジェクト: vedantdesai13/NLP
    def bag_of_words(self, words, vocab):
        words = b.rem_special_chars(words)
        words = b.convert_to_lowcase(words)
        words = b.rem_stopwords(words)
        words = b.lemma(words)
        words = self.text_to_word(words)

        bag = np.zeros(len(vocab))
        for w in words:
            for i, x in enumerate(vocab):
                if x == w:
                    bag[i] += 1
        return np.array(bag)
コード例 #2
0
ファイル: TokenizingClass.py プロジェクト: vedantdesai13/NLP
        words = b.convert_to_lowcase(words)
        words = b.rem_stopwords(words)
        words = b.lemma(words)
        words = self.text_to_word(words)

        bag = np.zeros(len(vocab))
        for w in words:
            for i, x in enumerate(vocab):
                if x == w:
                    bag[i] += 1
        return np.array(bag)


# Read data from a file
f = open('text2.txt', 'r')
corpus = f.read()

corpus = b.rem_special_chars(corpus)
corpus = b.convert_to_lowcase(corpus)
corpus = b.rem_stopwords(corpus)
corpus = b.lemma(corpus)

t = Tokenizer()
s = t.text_to_word(corpus)
print("word count=", t.word_count(s))
print("number of unique words", t.number_of_unique_words(s))

inp = "autonomous individuals mutual aid self governance"
print("input = ", inp)
print("bag of words = ", t.bag_of_words(inp, s))