コード例 #1
0
    def __preprocess(self, data):
        print("Number of sentences: ", len(data.groupby(['Sentence #'])))

        words = list(set(data["Word"].values))
        self.nWords = len(words)
        print("Number of words in the dataset: ", self.nWords)
        self.nWords += int(self.nWords * 0.1)

        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stems = list(set(stemmer.stem(word) for word in tqdm(words)))

        chars = set([w_i for w in tqdm(words) for w_i in w])
        print(chars)
        self.nChars = len(chars)

        tags = list(set(data["Tag"].values))
        print("Tags:", tags)
        self.nTags = len(tags)
        print("Number of Tags: ", self.nTags)

        if self.model == None:
            self.wordIndex = WordIndex("UNK", self.nWords)
            self.stemIndex = WordIndex("UNK", self.nWords)
            self.tagIndex = WordIndex("O", self.nTags + 2)
            self.charIndex = WordIndex("UNK", self.nChars + 1)

        self.wordIndex.add(words)
        self.stemIndex.add(stems)
        self.tagIndex.add(tags)
        self.charIndex.add(chars)

        getter = SentenceGetter(data)

        sentences = getter.sentences

        encodedSentences = encodeSentences(sentences, self.wordIndex)
        encodedSentences = pad(encodedSentences, self.maxLengthSentence,
                               self.wordIndex.getPadIdx())

        encodedStems = encodeStems(sentences, self.stemIndex, stemmer)
        encodedStems = pad(encodedStems, self.maxLengthSentence,
                           self.stemIndex.getPadIdx())

        encodedTags = encodeTags(sentences, self.tagIndex)
        encodedTags = pad(encodedTags, self.maxLengthSentence,
                          self.tagIndex.getPadIdx())
        encodedTags = onehotEncodeTags(encodedTags, self.nTags)

        encodedChars = encodeChars(sentences, self.charIndex,
                                   self.maxLengthSentence, self.maxLengthWord)

        return (encodedSentences, encodedStems, encodedChars, encodedTags)
コード例 #2
0
    def __preprocess(self, data):
        print("Number of sentences: ", len(data.groupby(['Sentence #'])))

        tags = list(set(data["Tag"].values))
        print("Tags:", tags)
        self.nTags = len(tags)
        print("Number of Tags: ", self.nTags)

        if self.model == None:
            self.tagIndex = WordIndex("O", self.nTags + 2)

        self.tagIndex.add(tags)

        getter = SentenceGetter(data)

        sentences = getter.sentences

        paddedSentences = padSentences(sentences, self.maxLengthSentence)

        encodedTags = encodeTags(sentences, self.tagIndex)
        encodedTags = pad(encodedTags, self.maxLengthSentence,
                          self.tagIndex.getPadIdx())
        encodedTags = onehotEncodeTags(encodedTags, self.nTags)

        return (paddedSentences, encodedTags)
    def predict(self, sentence):
        words = nltk.pos_tag(nltk.word_tokenize(sentence))
        encodedInput = encodeSentences([words], self.wordIndex)
        encodedInput = pad(encodedInput, self.maxLengthSentence, self.wordIndex.getPadIdx())

        encodedLemmas = encodeLemmas([[[w[0], w[1]] for w in words]], self.lemmaIndex, self.lemmatizer)
        encodedLemmas = pad(encodedLemmas, self.maxLengthSentence, self.lemmaIndex.getPadIdx())

        encodedChars = encodeChars([words], self.charIndex, self.maxLengthSentence, self.maxLengthWord)


        prediction = self.model.predict([
                            np.array([encodedInput[0]]),
                            np.array([encodedLemmas[0]]),
                            np.array(encodedChars[0]).reshape(1, self.maxLengthSentence, self.maxLengthWord)
                        ])
        prediction = np.argmax(prediction, axis=-1)
        return zip(words, [self.tagIndex.getWord(p) for p in prediction[0]])