def __preprocess(self, data): print("Number of sentences: ", len(data.groupby(['Sentence #']))) words = list(set(data["Word"].values)) self.nWords = len(words) print("Number of words in the dataset: ", self.nWords) self.nWords += int(self.nWords * 0.1) stemmer = SnowballStemmer("english", ignore_stopwords=True) stems = list(set(stemmer.stem(word) for word in tqdm(words))) chars = set([w_i for w in tqdm(words) for w_i in w]) print(chars) self.nChars = len(chars) tags = list(set(data["Tag"].values)) print("Tags:", tags) self.nTags = len(tags) print("Number of Tags: ", self.nTags) if self.model == None: self.wordIndex = WordIndex("UNK", self.nWords) self.stemIndex = WordIndex("UNK", self.nWords) self.tagIndex = WordIndex("O", self.nTags + 2) self.charIndex = WordIndex("UNK", self.nChars + 1) self.wordIndex.add(words) self.stemIndex.add(stems) self.tagIndex.add(tags) self.charIndex.add(chars) getter = SentenceGetter(data) sentences = getter.sentences encodedSentences = encodeSentences(sentences, self.wordIndex) encodedSentences = pad(encodedSentences, self.maxLengthSentence, self.wordIndex.getPadIdx()) encodedStems = encodeStems(sentences, self.stemIndex, stemmer) encodedStems = pad(encodedStems, self.maxLengthSentence, self.stemIndex.getPadIdx()) encodedTags = encodeTags(sentences, self.tagIndex) encodedTags = pad(encodedTags, self.maxLengthSentence, self.tagIndex.getPadIdx()) encodedTags = onehotEncodeTags(encodedTags, self.nTags) encodedChars = encodeChars(sentences, self.charIndex, self.maxLengthSentence, self.maxLengthWord) return (encodedSentences, encodedStems, encodedChars, encodedTags)
def __preprocess(self, data): print("Number of sentences: ", len(data.groupby(['Sentence #']))) tags = list(set(data["Tag"].values)) print("Tags:", tags) self.nTags = len(tags) print("Number of Tags: ", self.nTags) if self.model == None: self.tagIndex = WordIndex("O", self.nTags + 2) self.tagIndex.add(tags) getter = SentenceGetter(data) sentences = getter.sentences paddedSentences = padSentences(sentences, self.maxLengthSentence) encodedTags = encodeTags(sentences, self.tagIndex) encodedTags = pad(encodedTags, self.maxLengthSentence, self.tagIndex.getPadIdx()) encodedTags = onehotEncodeTags(encodedTags, self.nTags) return (paddedSentences, encodedTags)
def predict(self, sentence): words = nltk.pos_tag(nltk.word_tokenize(sentence)) encodedInput = encodeSentences([words], self.wordIndex) encodedInput = pad(encodedInput, self.maxLengthSentence, self.wordIndex.getPadIdx()) encodedLemmas = encodeLemmas([[[w[0], w[1]] for w in words]], self.lemmaIndex, self.lemmatizer) encodedLemmas = pad(encodedLemmas, self.maxLengthSentence, self.lemmaIndex.getPadIdx()) encodedChars = encodeChars([words], self.charIndex, self.maxLengthSentence, self.maxLengthWord) prediction = self.model.predict([ np.array([encodedInput[0]]), np.array([encodedLemmas[0]]), np.array(encodedChars[0]).reshape(1, self.maxLengthSentence, self.maxLengthWord) ]) prediction = np.argmax(prediction, axis=-1) return zip(words, [self.tagIndex.getWord(p) for p in prediction[0]])