def transform(self,sentence): sentence_mod = exp_replace.replace_reg(sentence) tokens = nltk.word_tokenize(sentence_mod) tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop] corpus_sentence = self.dictionary.doc2bow(tokens) return self.lda[corpus_sentence]
def grams_feature(features, sentence): sentence_reg = exp_replace.replace_reg(sentence) tokens = nltk.word_tokenize(sentence_reg) tokens = [porter.stem(t.lower()) for t in tokens] bigrams = nltk.bigrams(tokens) bigrams = [tuple[0] + ' ' + tuple[1] for tuple in bigrams] ngrams = tokens + bigrams for tup in ngrams: features['contains(%s)' % tup] = 1.0
def fit(self,documents): documents_mod = [exp_replace.replace_reg(sentence) for sentence in documents] tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod] tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens] self.dictionary = corpora.Dictionary(tokens) corpus = [self.dictionary.doc2bow(text) for text in tokens] self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha) self.lda.save('topics.tp') self.dictionary.save('topics_dict.tp')
def grams_feature(features, sentence): sentence_reg = exp_replace.replace_reg(sentence) #Spell check #sentence_reg = TextBlob(sentence_reg) #sentence_reg = str(sentence_reg.correct()) tokens = nltk.word_tokenize(sentence_reg) tokens = [porter.stem(t.lower()) for t in tokens] bigrams = nltk.bigrams(tokens) bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams] grams = tokens + bigrams for t in grams: features['contains(%s)' % t] = 1.0
def grams_feature(features, sentence): sentence_reg = exp_replace.replace_reg(sentence) # Spell check # sentence_reg = TextBlob(sentence_reg) # sentence_reg = str(sentence_reg.correct()) tokens = nltk.word_tokenize(sentence_reg) tokens = [porter.stem(t.lower()) for t in tokens] bigrams = nltk.bigrams(tokens) bigrams = [tup[0] + " " + tup[1] for tup in bigrams] grams = tokens + bigrams for t in grams: features["contains(%s)" % t] = 1.0