def transform(self,sentence):
     
     sentence_mod = exp_replace.replace_reg(sentence)
     tokens = nltk.word_tokenize(sentence_mod)
     tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop] 
     corpus_sentence = self.dictionary.doc2bow(tokens)
     
     return self.lda[corpus_sentence]  
 def fit(self,documents):
     
     documents_mod = [exp_replace.replace_reg(sentence) for sentence in documents]
     tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
     tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]        
         
     self.dictionary = corpora.Dictionary(tokens)
     corpus = [self.dictionary.doc2bow(text) for text in tokens]
     self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)
     
     self.lda.save('topics.tp')
     self.dictionary.save('topics_dict.tp')
def grams_feature(features, sentence):
    sentence_reg = exp_replace.replace_reg(sentence)

    # Spell check
    # sentence_reg = TextBlob(sentence_reg)
    # sentence_reg = str(sentence_reg.correct())

    tokens = nltk.word_tokenize(sentence_reg)
    tokens = [porter.stem(t.lower()) for t in tokens]
    bigrams = nltk.bigrams(tokens)
    bigrams = [tup[0] + " " + tup[1] for tup in bigrams]
    grams = tokens + bigrams

    for t in grams:
        features["contains(%s)" % t] = 1.0