forked from jarpi/topic_modelling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda.py
58 lines (51 loc) · 1.98 KB
/
lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
#!/usr/bin/env python
from utils import Utils
from corpus_iterator import MyCorpus
from gensim import corpora, models, similarities
class LDA:
def __init__(self, documentList, stopList):
# self.documentList = documentList
# self.stopList = stopList
# self.texts = []
# self.dictionary = []
# self.corpus = []
# self.ldaModel = []
# self.index = []
# self.utils = Utils()
# self.isInitialized = False
# self.initMe()
self.documentList = documentList
self.stopList = stopList
self.texts = []
self.corpus = []
self.tfidfModel = []
self.index = []
self.utils = Utils()
self.isInitialized = False
self.initMe()
def runQuery(self,keyword):
if ( self.isInitialized ):
# vec = self.dictionary.doc2bow(keyword.lower().split())
vec = self.corpus.convertToBOW(keyword)
sims = self.index[self.ldaModel[vec]]
return (list(enumerate(sims)))
return []
def initMe(self):
if ( not self.isInitialized ):
# OLD
# self.texts = self.utils.cleanStopWordsPunctuations(self.documentList, self.stopList)
# self.dictionary = corpora.Dictionary(self.texts)
# self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
# self.ldaModel = models.LdaModel(self.corpus, id2word=self.dictionary, num_topics=10)
# self.index = similarities.SparseMatrixSimilarity(self.ldaModel[self.corpus], num_features=12)
# self.isInitialized = True
# NEW
self.corpus = MyCorpus(self.documentList, self.stopList)
# print (self.corpus)
# self.tfidfModel = models.TfidfModel(self.corpus)
self.ldaModel = models.LdaModel(self.corpus, id2word=self.corpus.getCorpusDictionary(), num_topics=10)
# self.index = similarities.SparseMatrixSimilarity(self.tfidfModel[self.corpus], num_features=12)
self.index = similarities.Similarity('./', self.ldaModel[self.corpus], num_features=200, chunksize=128, shardsize=16384)
# print (self.dictionary.token2id)
self.isInitialized = True