Esempio n. 1
0
class LSI: 
	def __init__(self, documentList, stopList): 
		# self.documentList = documentList 
		# self.stopList = stopList 
		# self.texts = [] 
		# self.dictionary = [] 
		# self.corpus = [] 
		# self.lsiModel = [] 
		# self.index = [] 
		# self.utils = Utils()  
		# self.isInitialized = False  
		# self.initMe() 
		self.documentList = documentList 
		self.stopList = stopList 
		self.texts = [] 
		self.corpus = [] 
		self.tfidfModel = [] 
		self.index = [] 
		self.utils = Utils()  
		self.isInitialized = False  
		self.initMe()


	def runQuery(self,keyword): 
		if ( self.isInitialized ): 
			# vec = self.dictionary.doc2bow(keyword.lower().split()) 
			vec = self.corpus.convertToBOW(keyword) 
			sims = self.index[self.lsiModel[vec]] 
			return (list(enumerate(sims))) 
		return [] 

	def initMe(self): 
		if ( not self.isInitialized ): 
			# OLD 
			# self.texts = self.utils.cleanStopWordsPunctuations(self.documentList, self.stopList) 
			# self.dictionary = corpora.Dictionary(self.texts) 
			# self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] 
			# self.lsiModel = models.LsiModel(self.corpus, id2word=self.dictionary, num_topics=300) 
			# self.index = similarities.SparseMatrixSimilarity(self.lsiModel[self.corpus], num_features=12) 
			# self.isInitialized = True   
			# self.texts = self.utils.cleanStopWordsPunctuations(self.documentList, self.stopList) 
			# self.dictionary = corpora.Dictionary(self.texts) 
			# print (self.dictionary) 
			# self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] 
			
			# NEW 
			self.corpus = MyCorpus(self.documentList, self.stopList) 
			# print (self.corpus) 
			# self.tfidfModel = models.TfidfModel(self.corpus) 
			self.lsiModel = models.LsiModel(self.corpus, id2word=self.corpus.getCorpusDictionary(), num_topics=300) 
			# self.index = similarities.SparseMatrixSimilarity(self.tfidfModel[self.corpus], num_features=12) 
			self.index = similarities.Similarity('./', self.lsiModel[self.corpus], num_features=200, chunksize=128, shardsize=16384) 
			# print (self.dictionary.token2id) 
			self.isInitialized = True   
Esempio n. 2
0
	def initMe(self): 
		if ( not self.isInitialized ): 
			# OLD 
			# self.texts = self.utils.cleanStopWordsPunctuations(self.documentList, self.stopList) 
			# self.dictionary = corpora.Dictionary(self.texts) 
			# self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] 
			# self.ldaModel = models.LdaModel(self.corpus, id2word=self.dictionary, num_topics=10) 
			# self.index = similarities.SparseMatrixSimilarity(self.ldaModel[self.corpus], num_features=12) 
			# self.isInitialized = True   

			# NEW 
			self.corpus = MyCorpus(self.documentList, self.stopList) 
			# print (self.corpus) 
			# self.tfidfModel = models.TfidfModel(self.corpus) 
			self.ldaModel = models.LdaModel(self.corpus, id2word=self.corpus.getCorpusDictionary(), num_topics=10) 
			# self.index = similarities.SparseMatrixSimilarity(self.tfidfModel[self.corpus], num_features=12) 
			self.index = similarities.Similarity('./', self.ldaModel[self.corpus], num_features=200, chunksize=128, shardsize=16384) 
			# print (self.dictionary.token2id) 
			self.isInitialized = True