def GensimClient(texts): similarities = None gsDir = os.getcwd() gss = gsDir + os.sep + u"gensim_server" + os.sep server = SessionServer(gss) logger.debug(u"%s" % server.status()) try: corpus = [{ u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text) } for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) similarities = findSimilar(texts, server, corpus) except Exception, msg: logger.debug(u"%s" % msg)
def GensimClient(texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) logger.info(u"%s" % server.status()) corpus = [{u"id": u"url_%i" % n, u"tokens": utils.simple_preprocess(text)} for n, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) # supply a list of document ids to be removed from the index # server.delete(["doc_5", "doc_8"]) # overall index size unchanged (just 3 docs overwritten) server.index(corpus[:3]) # Option Ons for n in range(0, len(corpus)): doc = u"doc_%d" % n logger.info(u"------------------------------------------------------") logger.info(u"Find similar N doc_%d to %s" % (n, corpus[n][u"tokens"])) logger.info(u"------------------------------------------------------") for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: logger.info(u"\t%s \t %3.2f : M %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"])) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u"\t\tCommon Topics : %s" % (lc)) if False: # Option two doc = {u"tokens": utils.simple_preprocess(str("Graph and minors and humans and trees."))} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
def GensimClient(texts): similarities = None gsDir = os.getcwd() gss = gsDir + os.sep + u"gensim_server" + os.sep server = SessionServer(gss) logger.debug(u"%s" % server.status()) try: corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) similarities = findSimilar(texts, server, corpus) except Exception, msg: logger.debug(u"%s" % msg)
class SimService(object): def __init__(self, path, preprocess, deaccent=True, lowercase=True, stemmer=None, stopwords=None): self.service = SessionServer(path) self.deaccent = deaccent self.lowercase = lowercase self.preprocess = preprocess self.stemmer = stemmer self.stopwords = stopwords def find_similar(self, data, min_score, max_results): if isinstance(data, basestring): doc = data.strip() if ' ' in doc: doc = {'tokens': self.preprocess(data, deacc=self.deaccent, lowercase=self.lowercase, errors='ignore', stemmer=self.stemmer, stopwords=self.stopwords)} try: return {'status': 'OK', 'response': self.service.find_similar(doc, min_score=min_score, max_results=max_results)} except ValueError: return {'status': 'NOTFOUND', 'response':[]} else: result = {} for doc in data: try: result[doc] = (self.service.find_similar( doc, min_score=min_score, max_results=max_results)) except ValueError: pass if result: return {'status': 'OK', 'response': result} else: return {'status': 'NOTFOUND', 'response':[]} def _buffer(self, data): i = 0 for d in data: if 'tokens' in d: self.service.buffer([{'id': d['id'], 'tokens': d['tokens']}]) else: self.service.buffer([{'id': d['id'], 'tokens': list(self.preprocess(d['text'], deacc=self.deaccent, lowercase=self.lowercase, errors='ignore', stemmer=self.stemmer, stopwords=self.stopwords))}]) i+=1 return i def train(self, data): self.service.set_autosession(False) self.service.open_session() i = self._buffer(data) self.service.train(method='lsi') logger.info('training complete commit changes') self.service.commit() self.service.set_autosession(True) return {'status': 'OK', 'response':i} def index(self, data): self.service.set_autosession(False) self.service.open_session() i = self._buffer(data) self.service.index() logger.info('indexing complete commit changes') self.service.commit() self.service.set_autosession(True) return {'status': 'OK', 'response':i} def optimize(self): self.service.set_autosession(False) self.service.open_session() self.service.optimize() self.service.commit() self.service.set_autosession(True) return {'status': 'OK', 'response': 'index optimized'} def delete(self, data): self.service.set_autosession(False) self.service.open_session() self.service.delete(data) self.service.commit() self.service.set_autosession(True) return {'status': 'OK', 'response': 'documents deleted'} def status(self): return {'status': 'OK', 'response': self.service.status()} def indexed_documents(self): return {'status': 'OK', 'response': self.service.keys()} def is_indexed(self, doc): return {'status': 'OK', 'response': doc in self.service.keys()}
def test_Gensim(texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) u""" texts = [u"Human machine interface for lab abc computer applications", u"A survey of user opinion of computer system response time", u"The EPS user interface management system", u"System and human system engineering testing of EPS", u"Relation of user perceived response time to error measurement", u"The generation of random binary unordered trees", u"The intersection graph of paths in trees", u"Graph minors IV Widths of trees and well quasi ordering", u"Graph minors A survey", u"Why use a computer"] """ logger.info(u"%s" % server.status()) corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) # supply a list of document ids to be removed from the index # server.delete(["doc_5", "doc_8"]) # overall index size unchanged (just 3 docs overwritten) server.index(corpus[:3]) # Option Ons for n in range(0, len(texts)): doc = u"doc_%d" % n logger.info(u"Find similar doc_%d to %s" % (n, corpus[n][u"tokens"])) for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: logger.info(u"\t%s \t %3.2f : %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"])) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u"\tCommon Topics : %s\n" % (lc)) if False: # Option two doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))