def setUp(self): self.docs = mock_documents('en', '') try: import Pyro4 self.server = Pyro4.Proxy('PYRONAME:gensim.testserver') logger.info(self.server.status()) except Exception, e: logger.info( "could not locate running SessionServer; starting a local server" ) self.server = SessionServer(gensim.utils.randfname())
def commit_indexing_set_old(self): ''' after filling an indexing set the actual indexing needs to be done ''' import sqlite3 training_id = str(self.training_id) conn = sqlite3.connect(self.sqlserver) c = conn.cursor() # fetch the content sql = "SELECT * FROM gensimIndexingSet"+str(training_id) print ( sql ) c.execute( sql ) # just fetch all items indexing_data = c.fetchmany(500); #service = similarities.SessionServer(self.rootlocation, autosession=True) service = SessionServer(self.rootlocation + 'gensimTraining'+str(self.training_id), autosession=True) # create a local server import Pyro4 service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) print self.rootlocation + 'gensimTraining'+str(self.training_id) while len(indexing_data) > 0 : corpus = [{'id': str(id), 'tokens': utils.simple_preprocess(text)} for (text, id) in indexing_data] service.index(corpus) ## TODO we don't have a corpus yet, but we definatly need one big indexing_data = c.fetchmany(500) service.autosession = True time.sleep(0.5) # sql = "DROP TABLE IF EXISTS gensimIndexingSet"+str(training_id) # c.execute( sql ) self.delete_set() self.init_indexing_set() return 'indexing done'
def commit_WIKIPEDIA_training_set(self, ): ''' after filling a training set the actual training needs to be done ''' from packages.controller.gensim_sim import gensim_sim w = gensim_sim() service = SessionServer(self.rootlocation + 'gensimTraining'+str(self.training_id), autosession=True) # create a local server factor=20000 for d in range (0, 100): print "currently working on text row " + str( d*factor ) + "up to" + str( (d+1)*factor ) training_data = w.init_sql_connection(d*factor, factor ) corpus = [{'id': id, 'tokens': utils.simple_preprocess(text)} for (id, text) in training_data] service.train(corpus, method='lsi') ## TODO we don't have a corpus yet, but we definatly need one big #self.init_training_set() return 'training done'