def setUp(self):
     self.docs = mock_documents('en', '')
     try:
         import Pyro4
         self.server = Pyro4.Proxy('PYRONAME:gensim.testserver')
         logger.info(self.server.status())
     except Exception, e:
         logger.info(
             "could not locate running SessionServer; starting a local server"
         )
         self.server = SessionServer(gensim.utils.randfname())
Example #2
0
    def commit_indexing_set_old(self):
        ''' after filling an indexing set the actual indexing needs to be done
        '''         
        import sqlite3
        training_id = str(self.training_id)
        conn = sqlite3.connect(self.sqlserver)
        
        c = conn.cursor()

        # fetch the content 
        sql = "SELECT * FROM gensimIndexingSet"+str(training_id)
        print ( sql )
        c.execute( sql )
        
        # just fetch all items
        
        indexing_data = c.fetchmany(500);
        
        #service = similarities.SessionServer(self.rootlocation, autosession=True)
        service = SessionServer(self.rootlocation + 'gensimTraining'+str(self.training_id), autosession=True) # create a local server
        import Pyro4
        service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
        print self.rootlocation + 'gensimTraining'+str(self.training_id)
        
        while len(indexing_data) > 0 :       
            corpus = [{'id': str(id), 'tokens': utils.simple_preprocess(text)}
                for (text, id) in indexing_data]
            
            service.index(corpus) ## TODO we don't have a corpus yet, but we definatly need one big      
            indexing_data = c.fetchmany(500)
            service.autosession = True
            time.sleep(0.5)          
        
        # sql = "DROP TABLE IF EXISTS gensimIndexingSet"+str(training_id)
        # c.execute( sql )
        
        self.delete_set()
        self.init_indexing_set()
        
        return 'indexing done'
Example #3
0
    def commit_WIKIPEDIA_training_set(self, ):
            ''' after filling a training set the actual training needs to be done
            '''         
            from packages.controller.gensim_sim import gensim_sim
            w = gensim_sim()

            service = SessionServer(self.rootlocation + 'gensimTraining'+str(self.training_id), autosession=True) # create a local server
            
            factor=20000
            for d in range (0, 100):
                
                print "currently working on text row " + str( d*factor ) + "up to" + str( (d+1)*factor )
                
                training_data = w.init_sql_connection(d*factor, factor )
                corpus = [{'id': id, 'tokens': utils.simple_preprocess(text)}
                    for (id, text) in training_data]
            
                 
                service.train(corpus, method='lsi') ## TODO we don't have a corpus yet, but we definatly need one big      
                
                        
                    
            #self.init_training_set()    
            return 'training done'