def train(): ''' $1 path to config file ''' start = datetime.now() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # loading configs; config = get_config(sys.argv[2]) k = config['k'] nthread = config['nthread'] asyn = config['asyn'] mm_path = config['mm_path'] var_path = config['var_path'] minibatch = config['minibatch'] corpus = _mCorpus.get_corpus(mm_path) V = corpus.num_terms if asyn: eta = _masynf.asyn_framework(corpus, k, V, nthread, minibatch, var_path) else: eta = _msynf.syn_framework(corpus, k, V, nthread, minibatch, var_path, True) fn = 'eta.final.pickle' path = os.path.join(var_path, fn) _mea.write_eta(eta, path) end = datetime.now() print end - start
def train(): ''' $1 path to config file ''' start = datetime.now() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # loading configs; config = get_config(sys.argv[2]) k = config['k'] nthread = config['nthread'] asyn = config['asyn'] mm_path=config['mm_path'] var_path = config['var_path'] minibatch = config['minibatch'] corpus = _mCorpus.get_corpus(mm_path) V = corpus.num_terms if asyn: eta = _masynf.asyn_framework(corpus,k,V,nthread,minibatch,var_path) else: eta = _msynf.syn_framework(corpus,k,V,nthread,minibatch,var_path,True) fn = 'eta.final.pickle' path = os.path.join(var_path,fn) _mea.write_eta(eta,path) end = datetime.now() print end-start
def main(): # Initializations and preliminaries comm = MPI.COMM_WORLD # get MPI communicator object size = comm.size # total number of processes rank = comm.rank # rank of this process status = MPI.Status() # get MPI status object tags = enum('READY', 'DONE', 'EXIT', 'START') if rank == 0: # Master process ''' $1 path to config file ''' start = datetime.now() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # loading configs; config = get_config(sys.argv[2]) k = config['k'] nthread = config['nthread'] asyn = config['asyn'] # here, the value should be 'mpi' mm_path = config['mm_path'] var_path = config['var_path'] minibatch = config['minibatch'] corpus = _mCorpus.get_corpus(mm_path) V = corpus.num_terms eta = master_process(comm, status, tags, corpus, k, V, nthread, minibatch, var_path) # store the final pickle fn = 'eta.final.pickle' path = os.path.join(var_path, fn) _mea.write_eta(eta, path) end = datetime.now() print end - start else: # Worker process name = MPI.Get_processor_name() worker_process(comm, status, tags, name)
def main(): # Initializations and preliminaries comm = MPI.COMM_WORLD # get MPI communicator object size = comm.size # total number of processes rank = comm.rank # rank of this process status = MPI.Status() # get MPI status object tags = enum('READY', 'DONE', 'EXIT', 'START') if rank == 0: # Master process ''' $1 path to config file ''' start = datetime.now() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # loading configs; config = get_config(sys.argv[2]) k = config['k'] nthread = config['nthread'] asyn = config['asyn'] # here, the value should be 'mpi' mm_path=config['mm_path'] var_path = config['var_path'] minibatch = config['minibatch'] corpus = _mCorpus.get_corpus(mm_path) V = corpus.num_terms eta = master_process(comm,status,tags,corpus,k,V,nthread,minibatch,var_path) # store the final pickle fn = 'eta.final.pickle' path = os.path.join(var_path,fn) _mea.write_eta(eta,path) end = datetime.now() print end-start else: # Worker process name = MPI.Get_processor_name() worker_process(comm,status,tags,name)
def write_eta(self, path): self.lock.acquire() _mea.write_eta(self.eta, path) self.lock.release()
def write_eta(self,path): self.lock.acquire() _mea.write_eta(self.eta,path) self.lock.release()
def syn_framework(corpus, k, V, nthread, minibatch, var_path, record_eta=False): # configs thread_batch = minibatch / nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] batch_buffer = [] # [(docs,etas)] voc_temp = set() # global data eta = {} for doc in corpus: for vid, count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = _mea.get_eta(k, eta, voc_temp) etaSum = _mea.get_eta_sum(eta, k, V) batch_buffer.append((doc_buffer, eta_temp, etaSum)) # clear doc buffer doc_buffer = [] voc_temp = set() if batch_id % nthread == nthread - 1: # update eta eta = syn_master(batch_buffer, k, nthread, eta, _mea.get_alpha(k)) if record_eta: fn = 'eta.{}.pickle'.format(round_id) path = os.path.join(var_path, fn) _mea.write_eta(eta, path) # clear batch_buffer batch_buffer = [] round_id += 1 logging.info('round:{}, batch:{}'.format(round_id, batch_id)) batch_id += 1 doc_id += 1 # process the docs in current doc_buffer if len(doc_buffer) > 0: # form a new batch eta_temp = _mea.get_eta(k, eta, voc_temp) etaSum = _mea.get_eta_sum(eta, k, V) batch_buffer.append((doc_buffer, eta_temp, etaSum)) # form a new round eta = syn_master(batch_buffer, k, len(batch_buffer), eta, _mea.get_alpha(k)) if record_eta: fn = 'eta.{}.pickle'.format(round_id) path = os.path.join(var_path, fn) _mea.write_eta(eta, path) round_id += 1 batch_id += 1 return eta
def syn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False): # configs thread_batch = minibatch/nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] batch_buffer = [] # [(docs,etas)] voc_temp = set() # global data eta = {} for doc in corpus: for vid,count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = _mea.get_eta(k,eta, voc_temp) etaSum = _mea.get_eta_sum(eta,k,V) batch_buffer.append((doc_buffer,eta_temp,etaSum)) # clear doc buffer doc_buffer = [] voc_temp = set() if batch_id % nthread == nthread - 1: # update eta eta = syn_master(batch_buffer,k,nthread,eta,_mea.get_alpha(k)) if record_eta: fn = 'eta.{}.pickle'.format(round_id) path = os.path.join(var_path,fn) _mea.write_eta(eta,path) # clear batch_buffer batch_buffer = [] round_id += 1 logging.info('round:{}, batch:{}'.format(round_id,batch_id)) batch_id += 1 doc_id += 1 # process the docs in current doc_buffer if len(doc_buffer) > 0: # form a new batch eta_temp = _mea.get_eta(k,eta, voc_temp) etaSum = _mea.get_eta_sum(eta,k,V) batch_buffer.append((doc_buffer,eta_temp,etaSum)) # form a new round eta = syn_master(batch_buffer,k,len(batch_buffer),eta,_mea.get_alpha(k)) if record_eta: fn = 'eta.{}.pickle'.format(round_id) path = os.path.join(var_path,fn) _mea.write_eta(eta,path) round_id +=1 batch_id +=1 return eta