Ejemplo n.º 1
0
def train():
    '''
    $1 path to config file
    '''
    start = datetime.now()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # loading configs;
    config = get_config(sys.argv[2])
    k = config['k']
    nthread = config['nthread']
    asyn = config['asyn']
    mm_path = config['mm_path']
    var_path = config['var_path']
    minibatch = config['minibatch']
    corpus = _mCorpus.get_corpus(mm_path)
    V = corpus.num_terms

    if asyn:
        eta = _masynf.asyn_framework(corpus, k, V, nthread, minibatch,
                                     var_path)
    else:
        eta = _msynf.syn_framework(corpus, k, V, nthread, minibatch, var_path,
                                   True)

    fn = 'eta.final.pickle'
    path = os.path.join(var_path, fn)
    _mea.write_eta(eta, path)
    end = datetime.now()
    print end - start
Ejemplo n.º 2
0
def train():
    '''
    $1 path to config file
    '''
    start = datetime.now()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    # loading configs;
    config = get_config(sys.argv[2])
    k = config['k']
    nthread = config['nthread']
    asyn = config['asyn']
    mm_path=config['mm_path']
    var_path = config['var_path']
    minibatch = config['minibatch']
    corpus = _mCorpus.get_corpus(mm_path)
    V = corpus.num_terms


    if asyn:
        eta = _masynf.asyn_framework(corpus,k,V,nthread,minibatch,var_path)
    else:
        eta = _msynf.syn_framework(corpus,k,V,nthread,minibatch,var_path,True)
    

    fn = 'eta.final.pickle'
    path = os.path.join(var_path,fn)
    _mea.write_eta(eta,path)
    end = datetime.now()
    print end-start
Ejemplo n.º 3
0
def main():
    # Initializations and preliminaries
    comm = MPI.COMM_WORLD  # get MPI communicator object
    size = comm.size  # total number of processes
    rank = comm.rank  # rank of this process
    status = MPI.Status()  # get MPI status object
    tags = enum('READY', 'DONE', 'EXIT', 'START')

    if rank == 0:
        # Master process
        '''
        $1 path to config file
        '''
        start = datetime.now()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)

        # loading configs;
        config = get_config(sys.argv[2])
        k = config['k']
        nthread = config['nthread']
        asyn = config['asyn']  # here, the value should be 'mpi'
        mm_path = config['mm_path']
        var_path = config['var_path']
        minibatch = config['minibatch']
        corpus = _mCorpus.get_corpus(mm_path)
        V = corpus.num_terms

        eta = master_process(comm, status, tags, corpus, k, V, nthread,
                             minibatch, var_path)

        # store the final pickle
        fn = 'eta.final.pickle'
        path = os.path.join(var_path, fn)
        _mea.write_eta(eta, path)

        end = datetime.now()
        print end - start

    else:
        # Worker process
        name = MPI.Get_processor_name()
        worker_process(comm, status, tags, name)
Ejemplo n.º 4
0
def main():
    # Initializations and preliminaries
    comm = MPI.COMM_WORLD   # get MPI communicator object
    size = comm.size        # total number of processes
    rank = comm.rank        # rank of this process
    status = MPI.Status()   # get MPI status object
    tags = enum('READY', 'DONE', 'EXIT', 'START')

    if rank == 0:
        # Master process
        '''
        $1 path to config file
        '''
        start = datetime.now()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

        # loading configs;
        config = get_config(sys.argv[2])
        k = config['k']
        nthread = config['nthread']
        asyn = config['asyn'] # here, the value should be 'mpi'
        mm_path=config['mm_path']
        var_path = config['var_path']
        minibatch = config['minibatch']
        corpus = _mCorpus.get_corpus(mm_path)
        V = corpus.num_terms
        
        eta = master_process(comm,status,tags,corpus,k,V,nthread,minibatch,var_path)
        
        # store the final pickle
        fn = 'eta.final.pickle'
        path = os.path.join(var_path,fn)
        _mea.write_eta(eta,path)

        end = datetime.now()
        print end-start

    else:
        # Worker process
        name = MPI.Get_processor_name()
        worker_process(comm,status,tags,name)
Ejemplo n.º 5
0
 def write_eta(self, path):
     self.lock.acquire()
     _mea.write_eta(self.eta, path)
     self.lock.release()
Ejemplo n.º 6
0
 def write_eta(self,path):
     self.lock.acquire()
     _mea.write_eta(self.eta,path)
     self.lock.release()
Ejemplo n.º 7
0
def syn_framework(corpus,
                  k,
                  V,
                  nthread,
                  minibatch,
                  var_path,
                  record_eta=False):
    # configs
    thread_batch = minibatch / nthread
    # ids
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    batch_buffer = []  # [(docs,etas)]
    voc_temp = set()
    # global data
    eta = {}

    for doc in corpus:

        for vid, count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = _mea.get_eta(k, eta, voc_temp)
            etaSum = _mea.get_eta_sum(eta, k, V)
            batch_buffer.append((doc_buffer, eta_temp, etaSum))

            # clear doc buffer
            doc_buffer = []
            voc_temp = set()

            if batch_id % nthread == nthread - 1:
                # update eta
                eta = syn_master(batch_buffer, k, nthread, eta,
                                 _mea.get_alpha(k))
                if record_eta:
                    fn = 'eta.{}.pickle'.format(round_id)
                    path = os.path.join(var_path, fn)
                    _mea.write_eta(eta, path)
                # clear batch_buffer
                batch_buffer = []
                round_id += 1
                logging.info('round:{}, batch:{}'.format(round_id, batch_id))

            batch_id += 1

        doc_id += 1

    # process the docs in current doc_buffer
    if len(doc_buffer) > 0:
        # form a new batch
        eta_temp = _mea.get_eta(k, eta, voc_temp)
        etaSum = _mea.get_eta_sum(eta, k, V)
        batch_buffer.append((doc_buffer, eta_temp, etaSum))

        # form a new round
        eta = syn_master(batch_buffer, k, len(batch_buffer), eta,
                         _mea.get_alpha(k))
        if record_eta:
            fn = 'eta.{}.pickle'.format(round_id)
            path = os.path.join(var_path, fn)
            _mea.write_eta(eta, path)

        round_id += 1
        batch_id += 1

    return eta
Ejemplo n.º 8
0
def syn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False):
    # configs
    thread_batch = minibatch/nthread
    # ids 
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    batch_buffer = [] # [(docs,etas)]
    voc_temp = set()
    # global data
    eta = {}

    for doc in corpus:

        for vid,count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = _mea.get_eta(k,eta, voc_temp)
            etaSum = _mea.get_eta_sum(eta,k,V)
            batch_buffer.append((doc_buffer,eta_temp,etaSum))
            
            # clear doc buffer
            doc_buffer = []
            voc_temp = set()
            
            if batch_id % nthread == nthread - 1:
                # update eta
                eta = syn_master(batch_buffer,k,nthread,eta,_mea.get_alpha(k))
                if record_eta:
                    fn = 'eta.{}.pickle'.format(round_id)
                    path = os.path.join(var_path,fn)
                    _mea.write_eta(eta,path)
                # clear batch_buffer
                batch_buffer = []
                round_id += 1
                logging.info('round:{}, batch:{}'.format(round_id,batch_id))

            batch_id += 1
            


        doc_id += 1


    # process the docs in current doc_buffer
    if len(doc_buffer) > 0:
        # form a new batch
        eta_temp = _mea.get_eta(k,eta, voc_temp)
        etaSum = _mea.get_eta_sum(eta,k,V)
        batch_buffer.append((doc_buffer,eta_temp,etaSum))
        
        # form a new round
        eta = syn_master(batch_buffer,k,len(batch_buffer),eta,_mea.get_alpha(k))
        if record_eta:
            fn = 'eta.{}.pickle'.format(round_id)
            path = os.path.join(var_path,fn)
            _mea.write_eta(eta,path)

        round_id +=1
        batch_id +=1

    return eta