def prepare_iterator(self): from base.iterator import P_iter self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', self.model.train) self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val', self.model.val)
def prepare_iterator(self): from base.iterator import P_iter # iterator won't make another copy of the model # instead it will just call its compiled train function self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train') self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val')
def prepare_iterator(self): #override Async_PTWorker member function from base.iterator import P_iter # iterator won't make another copy of the model # instead it will just call its compiled train function self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', self.model.train_vel_acc) self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val', self.model.val)
def prepare_iterator(self): worker_type=self.config['worker_type'] from base.iterator import P_iter if worker_type == 'cdd': def cdd_iter_fn(subb_ind): self.model.descent_vel() cost, error = self.model.get_vel(subb_ind) return cost, error self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', cdd_iter_fn) elif worker_type == 'avg': self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', self.model.train) self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val', self.model.val)
def prepare_iterator(self): worker_type = self.config['worker_type'] from base.iterator import P_iter if worker_type == 'cdd': def cdd_iter_fn(subb_ind): self.model.descent_vel() cost, error = self.model.get_vel(subb_ind) return cost, error self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', cdd_iter_fn) elif worker_type == 'avg': self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', self.model.train) self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val', self.model.val)
class BSP_PTWorker(PTWorker): ''' Worker class based a specific synchronization rule (EASGD) ''' def __init__(self, port, config, device): PTWorker.__init__(self, port = port, \ config = config, \ device = device) self.verbose = self.config['verbose'] self.worker_id = self.config['worker_id'] self.prepare_worker() self.prepare_recorder() self.prepare_iterator() self.mode = None self.epoch = 0 self.count = 0 if self.config['resume_train'] == True: self.epoch = self.config['load_epoch'] self.load_model(self.epoch) self.train_len = len(self.data[0]) #self.config['avg_freq'] self.val_len = len(self.data[2]) def prepare_param_exchanger(self): from base.exchanger import BSP_Exchanger self.exchanger = BSP_Exchanger(self.config,\ self.drv, \ self.ctx, self.model) def prepare_recorder(self): from base.recorder import Recorder self.recorder = Recorder(self.config) def prepare_iterator(self): from base.iterator import P_iter # iterator won't make another copy of the model # instead it will just call its compiled train function self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train') self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val') def load_model(self, load_epoch): layers = self.model.layers path = self.config['load_path'] s_lr = self.model.shared_lr vels = self.model.vels # TODO needs to verify the previous lr is when training with avg, scaled by size import os s_lr.set_value(np.load(os.path.join(path, 'lr_' + str(load_epoch) + '.npy'))) from base.helper_funcs import load_weights, load_momentums #l_range = set(range(16))-set([1,3]) load_weights(layers, path, load_epoch) #load_momentums(vels, path, load_epoch) if self.verbose: print '\nlearning rate loaded %f' % s_lr.get_value() print 'weights and momentums loaded from epoch %d' % load_epoch print 'in %s' % path record_file_path = self.config['record_dir'] + 'inforec.pkl' if os.path.exists(record_file_path): import glob history_folder = self.config['record_dir']+ 'history*' find = glob.glob(history_folder) #print find if find != []: history_folder = sorted(find)[-1] #print history_folder history_folder = history_folder.split('_')[0] + '_' + \ "%d" % (int(history_folder.split('_')[-1])+1) + '/' else: history_folder = self.config['record_dir']+ 'history_0' + '/' print 'creating inforec history folder: ' + history_folder os.makedirs(history_folder) import shutil shutil.copy(record_file_path, history_folder+'inforec.pkl') self.recorder.load(filepath = record_file_path) self.recorder.cut(load_epoch) # print type(self.recorder.info_dict['train_info']) # print len(self.recorder.info_dict['train_info']) # # print type(self.recorder.info_dict['val_info']) # print len(self.recorder.info_dict['val_info']) else: raise OSError('record fle not found at %s ' % record_file_path) def save_model(self): layers = self.model.layers path = self.config['weights_dir'] vels = self.model.vels from base.helper_funcs import save_weights, save_momentums save_weights(layers, path, self.epoch) np.save(path + 'lr_' + str(self.epoch) + \ '.npy', self.model.shared_lr.get_value()) #save_momentums(vels, self.config['weights_dir'], self.epoch) if self.verbose: print '\nweights and momentums saved at epoch %d' % self.epoch with open(path+"val_info.txt", "a") as f: f.write("\nepoch: {} val_info {}:".format(self.epoch, \ self.model.current_info)) def train(self): # avoiding dots evaluation i_next = self.train_iterator.next r_start = self.recorder.start if self.size>1: exch = self.exchanger.exchange r_end = self.recorder.end r_print = self.recorder.print_train_info for i in xrange(0,self.train_len,self.size): for subb_ind in range(self.config['n_subb']): i_next(self.recorder,self.count) self.comm.Barrier() r_start() #print self.model.params[0].get_value()[1][1][1][1] if self.size>1: exch() r_end('comm') self.count += self.size r_print(self.count) self.train_iterator.reset() def val(self): self.model.set_dropout_off() for i in xrange(0,self.val_len,self.config['size']): for subb_ind in range(self.config['n_subb']): self.val_iterator.next(self.recorder,self.count) print '.', self.recorder.gather_val_info() self.recorder.print_val_info(self.count) self.model.current_info = self.recorder.get_latest_val_info() self.model.set_dropout_on() self.val_iterator.reset() def adjust_lr(self): self.model.adjust_lr(self.epoch, size = self.size) def run(self): # override PTWorker class method print 'worker started' if self.size>1: self.prepare_param_exchanger() self.adjust_lr() if self.config['initial_val']: self.mode = 'val' else: self.mode = 'train' while True: if self.mode == 'train': self.comm.Barrier() self.recorder.start_epoch() self.epoch+=1# epoch starts from 1, not 0. 0 means training has not started. if self.verbose: print '\nNow training' self.train() self.recorder.end_epoch(self.count, self.epoch) self.mode = 'val' elif self.mode == 'val': self.comm.Barrier() if self.verbose: print '\nNow validating' self.val() self.adjust_lr() self.recorder.save(self.count, self.model.shared_lr.get_value(), \ filepath = self.config['record_dir'] + 'inforec.pkl') if self.epoch % self.config['snapshot_freq'] == 0: if self.config['rank'] ==0 : self.save_model() if self.epoch >= self.config['n_epochs']: self.mode = 'stop' else: self.mode = 'train' elif self.mode == 'stop': if self.verbose: print '\nOptimization finished' break else: raise ValueError('wrong mode') self.para_load_close()
class BSP_PTWorker(PTWorker): ''' Worker class based a specific synchronization rule (BSP) ''' def __init__(self, config, device): PTWorker.__init__(self, config = config, \ device = device) self.verbose = self.config['verbose'] import time compile_time = time.time() self.prepare_train_fn( ) # 1 (local to worker type) allocate supporting params and compiling theano functions self.prepare_val_fn() if self.verbose: print 'compile_time %.2f s' % \ (time.time() - compile_time) self.prepare_para_load( ) #needs to be after compile_train and compile_val() self.prepare_recorder() self.prepare_iterator() self.mode = None self.epoch = 0 self.count = 0 self.train_len = len(self.data[0]) #self.config['avg_freq'] self.val_len = len(self.data[2]) def prepare_param_exchanger(self): from base.exchanger import BSP_Exchanger # 3 (local to worker type) self.exchanger = BSP_Exchanger(self.config,\ self.drv, \ self.ctx, self.model) def prepare_train_fn(self): # to make sure model compiles necessary functions (get_vels() and descent() for cdd, or train() for avg) and allocate necessary extra param memory (vels,vels2 for cdd, or nothing for avg) # allocate supporting params for this worker type worker_type = self.config['worker_type'] model = self.model if worker_type == 'cdd': import theano model.vels = [ theano.shared(param_i.get_value() * 0.) for param_i in model.params ] model.vels2 = [ theano.shared(param_i.get_value() * 0.) for param_i in model.params ] self.prepare_update_dict(worker_type='cdd') updates_v, updates_dv = model.update_dict get_vel_args = {"inputs":[model.subb_ind], "outputs":[model.cost,model.error], "updates":updates_v, \ "givens":[(model.x, model.shared_x_slice), (model.y, model.shared_y_slice), (model.lr, model.shared_lr)]} descent_vel_args = { "inputs": [], "outputs": [], "updates": updates_dv } model.compile_train_fn_list = [get_vel_args, descent_vel_args] model.compile_train( ) # needs compile model before para_load_init() # 2 (local to worker type) model.get_vel, model.descent_vel = model.compiled_train_fn_list elif worker_type == 'avg': import theano model.vels = [ theano.shared(param_i.get_value() * 0.) for param_i in model.params ] self.prepare_update_dict(worker_type='avg') updates_w, = model.update_dict train_args = {"inputs":[model.subb_ind], "outputs": [model.cost,model.error], "updates": updates_w, \ "givens": [(model.x, model.shared_x_slice), (model.y, model.shared_y_slice), (model.lr, model.shared_lr)]} model.compile_train_fn_list = [train_args] model.compile_train() model.train, = model.compiled_train_fn_list def prepare_update_dict(self, worker_type): model = self.model config = self.config use_momentum = config['use_momentum'], use_nesterov_momentum = config['use_nesterov_momentum'] try: size = config['size'] verbose = config['rank'] == 0 except KeyError: size = 1 verbose = True params, grads, weight_types = model.params, model.grads, model.weight_types vels, vels2 = model.vels, model.vels2 lr = model.lr #shared_lr #T.scalar('lr') # symbolic learning rate mu = model.mu # def: 0.9 # momentum eta = model.eta #0.0002 # weight decay updates_w = [] # for avg updates_v = [] # for cdd updates_dv = [] # for cdd if use_momentum: assert len(weight_types) == len(params) k = 0 for param_i, grad_i, weight_type in \ zip(params, grads, weight_types): if weight_type == 'W': real_grad = grad_i + eta * param_i real_lr = lr elif weight_type == 'b': real_grad = grad_i real_lr = 2. * lr else: raise TypeError("Weight Type Error") if use_nesterov_momentum: vel_i_next = mu**2 * vels[k] - (1 + mu) * real_lr * real_grad else: vel_i_next = mu * vels[k] - real_lr * real_grad if worker_type == 'cdd': updates_v.append((vels[k], vel_i_next)) updates_dv.append((param_i, param_i + vels2[k])) elif worker_type == 'avg': updates_w.append((vels[k], vel_i_next)) updates_w.append((param_i, param_i + vel_i_next)) k = k + 1 else: k = 0 for param_i, grad_i, weight_type in \ zip(params, grads, weight_types): if weight_type == 'W': if worker_type == 'cdd': update = -lr * grad_i - eta * lr * param_i elif worker_type == 'avg': update = param_i - lr * grad_i - eta * lr * param_i elif weight_type == 'b': if worker_type == 'cdd': update = -2 * lr * grad_i elif worker_type == 'avg': update = param_i - 2 * lr * grad_i if worker_type == 'cdd': updates_v.append((vels[k], update)) updates_dv.append((param_i, param_i + vels2[k])) elif worker_type == 'avg': # updates_w.append((vel_i, - 2 * lr * grad_i)) updates_w.append((param_i, update)) k = k + 1 if worker_type == 'cdd': self.model.update_dict = [updates_v, updates_dv] elif worker_type == 'avg': self.model.update_dict = [updates_w] def prepare_val_fn(self): self.model.compile_val() def prepare_recorder(self): from base.recorder import Recorder self.recorder = Recorder(self.config) def prepare_iterator(self): worker_type = self.config['worker_type'] from base.iterator import P_iter if worker_type == 'cdd': def cdd_iter_fn(subb_ind): self.model.descent_vel() cost, error = self.model.get_vel(subb_ind) return cost, error self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', cdd_iter_fn) elif worker_type == 'avg': self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', self.model.train) self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val', self.model.val) def load_model(self, load_epoch): layers = self.model.layers path = self.config['load_path'] s_lr = self.model.shared_lr vels = self.model.vels # TODO needs to verify the previous lr is when training with avg, scaled by size import os s_lr.set_value( np.load(os.path.join(path, 'lr_' + str(load_epoch) + '.npy'))) from base.helper_funcs import load_weights, load_momentums #l_range = set(range(16))-set([1,3]) load_weights(layers, path, load_epoch) #load_momentums(vels, path, load_epoch) if self.verbose: print '\nlearning rate loaded %f' % s_lr.get_value() print 'weights and momentums loaded from epoch %d' % load_epoch print 'in %s' % path record_file_path = self.config['record_dir'] + 'inforec.pkl' if os.path.exists(record_file_path): import glob history_folder = self.config['record_dir'] + 'history*' find = glob.glob(history_folder) #print find if find != []: history_folder = sorted(find)[-1] #print history_folder history_folder = history_folder.split('_')[0] + '_' + \ "%d" % (int(history_folder.split('_')[-1])+1) + '/' else: history_folder = self.config[ 'record_dir'] + 'history_0' + '/' print 'creating inforec history folder: ' + history_folder os.makedirs(history_folder) import shutil shutil.copy(record_file_path, history_folder + 'inforec.pkl') self.recorder.load(filepath=record_file_path) else: raise OSError('record fle not found at %s ' % record_file_path) def save_model(self): layers = self.model.layers path = self.config['weights_dir'] vels = self.model.vels from base.helper_funcs import save_weights, save_momentums save_weights(layers, path, self.epoch) np.save(path + 'lr_' + str(self.epoch) + \ '.npy', self.model.shared_lr.get_value()) #save_momentums(vels, self.config['weights_dir'], self.epoch) if self.verbose: print '\nweights and momentums saved at epoch %d' % self.epoch with open(path + "val_info.txt", "a") as f: f.write("\nepoch: {} val_info {}:".format(self.epoch, \ self.model.current_info)) def train(self): i_next = self.train_iterator.next r_start = self.recorder.start if self.size > 1: exch = self.exchanger.exchange r_end = self.recorder.end r_print = self.recorder.print_train_info for i in xrange(0, self.train_len, self.size): for subb_ind in range(self.config['n_subb']): i_next(self.recorder, self.count) self.comm.Barrier() r_start() if self.size > 1: exch() r_end('comm') self.count += self.size r_print(self.count) self.train_iterator.reset() def val(self): self.model.set_dropout_off() for i in xrange(0, self.val_len, self.config['size']): for subb_ind in range(self.config['n_subb']): self.val_iterator.next(self.recorder, self.count) print '.', self.recorder.gather_val_info() self.recorder.print_val_info(self.count) self.model.current_info = self.recorder.get_latest_val_info() self.model.set_dropout_on() self.val_iterator.reset() def adjust_lr(self): self.model.adjust_lr(self.epoch) new_lr = self.model.shared_lr.get_value() if self.config['worker_type'] == 'avg': self.model.shared_lr.set_value(np.float32(new_lr * self.size)) else: pass if self.verbose: print 'Learning rate now: %.10f' % \ np.float32(self.model.shared_lr.get_value()) def run(self): # override PTWorker class method print 'worker started' if self.config['resume_train'] == True: self.epoch = self.config['load_epoch'] self.load_model(self.epoch) if self.size > 1: self.prepare_param_exchanger() self.adjust_lr() if self.config['initial_val']: self.mode = 'val' else: self.mode = 'train' while True: if self.mode == 'train': self.comm.Barrier() self.recorder.start_epoch() self.epoch += 1 # epoch starts from 1, not 0. 0 means training has not started. if self.verbose: print '\nNow training' self.train() self.recorder.end_epoch(self.count, self.epoch) self.mode = 'val' elif self.mode == 'val': self.comm.Barrier() if self.verbose: print '\nNow validating' self.val() self.adjust_lr() self.recorder.save(self.count, self.model.shared_lr.get_value(), \ filepath = self.config['record_dir'] + 'inforec.pkl') if self.epoch % self.config['snapshot_freq'] == 0: if self.config['rank'] == 0: self.save_model() if self.epoch >= self.config['n_epochs']: self.mode = 'stop' else: self.mode = 'train' elif self.mode == 'stop': if self.verbose: print '\nOptimization finished' break else: raise ValueError('wrong mode') self.para_load_close()
class BSP_PTWorker(PTWorker): ''' Worker class based a specific synchronization rule (BSP) ''' def __init__(self, config, device): PTWorker.__init__(self, config = config, \ device = device) self.verbose = self.config['verbose'] import time compile_time = time.time() self.prepare_train_fn() # 1 (local to worker type) allocate supporting params and compiling theano functions self.prepare_val_fn() if self.verbose: print 'compile_time %.2f s' % \ (time.time() - compile_time) self.prepare_para_load() #needs to be after compile_train and compile_val() self.prepare_recorder() self.prepare_iterator() self.mode = None self.epoch = 0 self.count = 0 self.train_len = len(self.data[0]) #self.config['avg_freq'] self.val_len = len(self.data[2]) def prepare_param_exchanger(self): from base.exchanger import BSP_Exchanger # 3 (local to worker type) self.exchanger = BSP_Exchanger(self.config,\ self.drv, \ self.ctx, self.model) def prepare_train_fn(self): # to make sure model compiles necessary functions (get_vels() and descent() for cdd, or train() for avg) and allocate necessary extra param memory (vels,vels2 for cdd, or nothing for avg) # allocate supporting params for this worker type worker_type=self.config['worker_type'] model = self.model if worker_type == 'cdd': import theano model.vels = [theano.shared(param_i.get_value() * 0.) for param_i in model.params] model.vels2 = [theano.shared(param_i.get_value() * 0.) for param_i in model.params] self.prepare_update_dict(worker_type='cdd') updates_v, updates_dv = model.update_dict get_vel_args = {"inputs":[model.subb_ind], "outputs":[model.cost,model.error], "updates":updates_v, \ "givens":[(model.x, model.shared_x_slice), (model.y, model.shared_y_slice), (model.lr, model.shared_lr)]} descent_vel_args = {"inputs":[], "outputs":[], "updates":updates_dv} model.compile_train_fn_list = [get_vel_args, descent_vel_args] model.compile_train() # needs compile model before para_load_init() # 2 (local to worker type) model.get_vel, model.descent_vel = model.compiled_train_fn_list elif worker_type == 'avg': import theano model.vels = [theano.shared(param_i.get_value() * 0.) for param_i in model.params] self.prepare_update_dict(worker_type='avg') updates_w, = model.update_dict train_args = {"inputs":[model.subb_ind], "outputs": [model.cost,model.error], "updates": updates_w, \ "givens": [(model.x, model.shared_x_slice), (model.y, model.shared_y_slice), (model.lr, model.shared_lr)]} model.compile_train_fn_list = [train_args] model.compile_train() model.train , = model.compiled_train_fn_list def prepare_update_dict(self, worker_type): model = self.model config = self.config use_momentum=config['use_momentum'], use_nesterov_momentum=config['use_nesterov_momentum'] try: size = config['size'] verbose = config['rank'] == 0 except KeyError: size = 1 verbose = True params, grads, weight_types = model.params, model.grads, model.weight_types vels, vels2 = model.vels, model.vels2 lr = model.lr #shared_lr #T.scalar('lr') # symbolic learning rate mu = model.mu # def: 0.9 # momentum eta = model.eta #0.0002 # weight decay updates_w = [] # for avg updates_v = [] # for cdd updates_dv = [] # for cdd if use_momentum: assert len(weight_types) == len(params) k=0 for param_i, grad_i, weight_type in \ zip(params, grads, weight_types): if weight_type == 'W': real_grad = grad_i + eta * param_i real_lr = lr elif weight_type == 'b': real_grad = grad_i real_lr = 2. * lr else: raise TypeError("Weight Type Error") if use_nesterov_momentum: vel_i_next = mu ** 2 * vels[k] - (1 + mu) * real_lr * real_grad else: vel_i_next = mu * vels[k] - real_lr * real_grad if worker_type == 'cdd': updates_v.append((vels[k], vel_i_next)) updates_dv.append((param_i, param_i + vels2[k])) elif worker_type == 'avg': updates_w.append((vels[k], vel_i_next)) updates_w.append((param_i, param_i + vel_i_next)) k=k+1 else: k=0 for param_i, grad_i, weight_type in \ zip(params, grads, weight_types): if weight_type == 'W': if worker_type == 'cdd': update = - lr * grad_i - eta * lr * param_i elif worker_type == 'avg': update = param_i - lr * grad_i - eta * lr * param_i elif weight_type == 'b': if worker_type == 'cdd': update = - 2 * lr * grad_i elif worker_type == 'avg': update = param_i - 2 * lr * grad_i if worker_type == 'cdd': updates_v.append((vels[k], update)) updates_dv.append((param_i, param_i + vels2[k])) elif worker_type == 'avg': # updates_w.append((vel_i, - 2 * lr * grad_i)) updates_w.append((param_i, update)) k=k+1 if worker_type == 'cdd': self.model.update_dict = [updates_v, updates_dv] elif worker_type == 'avg': self.model.update_dict = [updates_w] def prepare_val_fn(self): self.model.compile_val() def prepare_recorder(self): from base.recorder import Recorder self.recorder = Recorder(self.config) def prepare_iterator(self): worker_type=self.config['worker_type'] from base.iterator import P_iter if worker_type == 'cdd': def cdd_iter_fn(subb_ind): self.model.descent_vel() cost, error = self.model.get_vel(subb_ind) return cost, error self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', cdd_iter_fn) elif worker_type == 'avg': self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train', self.model.train) self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val', self.model.val) def load_model(self, load_epoch): layers = self.model.layers path = self.config['load_path'] s_lr = self.model.shared_lr vels = self.model.vels # TODO needs to verify the previous lr is when training with avg, scaled by size import os s_lr.set_value(np.load(os.path.join(path, 'lr_' + str(load_epoch) + '.npy'))) from base.helper_funcs import load_weights, load_momentums #l_range = set(range(16))-set([1,3]) load_weights(layers, path, load_epoch) #load_momentums(vels, path, load_epoch) if self.verbose: print '\nlearning rate loaded %f' % s_lr.get_value() print 'weights and momentums loaded from epoch %d' % load_epoch print 'in %s' % path record_file_path = self.config['record_dir'] + 'inforec.pkl' if os.path.exists(record_file_path): import glob history_folder = self.config['record_dir']+ 'history*' find = glob.glob(history_folder) #print find if find != []: history_folder = sorted(find)[-1] #print history_folder history_folder = history_folder.split('_')[0] + '_' + \ "%d" % (int(history_folder.split('_')[-1])+1) + '/' else: history_folder = self.config['record_dir']+ 'history_0' + '/' print 'creating inforec history folder: ' + history_folder os.makedirs(history_folder) import shutil shutil.copy(record_file_path, history_folder+'inforec.pkl') self.recorder.load(filepath = record_file_path) else: raise OSError('record fle not found at %s ' % record_file_path) def save_model(self): layers = self.model.layers path = self.config['weights_dir'] vels = self.model.vels from base.helper_funcs import save_weights, save_momentums save_weights(layers, path, self.epoch) np.save(path + 'lr_' + str(self.epoch) + \ '.npy', self.model.shared_lr.get_value()) #save_momentums(vels, self.config['weights_dir'], self.epoch) if self.verbose: print '\nweights and momentums saved at epoch %d' % self.epoch with open(path+"val_info.txt", "a") as f: f.write("\nepoch: {} val_info {}:".format(self.epoch, \ self.model.current_info)) def train(self): i_next = self.train_iterator.next r_start = self.recorder.start if self.size>1: exch = self.exchanger.exchange r_end = self.recorder.end r_print = self.recorder.print_train_info for i in xrange(0,self.train_len,self.size): for subb_ind in range(self.config['n_subb']): i_next(self.recorder,self.count) self.comm.Barrier() r_start() if self.size>1: exch() r_end('comm') self.count += self.size r_print(self.count) self.train_iterator.reset() def val(self): self.model.set_dropout_off() for i in xrange(0,self.val_len,self.config['size']): for subb_ind in range(self.config['n_subb']): self.val_iterator.next(self.recorder,self.count) print '.', self.recorder.gather_val_info() self.recorder.print_val_info(self.count) self.model.current_info = self.recorder.get_latest_val_info() self.model.set_dropout_on() self.val_iterator.reset() def adjust_lr(self): self.model.adjust_lr(self.epoch) new_lr = self.model.shared_lr.get_value() if self.config['worker_type'] == 'avg': self.model.shared_lr.set_value(np.float32(new_lr*self.size)) else: pass if self.verbose: print 'Learning rate now: %.10f' % \ np.float32(self.model.shared_lr.get_value()) def run(self): # override PTWorker class method print 'worker started' if self.config['resume_train'] == True: self.epoch = self.config['load_epoch'] self.load_model(self.epoch) if self.size>1: self.prepare_param_exchanger() self.adjust_lr() if self.config['initial_val']: self.mode = 'val' else: self.mode = 'train' while True: if self.mode == 'train': self.comm.Barrier() self.recorder.start_epoch() self.epoch+=1# epoch starts from 1, not 0. 0 means training has not started. if self.verbose: print '\nNow training' self.train() self.recorder.end_epoch(self.count, self.epoch) self.mode = 'val' elif self.mode == 'val': self.comm.Barrier() if self.verbose: print '\nNow validating' self.val() self.adjust_lr() self.recorder.save(self.count, self.model.shared_lr.get_value(), \ filepath = self.config['record_dir'] + 'inforec.pkl') if self.epoch % self.config['snapshot_freq'] == 0: if self.config['rank'] ==0 : self.save_model() if self.epoch >= self.config['n_epochs']: self.mode = 'stop' else: self.mode = 'train' elif self.mode == 'stop': if self.verbose: print '\nOptimization finished' break else: raise ValueError('wrong mode') self.para_load_close()
class EASGD_PTWorker(PTWorker): ''' Worker class based on a specific synchronization rule (EASGD) Executing training routine and periodically reporting results to server ''' def __init__(self, port, config, device): PTWorker.__init__(self, port = port, \ config = config, \ device = device) self.worker_id = self.config['worker_id'] if self.config['sync_start']: # sync start register, # use the COMM_WORLD to communicate with server self._MPI_register() self.model.verbose = self.verbose else: # async start register, # build a separate intercomm to communicate with server self.MPI_register() self.model.verbose = self.verbose #if self.verbose: print 'worker registered' self.prepare_worker() self.prepare_recorder() self.prepare_iterator() self.uepoch = None if self.config['resume_train'] == True: self.uepoch = self.config['load_epoch'] self.load_model(self.uepoch) self.train_len = self.config['avg_freq'] self.val_len = len(self.data[2]) self.mode = None self.lastmode = None self.count = 0 if self.verbose: self.rec_name = 'inforec.pkl' else: self.rec_name = 'inforec_'+ str(self.worker_id) + '.pkl' def prepare_param_exchanger(self): from base.exchanger import EASGD_Exchanger self.exchanger = EASGD_Exchanger(self.config, \ self.drv, \ self.model.params, \ etype='worker') def prepare_recorder(self): from base.recorder import Recorder self.recorder = Recorder(self.config) def prepare_iterator(self): from base.iterator import P_iter # iterator won't make another copy of the model # instead it will just call its compiled train function self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train') self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val') def load_model(self, load_epoch): layers = self.model.layers path = self.config['load_path'] s_lr = self.model.shared_lr vels = self.model.vels # TODO needs to verify the previous lr is when training with avg, scaled by size import os s_lr.set_value(np.load(os.path.join(path, 'lr_' + str(load_epoch) + '.npy'))) from base.helper_funcs import load_weights, load_momentums load_weights(layers, path, load_epoch) #load_momentums(vels, path, load_epoch) if self.verbose: print '\nlearning rate loaded %f' % s_lr.get_value() print 'weights and momentums loaded from epoch %d in %s' % (load_epoch,path) record_file_path = self.config['record_dir'] + 'inforec.pkl' # bug which worker's inforec should be used, use only recording worker's, if exist, put into history if os.path.exists(record_file_path): import glob history_folder = self.config['record_dir']+ 'history*' find = glob.glob(history_folder) #print find if find != []: history_folder = sorted(find)[-1] #print history_folder history_folder = history_folder.split('_')[0] + '_' + \ "%d" % (int(history_folder.split('_')[-1])+1) + '/' else: history_folder = self.config['record_dir']+ 'history_0' + '/' print 'creating inforec history folder: ' + history_folder os.makedirs(history_folder) import shutil shutil.copy(record_file_path, history_folder+'inforec.pkl') self.recorder.load(filepath = record_file_path) # print type(self.recorder.info_dict['train_info']) # print len(self.recorder.info_dict['train_info']) # # print type(self.recorder.info_dict['val_info']) # print len(self.recorder.info_dict['val_info']) else: raise OSError('record fle not found at %s ' % record_file_path) def save_model(self): assert self.uepoch != None layers = self.model.layers path = self.config['weights_dir'] vels = self.model.vels from base.helper_funcs import save_weights, save_momentums save_weights(layers, path, self.uepoch) np.save(path + 'lr_' + str(self.uepoch) + \ '.npy', self.model.shared_lr.get_value()) #save_momentums(vels,self.config['weights_dir'], self.uepoch) if self.verbose: print '\nweights and momentums saved at epoch %d' % self.uepoch def train(self): for i in range(self.train_len): for subb_ind in range(self.config['n_subb']): #print self.count self.train_iterator.next(self.recorder,self.count) self.count += 1 self.recorder.print_train_info(self.count) self.recorder.start() reply = self.request(dict(done=self.train_len)) self.exchanger.comm = self.intercomm self.action(message = 'exchange', \ action=self.exchanger.exchange) self.recorder.end('comm') self.lastmode = 'train' def val(self): if self.lastmode == 'train': self.train_iterator.reset() self.model.set_dropout_off() for i in range(self.val_len): self.val_iterator.next(self.recorder,self.count) if self.verbose: print '.', self.recorder.print_val_info(self.count) self.model.set_dropout_on() self.val_iterator.reset() def copy_to_local(self): self.exchanger.comm = self.intercomm self.action(message = 'copy_to_local', \ action=self.exchanger.copy_to_local) if self.verbose: print '\nSynchronized param with server' def adjust_lr(self): self.uepoch, self.n_workers = self.request('uepoch') #if self.verbose: print 'global epoch %d, %d workers online' % (self.uepoch, self.n_workers ) self.model.adjust_lr(self.uepoch, size = self.n_workers) def run(self): # override PTWorker class method if self.verbose: print 'worker %s started' % self.worker_id self.prepare_param_exchanger() # start training with the most recent server parameter self.copy_to_local() self.adjust_lr() epoch_start = False while True: self.mode = self.request('next') #print self.mode if self.mode == 'train': if epoch_start == False: self.recorder.start_epoch() epoch_start = True if self.verbose: print '\nNow training' self.train() if self.mode == 'adjust_lr': self.adjust_lr() #self.copy_to_local() if self.mode == 'val': if self.verbose: print '\nNow validating' self.copy_to_local() self.val() self.recorder.save(self.count, self.model.shared_lr.get_value(), \ filepath = self.config['record_dir'] + self.rec_name) self.uepoch, self.n_workers = self.request('uepoch') if self.uepoch % self.config['snapshot_freq'] == 0: # TODO BUG: if too few images in training set, uepoch may skip more than 1 per check self.save_model() self.copy_to_local() if epoch_start == True: self.recorder.end_epoch(self.count, self.uepoch) epoch_start = False if self.mode == 'stop': self.copy_to_local() self.val() if epoch_start == True: self.recorder.end_epoch(self.count, self.uepoch) epoch_start = False if self.verbose: print '\nOptimization finished' break self.para_load_close() # TODO some workers blocked here can't disconnect self.ctx.pop() self.MPI_deregister() if self.verbose: print '\nWorker %s deregistered' % self.worker_id
class EASGD_PTWorker(PTWorker): ''' Worker class based on a specific synchronization rule (EASGD) Executing training routine and periodically reporting results to server ''' def __init__(self, port, config, device): PTWorker.__init__(self, port = port, \ config = config, \ device = device) self.worker_id = self.config['worker_id'] if self.config['sync_start']: # sync start register, # use the COMM_WORLD to communicate with server self._MPI_register() self.model.verbose = self.verbose else: # async start register, # build a separate intercomm to communicate with server self.MPI_register() self.model.verbose = self.verbose #if self.verbose: print 'worker registered' self.prepare_worker() self.prepare_recorder() self.prepare_iterator() self.uepoch = None if self.config['resume_train'] == True: self.uepoch = self.config['load_epoch'] self.load_model(self.uepoch) self.train_len = self.config['avg_freq'] self.val_len = len(self.data[2]) self.mode = None self.lastmode = None self.count = 0 if self.verbose: self.rec_name = 'inforec.pkl' else: self.rec_name = 'inforec_' + str(self.worker_id) + '.pkl' def prepare_param_exchanger(self): from base.exchanger import EASGD_Exchanger self.exchanger = EASGD_Exchanger(self.config, \ self.drv, \ self.model.params, \ etype='worker') def prepare_recorder(self): from base.recorder import Recorder self.recorder = Recorder(self.config) def prepare_iterator(self): from base.iterator import P_iter # iterator won't make another copy of the model # instead it will just call its compiled train function self.train_iterator = P_iter(self.config, self.model, \ self.data[0], self.data[1], 'train') self.val_iterator = P_iter(self.config, self.model, \ self.data[2], self.data[3], 'val') def load_model(self, load_epoch): layers = self.model.layers path = self.config['load_path'] s_lr = self.model.shared_lr vels = self.model.vels # TODO needs to verify the previous lr is when training with avg, scaled by size import os s_lr.set_value( np.load(os.path.join(path, 'lr_' + str(load_epoch) + '.npy'))) from base.helper_funcs import load_weights, load_momentums load_weights(layers, path, load_epoch) #load_momentums(vels, path, load_epoch) if self.verbose: print '\nlearning rate loaded %f' % s_lr.get_value() print 'weights and momentums loaded from epoch %d in %s' % ( load_epoch, path) record_file_path = self.config[ 'record_dir'] + 'inforec.pkl' # bug which worker's inforec should be used, use only recording worker's, if exist, put into history if os.path.exists(record_file_path): import glob history_folder = self.config['record_dir'] + 'history*' find = glob.glob(history_folder) #print find if find != []: history_folder = sorted(find)[-1] #print history_folder history_folder = history_folder.split('_')[0] + '_' + \ "%d" % (int(history_folder.split('_')[-1])+1) + '/' else: history_folder = self.config[ 'record_dir'] + 'history_0' + '/' print 'creating inforec history folder: ' + history_folder os.makedirs(history_folder) import shutil shutil.copy(record_file_path, history_folder + 'inforec.pkl') self.recorder.load(filepath=record_file_path) # print type(self.recorder.info_dict['train_info']) # print len(self.recorder.info_dict['train_info']) # # print type(self.recorder.info_dict['val_info']) # print len(self.recorder.info_dict['val_info']) else: raise OSError('record fle not found at %s ' % record_file_path) def save_model(self): assert self.uepoch != None layers = self.model.layers path = self.config['weights_dir'] vels = self.model.vels from base.helper_funcs import save_weights, save_momentums save_weights(layers, path, self.uepoch) np.save(path + 'lr_' + str(self.uepoch) + \ '.npy', self.model.shared_lr.get_value()) #save_momentums(vels,self.config['weights_dir'], self.uepoch) if self.verbose: print '\nweights and momentums saved at epoch %d' % self.uepoch def train(self): for i in range(self.train_len): for subb_ind in range(self.config['n_subb']): #print self.count self.train_iterator.next(self.recorder, self.count) self.count += 1 self.recorder.print_train_info(self.count) self.recorder.start() reply = self.request(dict(done=self.train_len)) self.exchanger.comm = self.intercomm self.action(message = 'exchange', \ action=self.exchanger.exchange) self.recorder.end('comm') self.lastmode = 'train' def val(self): if self.lastmode == 'train': self.train_iterator.reset() self.model.set_dropout_off() for i in range(self.val_len): self.val_iterator.next(self.recorder, self.count) if self.verbose: print '.', self.recorder.print_val_info(self.count) self.model.set_dropout_on() self.val_iterator.reset() def copy_to_local(self): self.exchanger.comm = self.intercomm self.action(message = 'copy_to_local', \ action=self.exchanger.copy_to_local) if self.verbose: print '\nSynchronized param with server' def adjust_lr(self): self.uepoch, self.n_workers = self.request('uepoch') #if self.verbose: print 'global epoch %d, %d workers online' % (self.uepoch, self.n_workers ) self.model.adjust_lr(self.uepoch, size=self.n_workers) def run(self): # override PTWorker class method if self.verbose: print 'worker %s started' % self.worker_id self.prepare_param_exchanger() # start training with the most recent server parameter self.copy_to_local() self.adjust_lr() epoch_start = False while True: self.mode = self.request('next') #print self.mode if self.mode == 'train': if epoch_start == False: self.recorder.start_epoch() epoch_start = True if self.verbose: print '\nNow training' self.train() if self.mode == 'adjust_lr': self.adjust_lr() #self.copy_to_local() if self.mode == 'val': if self.verbose: print '\nNow validating' self.copy_to_local() self.val() self.recorder.save(self.count, self.model.shared_lr.get_value(), \ filepath = self.config['record_dir'] + self.rec_name) self.uepoch, self.n_workers = self.request('uepoch') if self.uepoch % self.config[ 'snapshot_freq'] == 0: # TODO BUG: if too few images in training set, uepoch may skip more than 1 per check self.save_model() self.copy_to_local() if epoch_start == True: self.recorder.end_epoch(self.count, self.uepoch) epoch_start = False if self.mode == 'stop': self.copy_to_local() self.val() if epoch_start == True: self.recorder.end_epoch(self.count, self.uepoch) epoch_start = False if self.verbose: print '\nOptimization finished' break self.para_load_close( ) # TODO some workers blocked here can't disconnect self.ctx.pop() self.MPI_deregister() if self.verbose: print '\nWorker %s deregistered' % self.worker_id