def _init_training(self, model): model = self._parallel_to_device(model) # define the param to optimize. params = [ { "params": [value], "name": key, "weight_decay": self.conf.weight_decay, "param_size": value.size(), "nelement": value.nelement(), "lr": self.conf.lr, } for key, value in model.named_parameters() if value.requires_grad ] # create the optimizer. if self.conf.optimizer == "adam": opt = optim.Adam( params, lr=self.conf.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=self.conf.weight_decay, ) else: raise NotImplementedError("this optimizer is not supported yet.") opt.zero_grad() model.zero_grad() self.log_fn(f"Initialize the optimizer: {self.conf.optimizer}") return opt, model
def _init_training(self, model): model = self._parallel_to_device(model) # define the param to optimize. params = [{ "params": [value], "name": key, "weight_decay": self.conf.weight_decay, "param_size": value.size(), "nelement": value.nelement(), "lr": self.conf.lr_for_mask if self.conf.lr_for_mask is not None and "mask" in key else self.conf.lr, } for key, value in model.named_parameters() if value.requires_grad] # create the optimizer. if self.conf.optimizer == "adam": opt = optim.Adam( params, lr=self.conf.lr, betas=(self.conf.adam_beta_1, self.conf.adam_beta_2), eps=self.conf.adam_eps, weight_decay=self.conf.weight_decay, ) elif self.conf.optimizer == "sgd": opt = torch.optim.SGD( params, lr=self.conf.lr, momentum=self.conf.momentum_factor, weight_decay=self.conf.weight_decay, nesterov=self.conf.use_nesterov, ) elif self.conf.optimizer == "signsgd": opt = optim.SignSGD( params, lr=self.conf.lr, momentum=self.conf.momentum_factor, weight_decay=self.conf.weight_decay, nesterov=self.conf.use_nesterov, ) else: raise NotImplementedError("this optimizer is not supported yet.") opt.zero_grad() model.zero_grad() self.log_fn(f"Initialize the optimizer: {self.conf.optimizer}") return opt, model
def test_sample(filename): with open(filename, 'r') as o: lr = float(o.readline().strip()) p, q, r = list(map(int, o.readline().split())) W = np.zeros([p, q]) b = np.zeros([r]) for i in range(p): line = list(map(float, o.readline().split())) W[i, :] = line line = list(map(float, o.readline().split())) b[:] = line n = int(o.readline().strip()) grads_flat = np.zeros([n, p * q + r]) W_ans = np.zeros([p, q]) b_ans = np.zeros([r]) m_ans = np.zeros([p * q + r]) v_ans = np.zeros([p * q + r]) for i in range(n): line = list(map(float, o.readline().split())) grads_flat[i, :] = line for i in range(p): line = list(map(float, o.readline().split())) W_ans[i, :] = line line = list(map(float, o.readline().split())) b_ans[:] = line line = list(map(float, o.readline().split())) m_ans[:] = line line = list(map(float, o.readline().split())) v_ans[:] = line model = TestModel(W, b) optimizer = optim.Adam(model, lr) for i in range(n): optimizer.update(grads_flat[i]) self.assertTrue( np.isclose(model.params()[0], W_ans).all(), filename) self.assertTrue( np.isclose(model.params()[1], b_ans).all(), filename) self.assertTrue(np.isclose(optimizer.m, m_ans).all(), filename) self.assertTrue(np.isclose(optimizer.v, v_ans).all(), filename)
def __init__(self, config_file, config): super().__init__(cache=False) self.train_language_list = config.train_language_list self.n_train_lang = len(config.train_language_list) self.main_dev_list = config.main_dev_language_list self.n_main_dev_lang = len(config.main_dev_language_list) self.add_dev_list = config.add_dev_language_list self.n_add_dev_lang = len(config.add_dev_language_list) self.subwords_to_vectors = data.get_subwords_to_vectors( config.feature_fn) self.data = data.MultilangDataset( feats_fns=config.train_feats, align_fns=config.train_align, vocab_fns=config.train_vocab, subwords=config.subwords, subwords_to_vectors=self.subwords_to_vectors, min_occ_count=config.train_min_occ_count, min_seg_dur=config.train_min_seg_dur, stack_frames=config.stack_frames, batch_size=config.train_batch_size, shuffle=config.shuffle, cache=self.cache) # statistics train_subwords = set( data.combine_subwords_to_ids(config.train_vocab, config.subwords).keys()) log.info(f"Using {len(train_subwords)} subwords in training") # dev sets for all training languages self.dev_datasets = [] for i in range(self.n_main_dev_lang + self.n_add_dev_lang): data_dev = data.DevDataset( feats=config.dev_feats[i], align=config.dev_align[i], vocab=config.dev_vocab[i], subwords=config.subwords, min_occ_count=config.dev_min_occ_count, min_seg_dur=config.dev_min_seg_dur, stack_frames=config.stack_frames, batch_size=config.dev_batch_size, cache=self.cache, subwords_to_vectors=self.subwords_to_vectors) self.dev_datasets.append(data_dev) # statistics if i < self.n_main_dev_lang: this_lang = self.main_dev_list[i] else: this_lang = self.add_dev_list[i - self.n_main_dev_lang] this_subwords = set( data.combine_subwords_to_ids([config.dev_vocab[i]], config.subwords)) log.info( f"language {this_lang} has {len(this_subwords)} subwords, " f"intersect {len(train_subwords.intersection(this_subwords))} subwords" ) loss_fun = loss.Obj02(margin=config.loss_margin, k=config.loss_k) self.net = net.MultiViewRNN_Phonetic( config=config, feat_dim=self.data.feat_dim, phone_feat_dim=self.data.phone_feat_dim, loss_fun=loss_fun, use_gpu=True) self.optim = optim.Adam(params=self.net.parameters(), lr=config.adam_lr) self.sched = sched.RevertOnPlateau(network=self.net, optimizer=self.optim, mode=config.mode, factor=config.factor, patience=config.patience, min_lr=config.min_lr) save_dir = os.path.join(expt_dir, "save") os.makedirs(save_dir, exist_ok=True) self.set_savepaths(save_dir=save_dir) self.save_dir = save_dir self.config_file = config_file self.config = config
# load embedding model.embed.lut.weight = nn.Parameter(TEXT.vocab.vectors) device = th.device('cuda:0') model = model.to(device) embed_params, other_params, wd_params = unpack_params(model.named_parameters()) optimizer = get_wrapper(config['opt_wrapper'])(optim.Adam([{ 'params': embed_params, 'lr': 0 }, { 'params': other_params, 'lr': config.get('lr', 1e-3) }, { 'params': wd_params, 'lr': config.get('lr', 1e-3), 'weight_decay': 5e-5 }]), **config.get('opt_attrs', {})) best_val, test_acc = 1e9, 0 for epoch in range(config['n_epochs']): import time tic = time.time() print('epoch {}'.format(epoch)) print('training...')
n_layers, m_layers, dropouti=config['dropouti'], dropouth=config['dropouth'], dropouta=config['dropouta'], dropoutc=config['dropoutc'], rel_pos=config['rel_pos']) # load embedding model.embed.lut.weight = nn.Parameter(TEXT.vocab.vectors) device = th.device('cuda:0') model = model.to(device) embed_params, other_params, wd_params = unpack_params(model.named_parameters()) optimizer = get_wrapper(config['opt_wrapper'])( optim.Adam([ {'params': embed_params, 'lr': 0}, {'params': other_params, 'lr': config.get('lr', 1e-3)}, {'params': wd_params, 'lr': config.get('lr', 1e-3), 'weight_decay': 5e-5}])) best_val, test_acc = 1e9, 0 for epoch in range(config['n_epochs']): print('epoch {}'.format(epoch)) print('training...') model.train() n_tokens = 0 sum_loss = 0 hit = 0 for i, batch in enumerate(train_loader): batch.y = batch.y.to(device) batch.g.edata['etype'] = batch.g.edata['etype'].to(device) batch.g.ndata['x'] = batch.g.ndata['x'].to(device)
if __name__ == '__main__': trainloader = Cifar10('./data/cifar-10-batches-py/', batch_size=32, phase='train', shuffle='True') testloader = Cifar10('./data/cifar-10-batches-py/', batch_size=100, phase='test', shuffle='False') model = Model() model.initialize() criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9, weight_decay=5-4) optimizer = optim.Adam(model.parameters(), lr=0.001) epoch_list, train_loss_list, train_acc_list, test_loss_list, test_acc_list = [], [], [], [], [] early_stop = 3 for i in range(100): train_loss, train_acc = train(model, trainloader, criterion, optimizer) test_loss, test_acc = test(model, testloader, criterion) epoch_list.append(i) train_loss_list.append(train_loss) train_acc_list.append(train_acc) test_loss_list.append(test_loss) test_acc_list.append(test_acc) if len(test_acc_list) > 1 and test_acc < test_acc_list[-2]: early_stop -= 1