step = 0 fig, axes = plt.subplots(1, 2) mats = [None] * 2 mats[0] = axes[0].matshow(np.zeros([28, 28]), cmap='bone', vmin=0., vmax=1.) mats[1] = axes[1].matshow(np.zeros([28, 28]), cmap='bone', vmin=0., vmax=1.) for xx, yy in data_loader: vae.train() optimizer.zero_grad() xx = xx.to(device) x_recons, pz_given_xs = vae(xx, return_pz=True) nelbo = vae.calc_nelbo(xx, x_recons, pz_given_xs, wz=0.1) mi_loss = vae.calc_mi_loss(pz_given_xs) loss = nelbo + mi_loss * 0.1 # TODO: adversarial net can be used to keep mi_loss recon on natural manifold of data points if step % 100 == 0: print(f'loss step {step}: {nelbo.item():.2f}') with torch.no_grad(): plt.ion() x = xx[0].numpy().reshape(28, 28)
def train(**kwargs): opt.parse(kwargs) if opt.vis: vis = Visualizer(opt.env) # step 1: configure model model = getattr(models, opt.model)(opt) if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step 2: data train_data = Small(opt.train_root, wv_path=opt.word2vec_path, stopwords_path=opt.stopwords_path, idf_path=opt.idf_train_path, train=True) # val_data = Small(opt.train_root, # wv_path=opt.word2vec_path, # stopwords_path=opt.stopwords_path, # train=False) data_size = len(train_data) indices = t.randperm(data_size) # step 3: criterion and optimizer criterion = t.nn.KLDivLoss() lr = opt.lr optimizer = Adamax(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step 4: meters previous_loss = float('inf') # train for epoch in range(opt.max_epoch): for i in tqdm(range(0, data_size, opt.batch_size)): batch_size = min(opt.batch_size, data_size - i) # train_model loss = 0. for j in range(0, batch_size): idx = indices[i + j] q, a, label, shallow_features = train_data[idx] input_q, input_a, shallow_features = Variable(q), Variable( a), Variable(shallow_features) target = Variable(label) if opt.use_gpu: input_q = input_q.cuda() input_a = input_a.cuda() shallow_features = shallow_features.cuda() target = target.cuda() score = model(input_q, input_a, shallow_features) example_loss = criterion(score, target) loss += example_loss loss /= opt.batch_size optimizer.zero_grad() loss.backward() optimizer.step() model.save(model.module_name + '_' + str(epoch) + '.pth') print('epoch:{epoch}, lr:{lr}, loss:{loss}'.format(epoch=epoch, loss=loss.data, lr=lr)) # # validate and visualize # map, mrr = val(model, val_data) # # print('epoch:{epoch}, lr:{lr}, loss:{loss}, map:{map}, mrr:{mrr}'.format( # epoch=epoch, # loss=loss.data, # map=map, # mrr=mrr, # lr=lr # )) # update learning rate if (loss.data > previous_loss).all(): lr = lr * opt.lr_decay previous_loss = loss.data
def train(**kwargs): opt.parse(kwargs) if opt.vis_env: vis = Visualizer(opt.vis_env, port=opt.vis_port) if opt.device is None or opt.device is 'cpu': opt.device = torch.device('cpu') else: opt.device = torch.device(opt.device) images, tags, labels = load_data(opt.data_path, type=opt.dataset) train_data = Dataset(opt, images, tags, labels) train_dataloader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True) # valid or test data x_query_data = Dataset(opt, images, tags, labels, test='image.query') x_db_data = Dataset(opt, images, tags, labels, test='image.db') y_query_data = Dataset(opt, images, tags, labels, test='text.query') y_db_data = Dataset(opt, images, tags, labels, test='text.db') x_query_dataloader = DataLoader(x_query_data, opt.batch_size, shuffle=False) x_db_dataloader = DataLoader(x_db_data, opt.batch_size, shuffle=False) y_query_dataloader = DataLoader(y_query_data, opt.batch_size, shuffle=False) y_db_dataloader = DataLoader(y_db_data, opt.batch_size, shuffle=False) query_labels, db_labels = x_query_data.get_labels() query_labels = query_labels.to(opt.device) db_labels = db_labels.to(opt.device) if opt.load_model_path: pretrain_model = None elif opt.pretrain_model_path: pretrain_model = load_pretrain_model(opt.pretrain_model_path) model = AGAH(opt.bit, opt.tag_dim, opt.num_label, opt.emb_dim, lambd=opt.lambd, pretrain_model=pretrain_model).to(opt.device) load_model(model, opt.load_model_path) optimizer = Adamax([{ 'params': model.img_module.parameters(), 'lr': opt.lr }, { 'params': model.txt_module.parameters() }, { 'params': model.hash_module.parameters() }, { 'params': model.classifier.parameters() }], lr=opt.lr * 10, weight_decay=0.0005) optimizer_dis = { 'img': Adamax(model.img_discriminator.parameters(), lr=opt.lr * 10, betas=(0.5, 0.9), weight_decay=0.0001), 'txt': Adamax(model.txt_discriminator.parameters(), lr=opt.lr * 10, betas=(0.5, 0.9), weight_decay=0.0001) } criterion_tri_cos = TripletAllLoss(dis_metric='cos', reduction='sum') criterion_bce = nn.BCELoss(reduction='sum') loss = [] max_mapi2t = 0. max_mapt2i = 0. FEATURE_I = torch.randn(opt.training_size, opt.emb_dim).to(opt.device) FEATURE_T = torch.randn(opt.training_size, opt.emb_dim).to(opt.device) U = torch.randn(opt.training_size, opt.bit).to(opt.device) V = torch.randn(opt.training_size, opt.bit).to(opt.device) FEATURE_MAP = torch.randn(opt.num_label, opt.emb_dim).to(opt.device) CODE_MAP = torch.sign(torch.randn(opt.num_label, opt.bit)).to(opt.device) train_labels = train_data.get_labels().to(opt.device) mapt2i_list = [] mapi2t_list = [] train_times = [] for epoch in range(opt.max_epoch): t1 = time.time() for i, (ind, x, y, l) in tqdm(enumerate(train_dataloader)): imgs = x.to(opt.device) tags = y.to(opt.device) labels = l.to(opt.device) batch_size = len(ind) h_x, h_y, f_x, f_y, x_class, y_class = model( imgs, tags, FEATURE_MAP) FEATURE_I[ind] = f_x.data FEATURE_T[ind] = f_y.data U[ind] = h_x.data V[ind] = h_y.data ##### # train txt discriminator ##### D_txt_real = model.dis_txt(f_y.detach()) D_txt_real = -D_txt_real.mean() optimizer_dis['txt'].zero_grad() D_txt_real.backward() # train with fake D_txt_fake = model.dis_txt(f_x.detach()) D_txt_fake = D_txt_fake.mean() D_txt_fake.backward() # train with gradient penalty alpha = torch.rand(batch_size, opt.emb_dim).to(opt.device) interpolates = alpha * f_y.detach() + (1 - alpha) * f_x.detach() interpolates.requires_grad_() disc_interpolates = model.dis_txt(interpolates) gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones( disc_interpolates.size()).to( opt.device), create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.view(gradients.size(0), -1) # 10 is gradient penalty hyperparameter txt_gradient_penalty = ( (gradients.norm(2, dim=1) - 1)**2).mean() * 10 txt_gradient_penalty.backward() loss_D_txt = D_txt_real - D_txt_fake optimizer_dis['txt'].step() ##### # train img discriminator ##### D_img_real = model.dis_img(f_x.detach()) D_img_real = -D_img_real.mean() optimizer_dis['img'].zero_grad() D_img_real.backward() # train with fake D_img_fake = model.dis_img(f_y.detach()) D_img_fake = D_img_fake.mean() D_img_fake.backward() # train with gradient penalty alpha = torch.rand(batch_size, opt.emb_dim).to(opt.device) interpolates = alpha * f_x.detach() + (1 - alpha) * f_y.detach() interpolates.requires_grad_() disc_interpolates = model.dis_img(interpolates) gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones( disc_interpolates.size()).to( opt.device), create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.view(gradients.size(0), -1) # 10 is gradient penalty hyperparameter img_gradient_penalty = ( (gradients.norm(2, dim=1) - 1)**2).mean() * 10 img_gradient_penalty.backward() loss_D_img = D_img_real - D_img_fake optimizer_dis['img'].step() ##### # train generators ##### # update img network (to generate txt features) domain_output = model.dis_txt(f_x) loss_G_txt = -domain_output.mean() # update txt network (to generate img features) domain_output = model.dis_img(f_y) loss_G_img = -domain_output.mean() loss_adver = loss_G_txt + loss_G_img loss1 = criterion_tri_cos(h_x, labels, target=h_y, margin=opt.margin) loss2 = criterion_tri_cos(h_y, labels, target=h_x, margin=opt.margin) theta1 = F.cosine_similarity(torch.abs(h_x), torch.ones_like(h_x).to(opt.device)) theta2 = F.cosine_similarity(torch.abs(h_y), torch.ones_like(h_y).to(opt.device)) loss3 = torch.sum(1 / (1 + torch.exp(theta1))) + torch.sum( 1 / (1 + torch.exp(theta2))) loss_class = criterion_bce(x_class, labels) + criterion_bce( y_class, labels) theta_code_x = h_x.mm(CODE_MAP.t()) # size: (batch, num_label) theta_code_y = h_y.mm(CODE_MAP.t()) loss_code_map = torch.sum(torch.pow(theta_code_x - opt.bit * (labels * 2 - 1), 2)) + \ torch.sum(torch.pow(theta_code_y - opt.bit * (labels * 2 - 1), 2)) loss_quant = torch.sum(torch.pow( h_x - torch.sign(h_x), 2)) + torch.sum( torch.pow(h_y - torch.sign(h_y), 2)) # err = loss1 + loss2 + loss3 + 0.5 * loss_class + 0.5 * (loss_f1 + loss_f2) err = loss1 + loss2 + opt.alpha * loss3 + opt.beta * loss_class + opt.gamma * loss_code_map + \ opt.eta * loss_quant + opt.mu * loss_adver optimizer.zero_grad() err.backward() optimizer.step() loss.append(err.item()) CODE_MAP = update_code_map(U, V, CODE_MAP, train_labels) FEATURE_MAP = update_feature_map(FEATURE_I, FEATURE_T, train_labels) print('...epoch: %3d, loss: %3.3f' % (epoch + 1, loss[-1])) delta_t = time.time() - t1 if opt.vis_env: vis.plot('loss', loss[-1]) # validate if opt.valid and (epoch + 1) % opt.valid_freq == 0: mapi2t, mapt2i = valid(model, x_query_dataloader, x_db_dataloader, y_query_dataloader, y_db_dataloader, query_labels, db_labels, FEATURE_MAP) print( '...epoch: %3d, valid MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (epoch + 1, mapi2t, mapt2i)) mapi2t_list.append(mapi2t) mapt2i_list.append(mapt2i) train_times.append(delta_t) if opt.vis_env: d = {'mapi2t': mapi2t, 'mapt2i': mapt2i} vis.plot_many(d) if mapt2i >= max_mapt2i and mapi2t >= max_mapi2t: max_mapi2t = mapi2t max_mapt2i = mapt2i save_model(model) path = 'checkpoints/' + opt.dataset + '_' + str(opt.bit) with torch.cuda.device(opt.device): torch.save(FEATURE_MAP, os.path.join(path, 'feature_map.pth')) if epoch % 100 == 0: for params in optimizer.param_groups: params['lr'] = max(params['lr'] * 0.6, 1e-6) if not opt.valid: save_model(model) print('...training procedure finish') if opt.valid: print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (max_mapi2t, max_mapt2i)) else: mapi2t, mapt2i = valid(model, x_query_dataloader, x_db_dataloader, y_query_dataloader, y_db_dataloader, query_labels, db_labels, FEATURE_MAP) print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (mapi2t, mapt2i)) path = 'checkpoints/' + opt.dataset + '_' + str(opt.bit) with open(os.path.join(path, 'result.pkl'), 'wb') as f: pickle.dump([train_times, mapi2t_list, mapt2i_list], f)
except: break acc = metric.accuracy(predictTags, devTagList) print('accuracy: ', acc) else: id2tag = dict((id, tag) for tag, id in tag2id.items()) # print(id2tag) biLstm=BiLSTM(len(word2id)+1,100,128,len(tag2id)) optimer=Adamax(biLstm.parameters(),lr=0.001) bestAccuracy=0.0 for epoch in range(30): print('epoch: ',epoch) trainDatas=utils.batch_data(trainWordLists,trainTagLists,word2id,tag2id) while 1: try: optimer.zero_grad() sentence,tag=trainDatas.__next__() predictScores=biLstm(torch.LongTensor(sentence)) loss=0 # print(len(sentence),len(tag)) for i in range(len(sentence)): # print(len(sentence[i]),len(tag[i])) for j in range(len(tag[i])): # print('tag',tag[i][j],'score:',predictScores[i][j][tag[i][j]]) loss+=torch.log(predictScores[i][j][tag[i][j]]) # print('loss: ', loss) loss=-loss/len(sentence) loss.backward() optimer.step() print('loss:',loss)
class GymLearner(Trainable): def __init__(self, hyperparameters, data_generator, initial_state_generator=None, trace_handler=None, summary_writer=None): super().__init__('NGE_Learner', moderage_category=None, moderage_data_id=None, summary_writer=summary_writer) self._hyperparameters = hyperparameters self._data_generator = data_generator self._state_channels = hyperparameters['state_channels'] self._saturation_cost_weight = hyperparameters[ 'saturation_cost_weight'] self._saturation_limit = hyperparameters['saturation_limit'] self._gradient_clip = hyperparameters['gradient_clip'] self._observation_noise_std = hyperparameters['observation_noise_std'] self._reward_loss_coeff = hyperparameters['reward_loss_coeff'] self._reward_state_channels = hyperparameters['reward_state_channels'] self._reward_class_weight = hyperparameters['reward_class_weight'] self._state_channels = hyperparameters['state_channels'] self._batch_size = hyperparameters['batch_size'] self._learning_rate_patience = hyperparameters[ 'learning_rate_patience'] self._learning_rate_decay_factor = hyperparameters[ 'learning_rate_decay_factor'] self._iterations = hyperparameters['ngpu_iterations'] self._num_actions = data_generator.get_num_actions() self._initial_state_generator = initial_state_generator self._model = NeuralGameEngine( self._state_channels, self._reward_state_channels, self._num_actions, observation_noise_std=self._observation_noise_std, saturation_limit=self._saturation_limit, trace_handler=trace_handler, summary_writer=summary_writer, ).to(self._device) self._optimizer = Adamax(self._model.parameters(), lr=hyperparameters['learning_rate']) if self._learning_rate_patience is not None: self._scheduler = ReduceLROnPlateau( self._optimizer, mode='min', factor=self._learning_rate_decay_factor, verbose=True, patience=self._learning_rate_patience) self._mse_observation_loss_criterion = MSELoss().to(self._device) self._ce_reward_loss_criterion = CrossEntropyLoss( weight=torch.tensor(self._reward_class_weight)).to(self._device) self._logger.info('Created Automata Learner') self._logger.info(f'Data Generator: {data_generator.get_name()}') self._logger.info(f'State channels: {self._state_channels}') def is_training(self): return self._model.training def _get_lr(self): for param_group in self._optimizer.param_groups: return param_group['lr'] def _loss(self, predictions, t_batch, saturation_cost=None): observation_targets = t_batch['expected_observation_batch'] observation_predictions = predictions['observation_predictions'] reward_targets = t_batch['expected_reward_batch'] reward_predictions = predictions['reward_predictions'] batch_size = observation_targets.shape[0] loss_components = {} # Calculate mean square error loss component mse_observation_loss = self._mse_observation_loss_criterion( observation_predictions, observation_targets) loss_components['mse_observation_loss'] = mse_observation_loss # Calculate cross entropy loss for reward reward_target_class = reward_targets.type(torch.long) ce_reward_loss = self._ce_reward_loss_criterion( reward_predictions, reward_target_class) reward_predictions_np = np.argmax( reward_predictions.detach().cpu().numpy(), axis=1) reward_target_np = reward_target_class.detach().cpu().numpy() reward_precision, reward_recall, reward_f1, reward_bacc = calc_precision_recall_f1_bacc( reward_predictions_np, reward_target_np) # Calculate saturation cost loss loss_components[ 'saturation_loss'] = saturation_cost * self._saturation_cost_weight total_loss = torch.sum( torch.stack([loss for _, loss in loss_components.items()])) loss_components['ce_reward_loss'] = ce_reward_loss total_loss += ce_reward_loss * self._reward_loss_coeff detached_loss_components = { k: loss.detach().cpu().numpy() for k, loss in loss_components.items() } detached_loss_components['reward_precision'] = reward_precision detached_loss_components['reward_recall'] = reward_recall detached_loss_components['reward_bacc'] = reward_bacc detached_loss_components['reward_f1'] = reward_f1 reward_rate = (reward_targets.detach().cpu().numpy().sum(axis=1) > 0).sum() / batch_size detached_loss_components['reward_rate'] = reward_rate return total_loss, detached_loss_components def forward(self, t_batch, steps=1, trace=False): inputs = t_batch['input_observation_batch'] actions = t_batch['input_action_batch'] return self._model.forward(inputs, actions=actions, steps=steps, trace=trace) def train_batches(self): training_batches = self._data_generator.generate_samples( batch_size=self._batch_size) train_batch_losses = [] loss_component_collector = LossComponentCollector() for training_batch in training_batches: t_prepared_batch = self._model.prepare_batch(training_batch) batch_loss, loss_components_batch = self.train_batch( t_prepared_batch) train_batch_losses.append(batch_loss) loss_component_collector.append_loss_components_batch( loss_components_batch) return np.mean(train_batch_losses), loss_component_collector def eval(self, t_batch, trace=False): # Get predictions predictions, saturation_costs = self.forward(t_batch, steps=self._iterations, trace=trace) # Calculate losses loss, loss_components = self._loss(predictions, t_batch, saturation_costs) # Get loss loss.backward() # Return the loss from the single batch step return (loss.data.detach().cpu().numpy(), loss_components), predictions def train_batch(self, t_batch): # Get predictions predictions, saturation_cost = self.forward(t_batch, steps=self._iterations) # Calculate losses total_loss, loss_components = self._loss(predictions, t_batch, saturation_cost) # Update the weights self._optimizer.zero_grad() total_loss.backward() # clip gradient torch.nn.utils.clip_grad_norm_(self._model.parameters(), self._gradient_clip) self._optimizer.step() return total_loss.data.detach().cpu().numpy(), loss_components def train(self, training_epochs, checkpoint_callback=None, callback_epoch=10, **kwargs): training_mean_loss_component_collector = LossComponentCollector(500) for e in range(training_epochs): self._epoch = e self._model.eval() # If we want to do something at specific points during training then we can set a checkpoint callback if checkpoint_callback is not None and self._epoch % callback_epoch == 0: checkpoint_callback(e) self._model.train() training_loss, training_loss_components = self.train_batches() training_mean_loss_components = training_loss_components.get_means( ) training_mean_loss_component_collector.append_loss_components_batch( training_mean_loss_components) debug_string = ', '.join([ f'{k}: {v:.4f}' for k, v in training_mean_loss_component_collector. get_window_mean().items() ]) self._logger.info( f'Epoch [{e + 1}/{training_epochs}], Lr: {self._get_lr():.4f}, {debug_string}' ) if self._summary_writer is not None: for component_key, component_value in training_mean_loss_component_collector.get_window_mean( ).items(): self._summary_writer.add_scalars( f'{self.get_name()}/training/{component_key}', {component_key: component_value}, e) if self._learning_rate_patience is not None: self._scheduler.step(training_loss) experiment = self.save( training_epochs=training_epochs, training_loss_components=training_mean_loss_component_collector, ) return experiment, self._model def _generate_initial_state_files(self): if self._initial_state_generator is None: return [] params = self._initial_state_generator.get_generator_params() levels = params['train'] initial_states = self._initial_state_generator.generate_samples(1) initial_state_files = self._get_initial_states(initial_states, levels) return initial_state_files def _get_initial_states(self, batch, envs): initial_state_files = [] for i, env in enumerate(envs): initial_state = np.array( np.swapaxes(batch[i]['input_observation_batch'][0], 2, 0) * 255.0).astype(np.uint8) state_filename = f'{env}_initial.npy' np.save(state_filename, initial_state) initial_state_files.append({ 'filename': state_filename, 'caption': f'Initial state for training level: {env}' }) return initial_state_files def save(self, training_epochs, training_loss_components): filename = 'model.tch' torch.save(self._model.saveable(), open(filename, 'wb')) training_history_csv = self._create_training_history_csv( 'training_history.csv', training_loss_components.get_history()) train_final_values = { f'train_{k}_final': f'{v:.8f}' for k, v in training_loss_components.get_window_mean().items() } meta = { 'epochs': training_epochs, **self._hyperparameters, 'data_generator': self._data_generator.get_name(), 'action_map': self._data_generator.get_action_mapping(), **self._data_generator.get_generator_params(), **train_final_values, } files = [{ 'filename': training_history_csv, 'caption': 'Training history' }, { 'filename': filename, 'caption': f'{self.get_name()}-{self._data_generator.get_name()}-model' }] files.extend(self._generate_initial_state_files()) return self._mr.save(f'{self.get_name()}', meta, files=files) def _create_training_history_csv(self, filename, history_data): dataframe = pd.DataFrame(history_data) dataframe.to_csv(filename, header=True) return filename
# Generate a dataset data_d = next(iter(trainloader))[0] labels_d = torch.ones([data_to_load, 1]).to(device) # Create a fake dataset data_g = G.forward(torch.rand([data_to_load, coding_dim]).to(device)).to(device) labels_g = torch.zeros([data_to_load, 1]).to(device) # Train the discriminator using these datasets x_data_ = torch.cat((data_d, data_g), 0) y_data_ = torch.cat((labels_d, labels_g), 0) index = torch.randperm(2 * data_to_load).long().to(device) x_data = x_data_.index_select(0, index).detach() y_data = y_data_.index_select(0, index).float().detach() optimizer_D.zero_grad() y_hat = D.forward(x_data) l = loss(y_hat, y_data) l.backward() optimizer_D.step() # Mean number of correctly sorted : correct = (y_hat.round() == y_data).double().mean() if (correct > 0.5 + tol_d): break # Validation score after the training # Generate a dataset data_d = next(iter(trainloader))[0]