def generate_optimizer(opt, lr, params): # params = [param for name, param in params if param.requires_grad] params = [param for name, param in params] if True: return AdamW(params, lr=lr, betas=betas, weight_decay=gamma) if opt == 'adam': return Adam(params, lr=lr, betas=betas, weight_decay=gamma, eps=eps) elif opt == 'sgd': return SGD(params, lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) elif opt == 'adamax': return Adamax(params, lr=lr, betas=betas, weight_decay=gamma, eps=eps) else: raise ValueError('Unknown optimization algorithm: %s' % opt)
def generate_optimizer(opt, lr, params): if opt == 'adam': return Adam(params, lr=lr, betas=betas, weight_decay=gamma, eps=eps) elif opt == 'sgd': return SGD(params, lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) elif opt == 'adamax': return Adamax(params, lr=lr, betas=betas, weight_decay=gamma, eps=eps) else: raise ValueError('Unknown optimization algorithm: %s' % opt)
def generate_optimizer(opt, lr, params): params = filter(lambda param: param.requires_grad, params) if opt == 'adam': return Adam(params, lr=lr, betas=betas, weight_decay=gamma) elif opt == 'sgd': return SGD(params, lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) elif opt == 'adamax': return Adamax(params, lr=lr, betas=betas, weight_decay=gamma) else: raise ValueError('Unknown optimization algorithm: %s' % opt)
def generate_optimizer(config, params): params = filter(lambda param: param.requires_grad, params) if config.opt == 'adam': return Adam(params, lr=config.lr, betas=config.betas, weight_decay=config.gamma, eps=config.ada_eps) elif config.opt == 'sgd': return SGD(params, lr=config.lr, momentum=config.momentum, weight_decay=config.start_decay, nesterov=True) elif opt == 'adamax': return Adamax(params, lr=config.lr, betas=config.betas, weight_decay=config.start_decay, eps=config.ada_eps) else: raise ValueError('Unknown optimization algorithm: %s' % config.opt)
def init_optimizer(self, state_dict=None): if self.args.fix_embeddings: for p in self.network.embedding.parameters(): p.requires_grad = False parameters = [p for p in self.network.parameters() if p.requires_grad] self.optimizer = SGD(parameters, self.args.learning_rate, momentum=self.args.momentum, weight_decay=self.args.weight_decay ) if self.args.optimizer == 'sgd' else Adamax( parameters, weight_decay=self.args.weight_decay)
def adamax(parameters): # pick defaults if not ("betas" in parameters["optimizer"]): parameters["optimizer"]["betas"] = (0.9, 0.999) if not ("weight_decay" in parameters["optimizer"]): parameters["optimizer"]["weight_decay"] = 0.00005 if not ("eps" in parameters["optimizer"]): parameters["optimizer"]["eps"] = 1e-8 return Adamax( parameters["model_parameters"], lr=parameters["learning_rate"], betas=parameters["optimizer"]["betas"], weight_decay=parameters["optimizer"]["weight_decay"], eps=parameters["optimizer"]["eps"], )
def _setup_optim(self): if self.train_config.optim_choice == 'lamb': self.optim = Lamb( filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.train_config.lr, betas=self.train_config.betas, weight_decay=self.train_config.weight_decay, adam=False, ) elif self.train_config.optim_choice == 'adam': self.optim = Adam( filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.train_config.lr, betas=self.train_config.betas, weight_decay=self.train_config.weight_decay, ) elif self.train_config.optim_choice == 'adamax': self.optim = Adamax( filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.train_config.lr, betas=self.train_config.betas, weight_decay=self.train_config.weight_decay, ) self.optim_schedule = CosineAnnealingLR(self.optim, T_max=10, eta_min=1e-7) elif self.train_config.optim_choice == 'adam_with_warmup': self.optim = Adam( filter(lambda p: p.requires_grad, self.model.parameters()), betas=self.train_config.betas, weight_decay=self.train_config.weight_decay, ) self.optim_schedule = ScheduledOptim( optimizer=self.optim, hidden_size=256, n_warmup_steps=self.train_config.warmup_steps, ) else: raise ValueError("Invalid optimizer choice: {}".format(self.train_config.optim_chioce))
def initialize_optimizer(params, cfg): """ Create an optimizer for the given params based on the given cfg. :param params: The parameters of the model we optimize. :params cfg: The config from which we configure the optimizer. :returns: An optimizer for given `params` based on the `cfg`. """ optimizer = cfg.optimizer.lower() assert optimizer in ["adam", "adadelta", "adamax", "rmsprop", "adagrad"] if optimizer == "adam": return Adam(params, lr=cfg.learning_rate) if optimizer == "adadelta": return Adadelta(params, lr=cfg.learning_rate) if optimizer == "adamax": return Adamax(params, lr=cfg.learning_rate) if optimizer == "rmsprop": return RMSprop(params, lr=cfg.learning_rate) if optimizer == "adagrad": return Adagrad(params, lr=cfg.learning_rate, initial_accumulator_value=cfg.adagrad_init_acc)
def train_model(args): # Read and process data train, dev, test, batch_size, test_batch_size, train_ques_to_para,\ dev_ques_to_para, test_ques_to_para, train_tokenized_paras,\ dev_tokenized_paras, test_tokenized_paras, train_order, dev_order, test_order,\ train_data, dev_data, test_data, train_tokenized_paras_chars,\ dev_tokenized_paras_chars, test_tokenized_paras_chars = read_and_process_data(args) # Build model model, config = build_model(args, train_data.dictionary.size(), train_data.dictionary.index_to_word, train_data.dictionary.word_to_index, train_data.dictionary.char_to_index, train_data.dictionary.index_to_char) if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) #------------------------------ Train System ----------------------------------# # Should we resume running from an existing checkpoint? last_done_epoch = config['ckpt'] if last_done_epoch > 0: model = model.load(args.model_dir, last_done_epoch) print "Loaded model." if not args.disable_glove: print "Embedding shape:", model.embedding.shape start_time = time.time() print "Starting training." # Decide which optimizer to use. if args.optimizer == "SGD": print "Using SGD optimizer." optimizer = SGD(model.parameters(), lr = args.learning_rate) elif args.optimizer == "Adamax": print "Using Adamax optimizer." optimizer = Adamax(model.parameters(), lr= args.learning_rate) elif args.optimizer == "Adadelta": print "Using Adadelta optimizer." optimizer = Adadelta(model.parameters(), lr=args.learning_rate, rho=0.95) else: assert False, "Unrecognized optimizer." if last_done_epoch > 0: if os.path.exists(args.model_dir + "/optim_%d.pt" % last_done_epoch): optimizer = torch.load(args.model_dir + "/optim_%d.pt" % last_done_epoch) else: print "Optimizer saved state not found. Not loading optimizer." # Model summary. print(model) for EPOCH in range(last_done_epoch+1, args.epochs): start_t = time.time() train_loss_sum = 0.0 model.train() for i, num in enumerate(train_order): print "\rTrain epoch %d, %.2f s - (Done %d of %d)" %\ (EPOCH, (time.time()-start_t)*(len(train_order)-i-1)/(i+1), i+1, len(train_order)), # Create next batch by getting lengths and padding train_batch = train[num:num+batch_size] passage_input_f, passage_input_b, question_input_f, question_input_b,\ passage_input_lens, question_input_lens, passage_input_chars_f,\ passage_input_chars_b, question_input_chars_f, question_input_chars_b,\ passage_input_chars_lens, question_input_chars_lens, answer_input =\ get_minibatch_input(train_batch, train_tokenized_paras, train_tokenized_paras_chars, train_ques_to_para) # Zero previous gradient. model.zero_grad() model((passage_input_chars_f, passage_input_chars_lens),\ (passage_input_chars_b, passage_input_chars_lens),\ (question_input_chars_f, question_input_chars_lens),\ (question_input_chars_b, question_input_chars_lens),\ (passage_input_f, passage_input_lens),\ (passage_input_b, passage_input_lens),\ (question_input_f, question_input_lens),\ (question_input_b, question_input_lens),\ answer_input) model.loss.backward() optimizer.step() train_loss_sum += model.loss.data[0] model.free_memory() print "Loss: %.5f (in time %.2fs)" % \ (train_loss_sum/(i+1), time.time() - start_t), sys.stdout.flush() print "\nLoss: %.5f (in time %.2fs)" % \ (train_loss_sum/len(train_order), time.time() - start_t) # End of epoch. random.shuffle(train_order) model.zero_grad() model.save(args.model_dir, EPOCH) # Updating LR for optimizer for param in optimizer.param_groups: param['lr'] *= config['decay'] torch.save(optimizer, args.model_dir + "/optim_%d.pt" % EPOCH) # Run pass over dev data. dev_start_t = time.time() dev_loss_sum = 0.0 all_predictions = {} print "\nRunning on Dev." model.eval() for i, num in enumerate(dev_order): print "\rDev: %.2f s (Done %d of %d)" %\ ((time.time()-dev_start_t)*(len(dev_order)-i-1)/(i+1), i+1, len(dev_order)), dev_batch = dev[num:num+test_batch_size] passage_input_f, passage_input_b, question_input_f, question_input_b,\ passage_input_lens, question_input_lens, passage_input_chars_f,\ passage_input_chars_b, question_input_chars_f, question_input_chars_b,\ passage_input_chars_lens, question_input_chars_lens, answer_input =\ get_minibatch_input(dev_batch, dev_tokenized_paras, dev_tokenized_paras_chars, dev_ques_to_para) # distributions[{0,1}].shape = (batch, max_passage_len) distributions = \ model((passage_input_chars_f, passage_input_chars_lens),\ (passage_input_chars_b, passage_input_chars_lens),\ (question_input_chars_f, question_input_chars_lens),\ (question_input_chars_b, question_input_chars_lens),\ (passage_input_f, passage_input_lens),\ (passage_input_b, passage_input_lens),\ (question_input_f, question_input_lens),\ (question_input_b, question_input_lens),\ answer_input) distributions[0] = distributions[0].data.cpu().numpy() distributions[1] = distributions[1].data.cpu().numpy() # Add all batch qids to predictions dict, if they don't already exist. qids = [ example[2] for example in dev_batch ] for qid in qids: if not qid in all_predictions: all_predictions[qid] = [] best_idxs = [] for idx in range(len(dev_batch)): best_prob = -1 best = [0, 0] max_end = passage_input_lens[idx] for j, start_prob in enumerate(distributions[0][idx][:max_end]): cur_end_idx = min(j + args.max_answer_span, max_end) end_idx = np.argmax(distributions[1][idx][j:cur_end_idx]) prob = distributions[1][idx][j+end_idx] * start_prob if prob > best_prob: best_prob = prob best = [j, j+end_idx] best_idxs.append(best) tokenized_paras = dev_data.tokenized_paras answers = [ tokenized_paras[dev_ques_to_para[qids[idx]]][start:end+1] \ for idx, (start, end) in enumerate(best_idxs) ] answers = [ " ".join([ dev_data.dictionary.get_word(idx) for idx in ans ]) \ for ans in answers ] for qid, answer in zip(qids, answers): all_predictions[qid] = answer dev_loss_sum += model.loss.data[0] model.free_memory() print "[Average loss : %.5f]" % (dev_loss_sum/(i+1)), sys.stdout.flush() # Print dev stats for epoch print "\nDev Loss: %.4f (in time: %.2f s)" %\ (dev_loss_sum/len(dev_order), (time.time() - dev_start_t)) # Dump the results json in the required format print "Dumping prediction results." json.dump( all_predictions, open(args.model_dir + "/dev_predictions_" + str(EPOCH) + ".json", "w")) print "Done."
if args.actnorm: transforms.append(ActNormBijection1d(2)) model = Flow(base_dist=StandardNormal((D, L)), transforms=transforms).to(args.device) if not args.train: state_dict = torch.load('models/{}.pt'.format(run_name)) model.load_state_dict(state_dict) ####################### ## Specify optimizer ## ####################### if args.optimizer == 'adam': optimizer = Adam(model.parameters(), lr=args.lr) elif args.optimizer == 'adamax': optimizer = Adamax(model.parameters(), lr=args.lr) if args.warmup is not None: scheduler_iter = LinearWarmupScheduler(optimizer, total_epoch=args.warmup) else: scheduler_iter = None if args.gamma is not None: scheduler_epoch = ExponentialLR(optimizer, gamma=args.gamma) else: scheduler_epoch = None ##################### ## Define training ## #####################
def train(**kwargs): opt.parse(kwargs) if opt.vis_env: vis = Visualizer(opt.vis_env, port=opt.vis_port) if opt.device is None or opt.device is 'cpu': opt.device = torch.device('cpu') else: opt.device = torch.device(opt.device) images, tags, labels = load_data(opt.data_path, type=opt.dataset) train_data = Dataset(opt, images, tags, labels) train_dataloader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True) # valid or test data x_query_data = Dataset(opt, images, tags, labels, test='image.query') x_db_data = Dataset(opt, images, tags, labels, test='image.db') y_query_data = Dataset(opt, images, tags, labels, test='text.query') y_db_data = Dataset(opt, images, tags, labels, test='text.db') x_query_dataloader = DataLoader(x_query_data, opt.batch_size, shuffle=False) x_db_dataloader = DataLoader(x_db_data, opt.batch_size, shuffle=False) y_query_dataloader = DataLoader(y_query_data, opt.batch_size, shuffle=False) y_db_dataloader = DataLoader(y_db_data, opt.batch_size, shuffle=False) query_labels, db_labels = x_query_data.get_labels() query_labels = query_labels.to(opt.device) db_labels = db_labels.to(opt.device) if opt.load_model_path: pretrain_model = None elif opt.pretrain_model_path: pretrain_model = load_pretrain_model(opt.pretrain_model_path) model = AGAH(opt.bit, opt.tag_dim, opt.num_label, opt.emb_dim, lambd=opt.lambd, pretrain_model=pretrain_model).to(opt.device) load_model(model, opt.load_model_path) optimizer = Adamax([{ 'params': model.img_module.parameters(), 'lr': opt.lr }, { 'params': model.txt_module.parameters() }, { 'params': model.hash_module.parameters() }, { 'params': model.classifier.parameters() }], lr=opt.lr * 10, weight_decay=0.0005) optimizer_dis = { 'img': Adamax(model.img_discriminator.parameters(), lr=opt.lr * 10, betas=(0.5, 0.9), weight_decay=0.0001), 'txt': Adamax(model.txt_discriminator.parameters(), lr=opt.lr * 10, betas=(0.5, 0.9), weight_decay=0.0001) } criterion_tri_cos = TripletAllLoss(dis_metric='cos', reduction='sum') criterion_bce = nn.BCELoss(reduction='sum') loss = [] max_mapi2t = 0. max_mapt2i = 0. FEATURE_I = torch.randn(opt.training_size, opt.emb_dim).to(opt.device) FEATURE_T = torch.randn(opt.training_size, opt.emb_dim).to(opt.device) U = torch.randn(opt.training_size, opt.bit).to(opt.device) V = torch.randn(opt.training_size, opt.bit).to(opt.device) FEATURE_MAP = torch.randn(opt.num_label, opt.emb_dim).to(opt.device) CODE_MAP = torch.sign(torch.randn(opt.num_label, opt.bit)).to(opt.device) train_labels = train_data.get_labels().to(opt.device) mapt2i_list = [] mapi2t_list = [] train_times = [] for epoch in range(opt.max_epoch): t1 = time.time() for i, (ind, x, y, l) in tqdm(enumerate(train_dataloader)): imgs = x.to(opt.device) tags = y.to(opt.device) labels = l.to(opt.device) batch_size = len(ind) h_x, h_y, f_x, f_y, x_class, y_class = model( imgs, tags, FEATURE_MAP) FEATURE_I[ind] = f_x.data FEATURE_T[ind] = f_y.data U[ind] = h_x.data V[ind] = h_y.data ##### # train txt discriminator ##### D_txt_real = model.dis_txt(f_y.detach()) D_txt_real = -D_txt_real.mean() optimizer_dis['txt'].zero_grad() D_txt_real.backward() # train with fake D_txt_fake = model.dis_txt(f_x.detach()) D_txt_fake = D_txt_fake.mean() D_txt_fake.backward() # train with gradient penalty alpha = torch.rand(batch_size, opt.emb_dim).to(opt.device) interpolates = alpha * f_y.detach() + (1 - alpha) * f_x.detach() interpolates.requires_grad_() disc_interpolates = model.dis_txt(interpolates) gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones( disc_interpolates.size()).to( opt.device), create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.view(gradients.size(0), -1) # 10 is gradient penalty hyperparameter txt_gradient_penalty = ( (gradients.norm(2, dim=1) - 1)**2).mean() * 10 txt_gradient_penalty.backward() loss_D_txt = D_txt_real - D_txt_fake optimizer_dis['txt'].step() ##### # train img discriminator ##### D_img_real = model.dis_img(f_x.detach()) D_img_real = -D_img_real.mean() optimizer_dis['img'].zero_grad() D_img_real.backward() # train with fake D_img_fake = model.dis_img(f_y.detach()) D_img_fake = D_img_fake.mean() D_img_fake.backward() # train with gradient penalty alpha = torch.rand(batch_size, opt.emb_dim).to(opt.device) interpolates = alpha * f_x.detach() + (1 - alpha) * f_y.detach() interpolates.requires_grad_() disc_interpolates = model.dis_img(interpolates) gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones( disc_interpolates.size()).to( opt.device), create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.view(gradients.size(0), -1) # 10 is gradient penalty hyperparameter img_gradient_penalty = ( (gradients.norm(2, dim=1) - 1)**2).mean() * 10 img_gradient_penalty.backward() loss_D_img = D_img_real - D_img_fake optimizer_dis['img'].step() ##### # train generators ##### # update img network (to generate txt features) domain_output = model.dis_txt(f_x) loss_G_txt = -domain_output.mean() # update txt network (to generate img features) domain_output = model.dis_img(f_y) loss_G_img = -domain_output.mean() loss_adver = loss_G_txt + loss_G_img loss1 = criterion_tri_cos(h_x, labels, target=h_y, margin=opt.margin) loss2 = criterion_tri_cos(h_y, labels, target=h_x, margin=opt.margin) theta1 = F.cosine_similarity(torch.abs(h_x), torch.ones_like(h_x).to(opt.device)) theta2 = F.cosine_similarity(torch.abs(h_y), torch.ones_like(h_y).to(opt.device)) loss3 = torch.sum(1 / (1 + torch.exp(theta1))) + torch.sum( 1 / (1 + torch.exp(theta2))) loss_class = criterion_bce(x_class, labels) + criterion_bce( y_class, labels) theta_code_x = h_x.mm(CODE_MAP.t()) # size: (batch, num_label) theta_code_y = h_y.mm(CODE_MAP.t()) loss_code_map = torch.sum(torch.pow(theta_code_x - opt.bit * (labels * 2 - 1), 2)) + \ torch.sum(torch.pow(theta_code_y - opt.bit * (labels * 2 - 1), 2)) loss_quant = torch.sum(torch.pow( h_x - torch.sign(h_x), 2)) + torch.sum( torch.pow(h_y - torch.sign(h_y), 2)) # err = loss1 + loss2 + loss3 + 0.5 * loss_class + 0.5 * (loss_f1 + loss_f2) err = loss1 + loss2 + opt.alpha * loss3 + opt.beta * loss_class + opt.gamma * loss_code_map + \ opt.eta * loss_quant + opt.mu * loss_adver optimizer.zero_grad() err.backward() optimizer.step() loss.append(err.item()) CODE_MAP = update_code_map(U, V, CODE_MAP, train_labels) FEATURE_MAP = update_feature_map(FEATURE_I, FEATURE_T, train_labels) print('...epoch: %3d, loss: %3.3f' % (epoch + 1, loss[-1])) delta_t = time.time() - t1 if opt.vis_env: vis.plot('loss', loss[-1]) # validate if opt.valid and (epoch + 1) % opt.valid_freq == 0: mapi2t, mapt2i = valid(model, x_query_dataloader, x_db_dataloader, y_query_dataloader, y_db_dataloader, query_labels, db_labels, FEATURE_MAP) print( '...epoch: %3d, valid MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (epoch + 1, mapi2t, mapt2i)) mapi2t_list.append(mapi2t) mapt2i_list.append(mapt2i) train_times.append(delta_t) if opt.vis_env: d = {'mapi2t': mapi2t, 'mapt2i': mapt2i} vis.plot_many(d) if mapt2i >= max_mapt2i and mapi2t >= max_mapi2t: max_mapi2t = mapi2t max_mapt2i = mapt2i save_model(model) path = 'checkpoints/' + opt.dataset + '_' + str(opt.bit) with torch.cuda.device(opt.device): torch.save(FEATURE_MAP, os.path.join(path, 'feature_map.pth')) if epoch % 100 == 0: for params in optimizer.param_groups: params['lr'] = max(params['lr'] * 0.6, 1e-6) if not opt.valid: save_model(model) print('...training procedure finish') if opt.valid: print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (max_mapi2t, max_mapt2i)) else: mapi2t, mapt2i = valid(model, x_query_dataloader, x_db_dataloader, y_query_dataloader, y_db_dataloader, query_labels, db_labels, FEATURE_MAP) print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (mapi2t, mapt2i)) path = 'checkpoints/' + opt.dataset + '_' + str(opt.bit) with open(os.path.join(path, 'result.pkl'), 'wb') as f: pickle.dump([train_times, mapi2t_list, mapt2i_list], f)
# In[21]: epochs = [80, 60, 40, 20] count = 0 # In[22]: from torch.optim import Adam from torch.optim import SGD from torch.optim import Adamax for epoch in epochs: if opt == "Adam": optimizer = Adam(net.parameters(), lr=lr, weight_decay=wd) elif opt == "Adamax": optimizer = Adamax(net.parameters(), lr=lr) elif opt == "SGD": optimizer = SGD(net.parameters(), lr=lr, momentum=.9, weight_decay=wd, nesterov=nest) for _ in range(epoch): train(count) test(count) count += 1 lr /= 10 # In[23]:
def __init__(self, Config, inference=False): self.Config = Config if not inference: torch.backends.cudnn.benchmark = True if self.Config.NR_CPUS > 0: torch.set_num_threads(self.Config.NR_CPUS) if self.Config.SEG_INPUT == "Peaks" and self.Config.TYPE == "single_direction": NR_OF_GRADIENTS = self.Config.NR_OF_GRADIENTS # NR_OF_GRADIENTS = 9 * 5 # 5 slices elif self.Config.SEG_INPUT == "Peaks" and self.Config.TYPE == "combined": self.Config.NR_OF_GRADIENTS = 3 * self.Config.NR_OF_CLASSES else: self.Config.NR_OF_GRADIENTS = 33 if self.Config.LOSS_FUNCTION == "soft_sample_dice": self.criterion = pytorch_utils.soft_sample_dice elif self.Config.LOSS_FUNCTION == "soft_batch_dice": self.criterion = pytorch_utils.soft_batch_dice elif self.Config.EXPERIMENT_TYPE == "peak_regression": if self.Config.LOSS_FUNCTION == "angle_length_loss": self.criterion = pytorch_utils.angle_length_loss elif self.Config.LOSS_FUNCTION == "angle_loss": self.criterion = pytorch_utils.angle_loss elif self.Config.LOSS_FUNCTION == "l2_loss": self.criterion = pytorch_utils.l2_loss elif self.Config.EXPERIMENT_TYPE == "dm_regression": # self.criterion = nn.MSELoss() # aggregate by mean self.criterion = nn.MSELoss(size_average=False, reduce=True) # aggregate by sum else: # weights = torch.ones((self.Config.BATCH_SIZE, self.Config.NR_OF_CLASSES, # self.Config.INPUT_DIM[0], self.Config.INPUT_DIM[1])).cuda() # weights[:, 5, :, :] *= 10 #CA # weights[:, 21, :, :] *= 10 #FX_left # weights[:, 22, :, :] *= 10 #FX_right # self.criterion = nn.BCEWithLogitsLoss(weight=weights) self.criterion = nn.BCEWithLogitsLoss() NetworkClass = getattr( importlib.import_module("tractseg.models." + self.Config.MODEL.lower()), self.Config.MODEL) self.net = NetworkClass(n_input_channels=NR_OF_GRADIENTS, n_classes=self.Config.NR_OF_CLASSES, n_filt=self.Config.UNET_NR_FILT, batchnorm=self.Config.BATCH_NORM, dropout=self.Config.USE_DROPOUT, upsample=self.Config.UPSAMPLE_TYPE) # Somehow not really faster (max 10% speedup): GPU utility low -> why? (CPU also low) # (with bigger batch_size even worse) # - GPU slow connection? (but maybe same problem as before pin_memory) # - Wrong setup with pin_memory, async, ...? -> should be correct # - load from npy instead of nii -> will not solve entire problem # nr_gpus = torch.cuda.device_count() # exp_utils.print_and_save(self.Config, "nr of gpus: {}".format(nr_gpus)) # self.net = nn.DataParallel(self.net) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") net = self.net.to(self.device) # if self.Config.TRAIN: # exp_utils.print_and_save(self.Config, str(net), only_log=True) # print network if self.Config.OPTIMIZER == "Adamax": self.optimizer = Adamax(net.parameters(), lr=self.Config.LEARNING_RATE) elif self.Config.OPTIMIZER == "Adam": self.optimizer = Adam(net.parameters(), lr=self.Config.LEARNING_RATE) # self.optimizer = Adam(net.parameters(), lr=self.Config.LEARNING_RATE, # weight_decay=self.Config.WEIGHT_DECAY) else: raise ValueError("Optimizer not defined") if APEX_AVAILABLE and self.Config.FP16: # Use O0 to disable fp16 (might be a little faster on TitanX) self.net, self.optimizer = amp.initialize(self.net, self.optimizer, verbosity=0, opt_level="O1") if not inference: print("INFO: Using fp16 training") else: if not inference: print("INFO: Did not find APEX, defaulting to fp32 training") if self.Config.LR_SCHEDULE: # Slightly better results could be archived if training for 500ep without reduction of LR # -> but takes too long -> using reudceOnPlateau gives benefits if only training for 200ep self.scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, mode=self.Config.LR_SCHEDULE_MODE, patience=self.Config.LR_SCHEDULE_PATIENCE) if self.Config.LOAD_WEIGHTS: exp_utils.print_verbose( self.Config, "Loading weights ... ({})".format( join(self.Config.EXP_PATH, self.Config.WEIGHTS_PATH))) self.load_model( join(self.Config.EXP_PATH, self.Config.WEIGHTS_PATH)) if self.Config.RESET_LAST_LAYER: self.net.conv_5 = nn.Conv2d(self.Config.UNET_NR_FILT, self.Config.NR_OF_CLASSES, kernel_size=1, stride=1, padding=0, bias=True).to(self.device)
DIM_ZS = [31, 37, 17, 13] num_z = len(DIM_ZS) dim_z = sum(DIM_ZS) Z_PRIORS = [D.Normal(torch.tensor(0.), torch.tensor(1.))] * num_z encoder0 = MLP(DIM_X, DIM_ZS[0] * 2, [64]) encoder1 = MLP(DIM_X, DIM_ZS[1] * 2, [64]) encoder2 = MLP(DIM_X, DIM_ZS[2] * 2, [64]) encoder3 = MLP(DIM_X, DIM_ZS[3] * 2, [64]) decoder = MLP(dim_z, DIM_X, [512]) vae = MIVAE(encoders=[encoder0, encoder1, encoder2, encoder3], decoder=decoder, z_priors=Z_PRIORS) optimizer = Adamax(params=vae.parameters(), lr=1e-3) step = 0 fig, axes = plt.subplots(1, 2) mats = [None] * 2 mats[0] = axes[0].matshow(np.zeros([28, 28]), cmap='bone', vmin=0., vmax=1.) mats[1] = axes[1].matshow(np.zeros([28, 28]), cmap='bone', vmin=0., vmax=1.) for xx, yy in data_loader:
################### ## Specify model ## ################### pi = get_model(args, target=target, num_bits=args.num_bits).to(args.device) p = StandardNormal((target.size, )).to(args.device) model_id = get_model_id(args) ####################### ## Specify optimizer ## ####################### if args.optimizer == 'adam': optimizer = Adam(pi.parameters(), lr=args.lr) elif args.optimizer == 'adamax': optimizer = Adamax(pi.parameters(), lr=args.lr) ############## ## Training ## ############## print('Training...') time_before = time.time() loss_sum = 0.0 for i in range(args.iter): z, log_p_z = p.sample_with_log_prob(args.batch_size) log_pi_z_tilde = pi.log_prob(z) KL_tilde = (log_p_z - log_pi_z_tilde).mean() optimizer.zero_grad() loss = KL_tilde loss.backward()
def __init__(self, Config): self.Config = Config # torch.backends.cudnn.benchmark = True #not faster if self.Config.NR_CPUS > 0: torch.set_num_threads(self.Config.NR_CPUS) if self.Config.SEG_INPUT == "Peaks" and self.Config.TYPE == "single_direction": NR_OF_GRADIENTS = self.Config.NR_OF_GRADIENTS # NR_OF_GRADIENTS = 9 * 5 # 5 slices elif self.Config.SEG_INPUT == "Peaks" and self.Config.TYPE == "combined": self.Config.NR_OF_GRADIENTS = 3 * self.Config.NR_OF_CLASSES else: self.Config.NR_OF_GRADIENTS = 33 if self.Config.LOSS_FUNCTION == "soft_sample_dice": self.criterion = pytorch_utils.soft_sample_dice elif self.Config.LOSS_FUNCTION == "soft_batch_dice": self.criterion = pytorch_utils.soft_batch_dice elif self.Config.EXPERIMENT_TYPE == "peak_regression": self.criterion = pytorch_utils.angle_length_loss else: # weights = torch.ones((self.Config.BATCH_SIZE, self.Config.NR_OF_CLASSES, # self.Config.INPUT_DIM[0], self.Config.INPUT_DIM[1])).cuda() # weights[:, 5, :, :] *= 10 #CA # weights[:, 21, :, :] *= 10 #FX_left # weights[:, 22, :, :] *= 10 #FX_right # self.criterion = nn.BCEWithLogitsLoss(weight=weights) self.criterion = nn.BCEWithLogitsLoss() NetworkClass = getattr(importlib.import_module("tractseg.models." + self.Config.MODEL.lower()), self.Config.MODEL) self.net = NetworkClass(n_input_channels=NR_OF_GRADIENTS, n_classes=self.Config.NR_OF_CLASSES, n_filt=self.Config.UNET_NR_FILT, batchnorm=self.Config.BATCH_NORM, dropout=self.Config.USE_DROPOUT, upsample=self.Config.UPSAMPLE_TYPE) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = self.net.to(self.device) # if self.Config.TRAIN: # exp_utils.print_and_save(self.Config, str(net), only_log=True) # print network if self.Config.OPTIMIZER == "Adamax": self.optimizer = Adamax(net.parameters(), lr=self.Config.LEARNING_RATE) elif self.Config.OPTIMIZER == "Adam": self.optimizer = Adam(net.parameters(), lr=self.Config.LEARNING_RATE) # self.optimizer = Adam(net.parameters(), lr=self.Config.LEARNING_RATE, # weight_decay=self.Config.WEIGHT_DECAY) else: raise ValueError("Optimizer not defined") if self.Config.LR_SCHEDULE: self.scheduler = lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.1) # self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, mode="max") if self.Config.LOAD_WEIGHTS: exp_utils.print_verbose(self.Config, "Loading weights ... ({})".format(join(self.Config.EXP_PATH, self.Config.WEIGHTS_PATH))) self.load_model(join(self.Config.EXP_PATH, self.Config.WEIGHTS_PATH)) if self.Config.RESET_LAST_LAYER: self.net.conv_5 = nn.Conv2d(self.Config.UNET_NR_FILT, self.Config.NR_OF_CLASSES, kernel_size=1, stride=1, padding=0, bias=True).to(self.device)
predictScores = model(torch.LongTensor(x)) scores = torch.argmax(predictScores, dim=2, ) for i in range(len(scores)): predictTag = [] for j in range(len(y[i])): predictTag.append(id2tag[int(scores[i][j])]) predictTags.append(predictTag) except: break acc = metric.accuracy(predictTags, devTagList) print('accuracy: ', acc) else: id2tag = dict((id, tag) for tag, id in tag2id.items()) # print(id2tag) biLstm=BiLSTM(len(word2id)+1,100,128,len(tag2id)) optimer=Adamax(biLstm.parameters(),lr=0.001) bestAccuracy=0.0 for epoch in range(30): print('epoch: ',epoch) trainDatas=utils.batch_data(trainWordLists,trainTagLists,word2id,tag2id) while 1: try: optimer.zero_grad() sentence,tag=trainDatas.__next__() predictScores=biLstm(torch.LongTensor(sentence)) loss=0 # print(len(sentence),len(tag)) for i in range(len(sentence)): # print(len(sentence[i]),len(tag[i])) for j in range(len(tag[i])): # print('tag',tag[i][j],'score:',predictScores[i][j][tag[i][j]])
class GymLearner(Trainable): def __init__(self, hyperparameters, data_generator, initial_state_generator=None, trace_handler=None, summary_writer=None): super().__init__('NGE_Learner', moderage_category=None, moderage_data_id=None, summary_writer=summary_writer) self._hyperparameters = hyperparameters self._data_generator = data_generator self._state_channels = hyperparameters['state_channels'] self._saturation_cost_weight = hyperparameters[ 'saturation_cost_weight'] self._saturation_limit = hyperparameters['saturation_limit'] self._gradient_clip = hyperparameters['gradient_clip'] self._observation_noise_std = hyperparameters['observation_noise_std'] self._reward_loss_coeff = hyperparameters['reward_loss_coeff'] self._reward_state_channels = hyperparameters['reward_state_channels'] self._reward_class_weight = hyperparameters['reward_class_weight'] self._state_channels = hyperparameters['state_channels'] self._batch_size = hyperparameters['batch_size'] self._learning_rate_patience = hyperparameters[ 'learning_rate_patience'] self._learning_rate_decay_factor = hyperparameters[ 'learning_rate_decay_factor'] self._iterations = hyperparameters['ngpu_iterations'] self._num_actions = data_generator.get_num_actions() self._initial_state_generator = initial_state_generator self._model = NeuralGameEngine( self._state_channels, self._reward_state_channels, self._num_actions, observation_noise_std=self._observation_noise_std, saturation_limit=self._saturation_limit, trace_handler=trace_handler, summary_writer=summary_writer, ).to(self._device) self._optimizer = Adamax(self._model.parameters(), lr=hyperparameters['learning_rate']) if self._learning_rate_patience is not None: self._scheduler = ReduceLROnPlateau( self._optimizer, mode='min', factor=self._learning_rate_decay_factor, verbose=True, patience=self._learning_rate_patience) self._mse_observation_loss_criterion = MSELoss().to(self._device) self._ce_reward_loss_criterion = CrossEntropyLoss( weight=torch.tensor(self._reward_class_weight)).to(self._device) self._logger.info('Created Automata Learner') self._logger.info(f'Data Generator: {data_generator.get_name()}') self._logger.info(f'State channels: {self._state_channels}') def is_training(self): return self._model.training def _get_lr(self): for param_group in self._optimizer.param_groups: return param_group['lr'] def _loss(self, predictions, t_batch, saturation_cost=None): observation_targets = t_batch['expected_observation_batch'] observation_predictions = predictions['observation_predictions'] reward_targets = t_batch['expected_reward_batch'] reward_predictions = predictions['reward_predictions'] batch_size = observation_targets.shape[0] loss_components = {} # Calculate mean square error loss component mse_observation_loss = self._mse_observation_loss_criterion( observation_predictions, observation_targets) loss_components['mse_observation_loss'] = mse_observation_loss # Calculate cross entropy loss for reward reward_target_class = reward_targets.type(torch.long) ce_reward_loss = self._ce_reward_loss_criterion( reward_predictions, reward_target_class) reward_predictions_np = np.argmax( reward_predictions.detach().cpu().numpy(), axis=1) reward_target_np = reward_target_class.detach().cpu().numpy() reward_precision, reward_recall, reward_f1, reward_bacc = calc_precision_recall_f1_bacc( reward_predictions_np, reward_target_np) # Calculate saturation cost loss loss_components[ 'saturation_loss'] = saturation_cost * self._saturation_cost_weight total_loss = torch.sum( torch.stack([loss for _, loss in loss_components.items()])) loss_components['ce_reward_loss'] = ce_reward_loss total_loss += ce_reward_loss * self._reward_loss_coeff detached_loss_components = { k: loss.detach().cpu().numpy() for k, loss in loss_components.items() } detached_loss_components['reward_precision'] = reward_precision detached_loss_components['reward_recall'] = reward_recall detached_loss_components['reward_bacc'] = reward_bacc detached_loss_components['reward_f1'] = reward_f1 reward_rate = (reward_targets.detach().cpu().numpy().sum(axis=1) > 0).sum() / batch_size detached_loss_components['reward_rate'] = reward_rate return total_loss, detached_loss_components def forward(self, t_batch, steps=1, trace=False): inputs = t_batch['input_observation_batch'] actions = t_batch['input_action_batch'] return self._model.forward(inputs, actions=actions, steps=steps, trace=trace) def train_batches(self): training_batches = self._data_generator.generate_samples( batch_size=self._batch_size) train_batch_losses = [] loss_component_collector = LossComponentCollector() for training_batch in training_batches: t_prepared_batch = self._model.prepare_batch(training_batch) batch_loss, loss_components_batch = self.train_batch( t_prepared_batch) train_batch_losses.append(batch_loss) loss_component_collector.append_loss_components_batch( loss_components_batch) return np.mean(train_batch_losses), loss_component_collector def eval(self, t_batch, trace=False): # Get predictions predictions, saturation_costs = self.forward(t_batch, steps=self._iterations, trace=trace) # Calculate losses loss, loss_components = self._loss(predictions, t_batch, saturation_costs) # Get loss loss.backward() # Return the loss from the single batch step return (loss.data.detach().cpu().numpy(), loss_components), predictions def train_batch(self, t_batch): # Get predictions predictions, saturation_cost = self.forward(t_batch, steps=self._iterations) # Calculate losses total_loss, loss_components = self._loss(predictions, t_batch, saturation_cost) # Update the weights self._optimizer.zero_grad() total_loss.backward() # clip gradient torch.nn.utils.clip_grad_norm_(self._model.parameters(), self._gradient_clip) self._optimizer.step() return total_loss.data.detach().cpu().numpy(), loss_components def train(self, training_epochs, checkpoint_callback=None, callback_epoch=10, **kwargs): training_mean_loss_component_collector = LossComponentCollector(500) for e in range(training_epochs): self._epoch = e self._model.eval() # If we want to do something at specific points during training then we can set a checkpoint callback if checkpoint_callback is not None and self._epoch % callback_epoch == 0: checkpoint_callback(e) self._model.train() training_loss, training_loss_components = self.train_batches() training_mean_loss_components = training_loss_components.get_means( ) training_mean_loss_component_collector.append_loss_components_batch( training_mean_loss_components) debug_string = ', '.join([ f'{k}: {v:.4f}' for k, v in training_mean_loss_component_collector. get_window_mean().items() ]) self._logger.info( f'Epoch [{e + 1}/{training_epochs}], Lr: {self._get_lr():.4f}, {debug_string}' ) if self._summary_writer is not None: for component_key, component_value in training_mean_loss_component_collector.get_window_mean( ).items(): self._summary_writer.add_scalars( f'{self.get_name()}/training/{component_key}', {component_key: component_value}, e) if self._learning_rate_patience is not None: self._scheduler.step(training_loss) experiment = self.save( training_epochs=training_epochs, training_loss_components=training_mean_loss_component_collector, ) return experiment, self._model def _generate_initial_state_files(self): if self._initial_state_generator is None: return [] params = self._initial_state_generator.get_generator_params() levels = params['train'] initial_states = self._initial_state_generator.generate_samples(1) initial_state_files = self._get_initial_states(initial_states, levels) return initial_state_files def _get_initial_states(self, batch, envs): initial_state_files = [] for i, env in enumerate(envs): initial_state = np.array( np.swapaxes(batch[i]['input_observation_batch'][0], 2, 0) * 255.0).astype(np.uint8) state_filename = f'{env}_initial.npy' np.save(state_filename, initial_state) initial_state_files.append({ 'filename': state_filename, 'caption': f'Initial state for training level: {env}' }) return initial_state_files def save(self, training_epochs, training_loss_components): filename = 'model.tch' torch.save(self._model.saveable(), open(filename, 'wb')) training_history_csv = self._create_training_history_csv( 'training_history.csv', training_loss_components.get_history()) train_final_values = { f'train_{k}_final': f'{v:.8f}' for k, v in training_loss_components.get_window_mean().items() } meta = { 'epochs': training_epochs, **self._hyperparameters, 'data_generator': self._data_generator.get_name(), 'action_map': self._data_generator.get_action_mapping(), **self._data_generator.get_generator_params(), **train_final_values, } files = [{ 'filename': training_history_csv, 'caption': 'Training history' }, { 'filename': filename, 'caption': f'{self.get_name()}-{self._data_generator.get_name()}-model' }] files.extend(self._generate_initial_state_files()) return self._mr.save(f'{self.get_name()}', meta, files=files) def _create_training_history_csv(self, filename, history_data): dataframe = pd.DataFrame(history_data) dataframe.to_csv(filename, header=True) return filename
def create_network(self): # torch.backends.cudnn.benchmark = True #not faster def train(X, y, weight_factor=10): X = torch.from_numpy(X.astype(np.float32)) y = torch.from_numpy(y.astype(np.float32)) if torch.cuda.is_available(): X, y = Variable(X.cuda()), Variable(y.cuda( )) # X: (bs, features, x, y) y: (bs, classes, x, y) else: X, y = Variable(X), Variable(y) optimizer.zero_grad() net.train() outputs = net(X) # forward # outputs: (bs, classes, x, y) weights = torch.ones( (self.HP.BATCH_SIZE, self.HP.NR_OF_CLASSES, self.HP.INPUT_DIM[0], self.HP.INPUT_DIM[1])).cuda() bundle_mask = y > 0 weights[bundle_mask.data] *= weight_factor #10 loss = criterion(outputs, y, Variable(weights)) # loss = criterion1(outputs, y, Variable(weights)) + criterion2(outputs, y, Variable(weights)) loss.backward() # backward optimizer.step() # optimise if self.HP.CALC_F1: # f1 = PytorchUtils.f1_score_macro(y.data, outputs.data, per_class=True) # f1_a = MetricUtils.calc_peak_dice_pytorch(self.HP, outputs.data, y.data, max_angle_error=self.HP.PEAK_DICE_THR) f1 = MetricUtils.calc_peak_length_dice_pytorch( self.HP, outputs.data, y.data, max_angle_error=self.HP.PEAK_DICE_THR, max_length_error=self.HP.PEAK_DICE_LEN_THR) # f1 = (f1_a, f1_b) else: f1 = np.ones(outputs.shape[3]) if self.HP.USE_VISLOGGER: probs = outputs.data.cpu().numpy().transpose( 0, 2, 3, 1) # (bs, x, y, classes) else: # probs = outputs.data.cpu().numpy().transpose(0,2,3,1) # (bs, x, y, classes) probs = None #faster return loss.data[0], probs, f1 def test(X, y, weight_factor=10): X = torch.from_numpy(X.astype(np.float32)) y = torch.from_numpy(y.astype(np.float32)) if torch.cuda.is_available(): X, y = Variable(X.cuda(), volatile=True), Variable(y.cuda(), volatile=True) else: X, y = Variable(X, volatile=True), Variable(y, volatile=True) net.train(False) outputs = net(X) # forward weights = torch.ones( (self.HP.BATCH_SIZE, self.HP.NR_OF_CLASSES, self.HP.INPUT_DIM[0], self.HP.INPUT_DIM[1])).cuda() bundle_mask = y > 0 weights[bundle_mask.data] *= weight_factor #10 loss = criterion(outputs, y, Variable(weights)) # loss = criterion1(outputs, y, Variable(weights)) + criterion2(outputs, y, Variable(weights)) if self.HP.CALC_F1: # f1 = PytorchUtils.f1_score_macro(y.data, outputs.data, per_class=True) # f1_a = MetricUtils.calc_peak_dice_pytorch(self.HP, outputs.data, y.data, max_angle_error=self.HP.PEAK_DICE_THR) f1 = MetricUtils.calc_peak_length_dice_pytorch( self.HP, outputs.data, y.data, max_angle_error=self.HP.PEAK_DICE_THR, max_length_error=self.HP.PEAK_DICE_LEN_THR) # f1 = (f1_a, f1_b) else: f1 = np.ones(outputs.shape[3]) # probs = outputs.data.cpu().numpy().transpose(0,2,3,1) # (bs, x, y, classes) probs = None # faster return loss.data[0], probs, f1 def predict(X): X = torch.from_numpy(X.astype(np.float32)) if torch.cuda.is_available(): X = Variable(X.cuda(), volatile=True) else: X = Variable(X, volatile=True) net.train(False) outputs = net(X) # forward probs = outputs.data.cpu().numpy().transpose( 0, 2, 3, 1) # (bs, x, y, classes) return probs def save_model(metrics, epoch_nr): max_f1_idx = np.argmax(metrics["f1_macro_validate"]) max_f1 = np.max(metrics["f1_macro_validate"]) if epoch_nr == max_f1_idx and max_f1 > 0.01: # saving to network drives takes 5s (to local only 0.5s) -> do not save so often print(" Saving weights...") for fl in glob.glob(join(self.HP.EXP_PATH, "best_weights_ep*") ): # remove weights from previous epochs os.remove(fl) try: #Actually is a pkl not a npz PytorchUtils.save_checkpoint(join( self.HP.EXP_PATH, "best_weights_ep" + str(epoch_nr) + ".npz"), unet=net) except IOError: print( "\nERROR: Could not save weights because of IO Error\n" ) self.HP.BEST_EPOCH = epoch_nr #Saving Last Epoch: # print(" Saving weights last epoch...") # for fl in glob.glob(join(self.HP.EXP_PATH, "weights_ep*")): # remove weights from previous epochs # os.remove(fl) # try: # # Actually is a pkl not a npz # PytorchUtils.save_checkpoint(join(self.HP.EXP_PATH, "weights_ep" + str(epoch_nr) + ".npz"), unet=net) # except IOError: # print("\nERROR: Could not save weights because of IO Error\n") # self.HP.BEST_EPOCH = epoch_nr def load_model(path): PytorchUtils.load_checkpoint(path, unet=net) def print_current_lr(): for param_group in optimizer.param_groups: ExpUtils.print_and_save( self.HP, "current learning rate: {}".format(param_group['lr'])) if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": NR_OF_GRADIENTS = self.HP.NR_OF_GRADIENTS # NR_OF_GRADIENTS = 9 * 5 # NR_OF_GRADIENTS = 9 * 9 # NR_OF_GRADIENTS = 33 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": NR_OF_GRADIENTS = 3 * self.HP.NR_OF_CLASSES else: NR_OF_GRADIENTS = 33 if torch.cuda.is_available(): net = UNet(n_input_channels=NR_OF_GRADIENTS, n_classes=self.HP.NR_OF_CLASSES, n_filt=self.HP.UNET_NR_FILT).cuda() else: net = UNet(n_input_channels=NR_OF_GRADIENTS, n_classes=self.HP.NR_OF_CLASSES, n_filt=self.HP.UNET_NR_FILT) # if self.HP.TRAIN: # ExpUtils.print_and_save(self.HP, str(net), only_log=True) # criterion1 = PytorchUtils.MSE_weighted # criterion2 = PytorchUtils.angle_loss # criterion = PytorchUtils.MSE_weighted # criterion = PytorchUtils.angle_loss criterion = PytorchUtils.angle_length_loss optimizer = Adamax(net.parameters(), lr=self.HP.LEARNING_RATE) if self.HP.LOAD_WEIGHTS: ExpUtils.print_verbose( self.HP, "Loading weights ... ({})".format( join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH))) load_model(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH)) self.train = train self.predict = test self.get_probs = predict self.save_model = save_model self.load_model = load_model self.print_current_lr = print_current_lr
def __init__(self, hyperparameters, data_generator, initial_state_generator=None, trace_handler=None, summary_writer=None): super().__init__('NGE_Learner', moderage_category=None, moderage_data_id=None, summary_writer=summary_writer) self._hyperparameters = hyperparameters self._data_generator = data_generator self._state_channels = hyperparameters['state_channels'] self._saturation_cost_weight = hyperparameters[ 'saturation_cost_weight'] self._saturation_limit = hyperparameters['saturation_limit'] self._gradient_clip = hyperparameters['gradient_clip'] self._observation_noise_std = hyperparameters['observation_noise_std'] self._reward_loss_coeff = hyperparameters['reward_loss_coeff'] self._reward_state_channels = hyperparameters['reward_state_channels'] self._reward_class_weight = hyperparameters['reward_class_weight'] self._state_channels = hyperparameters['state_channels'] self._batch_size = hyperparameters['batch_size'] self._learning_rate_patience = hyperparameters[ 'learning_rate_patience'] self._learning_rate_decay_factor = hyperparameters[ 'learning_rate_decay_factor'] self._iterations = hyperparameters['ngpu_iterations'] self._num_actions = data_generator.get_num_actions() self._initial_state_generator = initial_state_generator self._model = NeuralGameEngine( self._state_channels, self._reward_state_channels, self._num_actions, observation_noise_std=self._observation_noise_std, saturation_limit=self._saturation_limit, trace_handler=trace_handler, summary_writer=summary_writer, ).to(self._device) self._optimizer = Adamax(self._model.parameters(), lr=hyperparameters['learning_rate']) if self._learning_rate_patience is not None: self._scheduler = ReduceLROnPlateau( self._optimizer, mode='min', factor=self._learning_rate_decay_factor, verbose=True, patience=self._learning_rate_patience) self._mse_observation_loss_criterion = MSELoss().to(self._device) self._ce_reward_loss_criterion = CrossEntropyLoss( weight=torch.tensor(self._reward_class_weight)).to(self._device) self._logger.info('Created Automata Learner') self._logger.info(f'Data Generator: {data_generator.get_name()}') self._logger.info(f'State channels: {self._state_channels}')
def configure_optimizers(self): self.post_constructor_setup() optimizer = Adamax(self.parameters(), lr=self.lrate) scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer, gamma=self.lrate_decay) return [optimizer], [scheduler]
def train(**kwargs): opt.parse(kwargs) if opt.vis: vis = Visualizer(opt.env) # step 1: configure model model = getattr(models, opt.model)(opt) if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step 2: data train_data = Small(opt.train_root, wv_path=opt.word2vec_path, stopwords_path=opt.stopwords_path, idf_path=opt.idf_train_path, train=True) # val_data = Small(opt.train_root, # wv_path=opt.word2vec_path, # stopwords_path=opt.stopwords_path, # train=False) data_size = len(train_data) indices = t.randperm(data_size) # step 3: criterion and optimizer criterion = t.nn.KLDivLoss() lr = opt.lr optimizer = Adamax(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step 4: meters previous_loss = float('inf') # train for epoch in range(opt.max_epoch): for i in tqdm(range(0, data_size, opt.batch_size)): batch_size = min(opt.batch_size, data_size - i) # train_model loss = 0. for j in range(0, batch_size): idx = indices[i + j] q, a, label, shallow_features = train_data[idx] input_q, input_a, shallow_features = Variable(q), Variable( a), Variable(shallow_features) target = Variable(label) if opt.use_gpu: input_q = input_q.cuda() input_a = input_a.cuda() shallow_features = shallow_features.cuda() target = target.cuda() score = model(input_q, input_a, shallow_features) example_loss = criterion(score, target) loss += example_loss loss /= opt.batch_size optimizer.zero_grad() loss.backward() optimizer.step() model.save(model.module_name + '_' + str(epoch) + '.pth') print('epoch:{epoch}, lr:{lr}, loss:{loss}'.format(epoch=epoch, loss=loss.data, lr=lr)) # # validate and visualize # map, mrr = val(model, val_data) # # print('epoch:{epoch}, lr:{lr}, loss:{loss}, map:{map}, mrr:{mrr}'.format( # epoch=epoch, # loss=loss.data, # map=map, # mrr=mrr, # lr=lr # )) # update learning rate if (loss.data > previous_loss).all(): lr = lr * opt.lr_decay previous_loss = loss.data
def create_network(self): # torch.backends.cudnn.benchmark = True #not faster def train(X, y, weight_factor=10): X = torch.from_numpy(X.astype(np.float32)) y = torch.from_numpy(y.astype(np.float32)) if torch.cuda.is_available(): X, y = Variable(X.cuda()), Variable(y.cuda( )) # X: (bs, features, x, y) y: (bs, classes, x, y) else: X, y = Variable(X), Variable(y) optimizer.zero_grad() net.train() outputs = net(X) # forward # outputs: (bs, classes, x, y) loss = criterion(outputs, y) loss.backward() # backward optimizer.step() # optimise f1 = PytorchUtils.f1_score_macro(y.data, outputs.data, per_class=True) if self.HP.USE_VISLOGGER: probs = outputs.data.cpu().numpy().transpose( 0, 2, 3, 1) # (bs, x, y, classes) else: probs = None #faster return loss.data[0], probs, f1 def test(X, y, weight_factor=10): X = torch.from_numpy(X.astype(np.float32)) y = torch.from_numpy(y.astype(np.float32)) if torch.cuda.is_available(): X, y = Variable(X.cuda(), volatile=True), Variable(y.cuda(), volatile=True) else: X, y = Variable(X, volatile=True), Variable(y, volatile=True) net.train(False) outputs = net(X) # forward loss = criterion(outputs, y) f1 = PytorchUtils.f1_score_macro(y.data, outputs.data, per_class=True) # probs = outputs.data.cpu().numpy().transpose(0,2,3,1) # (bs, x, y, classes) probs = None # faster return loss.data[0], probs, f1 def predict(X): X = torch.from_numpy(X.astype(np.float32)) if torch.cuda.is_available(): X = Variable(X.cuda(), volatile=True) else: X = Variable(X, volatile=True) net.train(False) outputs = net(X) # forward probs = outputs.data.cpu().numpy().transpose( 0, 2, 3, 1) # (bs, x, y, classes) return probs def save_model(metrics, epoch_nr): max_f1_idx = np.argmax(metrics["f1_macro_validate"]) max_f1 = np.max(metrics["f1_macro_validate"]) if epoch_nr == max_f1_idx and max_f1 > 0.01: # saving to network drives takes 5s (to local only 0.5s) -> do not save so often print(" Saving weights...") for fl in glob.glob(join(self.HP.EXP_PATH, "best_weights_ep*") ): # remove weights from previous epochs os.remove(fl) try: #Actually is a pkl not a npz PytorchUtils.save_checkpoint(join( self.HP.EXP_PATH, "best_weights_ep" + str(epoch_nr) + ".npz"), unet=net) except IOError: print( "\nERROR: Could not save weights because of IO Error\n" ) self.HP.BEST_EPOCH = epoch_nr def load_model(path): PytorchUtils.load_checkpoint(path, unet=net) def print_current_lr(): for param_group in optimizer.param_groups: ExpUtils.print_and_save( self.HP, "current learning rate: {}".format(param_group['lr'])) if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": NR_OF_GRADIENTS = self.HP.NR_OF_GRADIENTS # NR_OF_GRADIENTS = 9 # NR_OF_GRADIENTS = 9 * 5 # NR_OF_GRADIENTS = 9 * 9 # NR_OF_GRADIENTS = 33 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": self.HP.NR_OF_GRADIENTS = 3 * self.HP.NR_OF_CLASSES else: self.HP.NR_OF_GRADIENTS = 33 if self.HP.LOSS_FUNCTION == "soft_sample_dice": criterion = PytorchUtils.soft_sample_dice final_activation = "sigmoid" elif self.HP.LOSS_FUNCTION == "soft_batch_dice": criterion = PytorchUtils.soft_batch_dice final_activation = "sigmoid" else: # weights = torch.ones((self.HP.BATCH_SIZE, self.HP.NR_OF_CLASSES, self.HP.INPUT_DIM[0], self.HP.INPUT_DIM[1])).cuda() # weights[:, 5, :, :] *= 10 #CA # weights[:, 21, :, :] *= 10 #FX_left # weights[:, 22, :, :] *= 10 #FX_right # criterion = nn.BCEWithLogitsLoss(weight=weights) criterion = nn.BCEWithLogitsLoss() final_activation = None net = UNet(n_input_channels=NR_OF_GRADIENTS, n_classes=self.HP.NR_OF_CLASSES, n_filt=self.HP.UNET_NR_FILT, batchnorm=self.HP.BATCH_NORM, final_activation=final_activation) if torch.cuda.is_available(): net = net.cuda() # else: # net = UNet(n_input_channels=NR_OF_GRADIENTS, n_classes=self.HP.NR_OF_CLASSES, n_filt=self.HP.UNET_NR_FILT, # batchnorm=self.HP.BATCH_NORM) # net = nn.DataParallel(net, device_ids=[0,1]) # if self.HP.TRAIN: # ExpUtils.print_and_save(self.HP, str(net), only_log=True) if self.HP.OPTIMIZER == "Adamax": optimizer = Adamax(net.parameters(), lr=self.HP.LEARNING_RATE) elif self.HP.OPTIMIZER == "Adam": #todo important: change # optimizer = Adam(net.parameters(), lr=self.HP.LEARNING_RATE) optimizer = Adam(net.parameters(), lr=self.HP.LEARNING_RATE, weight_decay=self.HP.WEIGHT_DECAY) else: raise ValueError("Optimizer not defined") # scheduler = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1) # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="max") if self.HP.LOAD_WEIGHTS: ExpUtils.print_verbose( self.HP, "Loading weights ... ({})".format( join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH))) load_model(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH)) self.train = train self.predict = test self.get_probs = predict self.save_model = save_model self.load_model = load_model self.print_current_lr = print_current_lr
def create_network(self): # torch.backends.cudnn.benchmark = True #not faster def train(X, y): X = torch.from_numpy(X.astype(np.float32)) y = torch.from_numpy(y.astype(np.float32)) if torch.cuda.is_available(): X, y = Variable(X.cuda()), Variable(y.cuda( )) # X: (bs, features, x, y) y: (bs, classes, x, y) else: X, y = Variable(X), Variable(y) optimizer.zero_grad() net.train() outputs = net(X) # forward # outputs: (bs, classes, x, y) loss = criterion(outputs, y) # loss = PytorchUtils.soft_dice(outputs, y) loss.backward() # backward optimizer.step() # optimise f1 = PytorchUtils.f1_score_macro(y.data, outputs.data, per_class=True) if self.HP.USE_VISLOGGER: probs = outputs.data.cpu().numpy().transpose( 0, 2, 3, 1) # (bs, x, y, classes) else: probs = None #faster return loss.data[0], probs, f1 def test(X, y): X = torch.from_numpy(X.astype(np.float32)) y = torch.from_numpy(y.astype(np.float32)) if torch.cuda.is_available(): X, y = Variable(X.cuda(), volatile=True), Variable(y.cuda(), volatile=True) else: X, y = Variable(X, volatile=True), Variable(y, volatile=True) net.train(False) outputs = net(X) # forward loss = criterion(outputs, y) # loss = PytorchUtils.soft_dice(outputs, y) f1 = PytorchUtils.f1_score_macro(y.data, outputs.data, per_class=True) # probs = outputs.data.cpu().numpy().transpose(0,2,3,1) # (bs, x, y, classes) probs = None # faster return loss.data[0], probs, f1 def predict(X): X = torch.from_numpy(X.astype(np.float32)) if torch.cuda.is_available(): X = Variable(X.cuda(), volatile=True) else: X = Variable(X, volatile=True) net.train(False) outputs = net(X) # forward probs = outputs.data.cpu().numpy().transpose( 0, 2, 3, 1) # (bs, x, y, classes) return probs def save_model(metrics, epoch_nr): max_f1_idx = np.argmax(metrics["f1_macro_validate"]) max_f1 = np.max(metrics["f1_macro_validate"]) if epoch_nr == max_f1_idx and max_f1 > 0.01: # saving to network drives takes 5s (to local only 0.5s) -> do not save so often print(" Saving weights...") for fl in glob.glob(join(self.HP.EXP_PATH, "best_weights_ep*") ): # remove weights from previous epochs os.remove(fl) try: #Actually is a pkl not a npz PytorchUtils.save_checkpoint(join( self.HP.EXP_PATH, "best_weights_ep" + str(epoch_nr) + ".npz"), unet=net) except IOError: print( "\nERROR: Could not save weights because of IO Error\n" ) self.HP.BEST_EPOCH = epoch_nr def load_model(path): PytorchUtils.load_checkpoint(path, unet=net) def print_current_lr(): for param_group in optimizer.param_groups: ExpUtils.print_and_save( self.HP, "current learning rate: {}".format(param_group['lr'])) if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": NR_OF_GRADIENTS = 9 # NR_OF_GRADIENTS = 9 * 5 # NR_OF_GRADIENTS = 9 * 9 # NR_OF_GRADIENTS = 33 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": NR_OF_GRADIENTS = 3 * self.HP.NR_OF_CLASSES else: NR_OF_GRADIENTS = 33 if torch.cuda.is_available(): net = UNet(n_input_channels=NR_OF_GRADIENTS, n_classes=self.HP.NR_OF_CLASSES, n_filt=self.HP.UNET_NR_FILT).cuda() else: net = UNet(n_input_channels=NR_OF_GRADIENTS, n_classes=self.HP.NR_OF_CLASSES, n_filt=self.HP.UNET_NR_FILT) #Initialisation from U-Net Paper def weights_init(m): classname = m.__class__.__name__ # Do not use with batchnorm -> has to be adapted for batchnorm if classname.find('Conv') != -1: N = m.in_channels * m.kernel_size[0] * m.kernel_size[0] std = math.sqrt(2. / N) m.weight.data.normal_(0.0, std) net.apply(weights_init) # net = nn.DataParallel(net, device_ids=[0,1]) if self.HP.TRAIN: ExpUtils.print_and_save(self.HP, str(net), only_log=True) criterion = nn.BCEWithLogitsLoss() optimizer = Adamax(net.parameters(), lr=self.HP.LEARNING_RATE) # optimizer = Adam(net.parameters(), lr=self.HP.LEARNING_RATE) #very slow (half speed of Adamax) -> strange # scheduler = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1) # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="max") if self.HP.LOAD_WEIGHTS: ExpUtils.print_verbose( self.HP, "Loading weights ... ({})".format( join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH))) load_model(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH)) self.train = train self.predict = test self.get_probs = predict self.save_model = save_model self.load_model = load_model self.print_current_lr = print_current_lr
def train_model(args): # Read and process data train, dev, test, batch_size, test_batch_size, train_ques_to_para,\ dev_ques_to_para, test_ques_to_para, train_tokenized_paras,\ dev_tokenized_paras, test_tokenized_paras, train_order, dev_order, test_order,\ train_data, dev_data, test_data = read_and_process_data(args) # Build model num_pos_tags = len(train_data.dictionary.pos_tags) num_ner_tags = len(train_data.dictionary.ner_tags) model, config = build_model(args, train_data.dictionary.size(), train_data.dictionary.index_to_word, train_data.dictionary.word_to_index, num_pos_tags, num_ner_tags, train_data.dictionary.pos_tags, train_data.dictionary.ner_tags) if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) #------------------------------ Train System ----------------------------------# # Should we resume running from an existing checkpoint? last_done_epoch = config['ckpt'] if last_done_epoch > 0: model = model.load(args.model_dir, last_done_epoch) print "Loaded model." if not args.disable_pretrained: print "Embedding shape:", model.embedding.shape if args.model_file is not None: model = model.load_from_file(args.model_file) print "Loaded model from %s." % args.model_file start_time = time.time() print "Starting training." if args.optimizer == "SGD": print "Using SGD optimizer." optimizer = SGD(model.parameters(), lr=args.learning_rate_start) elif args.optimizer == "Adamax": print "Using Adamax optimizer." optimizer = Adamax(model.parameters(), lr=args.learning_rate_start) if last_done_epoch > 0: if os.path.exists(args.model_dir + "/optim_%d.pt" % last_done_epoch): optimizer.load_state_dict( torch.load(args.model_dir + "/optim_%d.pt" % last_done_epoch)) else: print "Optimizer saved state not found. Not loading optimizer." else: assert False, "Unrecognized optimizer." print(model) print "Starting training loop." cur_learning_rate = args.learning_rate_start dev_loss_prev = float('inf') loss_increase_counter = 0 for EPOCH in range(last_done_epoch + 1, args.epochs): start_t = time.time() train_loss_sum = 0.0 model.set_train() for i, num in enumerate(train_order): print "\r[%.2f%%] Train epoch %d, %.2f s - (Done %d of %d)" %\ ((100.0 * (i+1))/len(train_order), EPOCH, (time.time()-start_t)*(len(train_order)-i-1)/(i+1), i+1, len(train_order)), # Create next batch by getting lengths and padding train_batch = train[num:num + batch_size] # Zero previous gradient. model.zero_grad() # Predict on the network_id assigned to this minibatch. model(*get_batch( train_batch, train_ques_to_para, train_tokenized_paras, train_data.paras_pos_tags, train_data.paras_ner_tags, train_data.question_pos_tags, train_data.question_ner_tags, num_pos_tags, num_ner_tags)) model.loss.backward() optimizer.step() train_loss_sum += model.loss.data[0] print "Loss Total: %.5f, Cur: %.5f (in time %.2fs) " % \ (train_loss_sum/(i+1), model.loss.data[0], time.time() - start_t), if args.show_losses and args.f1_loss_multiplier > 0: print "[MLE: %.5f, F1: %.5f]" % (model.mle_loss.data[0], model.f1_loss.data[0]), sys.stdout.flush() if args.debug_level >= 3: print "" model.free_memory() print "\nLoss: %.5f (in time %.2fs)" % \ (train_loss_sum/len(train_order), time.time() - start_t) # End of epoch. random.shuffle(train_order) model.zero_grad() model.save(args.model_dir, EPOCH) # Decrease learning rate, and save the current optimizer state. for param in optimizer.param_groups: param['lr'] *= config['decay_rate'] cur_learning_rate *= config['decay_rate'] if args.optimizer == "Adamax": torch.save(optimizer.state_dict(), args.model_dir + "/optim_%d.pt" % EPOCH) # Run pass over dev data. dev_start_t = time.time() dev_loss_sum = 0.0 all_predictions = {} print "\nRunning on Dev." model.set_eval() for i, num in enumerate(dev_order): print "\rDev: %.2f s (Done %d of %d)" %\ ((time.time()-dev_start_t)*(len(dev_order)-i-1)/(i+1), i+1, len(dev_order)), dev_batch = dev[num:num + test_batch_size] # distributions[{0,1}][{0,1}].shape = (batch, max_passage_len) # Predict using both networks. distributions = \ model(*get_batch(dev_batch, dev_ques_to_para, dev_tokenized_paras, dev_data.paras_pos_tags, dev_data.paras_ner_tags, dev_data.question_pos_tags, dev_data.question_ner_tags, num_pos_tags, num_ner_tags)) # Add predictions to all answers. get_batch_answers(args, dev_batch, all_predictions, distributions, dev_data) dev_loss_sum += model.loss.data[0] print "[Average loss : %.5f, Cur: %.5f]" % (dev_loss_sum / (i + 1), model.loss.data[0]), sys.stdout.flush() model.free_memory() # Print dev stats for epoch print "\nDev Loss: %.4f (in time: %.2f s)" %\ (dev_loss_sum/len(dev_order), (time.time() - dev_start_t)) # Dump the results json in the required format print "Dumping prediction results." json.dump( all_predictions, open(args.model_dir + "/dev_predictions_" + str(EPOCH) + ".json", "w")) print "Done." # Break if validation loss doesn't decrease for specified num of epochs. if dev_loss_sum / len(dev_order) >= dev_loss_prev: loss_increase_counter += 1 print "Dev loss hasn't decreased (prev = %.5f, cur = %.5f)." %\ (dev_loss_prev, dev_loss_sum/len(dev_order)) if loss_increase_counter >= args.loss_increase_epochs: break else: loss_increase_counter = 0 dev_loss_prev = dev_loss_sum / len(dev_order) print "Training complete!"
tol_d = 0.35 tol_g = 0.2 history_loss_d = [] history_loss_g = [] loss = nn.BCELoss() ## Define the size of the problem coding_dim = int(sys.argv[1]) working_dim = 2211 ## Declare the generator and the discriminator G = Generator(coding_dim, working_dim) D = Discriminator(working_dim, 1) optimizer_G = Adamax(G.parameters()) optimizer_D = Adamax(D.parameters()) # Prepare the dataset train_data = Database(csv_target=database_folder + "data.csv", csv_input=database_folder + "data.csv", nb_data=nb_data).get_loader() trainloader = DataLoader(train_data, batch_size=int(data_to_load), shuffle=True) # Start the training over epochs_G epochs for e in range(epochs): print(str(e) + "/" + str(epochs)) score = 0 # Train the discriminator G.eval()
def create_network(self): # torch.backends.cudnn.benchmark = True #not faster def train(X, y, weight_factor=10): X = torch.tensor(X, dtype=torch.float32).to(device) # X: (bs, features, x, y) y: (bs, classes, x, y) y = torch.tensor(y, dtype=torch.float32).to(device) optimizer.zero_grad() net.train() outputs, outputs_sigmoid = net(X) # forward # outputs: (bs, classes, x, y) if weight_factor > 1: # weights = torch.ones((self.HP.BATCH_SIZE, self.HP.NR_OF_CLASSES, self.HP.INPUT_DIM[0], self.HP.INPUT_DIM[1])).cuda() weights = torch.ones((self.HP.BATCH_SIZE, self.HP.NR_OF_CLASSES, y.shape[2], y.shape[3])).cuda() bundle_mask = y > 0 weights[bundle_mask.data] *= weight_factor # 10 if self.HP.EXPERIMENT_TYPE == "peak_regression": loss = criterion(outputs, y, weights) else: loss = nn.BCEWithLogitsLoss(weight=weights)(outputs, y) else: if self.HP.LOSS_FUNCTION == "soft_sample_dice" or self.HP.LOSS_FUNCTION == "soft_batch_dice": loss = criterion(outputs_sigmoid, y) # loss = criterion(outputs_sigmoid, y) + nn.BCEWithLogitsLoss()(outputs, y) else: loss = criterion(outputs, y) loss.backward() # backward optimizer.step() # optimise if self.HP.EXPERIMENT_TYPE == "peak_regression": # f1 = PytorchUtils.f1_score_macro(y.data, outputs.data, per_class=True) # f1_a = MetricUtils.calc_peak_dice_pytorch(self.HP, outputs.data, y.data, max_angle_error=self.HP.PEAK_DICE_THR) f1 = MetricUtils.calc_peak_length_dice_pytorch(self.HP, outputs.detach(), y.detach(), max_angle_error=self.HP.PEAK_DICE_THR, max_length_error=self.HP.PEAK_DICE_LEN_THR) # f1 = (f1_a, f1_b) elif self.HP.EXPERIMENT_TYPE == "dm_regression": #density map regression f1 = PytorchUtils.f1_score_macro(y.detach()>0.5, outputs.detach(), per_class=True) else: f1 = PytorchUtils.f1_score_macro(y.detach(), outputs_sigmoid.detach(), per_class=True, threshold=self.HP.THRESHOLD) if self.HP.USE_VISLOGGER: # probs = outputs_sigmoid.detach().cpu().numpy().transpose(0,2,3,1) # (bs, x, y, classes) probs = outputs_sigmoid else: probs = None #faster return loss.item(), probs, f1 def test(X, y, weight_factor=10): with torch.no_grad(): X = torch.tensor(X, dtype=torch.float32).to(device) y = torch.tensor(y, dtype=torch.float32).to(device) if self.HP.DROPOUT_SAMPLING: net.train() else: net.train(False) outputs, outputs_sigmoid = net(X) # forward if weight_factor > 1: # weights = torch.ones((self.HP.BATCH_SIZE, self.HP.NR_OF_CLASSES, self.HP.INPUT_DIM[0], self.HP.INPUT_DIM[1])).cuda() weights = torch.ones((self.HP.BATCH_SIZE, self.HP.NR_OF_CLASSES, y.shape[2], y.shape[3])).cuda() bundle_mask = y > 0 weights[bundle_mask.data] *= weight_factor # 10 if self.HP.EXPERIMENT_TYPE == "peak_regression": loss = criterion(outputs, y, weights) else: loss = nn.BCEWithLogitsLoss(weight=weights)(outputs, y) else: if self.HP.LOSS_FUNCTION == "soft_sample_dice" or self.HP.LOSS_FUNCTION == "soft_batch_dice": loss = criterion(outputs_sigmoid, y) # loss = criterion(outputs_sigmoid, y) + nn.BCEWithLogitsLoss()(outputs, y) else: loss = criterion(outputs, y) if self.HP.EXPERIMENT_TYPE == "peak_regression": # f1 = PytorchUtils.f1_score_macro(y.data, outputs.data, per_class=True) # f1_a = MetricUtils.calc_peak_dice_pytorch(self.HP, outputs.data, y.data, max_angle_error=self.HP.PEAK_DICE_THR) f1 = MetricUtils.calc_peak_length_dice_pytorch(self.HP, outputs.detach(), y.detach(), max_angle_error=self.HP.PEAK_DICE_THR, max_length_error=self.HP.PEAK_DICE_LEN_THR) # f1 = (f1_a, f1_b) elif self.HP.EXPERIMENT_TYPE == "dm_regression": #density map regression f1 = PytorchUtils.f1_score_macro(y.detach()>0.5, outputs.detach(), per_class=True) else: f1 = PytorchUtils.f1_score_macro(y.detach(), outputs_sigmoid.detach(), per_class=True, threshold=self.HP.THRESHOLD) if self.HP.USE_VISLOGGER: # probs = outputs_sigmoid.detach().cpu().numpy().transpose(0,2,3,1) # (bs, x, y, classes) probs = outputs_sigmoid else: probs = None # faster return loss.item(), probs, f1 def predict(X): with torch.no_grad(): X = torch.tensor(X, dtype=torch.float32).to(device) if self.HP.DROPOUT_SAMPLING: net.train() else: net.train(False) outputs, outputs_sigmoid = net(X) # forward if self.HP.EXPERIMENT_TYPE == "peak_regression" or self.HP.EXPERIMENT_TYPE == "dm_regression": probs = outputs.detach().cpu().numpy().transpose(0,2,3,1) # (bs, x, y, classes) else: probs = outputs_sigmoid.detach().cpu().numpy().transpose(0, 2, 3, 1) # (bs, x, y, classes) return probs def save_model(metrics, epoch_nr): max_f1_idx = np.argmax(metrics["f1_macro_validate"]) max_f1 = np.max(metrics["f1_macro_validate"]) if epoch_nr == max_f1_idx and max_f1 > 0.01: # saving to network drives takes 5s (to local only 0.5s) -> do not save so often print(" Saving weights...") for fl in glob.glob(join(self.HP.EXP_PATH, "best_weights_ep*")): # remove weights from previous epochs os.remove(fl) try: #Actually is a pkl not a npz PytorchUtils.save_checkpoint(join(self.HP.EXP_PATH, "best_weights_ep" + str(epoch_nr) + ".npz"), unet=net) except IOError: print("\nERROR: Could not save weights because of IO Error\n") self.HP.BEST_EPOCH = epoch_nr def load_model(path): PytorchUtils.load_checkpoint(path, unet=net) def print_current_lr(): for param_group in optimizer.param_groups: ExpUtils.print_and_save(self.HP, "current learning rate: {}".format(param_group['lr'])) if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": NR_OF_GRADIENTS = self.HP.NR_OF_GRADIENTS # NR_OF_GRADIENTS = 9 # NR_OF_GRADIENTS = 9 * 5 # NR_OF_GRADIENTS = 9 * 9 # NR_OF_GRADIENTS = 33 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": self.HP.NR_OF_GRADIENTS = 3*self.HP.NR_OF_CLASSES else: self.HP.NR_OF_GRADIENTS = 33 if self.HP.LOSS_FUNCTION == "soft_sample_dice": criterion = PytorchUtils.soft_sample_dice elif self.HP.LOSS_FUNCTION == "soft_batch_dice": criterion = PytorchUtils.soft_batch_dice elif self.HP.EXPERIMENT_TYPE == "peak_regression": criterion = PytorchUtils.angle_length_loss else: # weights = torch.ones((self.HP.BATCH_SIZE, self.HP.NR_OF_CLASSES, self.HP.INPUT_DIM[0], self.HP.INPUT_DIM[1])).cuda() # weights[:, 5, :, :] *= 10 #CA # weights[:, 21, :, :] *= 10 #FX_left # weights[:, 22, :, :] *= 10 #FX_right # criterion = nn.BCEWithLogitsLoss(weight=weights) criterion = nn.BCEWithLogitsLoss() NetworkClass = getattr(importlib.import_module("tractseg.models." + self.HP.MODEL), self.HP.MODEL) net = NetworkClass(n_input_channels=NR_OF_GRADIENTS, n_classes=self.HP.NR_OF_CLASSES, n_filt=self.HP.UNET_NR_FILT, batchnorm=self.HP.BATCH_NORM, dropout=self.HP.USE_DROPOUT) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = net.to(device) # if self.HP.TRAIN: # ExpUtils.print_and_save(self.HP, str(net), only_log=True) if self.HP.OPTIMIZER == "Adamax": optimizer = Adamax(net.parameters(), lr=self.HP.LEARNING_RATE) elif self.HP.OPTIMIZER == "Adam": optimizer = Adam(net.parameters(), lr=self.HP.LEARNING_RATE) # optimizer = Adam(net.parameters(), lr=self.HP.LEARNING_RATE, weight_decay=self.HP.WEIGHT_DECAY) else: raise ValueError("Optimizer not defined") if self.HP.LR_SCHEDULE: scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="max") self.scheduler = scheduler if self.HP.LOAD_WEIGHTS: ExpUtils.print_verbose(self.HP, "Loading weights ... ({})".format(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH))) load_model(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH)) if self.HP.RESET_LAST_LAYER: # net.conv_5 = conv2d(self.HP.UNET_NR_FILT, self.HP.NR_OF_CLASSES, kernel_size=1, stride=1, padding=0, bias=True).to(device) net.conv_5 = nn.Conv2d(self.HP.UNET_NR_FILT, self.HP.NR_OF_CLASSES, kernel_size=1, stride=1, padding=0, bias=True).to(device) self.train = train self.predict = test self.get_probs = predict self.save_model = save_model self.load_model = load_model self.print_current_lr = print_current_lr
def configure_optimizers(self): optimizer = Adamax(self.parameters(), lr=self.lrate) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 9999999999, eta_min=1e-4) return [optimizer], [scheduler]
def __init__(self, Config, inference=False): self.Config = Config # Do not use during inference because uses a lot more memory if not inference: torch.backends.cudnn.benchmark = True if self.Config.NR_CPUS > 0: torch.set_num_threads(self.Config.NR_CPUS) if self.Config.SEG_INPUT == "Peaks" and self.Config.TYPE == "single_direction": NR_OF_GRADIENTS = self.Config.NR_OF_GRADIENTS elif self.Config.SEG_INPUT == "Peaks" and self.Config.TYPE == "combined": self.Config.NR_OF_GRADIENTS = 3 * self.Config.NR_OF_CLASSES else: self.Config.NR_OF_GRADIENTS = 33 if self.Config.LOSS_FUNCTION == "soft_sample_dice": self.criterion = pytorch_utils.soft_sample_dice elif self.Config.LOSS_FUNCTION == "soft_batch_dice": self.criterion = pytorch_utils.soft_batch_dice elif self.Config.EXPERIMENT_TYPE == "peak_regression": if self.Config.LOSS_FUNCTION == "angle_length_loss": self.criterion = pytorch_utils.angle_length_loss elif self.Config.LOSS_FUNCTION == "angle_loss": self.criterion = pytorch_utils.angle_loss elif self.Config.LOSS_FUNCTION == "l2_loss": self.criterion = pytorch_utils.l2_loss elif self.Config.EXPERIMENT_TYPE == "dm_regression": # self.criterion = nn.MSELoss() # aggregate by mean self.criterion = nn.MSELoss(size_average=False, reduce=True) # aggregate by sum else: self.criterion = nn.BCEWithLogitsLoss() NetworkClass = getattr( importlib.import_module("tractseg.models." + self.Config.MODEL.lower()), self.Config.MODEL) self.net = NetworkClass(n_input_channels=NR_OF_GRADIENTS, n_classes=self.Config.NR_OF_CLASSES, n_filt=self.Config.UNET_NR_FILT, batchnorm=self.Config.BATCH_NORM, dropout=self.Config.USE_DROPOUT, upsample=self.Config.UPSAMPLE_TYPE) # MultiGPU setup # (Not really faster (max 10% speedup): GPU and CPU utility low) # nr_gpus = torch.cuda.device_count() # exp_utils.print_and_save(self.Config, "nr of gpus: {}".format(nr_gpus)) # self.net = nn.DataParallel(self.net) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") net = self.net.to(self.device) if self.Config.OPTIMIZER == "Adamax": self.optimizer = Adamax(net.parameters(), lr=self.Config.LEARNING_RATE, weight_decay=self.Config.WEIGHT_DECAY) elif self.Config.OPTIMIZER == "Adam": self.optimizer = Adam(net.parameters(), lr=self.Config.LEARNING_RATE, weight_decay=self.Config.WEIGHT_DECAY) else: raise ValueError("Optimizer not defined") if APEX_AVAILABLE and self.Config.FP16: # Use O0 to disable fp16 (might be a little faster on TitanX) self.net, self.optimizer = amp.initialize(self.net, self.optimizer, verbosity=0, opt_level="O1") if not inference: print("INFO: Using fp16 training") else: if not inference: print("INFO: Did not find APEX, defaulting to fp32 training") if self.Config.LR_SCHEDULE: self.scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, mode=self.Config.LR_SCHEDULE_MODE, patience=self.Config.LR_SCHEDULE_PATIENCE) if self.Config.LOAD_WEIGHTS: exp_utils.print_verbose( self.Config, "Loading weights ... ({})".format( join(self.Config.EXP_PATH, self.Config.WEIGHTS_PATH))) self.load_model( join(self.Config.EXP_PATH, self.Config.WEIGHTS_PATH)) # Reset weights of last layer for transfer learning if self.Config.RESET_LAST_LAYER: self.net.conv_5 = nn.Conv2d(self.Config.UNET_NR_FILT, self.Config.NR_OF_CLASSES, kernel_size=1, stride=1, padding=0, bias=True).to(self.device)
embedding = ElmoEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-medium', requires_grad=True) elif arg.embedding == 'glove': embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-glove-840b-300d', requires_grad=True, normalize=False) else: raise RuntimeError(f'NOT support {arg.embedding} embedding yet!') # define model model = ESIM(embedding, num_labels=len(data_bundle.vocabs[Const.TARGET])) # define optimizer and callback optimizer = Adamax(lr=arg.lr, params=model.parameters()) scheduler = StepLR(optimizer, step_size=10, gamma=0.5) # 每10个epoch学习率变为原来的0.5倍 callbacks = [ GradientClipCallback( clip_value=10), # 等价于torch.nn.utils.clip_grad_norm_(10) LRScheduler(scheduler), ] if arg.task in ['snli']: callbacks.append( EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name])) # evaluate test set in every epoch if task is snli. # define trainer trainer = Trainer(train_data=data_bundle.datasets[arg.train_dataset_name],