def ex(net, learning_rate, split, epsilon, beta, dur, n_epochs, targets, batch_sz, shuffle, data_path, overlap_chunk=True, kernel_multiplier=1, train_id=None): """""" if train_id is None: train_id = uuid.uuid4() logger = tblog.Logger('runs/{}'.format(train_id)) params = { 'network': net, 'data_fn': os.path.join(data_path, 'train.h5'), 'scaler': SCALER_FN, 'split_fn': split, 'learning_rate': learning_rate, 'epsilon': epsilon, 'beta': beta, 'verbose': True, 'n_epochs': n_epochs, 'batch_sz': batch_sz, 'dur': dur, # frames 'overlap_chunk': True if overlap_chunk else False, 'kernel_multiplier': kernel_multiplier, 'report_every': 100, 'class_weight': False, 'prepare_submission': False, 'iter': 0 } params['targets'] = targets with h5py.File(params['data_fn']) as hf: # load split info split = joblib.load(params['split_fn']) params.update( {'split': {k: map(int, v) for k, v in split.iteritems()}}) # load class weight if needed if params['class_weight']: params.update( {'class_weight': get_class_weight(hf['y'][:])} ) mdl, net = build(network(params), params) train(mdl, hf, params, shuffle, logger) save_check_point(net, params, train_id, path='results/') f1, ll = evaluate(mdl, hf, params) if params['prepare_submission']: # predict test dataset and prepare submission test(mdl, hf, train_id, os.path.join(data_path, 'test.h5'), params) return train_id, f1, ll
def main(): config_dict = import_config_settings() logs_folder = config_dict['logs_folder'] data_sets_folder = config_dict['data_sets_folder'] trained_models_folder = config_dict['trained_models_folder'] for model in config_dict['models']: logger = logger_for_print(folder=logs_folder) fasttext_model_path = model['fasttext_model_path'] #copy_data_files(data_folder=data_sets_folder + model['data_set']) data_set = model['data_set'] del_all_flags(tf.flags.FLAGS) flags.DEFINE_string( 'data_dir', data_sets_folder + '/' + data_set, 'data directory. Should contain train.txt/valid.txt/test.txt with input data' ) flags.DEFINE_string('fasttext_model_path', fasttext_model_path, 'fasttext trained model path') flags.DEFINE_string('embedding', model['embedding'], 'embedding method') define_flags() if model['training']: trained_model_folder = trained_models_folder + '/' + data_set + '_' + str( datetime.datetime.now().strftime('%Y-%m-%d--%H-%M-%S')) flags.DEFINE_string( 'train_dir', trained_model_folder, 'training directory (models and summaries are saved there periodically)' ) train(logger) if model['testing']: trained_model_folder = model['checkpoint_file_for_test'] flags.DEFINE_string( 'train_dir', os.path.dirname(os.path.abspath(trained_model_folder)), 'training directory (models and summaries are saved there periodically)' ) checkpoint_file = checkpoint_file_from_number( model, trained_model_folder) logger("test on model file : " + str(checkpoint_file)) if not checkpoint_file: break checkpoint_file = checkpoint_file.replace(".index", "") tf.flags.DEFINE_string( 'load_model_for_test', checkpoint_file, '(optional) filename of the model to load. Useful for re-starting training from a checkpoint' ) test(logger)
def evaluation(param, test_file='validation'): if param['evaluation_metric'] == 'auc': acc = e.accuracy(param, test_file) print('Iteration %s, Accuracy: %s' % (param['iteration'], acc)) elif param['evaluation_metric'] == 'mean_rank': hits, top_hits, mean_rank = e.test(param, test_file='validation', is_filetered=False) print('Iteration %s, mean_rank: %s, top_hits: %s, hits %s' % (param['iteration'], mean_rank, top_hits, hits))
def testing(model, test_image, step, loss): model.eval() _, _, ti = test(model, np.array([test_image])) image_path = Path( opt.save_path).joinpath('test_result/result_' + str(step) + '_' + str(loss) + '.png') cv2.imwrite(str(image_path), ti[0]) model.train()
def _retrain(self): """ Retrains the network after pruning and weight reinitialization. """ # run the training loop for epoch in range(1, self.epochs + 1): stop, stopping_iteration = train(self.network, self.device, self.train_loader, self.val_loader, self.test_loader, self.optimizer, epoch) self.scheduler.step() # test after each epoch test(self.network, self.device, self.test_loader) if stop: print('Stopped at overall iteration {}\n'.format( stopping_iteration + ((len(self.train_loader.dataset) / self.batch_size) * (epoch - 1)))) break
def main(): config_dict = import_config_settings() logs_folder = config_dict['logs_folder'] data_sets_folder = config_dict['data_sets_folder'] trained_models_folder = config_dict['trained_models_folder'] for model in config_dict['models']: logger = logger_for_print(folder=logs_folder, file_name=config_dict['data_sets_folder']) fasttext_model_path = model['fasttext_model_path'] #copy_data_files(data_folder=data_sets_folder + model['data_set']) data_set = model['data_set'] del_all_flags(tf.flags.FLAGS) trained_model_folder = trained_models_folder + '/' + data_set + '_' + str(datetime.datetime.now().strftime('%Y-%m-%d--%H-%M-%S')) # define train cofidg with static and dynamic values from config.json define_flags( data_dir=data_sets_folder + '/' + data_set, train_dir=trained_model_folder, fasttext_model_path=fasttext_model_path, embedding=model['embedding'], max_epochs=model['max_epochs'], rnn_size=model['rnn_size'], rnn_layers=model['rnn_layers'], highway_layers=model['highway_layers'] ) if model['training']: train(logger) if model['testing']: checkpoint_file = checkpoint_file_from_number(model, trained_model_folder) logger("test on model file : " + str(checkpoint_file)) if not checkpoint_file: break checkpoint_file = checkpoint_file.replace(".index", "") tf.flags.DEFINE_string('load_model_for_test', checkpoint_file, '(optional) filename of the model to load. Useful for re-starting training from a checkpoint') test(logger)
def submit_my_code(): code = request.form['code'] submissions[session['nmec']] = code compile_code, compile_output = evaluate.compile(code) compile_output = compile_output.replace('\n', '<br>') if compile_code == 0: program_output = evaluate.test(request.form['stdin']).replace( '\n', '<br>') else: program_output = '' return { 'compile_code': compile_code, 'compile_output': compile_output, 'program_output': program_output }
def main(opt): train_loader = load_train_dataset(opt) test_loader = load_test_dataset(opt) # create model print('shift model and criterion to GPU .. ') model = models.ResidualRNNConv().cuda() print(model) criterion = nn.BCELoss().cuda() # log directory logger = SummaryWriter(comment=f'_{model.__class__.__name__}') if opt.init_model: print(f'loading pretrained model from {opt.init_model}') model.load_state_dict( torch.load(opt.init_model, map_location=lambda storage, loc: storage.cuda())) # optimizer optimizer = optim.Adam(model.parameters(), opt.lr, weight_decay=opt.weight_decay) scheduler = ReduceLROnPlateau(optimizer, verbose=True, patience=opt.scheduler_patience) accuracy = [] for epoch in range(opt.max_epochs): loss = train(train_loader, model, criterion, optimizer, epoch + 1, opt, logger) scheduler.step(loss) right, simmat = evaluate.test(test_loader, model, opt) accuracy.append(right) # test accuracy logger.add_scalar('accuracy', accuracy[-1], epoch + 1) logger.add_histogram('simmat', simmat, epoch + 1, 'auto') if right > 0.8: path_checkpoint = os.path.join( opt.checkpoint_folder, f'{model.__class__.__name__}_{epoch + 1}_{right:.2%}.pth') utils.save_checkpoint(model.state_dict(), path_checkpoint) print( f'Max test accuracy: {np.max(accuracy):.2%} at epoch {(np.argmax(accuracy)+1)}' )
loss = criterion(raw_scores, pseudotargets) loss.backward() encoder_optimizer.step() train_loss += loss.item() t.set_description('Loss: %.3f ' % (train_loss / (batch_idx + 1))) for epoch in range(start_epoch, start_epoch + args.num_epochs): train(epoch) if (args.test_freq > 0) and (epoch % args.test_freq == (args.test_freq - 1)): X, y = encode_train_set(clftrainloader, device, net) clf = train_clf(X, y, net.representation_dim, num_classes, device, reg_weight=1e-5) acc = test(testloader, device, net, clf) if acc > best_acc: best_acc = acc save_checkpoint(net, clf, critic, epoch, args, os.path.basename(__file__)) elif args.test_freq == 0: save_checkpoint(net, clf, critic, epoch, args, os.path.basename(__file__)) if args.cosine_anneal: scheduler.step()
cuda=args.cuda, best=args.best, model_dir=args.modeldir, log_dir=args.logdir, verbose=args.verbose) ############################################### ## Predict ## ############################################### if not args.test: load_model(args.modeldir, cnn) else: logger.info('Testing on val set:') val_acc = test(cnn, val_iter, text_field, label_field, cuda=args.cuda, verbose=args.verbose) predict(cnn, val_iter, text_field, label_field, os.path.join(args.predout, 'predict_val.txt'), cuda=args.cuda, verbose=args.verbose) predict(cnn, test_iter, text_field, label_field, os.path.join(args.predout, 'predict_test.txt'), cuda=args.cuda,
def train(self, model, train_iter, dev_iter, num_epochs=60, patience=20, roll_in_k=12, roll_out_p=0.5, beam_width=4, clip=10.0, l2=0.0, cuda=False, best=True, model_dir='../model/', verbose=False, start_epoch=1, lambda_q=0.01): """ @TODO: time """ # Zero gradients of both optimizers optim = self.optim(model.transducer_parameters(), **self.optim_args) optim_q = torch.optim.Adam(model.discriminator_parameters(), lr=0.001) self._reset_histories() if cuda: model.cuda() self.logger.info('START TRAIN') self.logger.info('CUDA = ' + str(cuda)) save_model(model_dir, model) best_dev_acc = 0.0 best_dev_ed = float("inf") best_epoch = 0 epoch = start_epoch while epoch <= num_epochs: model.train() epoch_loss = 0.0 epoch_t_loss = 0.0 epoch_q_loss = 0.0 epoch_word_len = 0.0 epoch_pred_len = 0.0 model_roll_in_p = 1 - ( roll_in_k / (roll_in_k + np.exp(float(epoch) / roll_in_k))) self.logger.info('Epoch: %d/%d start ... model_roll_in_p: %f' % (epoch, num_epochs, model_roll_in_p)) #for ss, batch in enumerate(train_iter): with tqdm(total=len(train_iter)) as t: for ss, batch in enumerate(train_iter): lang, lemma, lemma_len, word, word_len, feat, pos, m_lemma, _ = batch # Reset optim.zero_grad() optim_q.zero_grad() loss = 0.0 loss_q = 0.0 if cuda: lang = lang.cuda() lemma = lemma.cuda() lemma_len = lemma_len.cuda() word = word.cuda() word_len = word_len.cuda() feat = feat.cuda() pos = pos.cuda() m_lemma = m_lemma.cuda() # Run batch through discriminator model.freeze_transducer() model.clamp_dis() ret = model(lang, lemma, lemma_len, feat, pos, m_lemma, word, word_len, dis_only=True) loss_q = ret['dis_loss'] epoch_q_loss += loss_q.item() loss_q.backward() optim_q.step() # Reset optim.zero_grad() loss = 0.0 # Run batch through transducer model.freeze_discriminator() model.clamp_dis() ret = model(lang, lemma, lemma_len, feat, pos, m_lemma, word, word_len, model_roll_in_p=model_roll_in_p) loss = ret['loss'] - lambda_q * ret['dis_loss'] prediction = ret['prediction'] predicted_acts = ret['predicted_acts'] # L2 Regularization l2_reg = None if l2 > 0: for W in model.parameters( ): #@BUG: do not include embeddings if l2_reg is None: l2_reg = W.norm(2) else: l2_reg = l2_reg + W.norm(2) else: l2_reg = 0.0 # update loss loss = loss + l2_reg * l2 epoch_t_loss += ret['loss'].item() epoch_loss += loss.item() # Backpropagation loss.backward() #torch.nn.utils.clip_grad_norm_(transducer.parameters(), clip) optim.step() epoch_word_len += word_len.sum().item() epoch_pred_len += sum([len(p) for p in prediction]) tqdm_log = { 'loss': epoch_loss / (ss + 1), 'loss_q': epoch_q_loss / (ss + 1), 'loss_t': epoch_t_loss / (ss + 1), 'word_len': epoch_word_len / (ss + 1), 'pred_len': epoch_pred_len / (ss + 1) } t.set_postfix(epoch="%d/%d" % (epoch, num_epochs), **tqdm_log) t.update() self.logger.info(' ave train loss: %f' % (epoch_loss / len(train_iter))) dev_scores = test(model, dev_iter, beam_width=beam_width, output_file=None, cuda=cuda, verbose=verbose) #dev_scores = {'acc':0.0, 'ed':0.0} if dev_scores['acc'] > best_dev_acc: best_dev_acc = dev_scores['acc'] best_dev_ed = dev_scores['ed'] best_epoch = epoch num_epochs = max(epoch + patience, num_epochs) self.logger.info( ' current dev acc: %f, ed: %f | highest dev acc: %f, ed: %f @ epoch %d' % (dev_scores['acc'], dev_scores['ed'], best_dev_acc, best_dev_ed, best_epoch)) if best: if epoch == best_epoch: save_model(model_dir, model) else: save_model(model_dir, model) epoch += 1
def train_function(config): config_env = config['env'] config_main = config['main'] config_alg = config['alg'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) dir_name = config_main['dir_name'] model_name = config_main['model_name'] summarize = config_main['summarize'] save_period = config_main['save_period'] os.makedirs('../results/%s' % dir_name, exist_ok=True) with open('../results/%s/%s' % (dir_name, 'config.json'), 'w') as f: json.dump(config, f, indent=4) N_train = config_alg['N_train'] N_eval = config_alg['N_eval'] period = config_alg['period'] buffer_size = config_alg['buffer_size'] batch_size = config_alg['batch_size'] pretrain_episodes = config_alg['pretrain_episodes'] steps_per_train = config_alg['steps_per_train'] epsilon_start = config_alg['epsilon_start'] epsilon_end = config_alg['epsilon_end'] epsilon_div = config_alg['epsilon_div'] epsilon_step = (epsilon_start - epsilon_end) / float(epsilon_div) epsilon = epsilon_start env = env_wrapper.Env(config_env, config_main) config_env_mod = config_env.copy() config_env_mod[ 'self_play'] = False # test against stock AI during evaluation episodes config_env_mod['num_away_ai_players'] = config_env_mod[ 'num_away_players'] # set number of stock AI env_eval = env_wrapper.Env(config_env_mod, config_main) self_play = config_env['self_play'] if self_play: assert (config_env['num_away_ai_players'] == 0) l_state = env.state_dim l_action = env.action_dim l_obs = env.obs_dim N_home = config_env['num_home_players'] if config_main['alg_name'] == 'qmix': alg = alg_qmix.Alg(config_alg, N_home, l_state, l_obs, l_action, config['nn_qmix']) elif config_main['alg_name'] == 'iql': alg = alg_iql.Alg(config_alg, N_home, l_state, l_obs, l_action, config['nn_iql']) config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer()) sess.run(alg.list_initialize_target_ops) if summarize: writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph) saver = tf.train.Saver(max_to_keep=config_main['max_to_keep']) buf = replay_buffer.Replay_Buffer(size=buffer_size) # Logging header = "Episode,Step,Step_train,R_avg,R_eval,Steps_per_eps,Opp_win_rate,Win_rate,T_env,T_alg\n" with open("../results/%s/log.csv" % dir_name, 'w') as f: f.write(header) t_start = time.time() t_env = 0 t_alg = 0 reward_period = 0 step = 0 step_train = 0 for idx_episode in range(1, N_train + 1): state_home, state_away, list_obs_home, list_obs_away, done = env.reset( ) reward_episode = 0 summarized = 0 while not done: if idx_episode < pretrain_episodes: if self_play: actions_int_h, actions_int_a = env.random_actions() actions_int = (actions_int_h, actions_int_a) else: actions_int = env.random_actions() else: t_alg_start = time.time() if self_play: actions_int_h = alg.run_actor(list_obs_home, epsilon, sess) actions_int_a = alg.run_actor(list_obs_away, epsilon, sess) actions_int = (actions_int_h, actions_int_a) else: actions_int = alg.run_actor(list_obs_home, epsilon, sess) t_alg += time.time() - t_alg_start t_env_start = time.time() state_home_next, state_away_next, list_obs_home_next, list_obs_away_next, reward, local_rewards, done, info = env.step( actions_int) t_env += time.time() - t_env_start step += 1 if self_play: buf.add( np.array([ state_home, np.array(list_obs_home), actions_int_h, reward[0], state_home_next, np.array(list_obs_home_next), done ])) buf.add( np.array([ state_away, np.array(list_obs_away), actions_int_a, reward[1], state_away_next, np.array(list_obs_away_next), done ])) else: buf.add( np.array([ state_home, np.array(list_obs_home), actions_int, reward, state_home_next, np.array(list_obs_home_next), done ])) if (idx_episode >= pretrain_episodes) and (step % steps_per_train == 0): batch = buf.sample_batch(batch_size) t_alg_start = time.time() if summarize and idx_episode % period == 0 and not summarized: alg.train_step(sess, batch, step_train, summarize=True, writer=writer) summarized = True else: alg.train_step(sess, batch, step_train, summarize=False, writer=None) step_train += 1 t_alg += time.time() - t_alg_start state_home = state_home_next list_obs_home = list_obs_home_next state_away = state_away_next list_obs_away = list_obs_away_next if self_play: reward_episode += reward[0] else: reward_episode += reward if idx_episode >= pretrain_episodes and epsilon > epsilon_end: epsilon -= epsilon_step reward_period += reward_episode if idx_episode == 1 or idx_episode % (5 * period) == 0: print( '{:>10s}{:>10s}{:>12s}{:>8s}{:>8s}{:>15s}{:>15s}{:>10s}{:>12s}{:>12s}' .format(*(header.strip().split(',')))) if idx_episode % period == 0: # Evaluation episodes r_avg_eval, steps_per_episode, win_rate, win_rate_opponent = evaluate.test( N_eval, env_eval, sess, alg) if win_rate >= config_main['save_threshold']: saver.save( sess, '../results/%s/%s-%d' % (dir_name, "model_good.ckpt", idx_episode)) s = '%d,%d,%d,%.2f,%.2f,%d,%.2f,%.2f,%.5e,%.5e\n' % ( idx_episode, step, step_train, reward_period / float(period), r_avg_eval, steps_per_episode, win_rate_opponent, win_rate, t_env, t_alg) with open('../results/%s/log.csv' % dir_name, 'a') as f: f.write(s) print( '{:10d}{:10d}{:12d}{:8.2f}{:8.2f}{:15d}{:15.2f}{:10.2f}{:12.5e}{:12.5e}\n' .format(idx_episode, step, step_train, reward_period / float(period), r_avg_eval, int(steps_per_episode), win_rate_opponent, win_rate, t_env, t_alg)) reward_period = 0 if idx_episode % save_period == 0: saver.save( sess, '../results/%s/%s-%d' % (dir_name, "model.ckpt", idx_episode)) saver.save(sess, '../results/%s/%s' % (dir_name, model_name)) with open('../results/%s/time.txt' % dir_name, 'a') as f: f.write('t_env_total,t_env_per_step,t_alg_total,t_alg_per_step\n') f.write('%.5e,%.5e,%.5e,%.5e' % (t_env, t_env / step, t_alg, t_alg / step))
def train(label_traindata, unlabel_traindata, unlabel_devdata, unlabel_testdata, epoches, model, critic, learnig_rate): ''' 得到lstm提取的特征 计算二者的推土机距离 在有标签的数据集上进行分类,得到分类的交叉熵损失 将推土机距离和交叉熵损失相加作为总的损失进行反向传播 利用集成的思想 加入互学习 后续可能会加上,目前不确定 确定迭代次数 :return: ''' criterion = nn.CrossEntropyLoss() # 定义交叉熵损失函数 optimizer = torch.optim.Adam(model.parameters(), lr=learnig_rate) # 设置学习率 optimizer_critic = torch.optim.Adam(critic.parameters(), lr=learnig_rate) # 设置域识别器的学习率 loss_history = [] # 开始训练 print("{},Start training".format( datetime.now().strftime('%02y/%02m%02d %H:%M:%S'))) loss = 0 bestdevacc = 0.0 besttestacc = 0.0 for epoch in range(epoches): for (batch_labelx, batch_labely), (batch_unlabelx, batch_unlabely) in zip(label_traindata, unlabel_traindata): # print("feature ok") ''' # deep mutual learning logits1, logits2, distance = model(batch_labelx, batch_unlabelx, mode) # 将 logits 从张量转换成 list 的形式 logits_array1 = logits1.cpu().detach().numpy() logits_array2 = logits2.cpu().detach().numpy() kl = computewa(logits_array1, logits_array2) # 模型转换成 深度互学习的表现形式 # 损失函数由 两个分类器的交叉熵损失函数 特征提取器的分布差异 分类器之间的kl散度 组成 loss = criterion(logits1, batch_labely-1) + criterion(logits2, batch_labely-1) + 0.6*distance + 0.1*kl ''' # 需要创建 域识别器的标签 源域-0 目标域-1 batch_domains = torch.zeros_like(batch_labely) batch_domaint = torch.ones_like(batch_unlabely) feature_label, attnweight = model.feature_extractor(batch_labelx) feature_unlabel, _ = model.feature_extractor(batch_unlabelx) length = compare(feature_label, feature_unlabel) discrepancy = 0.5 * wasserstein( feature_label, feature_unlabel, length) + 0.2 * cosine( feature_label, feature_unlabel, length) + 0.3 * mmd( feature_label[:length], feature_unlabel[:length]) # 两个分布之间的推土机距离 logits_critics, logits_critict = critic(feature_label, feature_unlabel) loss_critic = criterion(logits_critics, batch_domains) + criterion( logits_critict, batch_domaint) # logits_class, logits_domain, distance = model(feature_label, feature_unlabel, mode) # distance is discrepancy between source and target logits_class = model(feature_label) # 情感分类器的分类结果 loss_class = criterion(logits_class, batch_labely - 1) # loss for classifier loss_classes = loss_class + 0.8 * discrepancy + 0.1 * torch.norm( attnweight, p=2) # 将注意力权重作为惩罚项 optimizer.zero_grad() loss_classes.backward(retain_graph=True) # print("compute loss") # 使用梯度削减策略 torch.nn.utils.clip_grad_norm(model.parameters(), 0.15) optimizer.step() optimizer_critic.zero_grad() loss_critic.backward() optimizer_critic.step() total_loss = loss_critic + loss_classes print('Epoch:', '%03d' % (epoch + 1), 'cost =', '{:.6f}'.format(total_loss)) loss_history.append(loss) print("test start") devacc = test(unlabel_devdata, model) testacc = test(unlabel_testdata, model) if devacc > bestdevacc: torch.save(model, 'bestdev.pth') if testacc > besttestacc: torch.save(model, 'besttest.pth')
def train(): print("Training...") print("Initializing hyperparameters...") # vis = visdom.Visdom() # loss_window = vis.line(X=torch.zeros((1,)).cpu(), Y=torch.zeros((1,)).cpu(), # opts=dict(xlabel='epoch', ylabel='Loss', title='Training Loss', legend=['Loss'])) # Get dataset print("Get Dataset...") data_loader = Generator() model = HourglassModel() model = model.cuda(opt.cuda_devices) criterion = SGPNLoss() l_rate = opt.lr optimizer = torch.optim.Adam(model.parameters(), lr=l_rate, weight_decay=opt.weight_decay) step = 0 best_loss = float('inf') best_model_params = copy.deepcopy(model.state_dict()) loss_list = [] record = open('record.txt', 'w') feature_size = 4 K1 = 1.0 K2 = 2.0 constant_exist = 1.0 constant_nonexist = 1.0 constant_offset = 1.0 constant_alpha = 1.0 constant_beta = 1.0 constant_lane_loss = 1.0 constant_instance_loss = 1.0 current_epoch = 0 print("Training loop...") for epoch in range(opt.epochs): print(f'Epoch: {epoch+1}/{opt.epochs}') print('-' * len(f'Epoch: {epoch+1}/{opt.epochs}')) model.train() point_loss = 0.0 iteration = 0 for inputs, target_lanes, target_h, test_image in tqdm.tqdm( data_loader.Generate()): real_batch_size = len(target_lanes) iteration += 1 # print(real_batch_size) # generate ground truth ground_truth_point, ground_binary = make_ground_truth_point( target_lanes, target_h) ground_truth_instance = make_ground_truth_instance( target_lanes, target_h) # convert numpy array to torch tensor ground_truth_point = torch.from_numpy(ground_truth_point).float() ground_truth_point = Variable(ground_truth_point).cuda( opt.cuda_devices) ground_truth_point.requires_grad = False ground_binary = torch.LongTensor(ground_binary.tolist()).cuda( opt.cuda_devices) ground_binary.requires_grad = False ground_truth_instance = torch.from_numpy( ground_truth_instance).float() ground_truth_instance = Variable(ground_truth_instance).cuda( opt.cuda_devices) ground_truth_instance.requires_grad = False inputs = torch.from_numpy(inputs).float() inputs = Variable(inputs.cuda(opt.cuda_devices)) result = model(inputs) lane_detection_loss = 0.0 for (confidance, offset, feature) in result: #compute loss for point prediction offset_loss = 0 exist_condidence_loss = 0 nonexist_confidence_loss = 0 #exist confidance loss confidance_gt = ground_truth_point[:, 0, :, :] confidance_gt = confidance_gt.view(real_batch_size, 1, opt.grid_y, opt.grid_x) exist_condidence_loss = torch.sum( (confidance_gt[confidance_gt == 1] - confidance[confidance_gt == 1])** 2) / torch.sum(confidance_gt == 1) #non exist confidance loss nonexist_confidence_loss = torch.sum( (confidance_gt[confidance_gt == 0] - confidance[confidance_gt == 0])** 2) / torch.sum(confidance_gt == 0) #offset loss offset_x_gt = ground_truth_point[:, 1:2, :, :] offset_y_gt = ground_truth_point[:, 2:3, :, :] predict_x = offset[:, 0:1, :, :] predict_y = offset[:, 1:2, :, :] x_offset_loss = torch.sum((offset_x_gt[confidance_gt == 1] - predict_x[confidance_gt == 1])** 2) / torch.sum(confidance_gt == 1) y_offset_loss = torch.sum((offset_y_gt[confidance_gt == 1] - predict_y[confidance_gt == 1])** 2) / torch.sum(confidance_gt == 1) offset_loss = (x_offset_loss + y_offset_loss) / 2 #compute loss for similarity sisc_loss = 0 disc_loss = 0 feature_map = feature.view(real_batch_size, feature_size, 1, opt.grid_y * opt.grid_x) feature_map = feature_map.expand( real_batch_size, feature_size, opt.grid_y * opt.grid_x, opt.grid_y * opt.grid_x).detach() point_feature = feature.view(real_batch_size, feature_size, opt.grid_y * opt.grid_x, 1) point_feature = point_feature.expand( real_batch_size, feature_size, opt.grid_y * opt.grid_x, opt.grid_y * opt.grid_x) #.detach() distance_map = (feature_map - point_feature)**2 distance_map = torch.norm(distance_map, dim=1).view(real_batch_size, 1, opt.grid_y * opt.grid_x, opt.grid_y * opt.grid_x) # same instance sisc_loss = torch.sum( distance_map[ground_truth_instance == 1]) / torch.sum( ground_truth_instance == 1) # different instance, same class disc_loss = K1 - distance_map[ ground_truth_instance == 2] #self.p.K1/distance_map[ground_truth_instance==2] + (self.p.K1-distance_map[ground_truth_instance==2]) disc_loss[disc_loss < 0] = 0 disc_loss = torch.sum(disc_loss) / torch.sum( ground_truth_instance == 2) print( "seg loss################################################################" ) print(sisc_loss) print(disc_loss) print("point loss") print(exist_condidence_loss) print(nonexist_confidence_loss) print(offset_loss) print("lane loss") lane_loss = constant_exist * exist_condidence_loss + constant_nonexist * nonexist_confidence_loss + constant_offset * offset_loss print(lane_loss) print("instance loss") instance_loss = constant_alpha * sisc_loss + constant_beta * disc_loss print(instance_loss) lane_detection_loss = lane_detection_loss + constant_lane_loss * lane_loss + constant_instance_loss * instance_loss optimizer.zero_grad() lane_detection_loss.backward() optimizer.step() del confidance, offset, feature del ground_truth_point, ground_binary, ground_truth_instance del feature_map, point_feature, distance_map del exist_condidence_loss, nonexist_confidence_loss, offset_loss, sisc_loss, disc_loss, lane_loss, instance_loss if epoch > 0 and epoch % 50 == 0 and current_epoch != epoch: current_epoch = epoch if epoch > 0 and (epoch == 1000): constant_lane_loss += 0.5 constant_nonexist += 0.5 l_rate /= 2.0 optimizer = torch.optim.Adam(model.parameters(), lr=l_rate, weight_decay=opt.weight_decay) point_loss += lane_detection_loss.item() * inputs.size(0) if step % 1000 == 0: testing(model, test_image, step, point_loss) step += 1 training_loss = point_loss / (real_batch_size * iteration) loss_list.append(training_loss) print(f'training_loss: {training_loss:.4f}\n') if training_loss < best_loss: best_loss = training_loss best_model_params = copy.deepcopy(model.state_dict()) if (epoch + 1) % 50 == 0: model.load_state_dict(best_model_params) weight_path = Path(opt.save_path).joinpath('weights').joinpath( f'model-{epoch+1}epoch-{best_loss:.02f}-best_train_loss.pth') torch.save(model, str(weight_path)) record.write(f'{epoch+1}\n') record.write(f'Best training loss: {best_loss:.4f}\n\n') if epoch > 0 and epoch % 10 == 0: print("evaluaton...") model.eval() th_list = [0.3, 0.5, 0.7] for th in th_list: print("generate result") print(th) name = "test_result_" + str(epoch) + "_" + str(th) + ".json" save_path = Path( opt.save_path).joinpath('json_test_result').joinpath(name) result_data = copy.deepcopy(data_loader.test_data) for test_image, target_h, ratio_w, ratio_h, testset_index in data_loader.Generate_Test( ): x, y, _ = test(model, np.array([test_image]), thresh=0.81) x, y = convert_to_original_size(x[0], y[0], ratio_w, ratio_h) x, y = find_target(x, y, target_h, ratio_w, ratio_h) result_data = write_result_json(result_data, x, y, testset_index) if name != None: save_result(result_data, str(save_path)) else: save_result(result_data, name) for th in th_list: print("compute score") print(th) txt_file = Path(opt.save_path).joinpath( 'txt_eval_result').joinpath("eval_result_" + str(th) + "_.txt") with open(str(txt_file), 'a') as make_file: make_file.write("epoch : " + str(epoch) + " loss : " + str(training_loss)) test_result_path = Path( opt.save_path).joinpath('json_test_result').joinpath( "test_result_" + str(epoch) + "_" + str(th) + ".json") make_file.write( LaneEval.bench_one_submit(str(test_result_path), "test_label.json")) make_file.write("\n") loss_list = np.round(loss_list, 4) plt_loss(loss_list)
'batch_size': 128, 'num_epochs': 100, 'num_repeats': 1, 'k': 5, # only will be used in CV 'classifier_build': 'parallel', 'kernel_width': [4, 5, 6, 8, 10], # put 6 'sequence_length': sequence_length } clear_session() if cv: classifiers, histories, roc_auc_scores = te.cv_run(X, y, run_dict) else: classifiers, histories, roc_auc_scores = te.run(X, y, run_dict) else: import evaluate as ev classifier, sequence_length = u.load_model(model_path) df = pp.prepare_input(inputs, output_folder, False, usecols=columns, do_type=do_type) df, X, y = pp.process_dataset_for_test(df, sequence_length) df_out, roc_auc_scores = ev.test(classifier, df, X, y, output_folder) print(roc_auc_scores)
def train(self, cnn, train_iter, val_iter, text_field, label_field, num_epochs=10, clip=0.5, reg_lambda=0.0, cuda=False, best=True, model_dir='../model/', log_dir='./logs', verbose=False): # Zero gradients of both optimizers optim = self.optim(cnn.parameters(), **self.optim_args) #self.tf_logger = Logger(log_dir) self._reset_histories() if cuda: cnn.cuda() self.logger.info('START TRAIN') self.logger.info('CUDA = ' + str(cuda)) torch.save(cnn.state_dict(), os.path.join(model_dir, 'cnn.pkl')) best_val_acc = 0.0 best_epoch = 0 criterion = nn.CrossEntropyLoss() if cuda: criterion.cuda() ss = 0 for epoch in range(num_epochs): self.logger.info('Epoch: %d start ...' % (epoch + 1)) cnn.train() for batch in train_iter: ss += 1 input, target = batch.text, batch.label # input: len x N; target: N # Reset optim.zero_grad() loss = 0 # Setup data input.data.t_() # N x len if cuda: input = input.cuda() target = target.cuda() # Run words through cnn scores = cnn(input) l2_reg = None for W in cnn.parameters(): if l2_reg is None: l2_reg = W.norm(2) else: l2_reg = l2_reg + W.norm(2) loss = criterion(scores, target) + l2_reg * reg_lambda # Backpropagation loss.backward() torch.nn.utils.clip_grad_norm_(cnn.parameters(), clip) optim.step() ''' info = { 'Loss': loss.data[0], } for tag, value in info.items(): self.tf_logger.scalar_summary(tag, value, ss) ''' if verbose: self.logger.info('Epoch: %d, Iteration: %d, loss: %f' % (epoch + 1, ss, loss.item())) val_acc = test(cnn, val_iter, text_field, label_field, cuda=cuda, verbose=verbose) ''' info = { 'val_acc': val_acc } for tag, value in info.items(): self.tf_logger.scalar_summary(tag, value, epoch) ''' if best: if val_acc > best_val_acc: torch.save(cnn.state_dict(), os.path.join(model_dir, 'cnn.pkl')) best_val_acc = val_acc else: torch.save(cnn.state_dict(), os.path.join(model_dir, 'cnn.pkl'))
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') print("tb_writer.log_dir: ", tb_writer.log_dir) #resumed的时候可以从这里恢复 log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' print("log_dir: ", log_dir) print("wdir: ", wdir) print("last: ", last) print("best: ", best) results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank print("epochs: ", epochs) print("batch_size: ", batch_size) print("total_batch_size: ", total_batch_size) print("weights: ", weights) print("rank: ", rank) # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' print("cuda: ", cuda) init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict print("data_dict: ", data_dict) with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') print("pretrained: ", pretrained) if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint """ckpt中,['epoch'], ['best_fitness'], ['training_results'], ['model'], ['optimizer']""" # print("ckpt: ", ckpt) print( "ckpt: ['epoch'], ['best_fitness'], ['training_results'], ['optimizer']: ", ckpt['epoch'], ckpt['best_fitness'], ckpt['training_results'], ckpt['optimizer']) # print("ckpt['model']: ", ckpt['model']) # print("ckpt['model'].model: ", ckpt['model'].model) # print("ckpt['model'].state_dict(): ", ckpt['model'].state_dict()) print("ckpt['model'].save: ", ckpt['model'].save) print("ckpt['model'].yaml: ", ckpt['model'].yaml) print("hyp.get('anchors'): ", hyp.get('anchors')) if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor print("opt.cfg: ", opt.cfg) # create, 都为真则取or前面的, 即以opt.cfg中的内容(eg:yolov5s.yaml, yolov5x.yaml...)为主,其次是ckpt['model'].yaml model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) exclude = (['anchor'] if opt.cfg or hyp.get('anchors') else [] ) # exclude keys, 以opt.cfg中的anchor为主,其次是hyp中的 print("exclude: ", exclude) state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load # print("state_dict: ", state_dict) logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer """当模型梯度累积了(nbs/total_batch_size)次之后,再更新一次模型参数,变相的扩大了batch_size""" nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else #optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # 设置学习率衰减,这里为余弦退火方式进行衰减 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) #每个epochs具有一个不同的学习率 # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}' ) # save previous weights if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) # 获取模型总步长和模型输入图片分辨率 imgsz, imgsz_test = [ check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples #检查输入图片分辨率是gs=32的整数倍 print("imgsz: ", imgsz) print("imgsz_test: ", imgsz_test) # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: print( "DP mode..............................................................................." ) model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) print("dataloader: ", dataloader) print("dataset: ", dataset) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) nb = len(dataloader) # number of batches print("nb: ", nb) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates print("ema.updates: ", ema.updates) testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names #print("model: ", model) # Start training t0 = time.time() n_warmup = max( round(hyp['warmup_epochs'] * nb), 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # n_warmup = min(n_warmup, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training print("n_warmup: ", n_warmup) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) #混合精度梯度放大模块 maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' logger.info( 'Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start batch) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= n_warmup: # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, [0, n_warmup], [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, [0, n_warmup], [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, [0, n_warmup], [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): #自动混合精度 pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize """每accumulate个batch时更新一次, 在n_warmup之内时,accumulate从1逐渐增大到4""" if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if final_epoch: # replot predictions [ os.remove(x) for x in glob.glob( str(log_dir / 'test_batch*_pred.jpg')) if os.path.exists(x) ] results, maps, times = evaluate.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def main( model=LeNetFC, dataset='mnist', batch_size=64, train_size=None, test_batch_size=1000, epochs=3, lr=1.0, gamma=0.7, no_cuda=False, rand_seed=42, save_model=False, ): """ This is the main script which trains and tests the model Args: model (torch.nn.Module): which model to use for the experiment dataset (str): which dataset to use for the experiment batch_size (int): size of training mini-batch train_size (int): size of train set, not necessary to specify with 'mnist' test_batch_size (int): size of testing batch epochs (int): num epochs lr (float): learning rate gamma (float): rate at which to adjust lr with scheduler no_cuda (bool): cuda or not rand_seed (int): random seed save_model (bool): whether to save pytorch model conv_layers (bool): whether to include convolutional layers in LeNet architecture or not """ # view model print(model) if dataset == 'mnist': train_loader, val_loader, test_loader, use_cuda = load_mnist( batch_size, test_batch_size, no_cuda, rand_seed) elif dataset == 'cifar10': train_loader, val_loader, test_loader, use_cuda = load_cifar10( batch_size, train_size, test_batch_size, no_cuda, rand_seed) print(len(train_loader.dataset)) # setup device, model, optimizer, and lr scheduler device = torch.device('cuda' if use_cuda else 'cpu') print('device:', device) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=lr) scheduler = StepLR(optimizer, step_size=1, gamma=gamma) # run the training loop for epoch in range(1, epochs + 1): stop, stopping_iteration = train(model, device, train_loader, val_loader, test_loader, optimizer, epoch) scheduler.step() # test after each epoch test(model, device, test_loader) if stop: print('Stopped at overall iteration {}\n'.format( stopping_iteration + ((len(train_loader.dataset) / batch_size) * (epoch - 1)))) break if save_model: torch.save(model.state_dict(), model.__class__.__name__ + '_' + dataset + ".pt") print('\nPruning...\n') prune_model = PruneModel(model, batch_size, train_loader, val_loader, test_loader, optimizer, epochs, scheduler, device, pruning_rounds=7) prune_model.prune()
def train(modelWrapper, data, hyp, opt, device): model = modelWrapper.model ckpt = modelWrapper.config['ckpt'] logger.info(f'Hyperparameters {hyp}') log_dir = opt.modelPath wdir = log_dir + '/weights' os.makedirs(wdir, exist_ok=True) last = wdir + '/last.pt' best = wdir + '/best.pt' results_file = log_dir + '/results.txt' epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank with open(log_dir + '/hyp-train.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir + '/opt-train.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) with torch_distributed_zero_first(rank): check_dataset(data_dict) train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (int(data_dict['nc']), data_dict['names']) assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # Optimizer nbs = 64 accumulate = max(round(nbs / total_batch_size), 1) hyp['weight_decay'] *= total_batch_size * accumulate / nbs pg0, pg1, pg2 = [], [], [] for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) elif '.weight' in k and '.bn' not in k: pg1.append(v) else: pg0.append(v) optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) optimizer.add_param_group({'params': pg2}) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) start_epoch, best_fitness = 0, 0.0 # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # Epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] del ckpt # Image sizes gs = int(max(model.stride)) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # Exponential moving average ema = ModelEMA(model) dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() nb = len(dataloader) assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) ema.updates = start_epoch * nb // accumulate labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) plot_labels(labels, save_dir=log_dir) check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. model.nc = nc model.hyp = hyp model.gr = 1.0 model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1e3) maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\n' 'Using %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) for epoch in range(start_epoch, epochs): logger.info('Epoch: ' + str(epoch)) model.train() mloss = torch.zeros(4, device=device) # mean losses pbar = enumerate(dataloader) optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) # Plot if ni < 3: f = str(('log_dir/train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) # end batch ------------------------------------------------------------------------------------------------ logger.info(s) # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs results, maps, times = evaluate.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=dataloader, save_dir=log_dir, plots=epoch == 0 or final_epoch) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi logger.info('Current Best Map: ' + str(fi)) # Save model with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- return imgsz
def test_function(config): config_env = config['env'] config_main = config['main'] config_alg = config['alg'] config_h = config['h_params'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) alg_name = config_main['alg_name'] dir_name = config_main['dir_name'] model_name = config_main['model_name'] summarize = False N_test = config_main['N_test'] measure = config_main['measure'] test_filename = config_main['test_filename'] N_roles = config_h['N_roles'] steps_per_assign = config_h['steps_per_assign'] env = env_wrapper.Env(config_env, config_main, test=True, N_roles=N_roles) l_state = env.state_dim l_action = env.action_dim l_obs = env.obs_dim N_home = config_env['num_home_players'] if alg_name == 'qmix': alg = alg_qmix.Alg(config_alg, N_home, l_state, l_obs, l_action, config['nn_qmix']) elif alg_name == 'hsd-scripted' or alg_name == 'mara-c': alg = alg_hsd_scripted.Alg(alg_name, config_alg, N_home, l_state, l_obs, l_action, N_roles, config['nn_hsd_scripted']) elif alg_name == 'iql': alg = alg_iql.Alg(config_alg, N_home, l_state, l_obs, l_action, config['nn_iql']) elif alg_name == 'hsd': alg = alg_hsd.Alg(config_alg, config_h, N_home, l_state, l_obs, l_action, N_roles, config['nn_hsd']) saver = tf.train.Saver() config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) print("Restoring variables from %s" % dir_name) saver.restore(sess, '../results/%s/%s' % (dir_name, model_name)) if alg_name == 'qmix' or alg_name == 'iql': result = evaluate.test(N_test, env, sess, alg, dir_name=dir_name, log=True, test_filename=test_filename) r_avg_eval, steps_per_episode, win_rate_home, win_rate_away = result elif alg_name == 'hsd-scripted' or alg_name == 'mara-c' or alg_name == 'hsd': result = evaluate.test_hierarchy(alg_name, N_test, env, sess, alg, steps_per_assign, dir_name=dir_name, log=True, measure=measure, test_filename=test_filename, fixed_idx_skill=config_h['fixed_idx_skill']) r_avg_eval, steps_per_episode, win_rate, win_rate_opponent = result
def train(opt, isbody=False): train_ds = MedicalExtractionDataset(opt.train_data) dev_ds = MedicalExtractionDataset(opt.dev_data) test_ds = MedicalExtractionDataset(opt.test_data) dev_dl = DataLoader(dev_ds, batch_size=opt.dev_batch_size, shuffle=False, num_workers=opt.num_worker) test_dl = DataLoader(test_ds, batch_size=opt.dev_batch_size, shuffle=False, num_workers=opt.num_worker) if isbody: logging('training for body') model = MedicalExtractionModelForBody(opt) else: logging('training for subject, decorate and body') model = MedicalExtractionModel(opt) # print(model.parameters) print_params(model) start_epoch = 1 learning_rate = opt.lr total_epochs = opt.epochs pretrain_model = opt.pretrain_model model_name = opt.model_name # 要保存的模型名字 # load pretrained model if pretrain_model != '' and not isbody: chkpt = torch.load(pretrain_model, map_location=torch.device('cpu')) model.load_state_dict(chkpt['checkpoints']) logging('load model from {}'.format(pretrain_model)) start_epoch = chkpt['epoch'] + 1 learning_rate = chkpt['learning_rate'] logging('resume from epoch {} with learning_rate {}'.format( start_epoch, learning_rate)) else: logging('training from scratch with learning_rate {}'.format( learning_rate)) model = get_cuda(model) num_train_steps = int(len(train_ds) / opt.batch_size * opt.epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] # optimizer = optim.Adam(model.parameters(), lr=learning_rate) optimizer = optim.AdamW(optimizer_parameters, lr=learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=opt.num_warmup_steps, num_training_steps=num_train_steps) threshold = opt.threshold criterion = nn.BCEWithLogitsLoss(reduction='none') checkpoint_dir = opt.checkpoint_dir if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) es = EarlyStopping(patience=opt.patience, mode="min", criterion='val loss') for epoch in range(start_epoch, total_epochs + 1): train_loss = 0.0 model.train() train_dl = DataLoader(train_ds, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_worker) tk_train = tqdm(train_dl, total=len(train_dl)) for batch in tk_train: optimizer.zero_grad() subject_target_ids = batch['subject_target_ids'] decorate_target_ids = batch['decorate_target_ids'] freq_target_ids = batch['freq_target_ids'] body_target_ids = batch['body_target_ids'] mask = batch['mask'].float().unsqueeze(-1) body_mask = batch['body_mask'].unsqueeze(-1) loss = None if isbody: body_logits = model( input_ids=batch['body_input_ids'], attention_mask=batch['body_mask'], token_type_ids=batch['body_token_type_ids']) loss = torch.sum( criterion(body_logits, body_target_ids) * body_mask) / torch.sum(body_mask) else: subject_logits, decorate_logits, freq_logits = model( input_ids=batch['input_ids'], attention_mask=batch['mask'], token_type_ids=batch['token_type_ids']) loss = torch.sum( (criterion(subject_logits, subject_target_ids) + criterion(decorate_logits, decorate_target_ids) + criterion(freq_logits, freq_target_ids)) * mask) / torch.sum(mask) loss.backward() optimizer.step() scheduler.step() tk_train.set_postfix(train_loss='{:5.3f} / 1000'.format( 1000 * loss.item()), epoch='{:2d}'.format(epoch)) train_loss += loss.item() * subject_target_ids.shape[0] avg_train_loss = train_loss * 1000 / len(train_ds) print('train loss per example: {:5.3f} / 1000'.format(avg_train_loss)) avg_val_loss = test(model, dev_ds, dev_dl, criterion, threshold, 'val', isbody=isbody) # 保留最佳模型方便evaluation if isbody: save_model_path = os.path.join(checkpoint_dir, model_name + '_body_best.pt') else: save_model_path = os.path.join(checkpoint_dir, model_name + '_best.pt') es(avg_val_loss, model, model_path=save_model_path, epoch=epoch, learning_rate=learning_rate) if es.early_stop: print("Early stopping") break # 保存epoch的模型方便断点续训 if epoch % opt.save_model_freq == 0: if isbody: save_model_path = os.path.join( checkpoint_dir, model_name + '_body_{}.pt'.format(epoch)) else: save_model_path = os.path.join( checkpoint_dir, model_name + '_{}.pt'.format(epoch)) torch.save( { 'epoch': epoch, 'learning_rate': learning_rate, 'checkpoints': model.state_dict() }, save_model_path) # load best model and test if isbody: best_model_path = os.path.join(checkpoint_dir, model_name + '_body_best.pt') else: best_model_path = os.path.join(checkpoint_dir, model_name + '_best.pt') chkpt = torch.load(best_model_path, map_location=torch.device('cpu')) model.load_state_dict(chkpt['checkpoints']) if isbody: logging('load best body model from {} and test ...'.format( best_model_path)) else: logging('load best model from {} and test ...'.format(best_model_path)) test(model, test_ds, test_dl, criterion, threshold, 'test', isbody) model.cpu()
def train_and_evaluate(cfg): #Training settings experiment_dir = os.path.join('experiments',cfg.exp_type,cfg.save_dir) if not os.path.exists(experiment_dir): os.makedirs(experiment_dir) utils.set_logger(os.path.join(experiment_dir,cfg.log)) logging.info('-----------Starting Experiment------------') use_cuda = cfg.use_cuda and torch.cuda.is_available() cfg.use_cuda=use_cuda device = torch.device("cuda:{}".format(cfg.cuda_num) if use_cuda else "cpu") # initialize the tensorbiard summary writer #writer = SummaryWriter(experiment_dir + '/tboard' ) logs=os.path.join('experiments',cfg.exp_type,'tboard_sup_demo') writer = SummaryWriter(logs + '/rotnet_without_pretrain' ) ## get the dataloaders dloader_train,dloader_val,dloader_test = dataloaders.get_dataloaders(cfg) # Load the model model = models.get_model(cfg) # for name, param in model.named_parameters(): # param.requires_grad = False # print(name) # model.avgpool=nn.AdaptiveAvgPool2d(output_size=(1, 1)) #model.fc=nn.Linear(in_features=512, out_features=5, bias=True) if cfg.use_pretrained: pretrained_path = os.path.join('experiments','supervised',cfg.pretrained_dir,cfg.pretrained_weights) state_dict = torch.load(pretrained_path,map_location=device) model.load_state_dict(state_dict, strict=False) logging.info('loading pretrained_weights {}'.format(cfg.pretrained_weights)) if cfg.use_ssl: ssl_exp_dir = os.path.join('experiments',\ 'self-supervised',cfg.ssl_pretrained_exp_path) state_dict = torch.load(os.path.join(ssl_exp_dir,cfg.ssl_weight),\ map_location=device) # the stored dict has 3 informations - epoch,state_dict and optimizer state_dict=state_dict['state_dict'] print(state_dict.keys()) del state_dict['fc.weight'] del state_dict['fc.bias'] del state_dict['layer4.0.conv1.weight'] del state_dict['layer4.0.conv2.weight'] del state_dict['layer4.1.conv1.weight'] del state_dict['layer4.1.conv2.weight'] del state_dict['layer3.0.conv1.weight'] del state_dict['layer3.0.conv2.weight'] del state_dict['layer3.1.conv1.weight'] del state_dict['layer3.1.conv2.weight'] #del state_dict['layer2.0.conv1.weight'] #del state_dict['layer2.0.conv2.weight'] #del state_dict['layer2.1.conv1.weight'] #del state_dict['layer2.1.conv2.weight'] model.load_state_dict(state_dict, strict=False) # Only finetune fc layer #layers_list=['fc','avgpool','layer3.0.conv']#,'layer3.1.conv','layer4.0.conv','layer4.1.conv'] #params_update=[] for name, param in model.named_parameters(): #for l in layers_list: if 'fc' or 'layer3.0.conv' or 'layer3.1.conv' or'layer4.0.conv' or 'layer4.1.conv' in name: param.requires_grad = True ### print(name) else: param.requires_grad = False # print(name) # params_update.append(param) # print(param.requires_grad) model = model.to(device) images,_ ,_,_ = next(iter(dloader_train)) images = images.to(device) writer.add_graph(model, images) # follow the same setting as RotNet paper #model.parameters() if cfg.opt=='sgd': optimizer = optim.SGD(model.parameters(), lr=float(cfg.lr), momentum=float(cfg.momentum), weight_decay=5e-4, nesterov=True) elif cfg.opt=='adam': optimizer = optim.Adam(model.parameters(), lr=float(cfg.lr))#, momentum=float(cfg.momentum), weight_decay=5e-4, nesterov=True) if cfg.scheduler: scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160, 200], gamma=0.2) else: scheduler=None criterion = nn.CrossEntropyLoss() global iter_cnt iter_cnt=0 best_loss = 1000 for epoch in range(cfg.num_epochs): # print('\nTrain for Epoch: {}/{}'.format(epoch,cfg.num_epochs)) logging.info('\nTrain for Epoch: {}/{}'.format(epoch,cfg.num_epochs)) train_loss,train_acc = train(epoch, model, device, dloader_train, optimizer, scheduler, criterion, experiment_dir, writer) # validate after every epoch # print('\nValidate for Epoch: {}/{}'.format(epoch,cfg.num_epochs)) logging.info('\nValidate for Epoch: {}/{}'.format(epoch,cfg.num_epochs)) val_loss,val_acc = validate(epoch, model, device, dloader_val, criterion, experiment_dir, writer) logging.info('Val Epoch: {} Avg Loss: {:.4f} \t Avg Acc: {:.4f}'.format(epoch, val_loss, val_acc)) # for name, weight in model.named_parameters(): # writer.add_histogram(name,weight, epoch) # writer.add_histogram(f'{name}.grad',weight.grad, epoch) is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) if epoch % cfg.save_intermediate_weights==0 or is_best: utils.save_checkpoint({'Epoch': epoch,'state_dict': model.state_dict(), 'optim_dict' : optimizer.state_dict()}, is_best, experiment_dir, checkpoint='{}_epoch{}_checkpoint.pth'.format( cfg.network.lower(),str(epoch)),\ best_model='{}_best.pth'.format(cfg.network.lower()) ) writer.close() # print('\nEvaluate on test') logging.info('\nEvaluate test result on best ckpt') state_dict = torch.load(os.path.join(experiment_dir,'{}_best.pth'.format(cfg.network.lower())),\ map_location=device) model.load_state_dict(state_dict, strict=False) test_loss,test_acc = test(model, device, dloader_test, criterion, experiment_dir) logging.info('Test: Avg Loss: {:.4f} \t Avg Acc: {:.4f}'.format(test_loss, test_acc)) # save the configuration file within that experiment directory utils.save_yaml(cfg,save_path=os.path.join(experiment_dir,'config_sl.yaml')) logging.info('-----------End of Experiment------------')
# fitnesses = map(toolbox.evaluate, invalid_ind) # for ind, fit in zip(invalid_ind, fitnesses): # ind.fitness.values = fit # # # The population is entirely replaced by the offspring # pop[:] = offspring return pop, logbook, hof if __name__ == "__main__": # pop, logbook, hof = main() # pprint(logbook) # # # Best controller # print(hof.items[0]) # Save best as CSV # For some reason saves under _practice package # np.savetxt("./bestController_fully_10nodes2000.csv", hof.items[0], # delimiter=",") # agent.set_weights(hof.items[0]) # np.savetxt("./bestController_bu.csv", hof.items[0], delimiter=",") # agent.set_weights(hof.items[0]) agent.set_weights( np.loadtxt("./bestController_fully_10nodes2000.csv", delimiter=",")) from evaluate import test print(test(agent=agent))
def train_and_evaluate(cfg): #Training settings experiment_dir = os.path.join('experiments', cfg.exp_type, cfg.save_dir) if not os.path.exists(experiment_dir): os.makedirs(experiment_dir) utils.set_logger(os.path.join(experiment_dir, cfg.log)) logging.info('-----------Starting Experiment------------') use_cuda = cfg.use_cuda and torch.cuda.is_available() cfg.use_cuda = use_cuda device = torch.device( "cuda:{}".format(cfg.cuda_num) if use_cuda else "cpu") # initialize the tensorbiard summary writer writer = SummaryWriter(experiment_dir + '/tboard') ## get the dataloaders dloader_train, dloader_val, dloader_test = dataloaders.get_dataloaders( cfg, val_split=.2) # Load the model model = models.get_model(cfg) if cfg.ssl_pretrained_exp_path: ssl_exp_dir = experiment_dir = os.path.join('experiments',\ 'self-supervised',cfg.ssl_pretrained_exp_path) state_dict = torch.load(os.path.join(ssl_exp_dir,cfg.ssl_weight),\ map_location=device) # the stored dict has 3 informations - epoch,state_dict and optimizer state_dict = state_dict['state_dict'] del state_dict['fc.weight'] del state_dict['fc.bias'] model.load_state_dict(state_dict, strict=False) # Only finetune fc layer for name, param in model.named_parameters(): if 'fc' not in name: param.requires_grad = False model = model.to(device) images, _, _, _ = next(iter(dloader_train)) images = images.to(device) writer.add_graph(model, images) # follow the same setting as RotNet paper optimizer = optim.SGD(model.parameters(), lr=float(cfg.lr), momentum=float(cfg.momentum), weight_decay=5e-4, nesterov=True) if cfg.scheduler: scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160, 200], gamma=0.2) else: scheduler = None criterion = nn.CrossEntropyLoss() best_loss = 1000 for epoch in range(cfg.num_epochs + 1): # print('\nTrain for Epoch: {}/{}'.format(epoch,cfg.num_epochs)) logging.info('\nTrain for Epoch: {}/{}'.format(epoch, cfg.num_epochs)) train_loss, train_acc = train(epoch, model, device, dloader_train, optimizer, scheduler, criterion, experiment_dir, writer) # validate after every epoch # print('\nValidate for Epoch: {}/{}'.format(epoch,cfg.num_epochs)) logging.info('\nValidate for Epoch: {}/{}'.format( epoch, cfg.num_epochs)) val_loss, val_acc = validate(epoch, model, device, dloader_val, criterion, experiment_dir, writer) logging.info( 'Val Epoch: {} Avg Loss: {:.4f} \t Avg Acc: {:.4f}'.format( epoch + 1, val_loss, val_acc)) is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) if epoch % cfg.save_intermediate_weights == 0: utils.save_checkpoint({'Epoch': epoch,'state_dict': model.state_dict(), 'optim_dict' : optimizer.state_dict()}, is_best, experiment_dir, checkpoint='{}_{}rot_epoch{}_checkpoint.pth'.format( cfg.network.lower(), str(cfg.num_rot),str(epoch)),\ best_model='{}_{}rot_epoch{}_best.pth'.format(cfg.network.lower(), str(cfg.num_rot),str(epoch)) ) writer.close() # print('\nEvaluate on test') logging.info('\nEvaluate on test') test_loss, test_acc = test(model, device, dloader_test, criterion, experiment_dir) logging.info('Test: Avg Loss: {:.4f} \t Avg Acc: {:.4f}'.format( test_loss, test_acc)) # save the configuration file within that experiment directory utils.save_yaml(cfg, save_path=os.path.join(experiment_dir, 'config_sl.yaml')) logging.info('-----------End of Experiment------------')
def train_function(config): # ----------- Alg parameters ----------------- # experiment = config['experiment'] if experiment == "particle": scenario_name = config['scenario'] seed = config['seed'] np.random.seed(seed) random.seed(seed) # Curriculum stage stage = config['stage'] port = config['port'] dir_name = config['dir_name'] dir_restore = config['dir_restore'] use_alg_credit = config['use_alg_credit'] use_qmix = config['use_qmix'] use_Q_credit = config['use_Q_credit'] # If 1, then uses Q-net and global reward use_Q = config['use_Q'] use_V = config['use_V'] if experiment == "sumo": dimensions = config['dimensions_sumo'] elif experiment == "particle": dimensions = config['dimensions_particle'] # If 1, then restores variables from same stage restore_same_stage = config['restore_same_stage'] # If 1, then does not restore variables, even if stage > 1 train_from_nothing = config['train_from_nothing'] # Name of model to restore model_name = config['model_name'] # Total number of training episodes N_train = config['N_train'] period = config['period'] # Number of evaluation episodes to run every <period> N_eval = config['N_eval'] summarize = config['summarize'] alpha = config['alpha'] lr_Q = config['lr_Q'] lr_V = config['lr_V'] lr_actor = config['lr_actor'] dual_buffer = config['dual_buffer'] buffer_size = config['buffer_size'] threshold = config['threshold'] batch_size = config['batch_size'] pretrain_episodes = config['pretrain_episodes'] steps_per_train = config['steps_per_train'] max_steps = config['max_steps'] # Probability of using random configuration prob_random = config['prob_random'] epsilon_start = config['epsilon_start'] epsilon_end = config['epsilon_end'] epsilon_div = config['epsilon_div'] epsilon_step = (epsilon_start - epsilon_end) / float(epsilon_div) if experiment == "sumo": # ----------- SUMO parameters ---------------- # with open('config_sumo_stage%d.json' % stage) as f: config_sumo = json.load(f) n_agents = config_sumo["n_agents"] list_goals_fixed = config_sumo['goal_lane'] list_routes_fixed = config_sumo['route'] list_lanes_fixed = config_sumo['lane'] list_goal_pos = config_sumo['goal_pos'] list_speeds = config_sumo['speed'] init_positions = config_sumo['init_position'] list_id = config_sumo['id'] list_vtypes = config_sumo['vtypes'] depart_mean = config_sumo['depart_mean'] depart_stdev = config_sumo['depart_stdev'] total_length = config_sumo['total_length'] total_width = config_sumo['total_width'] save_threshold = config_sumo['save_threshold'] map_route_idx = {'route_ramp': 0, 'route_straight': 1} sim = sumo_simulator.Simulator(port, list_id=list_id, other_lc_mode=0b1000000001, sublane_res=0.8, seed=seed) for i in range(int(2 / sim.dt)): sim.step() elif experiment == 'particle': with open(config["particle_config"]) as f: config_particle = json.load(f) n_agents = config_particle['n_agents'] scenario = scenarios.load(scenario_name + ".py").Scenario() world = scenario.make_world(n_agents, config_particle, prob_random) env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, None, scenario.done, max_steps=max_steps) elif experiment == 'checkers': with open("config_checkers_stage%d.json" % stage) as f: config_checkers = json.load(f) n_agents = config_checkers['n_agents'] dimensions = config_checkers['dimensions'] init = config_checkers['init'] env = checkers.Checkers(init['n_rows'], init['n_columns'], init['n_obs'], init['agents_r'], init['agents_c'], n_agents, max_steps) l_action = dimensions['l_action'] l_goal = dimensions['l_goal'] # Create entire computational graph # Creation of new trainable variables for new curriculum # stage is handled by networks.py, given the stage number if use_alg_credit: if experiment == 'checkers': alg = alg_credit_checkers.Alg(experiment, dimensions, stage, n_agents, lr_V=lr_V, lr_Q=lr_Q, lr_actor=lr_actor, use_Q_credit=use_Q_credit, use_V=use_V, nn=config_checkers['nn']) else: alg = alg_credit.Alg(experiment, dimensions, stage, n_agents, lr_V=lr_V, lr_Q=lr_Q, lr_actor=lr_actor, use_Q_credit=use_Q_credit, use_V=use_V, nn=config['nn']) elif not use_qmix: if experiment == 'checkers': alg = alg_baseline_checkers.Alg(experiment, dimensions, stage, n_agents, lr_V=lr_V, lr_Q=lr_Q, lr_actor=lr_actor, use_Q=use_Q, use_V=use_V, alpha=alpha, nn=config_checkers['nn'], IAC=config['IAC']) else: alg = alg_baseline.Alg(experiment, dimensions, stage, n_agents, lr_V=lr_V, lr_Q=lr_Q, lr_actor=lr_actor, use_Q=use_Q, use_V=use_V, alpha=alpha, nn=config['nn'], IAC=config['IAC']) else: print("Using QMIX") if experiment == 'checkers': alg = alg_qmix_checkers.Alg(experiment, dimensions, stage, n_agents, lr_Q=lr_Q, nn=config_checkers['nn']) else: alg = alg_qmix.Alg(experiment, dimensions, stage, n_agents, lr_Q=lr_Q) print("Initialized computational graph") list_variables = tf.trainable_variables() if stage == 1 or restore_same_stage or train_from_nothing: saver = tf.train.Saver() elif stage == 2: # to_restore = [v for v in list_variables if ('stage-%d'%stage not in v.name.split('/') and 'Policy_target' not in v.name.split('/'))] to_restore = [] for v in list_variables: list_split = v.name.split('/') if ('stage-%d' % stage not in list_split ) and ('Policy_target' not in list_split) and ( 'Q_credit_main' not in list_split) and ('Q_credit_target' not in list_split): to_restore.append(v) saver = tf.train.Saver(to_restore) else: # restore only those variables that were not # just created at this curriculum stage to_restore = [ v for v in list_variables if 'stage-%d' % stage not in v.name.split('/') ] saver = tf.train.Saver(to_restore) config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.set_random_seed(seed) sess = tf.Session(config=config) writer = tf.summary.FileWriter('../saved/%s' % dir_name, sess.graph) sess.run(tf.global_variables_initializer()) print("Initialized variables") if train_from_nothing == 0: print("Restoring variables from %s" % dir_restore) saver.restore(sess, '../saved/%s/%s' % (dir_restore, model_name)) if stage == 2 and use_alg_credit and use_Q_credit: # Copy weights of Q_global to Q_credit at the start of Stage 2 sess.run(alg.list_initialize_credit_ops) for var in list_variables: if var.name == 'Q_global_main/Q_branch1/kernel:0': print("Q_global") print(sess.run(var)) print("") if var.name == 'Q_credit_main/Q_branch1/kernel:0': print("Q_credit") print(sess.run(var)) print("") # initialize target networks to equal main networks sess.run(alg.list_initialize_target_ops) # save everything without exclusion saver = tf.train.Saver(max_to_keep=None) epsilon = epsilon_start # For computing average over 100 episodes reward_local_century = np.zeros(n_agents) reward_global_century = 0 # Write log headers header = "Episode,r_global" header_c = "Century,r_global_avg" for idx in range(n_agents): header += ',r_%d' % idx header_c += ',r_avg_%d' % idx header_c += ",r_global_eval" for idx in range(n_agents): header_c += ',r_eval_%d' % idx if experiment == 'sumo': for idx in range(n_agents): header += ',route_%d,lane_%d,goal_%d' % (idx, idx, idx) header_c += ',r_eval_local,duration (s)' header += '\n' header_c += '\n' if not os.path.exists('../log/%s' % dir_name): os.makedirs('../log/%s' % dir_name) with open('../log/%s/log.csv' % dir_name, 'w') as f: f.write(header) with open('../log/%s/log_century.csv' % dir_name, 'w') as f: f.write(header_c) if dual_buffer: buf = replay_buffer_dual.Replay_Buffer(size=buffer_size) else: buf = replay_buffer.Replay_Buffer(size=buffer_size) t_start = time.time() dist_action = np.zeros(l_action) step = 0 # Each iteration is a training episode for idx_episode in range(1, N_train + 1): # print("Episode", idx_episode) if experiment == "sumo": t_ms = sim.traci.simulation.getCurrentTime() # SUMO time functions return negative values afer 24 days (in millisecond) of simulation time # Hence use 0 for departure time, essentially triggering an immediate departure if 0 < t_ms and t_ms < 2073600e3: depart_times = [ np.random.normal(t_ms / 1000.0 + depart_mean[idx], depart_stdev) for idx in range(n_agents) ] else: depart_times = [0 for idx in range(n_agents)] # Goals for input to policy and value function goals = np.zeros([n_agents, l_goal]) list_routes = ['route_straight'] * n_agents list_lanes = [0] * n_agents list_goal_lane = [0] * n_agents rand_num = random.random() if rand_num < prob_random: # Random settings for route, lane and goal init = 'Random' for idx in range(n_agents): route = 'route_straight' lane = np.random.choice([0, 1, 2, 3], p=np.ones(4) * 0.25) goal_lane = np.random.choice(np.arange(l_goal), p=np.ones(l_goal) / float(l_goal)) list_routes[idx] = route list_lanes[idx] = lane list_goal_lane[idx] = goal_lane goals[idx, goal_lane] = 1 else: init = 'Preset' # Use predetermined values for route, lane, goal for idx in range(n_agents): list_routes[idx] = list_routes_fixed[idx] goal_lane = list_goals_fixed[idx] list_goal_lane[idx] = goal_lane list_lanes[idx] = list_lanes_fixed[idx] goals[idx, goal_lane] = 1 env = multicar_simple.Multicar(sim, n_agents, list_goal_lane, list_goal_pos, list_routes, list_speeds, list_lanes, init_positions, list_id, list_vtypes, depart_times, total_length=total_length, total_width=total_width, safety=True) global_state, local_others, local_self, done = env.reset() elif experiment == "particle": global_state, local_others, local_self, done = env.reset() goals = np.zeros([n_agents, l_goal]) for idx in range(n_agents): goals[idx] = env.world.landmarks[idx].state.p_pos elif experiment == "checkers": if n_agents == 1: if np.random.randint(2) == 0: goals = np.array([[1, 0]]) else: goals = np.array([[0, 1]]) else: goals = np.eye(n_agents) global_state, local_others, local_self_t, local_self_v, done = env.reset( goals) actions_prev = np.zeros(n_agents, dtype=np.int) reward_global = 0 reward_local = np.zeros(n_agents) # step = 0 summarized = False if dual_buffer: buf_episode = [] while not done: if idx_episode < pretrain_episodes and (stage == 1 or train_from_nothing == 1): # Random actions when filling replay buffer actions = np.random.randint(0, l_action, n_agents) else: # Run actor network for all agents as batch if experiment == 'checkers': actions = alg.run_actor(actions_prev, local_others, local_self_t, local_self_v, goals, epsilon, sess) else: actions = alg.run_actor(local_others, local_self, goals, epsilon, sess) dist_action[actions[0]] += 1 if experiment == 'sumo': # check feasible actions actions = env.check_actions(actions) # step environment if experiment == 'checkers': next_global_state, next_local_others, next_local_self_t, next_local_self_v, reward, local_rewards, done = env.step( actions) else: next_global_state, next_local_others, next_local_self, reward, local_rewards, done = env.step( actions) step += 1 # store transition into memory if dual_buffer: if experiment == 'checkers': buf_episode.append( np.array([ global_state[0], global_state[1], np.array(local_others), np.array(local_self_t), np.array(local_self_v), actions_prev, actions, reward, local_rewards, next_global_state[0], next_global_state[1], np.array(next_local_others), np.array(next_local_self_t), np.array(next_local_self_v), done, goals ])) else: buf_episode.append( np.array([ global_state, np.array(local_others), np.array(local_self), actions, reward, local_rewards, next_global_state, np.array(next_local_others), np.array(next_local_self), done, goals ])) else: if experiment == 'checkers': buf.add( np.array([ global_state[0], global_state[1], np.array(local_others), np.array(local_self_t), np.array(local_self_v), actions_prev, actions, reward, local_rewards, next_global_state[0], next_global_state[1], np.array(next_local_others), np.array(next_local_self_t), np.array(next_local_self_v), done, goals ])) else: buf.add( np.array([ global_state, np.array(local_others), np.array(local_self), actions, reward, local_rewards, next_global_state, np.array(next_local_others), np.array(next_local_self), done, goals ])) if (idx_episode >= pretrain_episodes) and (step % steps_per_train == 0): # Sample batch of transitions from replay buffer batch = buf.sample_batch(batch_size) if summarize and idx_episode % period == 0 and not summarized: # Write TF summary every <period> episodes, # at the first <steps_per_train> step alg.train_step(sess, batch, epsilon, idx_episode, summarize=True, writer=writer) summarized = True else: alg.train_step(sess, batch, epsilon, idx_episode, summarize=False, writer=None) global_state = next_global_state local_others = next_local_others if experiment == 'checkers': local_self_t = next_local_self_t local_self_v = next_local_self_v actions_prev = actions else: local_self = next_local_self reward_local += local_rewards reward_global += reward if dual_buffer: if experiment == 'sumo': buf.add(buf_episode, np.sum(reward_local) < threshold) elif experiment == 'particle': buf.add(buf_episode, scenario.collisions != 0) if idx_episode >= pretrain_episodes and epsilon > epsilon_end: epsilon -= epsilon_step reward_local_century += reward_local reward_global_century += reward_global # ----------- Log performance --------------- # if idx_episode % period == 0: dist_action = dist_action / np.sum(dist_action) t_end = time.time() print("\n Evaluating") if experiment == 'sumo': r_local_eval, r_global_eval = evaluate.test( N_eval, sim, sess, depart_mean, depart_stdev, n_agents, l_goal, list_routes_fixed, list_lanes_fixed, list_goals_fixed, prob_random, list_goal_pos, list_speeds, init_positions, list_id, list_vtypes, alg) if np.all(r_local_eval > save_threshold): saver.save( sess, '../saved/%s/model_good_%d.ckpt' % (dir_name, idx_episode)) elif experiment == 'particle': r_local_eval, r_global_eval = evaluate.test_particle( N_eval, env, sess, n_agents, l_goal, alg, render=False) elif experiment == 'checkers': r_local_eval, r_global_eval = evaluate.test_checkers( N_eval, env, sess, n_agents, alg) if stage == 1 and np.sum(r_local_eval) > 9.0: saver.save( sess, '../saved/%s/model_good_%d.ckpt' % (dir_name, idx_episode)) s = '%d,%.2f,' % (idx_episode, reward_global_century / float(period)) s += ','.join([ '{:.2f}'.format(val / float(period)) for val in reward_local_century ]) s += ',%.2f,' % (r_global_eval) s += ','.join(['{:.2f}'.format(val) for val in r_local_eval]) s += ',%.2f,%d' % (np.sum(r_local_eval), int(t_end - t_start)) s += '\n' print(s) with open('../log/%s/log_century.csv' % dir_name, 'a') as f: f.write(s) reward_local_century = np.zeros(n_agents) reward_global_century = 0 print("Action distribution ", dist_action) if dual_buffer: print( "length buffer good %d, length buffer others %d, epsilon %.3f" % (len(buf.memory_2), len(buf.memory_1), epsilon)) else: print("epsilon %.3f" % epsilon) dist_action = np.zeros(l_action) t_start = time.time() s = '%d,%.2f,' % (idx_episode, reward_global) s += ','.join(['{:.2f}'.format(val) for val in reward_local]) if experiment == 'sumo': for idx in range(n_agents): s += ',%d,%d,%d' % (map_route_idx[list_routes[idx]], list_lanes[idx], list_goal_lane[idx]) s += '\n' with open('../log/%s/log.csv' % dir_name, 'a') as f: f.write(s) print("Saving stage %d variables" % stage) if not os.path.exists('../saved/%s' % dir_name): os.makedirs('../saved/%s' % dir_name) saver.save(sess, '../saved/%s/model_final.ckpt' % dir_name)
base_model.summary() """ Train just the new layers, let the pretrained ones be as they are (they'll be trained later) """ for layer in resnet_model.layers: layer.trainable = False """ Building triple siamese architecture """ input_shape = (224, 224, 3) input_anchor = Input(shape=input_shape, name='input_anchor') input_positive = Input(shape=input_shape, name='input_pos') input_negative = Input(shape=input_shape, name='input_neg') net_anchor = base_model(input_anchor) net_positive = base_model(input_positive) net_negative = base_model(input_negative) positive_dist = Lambda(euclidean_distance, name='pos_dist')([net_anchor, net_positive]) negative_dist = Lambda(euclidean_distance, name='neg_dist')([net_anchor, net_negative]) stacked_dists = Lambda(lambda vects: K.stack(vects, axis=1), name='stacked_dists')([positive_dist, negative_dist]) model = Model([input_anchor, input_positive, input_negative], [net_anchor, net_positive, net_negative], name='gen') model.load_weights('./model_weights.h5') evaluate.test(base_model)
def train(model, device, train_loader, val_loader, test_loader, optimizer, epoch, batch_log_interval=10, patience=20, min_delta=0.003): """ This function runs the training script of the model Args: model (obj): which model to train device (torch.device): device to run on, cpu or whether to enable cuda train_loader (torch.utils.data.dataloader.DataLoader): dataloader object val_loader (torch.utils.data.dataloader.DataLoader): dataloader object for validation test_loader (torch.utils.data.dataloader.DataLoader): dataloader object for testing at would-be early stopping iteration epoch (int): which epoch we're on optimizer (torch.optim obj): which optimizer to use batch_log_interval (int): how often to log results patience (int): how many iterations/batches (not epochs) we will tolerate a val_loss improvement < min_delta min_delta (float): early stopping threshold; if val_loss < min_delta for patience # of iterations, we consider early stopping to have occurred Returns stop (bool): whether early stopping would have occurred """ print('min_delta:', min_delta) no_improvement_count = 0 stop = False print('IN TRAIN, DEVICE:', device) model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = model.loss(output, target) loss.backward() optimizer.step() # for each "iteration" (assuming that means batch), get validation loss for early stopping if batch_idx == 0: val_loss = validate(model, device, val_loader) prev_val_loss = val_loss else: prev_val_loss = val_loss val_loss = validate(model, device, val_loader) # if no improvement on this batch if abs(prev_val_loss - val_loss) < min_delta: no_improvement_count += 1 else: no_improvement_count = 0 # trigger early stopping if no_improvement_count == patience: print( 'Early Stopping Triggered at iteration {} within epoch. Done Training. val_loss = {:.6f}, prev_val_loss = {:.6f}' .format(batch_idx, val_loss, prev_val_loss)) test(model, device, test_loader) stop = True return stop, batch_idx if batch_idx % batch_log_interval == 0: print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tTrain Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) return stop, None
############################################### ## Predict ## ############################################### if not args.test: utils.load_model(args.modeldir, model) dev_output_file = None test_output_file = None if args.output_pred: dev_output_file = os.path.join(args.predout, 'predict_dev.txt') test_output_file = os.path.join(args.predout, 'predict_test.txt') dev_scores = test(model, dev_iter, beam_width=args.beam_width, output_file=dev_output_file, cuda=args.cuda, verbose=args.verbose, covered=False) logger.info("dev acc: %f, ed: %f" % (dev_scores['acc'], dev_scores['ed'])) test_scores = test(model, test_iter, beam_width=args.beam_width, output_file=test_output_file, cuda=args.cuda, verbose=args.verbose, covered=args.covered_test) if args.covered_test: logger.info("test acc: %f, ed: %f" %