def main(): env = gym.make('CartPole-v0') gen_idx = 0 nets = [ model.CartPoleNet(env.observation_space.shape[0], env.action_space.n) for _ in range(POPULATION_SIZE) ] population = [(net, evaluate(env, net)) for net in nets] while True: population.sort(key=lambda p: p[1], reverse=True) rewards = [p[1] for p in population[:PARENTS_COUNT]] reward_mean = np.mean(rewards) reward_max = np.max(rewards) reward_std = np.std(rewards) print("%d: reward_mean=%.2f, reward_max=%.2f, reward_std=%.2f" % (gen_idx, reward_mean, reward_max, reward_std)) if reward_mean > 199: print("Solved in %d steps" % gen_idx) break prev_population = population population = [population[0]] for _ in range(POPULATION_SIZE - 1): parent_idx = np.random.randint(0, PARENTS_COUNT) parent = prev_population[parent_idx][0] net = mutate_parent(parent) fitness = utils.evaluate(env, net) population.append((net, fitness)) gen_idx += 1
def test_evaluate_runs(predictions_file, out_folder): df_preds_w_targets = pd.read_csv(predictions_file) evaluate(df_preds_w_targets, out_folder=out_folder)
for val_w, val_d, val_r, val_t in val_loader: net([val_w, val_d, val_r]) break net.initialize(ctx = ctx, init = MyInit(), force_reinit = True) # initialize a trainer to train model trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': learning_rate}) # initialize a SummaryWriter to write information into logs dir sw = SummaryWriter(logdir = params_path, flush_secs = 5) # compute validation loss before training compute_val_loss(net, val_loader, loss_function, sw, 0) # compute testing set MAE, RMSE, MAPE before training evaluate(net, test_loader, true_value, num_of_vertices, sw, 0) # train model global_step = 1 for epoch in range(1, epochs + 1): for train_w, train_d, train_r, train_t in train_loader: start_time = time() with autograd.record(): output = net([train_w, train_d, train_r]) l = loss_function(output, train_t) l.backward() trainer.step(train_t.shape[0]) training_loss = l.mean().asscalar()
recent=24, K=3, Kt=3) net.to(device) #to cuda optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay=wdecay) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, decay) #scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [20,30], gamma=0.7, last_epoch=-1) #calculate origin loss in epoch 0 compute_val_loss(net, val_loader, loss_function, supports, device, epoch=0) # compute testing set MAE, RMSE, MAPE before training evaluate(net, test_loader, true_value, supports, device, epoch=0) clip = 5 his_loss = [] train_time = [] for epoch in range(1, epochs + 1): train_l = [] start_time_train = time() for train_w, train_d, train_r, train_t in train_loader: train_w = train_w.to(device) train_d = train_d.to(device) train_r = train_r.to(device) train_t = train_t.to(device) net.train() #train pattern optimizer.zero_grad() #grad to 0
# initialize a trainer to train model trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': learning_rate}) # initialize a SummaryWriter to write information into logs dir sw = SummaryWriter(logdir=params_path, flush_secs=5) # compute validation loss before training compute_val_loss(net, val_loader, loss_function, sw, epoch=0, ctx=ctx) # compute testing set MAE, RMSE, MAPE before training evaluate(net, test_loader, true_value, num_of_vertices, sw, epoch=0, ctx=ctx) # train model global_step = 1 for epoch in range(1, epochs + 1): for train_w, train_d, train_r, train_t in train_loader: # running on single gpu train_w = train_w.as_in_context(ctx) train_d = train_d.as_in_context(ctx) train_r = train_r.as_in_context(ctx) train_t = train_t.as_in_context(ctx) with autograd.record():
time: {batch_end_time - batch_start_time:.2f}' ) print( f's:[{epoch:d}, {i + 1:5d}] loss: {running_loss_s / group_num:.2f}, \ time: {batch_end_time - batch_start_time:.2f}' ) print( '--------------------------------------------------------------------' ) running_loss = 0.0 running_loss_f = 0.0 running_loss_o = 0.0 running_loss_s = 0.0 batch_start_time = batch_end_time epoch_end_time = time.perf_counter() print(f'Epoch cost {epoch_end_time - epoch_start_time:.2f} seconds') # probably not need to run this after every epoch:可能不需要在每一个时代之后都运行这个 with torch.no_grad(): # compute validation loss:计算验证损失 compute_val_loss(net, val_loader, loss_function, None, epoch, device, all_data['stats']['stats']) # testing:测试 evaluate(net, test_loader, true_value, num_of_vertices, None, epoch, device, all_data['stats']['stats']) end_time = time.perf_counter() print(f'Total running time is {end_time - start_time:.2f} seconds.')
train_t = train_t.to(device) outputs = net([train_w, train_d, train_r]) loss = loss_function(outputs, train_t) # loss is a tensor on the same device as outpus and train_t loss.backward() optimizer.step() running_loss += loss.item() # type of running_loss is float, loss.item() is a float on CPU if i % group_num == group_num - 1: batch_end_time = time.perf_counter() print(f'[{epoch:d}, {i + 1:5d}] loss: {running_loss / group_num:.2f}, \ time: {batch_end_time - batch_start_time:.2f}') running_loss = 0.0 batch_start_time = batch_end_time epoch_end_time = time.perf_counter() print(f'Epoch cost {epoch_end_time - epoch_start_time:.2f} seconds') # probably not need to run this after every epoch with torch.no_grad(): # compute validation loss compute_val_loss(net, val_loader, loss_function, None, epoch, device) # testing evaluate(net, test_loader, true_value, num_of_vertices, None, epoch, device) end_time = time.perf_counter() print(f'Total running time is {end_time - start_time:.2f} seconds.')
torch.cuda.get_device_name(0)) model = model.cuda() # move model to GPU if available else: print('[STATUS] Nvidia GPU not found, training using CPU') criterion = nn.CrossEntropyLoss() # criterion optimizer = torch.optim.Adam( model.parameters(), lr=float(args['learning_rate']) ) # model optimizer (see torch.optim documentation to explore other optimizers) print('[INFO] Training and evaluating the model...') # train and evaluate the model if args['file_name'] is None: print( '[WARNING] Trained model will not be saved, no path specified. Use -f or --file_name args to save the trained model' ) else: print('[INFO] Trained model will be saved as:', args['file_name']) final_model, training_losses = train(int(args['epochs']), model, train_loader, criterion, optimizer, use_cuda, save_path=args['file_name']) evaluate(model, test_loader, criterion, use_cuda) print('[STATUS] Model training and evaluation completed') print('Job completed!')
def main(): cudnn.benchmark = True device = 'cuda' if torch.cuda.is_available() else 'cpu' assert config.MISC.TEST_INTERVAL is not 0, 'Illegal setting: config.MISC.TEST_INTERVAL = 0!' # set random seed if config.MISC.RANDOM_SEED: random.seed(config.MISC.RANDOM_SEED) np.random.seed(config.MISC.RANDOM_SEED) torch.manual_seed(config.MISC.RANDOM_SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.MISC.RANDOM_SEED) train_transformer = Mytransforms.Compose([ Mytransforms.KeyAreaCrop(20), Mytransforms.RandomRotate(40), Mytransforms.TestResized(config.MODEL.IMG_SIZE), Mytransforms.RandomHorizontalFlip() ]) test_transformer = Mytransforms.Compose([ Mytransforms.KeyAreaCrop(20), Mytransforms.TestResized(config.MODEL.IMG_SIZE) ]) # train source_dset = HandKptDataset(config.DATA.SOURCE.TRAIN.DIR, config.DATA.SOURCE.TRAIN.LBL_FILE, stride=config.MODEL.HEATMAP_STRIDE, transformer=train_transformer) # target_dset = HandKptDataset(config.DATA.TARGET.TRAIN.DIR, config.DATA.TARGET.TRAIN.LBL_FILE, # stride=config.MODEL.HEATMAP_STRIDE, transformer=train_transformer) source_val_dset = HandKptDataset(config.DATA.SOURCE.VAL.DIR, config.DATA.SOURCE.VAL.LBL_FILE, stride=config.MODEL.HEATMAP_STRIDE, transformer=test_transformer) target_val_dset = HandKptDataset(config.DATA.TARGET.VAL.DIR, config.DATA.TARGET.VAL.LBL_FILE, stride=config.MODEL.HEATMAP_STRIDE, transformer=test_transformer) # source only train_loader = torch.utils.data.DataLoader( source_dset, batch_size=config.TRAIN.BATCH_SIZE, shuffle=True, num_workers=config.MISC.WORKERS, pin_memory=True) # val source_val_loader = torch.utils.data.DataLoader( source_val_dset, batch_size=config.TRAIN.BATCH_SIZE, shuffle=False, num_workers=config.MISC.WORKERS, pin_memory=True) target_val_loader = torch.utils.data.DataLoader( target_val_dset, batch_size=config.TRAIN.BATCH_SIZE, shuffle=False, num_workers=config.MISC.WORKERS, pin_memory=True) logger = Logger(ckpt_path=os.path.join(config.DATA.CKPT_PATH, config.PROJ_NAME), tsbd_path=os.path.join(config.DATA.VIZ_PATH, config.PROJ_NAME)) net = pose_resnet.get_pose_net(config).to(device) optimizer = torch.optim.Adam(net.parameters(), config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY) input_shape = (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMG_SIZE, config.MODEL.IMG_SIZE) logger.add_graph(net, input_shape, device) if len(config.MODEL.RESUME) > 0: print("=> loading checkpoint '{}'".format(config.MODEL.RESUME)) resume_ckpt = torch.load(config.MODEL.RESUME) net.load_state_dict(resume_ckpt['net']) optimizer.load_state_dict(resume_ckpt['optim']) config.TRAIN.START_ITERS = resume_ckpt['iter'] logger.global_step = resume_ckpt['iter'] logger.best_metric_val = resume_ckpt['best_metric_val'] net = torch.nn.DataParallel(net) if config.EVALUATE: pck05, pck2 = evaluate(net, target_val_loader, img_size=config.MODEL.IMG_SIZE, vis=True, logger=logger, disp_interval=config.MISC.DISP_INTERVAL) print("=> validate [email protected] = {}, [email protected] = {}".format(pck05 * 100, pck2 * 100)) return criterion = nn.SmoothL1Loss(reduction='none').to(device) total_progress_bar = tqdm.tqdm(desc='Train iter', ncols=80, total=config.TRAIN.MAX_ITER, initial=config.TRAIN.START_ITERS) epoch = 0 while logger.global_step < config.TRAIN.MAX_ITER: for (stu_inputs, stu_heatmap, _) in tqdm.tqdm( train_loader, total=len(train_loader), desc='Current epoch', ncols=80, leave=False): stu_inputs = stu_inputs.to(device) stu_heatmap = stu_heatmap.to(device) stu_heats = net(stu_inputs) loss = criterion(stu_heats, stu_heatmap).sum() / stu_inputs.size(0) optimizer.zero_grad() loss.backward() optimizer.step() # val if logger.global_step % config.MISC.TEST_INTERVAL == 0: pck05, pck2 = evaluate(net, source_val_loader, img_size=config.MODEL.IMG_SIZE, vis=True, logger=logger, disp_interval=config.MISC.DISP_INTERVAL, show_gt=(logger.global_step == 0), is_target=False) logger.add_scalar('[email protected]', pck05 * 100) logger.add_scalar('[email protected]', pck2 * 100) pck05, pck2 = evaluate(net, target_val_loader, img_size=config.MODEL.IMG_SIZE, vis=True, logger=logger, disp_interval=config.MISC.DISP_INTERVAL, show_gt=(logger.global_step == 0), is_target=True) logger.add_scalar('[email protected]', pck05 * 100) logger.add_scalar('[email protected]', pck2 * 100) logger.save_ckpt(state={ 'net': net.module.state_dict(), 'optim': optimizer.state_dict(), 'iter': logger.global_step, 'best_metric_val': logger.best_metric_val, }, cur_metric_val=pck05) logger.step(1) total_progress_bar.update(1) # log logger.add_scalar('regress_loss', loss.item()) epoch += 1 total_progress_bar.close()