コード例 #1
0
    def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_range = [env.action_space.low, env.action_space.high]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau

        # initialize networks
        self.value_net = ValueNet(self.state_dim).to(device)
        self.target_value_net = ValueNet(self.state_dim).to(device)
        self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device)

        # Load the target value network parameters
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)

            # Initialize the optimizer
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr)
        self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        # Initialize thebuffer
        self.buffer = ReplayBeffer(buffer_maxlen)
コード例 #2
0
ファイル: dqn.py プロジェクト: paul-hyun/reinforceNLP
    def __init__(self, config):
        self.config = config
        self.epsilon = config.epsilon

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 가치신경망 생성
        self.model = ValueNet(self.config.n_state, self.config.n_action)
        self.model.to(device)
        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=self.config.learning_rate)
コード例 #3
0
ファイル: a3c.py プロジェクト: paul-hyun/reinforceNLP
    def __init__(self, config):
        self.config = config

        # 리플레이메모리
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
コード例 #4
0
ファイル: a3c.py プロジェクト: paul-hyun/reinforceNLP
    def __init__(self, config):
        self.config = config

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.critic_lr)
コード例 #5
0
ファイル: qlearning.py プロジェクト: Ienu/2048-CSharp
    def __init__(self):
        self.value_net = ValueNet(2)
        self.target_value_net = ValueNet(2)

        # self.value_net.load_state_dict(torch.load('./value_net.pkl'))
        torch.save(self.value_net.state_dict(), './value_net.pkl')
        self.target_value_net.load_state_dict(torch.load('./value_net.pkl'))

        self.episode = 0
        self.explore = 0.3

        self.buffer = []
        self.buffer_capacity = 20
        self.buffer_index = 0
コード例 #6
0
ファイル: a3c.py プロジェクト: paul-hyun/reinforceNLP
class A3CLocal:
    def __init__(self, config):
        self.config = config

        # 리플레이메모리
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.actor(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 리플에이메모리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리플에이메모리 조회 및 클리어
    def get_replay(self):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        return states, actions, rewards, next_states

    # 글로벌신병망의 wegith를 로컬신경망으로 복사
    def update_local_model(self, actor_dict, critic_dict):
        self.actor.load_state_dict(actor_dict)
        self.critic.load_state_dict(critic_dict)

    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
コード例 #7
0
ファイル: train.py プロジェクト: lumenghe/game
 def run_rl_game(i):
     print("Generating RL game number {} from generation {}".format(i, rl_step-1))
     np.random.seed(int.from_bytes(os.urandom(4), byteorder='little'))
     random.seed(int.from_bytes(os.urandom(4), byteorder='little'))
     value_net = ValueNet(rl_model_filepath, rl_step-1) # load from previous generation
     rl_player = RLValueMinimaxPlayer(value_net, rl_minimax_depth)
     _, _, trace = play_game(rl_player, rl_player, verbose=1, limit_to_draw=limit_to_draw, random_burn_in=random_burn_in, trace_min=trace_min)
     return trace
コード例 #8
0
def load_checkpoint(file_dir, i_epoch, layer_sizes, input_size, device='cuda'):
    checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch),
                            map_location=device)

    policy_net = PolicyNet(layer_sizes).to(device)
    value_net = ValueNet(input_size).to(device)
    policy_net.load_state_dict(checkpoint["policy_net"])
    policy_net.train()
    value_net.load_state_dict(checkpoint["value_net"])
    value_net.train()

    policy_lr = checkpoint["policy_lr"]
    valuenet_lr = checkpoint["valuenet_lr"]

    policynet_optim = optim.Adam(policy_net.parameters(), lr=policy_lr)
    policynet_optim.load_state_dict(checkpoint["policynet_optim"])
    valuenet_optim = optim.Adam(value_net.parameters(), lr=valuenet_lr)
    valuenet_optim.load_state_dict(checkpoint["valuenet_optim"])

    checkpoint.pop("policy_net")
    checkpoint.pop("value_net")
    checkpoint.pop("policynet_optim")
    checkpoint.pop("valuenet_optim")
    checkpoint.pop("i_epoch")
    checkpoint.pop("policy_lr")
    checkpoint.pop("valuenet_lr")

    return policy_net, value_net, policynet_optim, valuenet_optim, checkpoint
コード例 #9
0
ファイル: train.py プロジェクト: lumenghe/game
 def run_test_game(i):
     np.random.seed(int.from_bytes(os.urandom(4), byteorder='little'))
     value_net = ValueNet(rl_model_filepath, rl_step-1) # load from previous generation
     rl_player = RLValueMinimaxPlayer(value_net, rl_minimax_depth)
     minimax_player = SimpleMinimaxPlayer(base_minimax_depth)
     if i % 2:
         win, step, _ = play_game(rl_player, minimax_player, verbose=0, limit_to_draw=limit_to_draw)
     else:
         win, step, _ = play_game(minimax_player, rl_player, verbose=0, limit_to_draw=limit_to_draw)
     return win, step
コード例 #10
0
ファイル: ai.py プロジェクト: lumenghe/game
def play(board, color):
    rl_minimax_depth = 8
    rl_step = 21
    rl_model_filepath = './mlp_200_model.h5'
    value_net = ValueNet(rl_model_filepath,
                         rl_step)  # load from previous generation
    rl_player = RLValueMinimaxPlayer(value_net, rl_minimax_depth)
    nboard = board if isinstance(board, np.ndarray) else board_to_numpy(board)
    best_move, _ = rl_player.play(nboard, color)
    return best_move
コード例 #11
0
def load_model_checkpoint(c):#returns the model at given chkpoint

    dir_name = tf.train.latest_checkpoint(c.model_dir)
    #if ver_name =='None':
    #    check_or_make_dir(dir_name)
        
    #else:
    #    dir_name = os.path.join(dir_name,ver_name)
    dummy_env= TFPyEnvironment(StockEnvBasic(**c.default_env))
    time_step = dummy_env.reset()

    temp = ValueNet(**c.model_vars)
    #initialize model
    temp(time_step.observation)
    checkpoint2 = tf.train.Checkpoint(module=temp)
    status=checkpoint2.restore(dir_name)
    return temp,checkpoint2
コード例 #12
0
class SAC:
    def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_range = [env.action_space.low, env.action_space.high]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau

        # initialize networks
        self.value_net = ValueNet(self.state_dim).to(device)
        self.target_value_net = ValueNet(self.state_dim).to(device)
        self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device)

        # Load the target value network parameters
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)

            # Initialize the optimizer
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr)
        self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        # Initialize thebuffer
        self.buffer = ReplayBeffer(buffer_maxlen)

    def get_action(self, state):
        action = self.policy_net.action(state)
        action = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \
                 (self.action_range[1] + self.action_range[0]) / 2.0

        return action

    def update(self, batch_size):
        state, action, reward, next_state, done = self.buffer.sample(batch_size)
        new_action, log_prob = self.policy_net.evaluate(state)

        # V value loss
        value = self.value_net(state)
        new_q1_value = self.q1_net(state, new_action)
        new_q2_value = self.q2_net(state, new_action)
        next_value = torch.min(new_q1_value, new_q2_value) - log_prob
        value_loss = F.mse_loss(value, next_value.detach())

        # Soft q  loss
        q1_value = self.q1_net(state, action)
        q2_value = self.q2_net(state, action)
        target_value = self.target_value_net(next_state)
        target_q_value = reward + done * self.gamma * target_value
        q1_value_loss = F.mse_loss(q1_value, target_q_value.detach())
        q2_value_loss = F.mse_loss(q2_value, target_q_value.detach())

        # Policy loss
        policy_loss = (log_prob - torch.min(new_q1_value, new_q2_value)).mean()

        # Update v
        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        # Update Soft q
        self.q1_optimizer.zero_grad()
        self.q2_optimizer.zero_grad()
        q1_value_loss.backward()
        q2_value_loss.backward()
        self.q1_optimizer.step()
        self.q2_optimizer.step()

        # Update Policy
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # Update target networks
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)
コード例 #13
0
ファイル: qlearning.py プロジェクト: Ienu/2048-CSharp
class Qlearning:
    def __init__(self):
        self.value_net = ValueNet(2)
        self.target_value_net = ValueNet(2)

        # self.value_net.load_state_dict(torch.load('./value_net.pkl'))
        torch.save(self.value_net.state_dict(), './value_net.pkl')
        self.target_value_net.load_state_dict(torch.load('./value_net.pkl'))

        self.episode = 0
        self.explore = 0.3

        self.buffer = []
        self.buffer_capacity = 20
        self.buffer_index = 0

    def value(self, board):
        board_np = np.array(board, dtype=np.float32)
        board_flat = board_np.flatten()
        board_tensor = torch.from_numpy(board_flat)
        values_tensor = self.value_net(board_tensor.detach())
        return values_tensor

    def target_value(self, board):
        board_np = np.array(board, dtype=np.float32)
        board_flat = board_np.flatten()
        board_tensor = torch.from_numpy(board_flat)
        target_values_tensor = self.target_value_net(board_tensor.detach())
        return target_values_tensor

    def action(self, board):
        epsilon = random.random()
        if self.explore <= 0.9:
            self.explore *= 1.001
        print('explore = %.3f' % self.explore)
        if epsilon > self.explore:
            action_num = random.randint(0, 3)
            return action_num

        board_np = np.array(board, dtype=np.float32)
        board_flat = board_np.flatten()
        board_tensor = torch.from_numpy(board_flat)
        values_tensor = self.value(board_tensor.detach())
        values_np = values_tensor.detach().numpy()
        max_idx = np.argmax(values_np.tolist())
        return max_idx

    def train(self):
        # randomly select samples as one batch
        train_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        sample_size = 10  #self.buffer_capacity

        indices = np.random.choice(range(10), sample_size)
        sampler1 = torch.utils.data.SubsetRandomSampler(indices)

        print('sampler1 = ', sampler1)
        exit()

        value_loss = self.value_net.loss_function(v, v_.detach(), reward)
        self.value_net.zero_grad()
        value_loss[idx].backward()
        self.value_net.opt_Adam.step()

    def update(self):
        self.episode += 1
        print('episode = ', self.episode)
        if self.episode % 2 == 0:
            torch.save(self.value_net.state_dict(), './value_net.pkl')
            self.target_value_net.load_state_dict(
                torch.load('./value_net.pkl'))
コード例 #14
0
ファイル: run.py プロジェクト: ajy8456/POMDP
def main():
    config = Settings()
    # |TODO| go to Setting()
    train_filename = config.train_file
    # train_filename_1 = config.train_file_1
    # train_filename_2 = config.train_file_2
    test_filename = config.test_file
    dataset_path = os.path.join(os.getcwd(), config.path)

    if not os.path.exists(config.exp_dir):
        os.mkdir(config.exp_dir)
    model_dir = os.path.join(config.exp_dir, config.model_name)

    logger = SummaryWriter(model_dir)

    if config.data_type == 'success':
        # with open(os.path.join(dataset_path, train_filename), 'rb') as f:
        #     train_dataset = pickle.load(f)
        # with open(os.path.join(dataset_path, test_filename), 'rb') as f:
        #     test_dataset = pickle.load(f)

        dataset = glob.glob(f'{dataset_path}/{train_filename}/*.pickle')
        # test_dataset = glob.glob(f'{dataset_path}/{test_filename}/*.pickle')
        # train_dataset = dataset[:1500000]
        # test_dataset = dataset[-200000:]
        train_dataset = dataset[:-20000]
        test_dataset = dataset[-20000:]

        print('#trajectories of train_dataset:', len(train_dataset))
        print('#trajectories of test_dataset:', len(test_dataset))

    elif config.data_type == 'mcts':
        dataset = glob.glob(f'{dataset_path}/{train_filename}/*.pickle')
        train_dataset = dataset[:-20000]
        test_dataset = dataset[-20000:]

        # train_dataset = glob.glob(f'{dataset_path}/{train_filename}/*.pickle')
        # test_dataset = glob.glob(f'{dataset_path}/{test_filename}/*.pickle')

        if config.filter:
            filtered_data_train = []
            filtered_data_test = []
            total_reward_filt = []
            total_reward_not_filt = []
            avg_total_reward_not_filt = 0
            avg_total_reward_filt = 0
            for data in train_dataset:
                with open(data, 'rb') as f:
                    traj = pickle.load(f)
                    avg_total_reward_not_filt += traj[-1]
                    total_reward_not_filt.append(traj[-1])
                    if traj[-1] > config.filter:
                        filtered_data_train.append(data)
                        avg_total_reward_filt += traj[-1]
                        total_reward_filt.append(traj[-1])

            for data in test_dataset:
                with open(data, 'rb') as f:
                    traj = pickle.load(f)
                    if traj[-1] > config.filter:
                        filtered_data_test.append(data)

            total_reward_not_filt_std = np.std(
                np.asarray(total_reward_not_filt))
            total_reward_filt_std = np.std(np.asarray(total_reward_filt))
            print('Average of total reward(not filtered):',
                  avg_total_reward_not_filt / len(train_dataset))
            print('std of total reward(not filtered):',
                  total_reward_not_filt_std)
            print('Average of total reward(filtered):',
                  avg_total_reward_filt / len(filtered_data_train))
            print('std of total reward(filtered):', total_reward_filt_std)

            train_dataset = filtered_data_train
            test_dataset = filtered_data_test

        print('#trajectories of train_dataset:', len(train_dataset))
        print('#trajectories of test_dataset:', len(test_dataset))

    # # For mixed dataset
    # train_dataset_1 = glob.glob(f'{dataset_path}/{train_filename_1}/*.pickle')

    # dataset_2 = glob.glob(f'{dataset_path}/{train_filename_2}/*.pickle')
    # train_dataset_2 = dataset_2[:100000]
    # test_dataset = dataset_2[100000:]

    # if config.filter:
    #     filtered_data_train = []
    #     filtered_data_test = []
    #     total_reward_filt = []
    #     total_reward_not_filt = []
    #     avg_total_reward_not_filt = 0
    #     avg_total_reward_filt = 0
    #     for data in train_dataset_2:
    #         with open(data, 'rb') as f:
    #             traj = pickle.load(f)
    #             avg_total_reward_not_filt += traj[-1]
    #             total_reward_not_filt.append(traj[-1])
    #             if traj[-1] > config.filter:
    #                 filtered_data_train.append(data)
    #                 avg_total_reward_filt += traj[-1]
    #                 total_reward_filt.append(traj[-1])

    #     for data in test_dataset:
    #         with open(data, 'rb') as f:
    #             traj = pickle.load(f)
    #             if traj[-1] > config.filter:
    #                 filtered_data_test.append(data)

    #     total_reward_not_filt_std = np.std(np.asarray(total_reward_not_filt))
    #     total_reward_filt_std = np.std(np.asarray(total_reward_filt))
    #     print('Average of total reward(not filtered):', avg_total_reward_not_filt/len(train_dataset_2))
    #     print('std of total reward(not filtered):', total_reward_not_filt_std)
    #     print('Average of total reward(filtered):', avg_total_reward_filt/len(filtered_data_train))
    #     print('std of total reward(filtered):', total_reward_filt_std)

    #     train_dataset = train_dataset_1 + filtered_data_train
    #     test_dataset = filtered_data_test

    # print('#trajectories of train_dataset:', len(train_dataset))
    # print('#trajectories of test_dataset:', len(test_dataset))

    # generate dataloader
    train_loader = get_loader(config, train_dataset)
    test_loader = get_loader(config, test_dataset)

    # model
    device = th.device(config.device)
    if config.model == 'GPT':
        model = GPT2(config).to(device)
    elif config.model == 'RNN':
        model = RNN(config).to(device)
    elif config.model == 'LSTM':
        model = LSTM(config).to(device)
    elif config.model == 'CVAE' or config.model == 'PolicyValueNet':
        model = CVAE(config).to(device)
    elif config.model == 'ValueNet':
        model = ValueNet(config).to(device)
    else:
        raise Exception(
            f'"{config.model}" is not support!! You should select "GPT", "RNN", "LSTM", "CVAE", "ValueNet", or "PolicyValueNet.'
        )

    # optimizer
    optimizer = th.optim.AdamW(model.parameters(),
                               lr=config.learning_rate,
                               weight_decay=config.weight_decay)

    # learning rate scheduler
    if config.optimizer == 'AdamW':
        scheduler = th.optim.lr_scheduler.LambdaLR(
            optimizer, lambda step: min((step + 1) / config.warmup_step, 1))
    elif config.optimizer == 'AdamWR':
        scheduler = CosineAnnealingWarmUpRestarts(optimizer=optimizer,
                                                  T_0=config.T_0,
                                                  T_mult=config.T_mult,
                                                  eta_max=config.lr_max,
                                                  T_up=config.warmup_step,
                                                  gamma=config.lr_mult)
    else:
        raise Exception(
            f'"{config.optimizer}" is not support!! You should select "AdamW" or "AdamWR".'
        )

    # Metric
    # |TODO| implement Chamfer distance
    if config.model == 'CVAE':
        loss_fn = ELBOLoss(config)
        eval_fn = ELBOLoss(config)
    elif config.model == 'ValueNet':
        loss_fn = RegressionLossValue(config)
        eval_fn = RegressionLossValue(config)
    elif config.model == 'PolicyValueNet':
        loss_fn = None
        eval_fn = None
    else:
        loss_fn = RegressionLossPolicy(config)
        eval_fn = RegressionLossPolicy(config)

    # Trainer & Evaluator
    trainer = Trainer(config=config,
                      loader=train_loader,
                      model=model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_fn=loss_fn,
                      eval_fn=eval_fn)
    evaluator = Evaluator(config=config,
                          loader=test_loader,
                          model=model,
                          eval_fn=eval_fn)

    # save configuration
    config.save(model_dir + '/config.yaml')
    # Logging model graph
    dummy = next(iter(test_loader))
    for k in dummy:
        dummy[k].to(device).detach()
    logger.add_graph(ModelAsTuple(config, model), dummy)

    start_epoch = 1
    best_error = 10000.

    # load checkpoint for resuming
    if config.resume is not None:
        filename = os.path.join(model_dir, config.resume)
        if os.path.isfile(filename):
            start_epoch, best_error, model, optimizer, scheduler = load_checkpoint(
                config, filename, model, optimizer, scheduler)
            start_epoch += 1
            print("Loaded checkpoint '{}' (epoch {})".format(
                config.resume, start_epoch))
        else:
            raise Exception("No checkpoint found at '{}'".format(
                config.resume))

    # load checkpoint for pre-trained
    if config.pre_trained is not None:
        pre_trained_path = os.path.join(config.exp_dir, config.pre_trained)
        if os.path.isfile(pre_trained_path):
            start_epoch, best_error, model, optimizer, scheduler = load_checkpoint(
                config, pre_trained_path, model, optimizer, scheduler)
            start_epoch = 1
            print("Loaded checkpoint '{}'".format(config.pre_trained))
        else:
            raise Exception("No checkpoint found at '{}'".format(
                config.resume))

    for epoch in range(start_epoch, config.epochs + 1):
        print(f'===== Start {epoch} epoch =====')

        # Training one epoch
        print("Training...")
        train_loss, train_val = trainer.train(epoch)

        # Logging
        if config.model == 'CVAE':
            logger.add_scalar('Loss(total)/train', train_loss['total'], epoch)
            logger.add_scalar('Loss(Reconstruction)/train',
                              train_loss['Recon'], epoch)
            logger.add_scalar('Loss(KL_divergence)/train',
                              train_loss['KL_div'], epoch)
        elif config.model == 'ValueNet':
            logger.add_scalar('Loss/train', train_loss['total'], epoch)
        elif config.model == 'PolicyValueNet':
            logger.add_scalar('Loss(total)/train', train_loss['total'], epoch)
            logger.add_scalar('Loss(action)/train', train_loss['action'],
                              epoch)
            logger.add_scalar('Loss(accumulated reward)/train',
                              train_loss['accumulated_reward'], epoch)
            # logger.add_scalar('Eval(action)/train', train_val['action'], epoch)
        else:
            logger.add_scalar('Loss(total)/train', train_loss['total'], epoch)
            logger.add_scalar('Loss(action)/train', train_loss['action'],
                              epoch)
            # if config.use_reward:
            #     logger.add_scalar('Loss(reward)/train', train_loss['reward'], epoch)

            # logger.add_scalar('Eval(action)/train', train_val['action'], epoch)
            # if config.use_reward:
            #     logger.add_scalar('Eval(reward)/train', train_val['reward'], epoch)

        # |FIXME| debug for eff_grad: "RuntimeError: Boolean value of Tensor with more than one value is ambiguous"
        log_gradients(model,
                      logger,
                      epoch,
                      log_grad=config.log_grad,
                      log_param=config.log_para,
                      eff_grad=config.eff_grad,
                      print_num_para=config.print_num_para)

        # evaluating
        if epoch % config.test_eval_freq == 0:
            print("Validating...")
            test_val = evaluator.eval(epoch)

            # save the best model
            # |TODO| change 'action' to 'total' @ trainer.py & evaluator.py -> merge 'CVAE' & others
            if config.model == 'CVAE' or config.model == 'ValueNet' or config.model == 'PolicyValueNet':
                if test_val['total'] < best_error:
                    best_error = test_val['total']

                    save_checkpoint('Saving the best model!',
                                    os.path.join(model_dir, 'best.pth'), epoch,
                                    best_error, model, optimizer, scheduler)
            else:
                if test_val['action'] < best_error:
                    best_error = test_val['action']

                    save_checkpoint('Saving the best model!',
                                    os.path.join(model_dir, 'best.pth'), epoch,
                                    best_error, model, optimizer, scheduler)

            # Logging
            if config.model == 'CVAE':
                logger.add_scalar('Eval(total)/test', test_val['total'], epoch)
                logger.add_scalar('Eval(Reconstruction)/test',
                                  test_val['Recon'], epoch)
                logger.add_scalar('Eval(KL_divergence)/test',
                                  test_val['KL_div'], epoch)
            elif config.model == 'ValueNet':
                logger.add_scalar('Eval/test', test_val['total'], epoch)
            elif config.model == 'PolicyValueNet':
                logger.add_scalar('Eval(total)/test', test_val['total'], epoch)
                logger.add_scalar('Eval(action)/test', test_val['action'],
                                  epoch)
                logger.add_scalar('Eval(accumulated reward)/test',
                                  test_val['accumulated_reward'], epoch)
            else:
                logger.add_scalar('Eval(action)/test', test_val['action'],
                                  epoch)
                # if config.use_reward:
                #     logger.add_scalar('Eval(reward)/test', test_val['reward'], epoch)

        # save the model
        if epoch % config.save_freq == 0:
            save_checkpoint('Saving...',
                            os.path.join(model_dir, f'ckpt_epoch_{epoch}.pth'),
                            epoch, best_error, model, optimizer, scheduler)

        print(f'===== End {epoch} epoch =====')
コード例 #15
0
ファイル: main.py プロジェクト: zxjzxj9/RLexp
import torch
from torch import optim, distributions
import torch.nn.functional as F

env = gym.make("CartPole-v1")
# observation = env.reset()
# print(observation)
# print(env.observation_space)

MAXSTEP = 100
BATCHSIZE = 16
EPOCH = 1000
GAMMA = 0.99

policy_net = PolicyNet()
value_net = ValueNet()

policy_net.cuda()
value_net.cuda()
opt1 = optim.Adam(policy_net.parameters(), lr=1e-3)
opt2 = optim.Adam(value_net.parameters(), lr=1e-3)


# train one epoch
def train_step():

    observ_batch = []
    reward_batch = []
    action_batch = []
    mask_batch = []
コード例 #16
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    n_env = 8
    n_step = 128
    mb_size = n_env * n_step
    sample_mb_size = 64
    sample_n_epoch = 4
    clip_val = 0.2
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.0
    max_grad_norm = 0.5
    beta = 0.1
    lr = 1e-4
    n_iter = 30000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0"
    expert_path = "../save/{}_traj.pkl".format(args.env)

    #Create multiple environments
    #----------------------------
    env = MultiEnv([
        make_env(i,
                 env_id=args.env,
                 unwrap=args.unwrap,
                 rand_seed=int(time.time())) for i in range(n_env)
    ])

    if args.conti:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.shape[0]
    else:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.n

    runner = EnvRunner(env,
                       s_dim,
                       a_dim,
                       n_step,
                       gamma,
                       lamb,
                       device=device,
                       conti=args.conti)

    #Load expert trajectories
    #----------------------------
    if os.path.exists(expert_path):
        s_real, a_real = pkl.load(open(expert_path, "rb"))
        sa_real = []

        if args.conti:
            for i in range(len(s_real)):
                sa_real.append(np.concatenate([s_real[i], a_real[i]], 1))
        else:
            for i in range(len(s_real)):
                a_real_onehot = np.zeros((len(a_real[i]), a_dim),
                                         dtype=np.float32)

                for j in range(len(a_real[i])):
                    a_real_onehot[j, a_real[i][j]] = 1

                sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1))

        sa_real = np.concatenate(sa_real, 0)
    else:
        print("ERROR: No expert trajectory file found")
        sys.exit(1)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    value_net = ValueNet(s_dim).to(device)
    dis_net = DiscriminatorNet(s_dim + a_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                dis_net,
                a_dim,
                beta,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                mb_size,
                device=device,
                conti=args.conti)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        dis_net.load_state_dict(checkpoint["DiscriminatorNet"])
        agent.beta = checkpoint["beta"]
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run(
                policy_net, value_net, dis_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train(
            policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values,
            mb_advs, mb_returns, mb_old_a_logps, sa_real)

        #Print the result
        if it % disp_step == 0:
            agent.lr_decay(it, n_iter)
            policy_net.eval()
            value_net.eval()
            n_sec = time.time() - t_start
            fps = int((it - start_it) * n_env * n_step / n_sec)
            mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance(
            )
            policy_net.train()
            value_net.train()

            print("[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Timesteps        = {:d}".format((it - start_it) * mb_size))
            print("Elapsed time     = {:.2f} sec".format(n_sec))
            print("FPS              = {:d}".format(fps))
            print("actor loss       = {:.6f}".format(pg_loss))
            print("critic loss      = {:.6f}".format(v_loss))
            print("dis loss         = {:.6f}".format(dis_loss))
            print("entropy          = {:.6f}".format(ent))
            print("avg_kl           = {:.6f}".format(avg_kl))
            print("beta             = {:.6f}".format(agent.beta))
            print("mean true return = {:.6f}".format(mean_true_return))
            print("mean return      = {:.6f}".format(mean_return))
            print("mean length      = {:.2f}".format(mean_len))
            print("dis_real         = {:.3f}".format(dis_real))
            print("dis_fake         = {:.3f}".format(dis_fake))
            print()

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "beta": agent.beta,
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict(),
                    "DiscriminatorNet": dis_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
コード例 #17
0
ファイル: a3c.py プロジェクト: paul-hyun/reinforceNLP
class A3CGlobal:
    def __init__(self, config):
        self.config = config

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.critic_lr)

    # 리턴값 계산
    def get_returns(self, rewards, done, next_value):
        returns = torch.zeros(len(rewards),
                              dtype=torch.float).to(self.config.device)
        R = 0 if done else next_value
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, states, actions, rewards, next_states, done):
        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.config.device)
        next_states = torch.tensor(next_states,
                                   dtype=torch.float).to(self.config.device)

        next_values = self.critic(next_states).view(-1)

        # 리턴값 계산
        returns = self.get_returns(rewards, done, next_values[-1])

        values = self.critic(states).view(-1)

        # 가치신경망 학습
        critic_loss = self.train_critic(values, returns)
        # 정책신경망 학습
        actor_loss = self.train_actor(states, actions, returns - values)

        return actor_loss, critic_loss

    # 정책신경망을 업데이트하는 함수
    def train_actor(self, states, actions, advantages):
        policy = self.actor(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach()
        actor_loss = -torch.mean(cross_entropy)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item()

    # 가치신경망을 업데이트하는 states
    def train_critic(self, values, targets):
        critic_loss = torch.mean(torch.pow(targets - values, 2))

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
コード例 #18
0
ファイル: train.py プロジェクト: lumenghe/game
 def learn(i):
     boards, values = zip(*data_buffer)
     value_net = ValueNet(rl_model_filepath, rl_step-1)
     value_net.learn(boards, values, epochs=epochs, batch_size=batch_size)
コード例 #19
0
ファイル: dqn.py プロジェクト: paul-hyun/reinforceNLP
class DQNAgent:
    def __init__(self, config):
        self.config = config
        self.epsilon = config.epsilon

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 가치신경망 생성
        self.model = ValueNet(self.config.n_state, self.config.n_action)
        self.model.to(device)
        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=self.config.learning_rate)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.config.n_action)
        else:
            state = torch.tensor(state,
                                 dtype=torch.float).to(self.config.device)
            output = self.model(state)
            return output.argmax().item()

    # 히스토리 추가
    def append_replay(self, state, action, reward, next_state, done):
        self.replay_memory.append((state, action, reward, next_state, done))

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self):
        # 학습이 계속 될 수 록 탐험 학률을 줄여 줌
        if self.epsilon > self.config.epsilon_min:
            self.epsilon *= self.config.epsilon_decay

        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(
            random.sample(self.replay_memory, self.config.n_batch))
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])
        dones = list(replay_memory[:, 4])

        states = torch.tensor(states, dtype=torch.float).to(device)
        next_states = torch.tensor(next_states, dtype=torch.float).to(device)

        targets = self.model(states)
        next_values = self.model(next_states)

        for i in range(len(targets)):
            if dones[i]:
                targets[i][actions[i]] = rewards[i]  # Vt = Rt+1
            else:
                targets[i][
                    actions[i]] = rewards[i] + self.config.discount_factor * (
                        torch.max(next_values[i]))  # Vt = Rt+1 + rVt+1

        loss = self.train_value(states, targets)

        return loss

    # 가치신경망을 업데이트하는 함수
    def train_value(self, states, targets):
        values = self.model(states)
        loss = torch.mean(torch.pow(targets - values, 2))

        self.model_optimizer.zero_grad()
        loss.backward()
        self.model_optimizer.step()

        return loss.item()

    # model의 weight를 파일로 저장
    def save(self):
        torch.save(self.model.state_dict(), self.config.save_file)

    # 파일로 부터 model의 weight를 읽어 옴
    def load(self):
        self.model.load_state_dict(torch.load(self.config.save_file))

    # GPU 메모리 반납
    def close(self):
        del self.model
コード例 #20
0
# Turn on pyplot's interactive mode
# VERY IMPORTANT because otherwise training stats plot will hault
plt.ion()

# Create OpenAI gym environment
env = gym.make(env_name)
if is_unwrapped:
    env = env.unwrapped

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
policy_net = PolicyNet(layer_sizes, action_lim).to(device)  # Policy network
value_net = ValueNet(input_size).to(device)  # Value network

# Set up memory
memory = Memory(capacity, device)

# Set up optimizer
policynet_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)
valuenet_optimizer = optim.Adam(value_net.parameters(), lr=valuenet_lr)

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
    "epoch mean rewards": [],
コード例 #21
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    n_env = 8
    n_step = 128
    mb_size = n_env * n_step
    sample_mb_size = 64
    sample_n_epoch = 4
    clip_val = 0.2
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.0
    max_grad_norm = 0.5
    lr = 1e-4
    n_iter = 30000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0"

    #Create multiple environments
    #----------------------------
    env = MultiEnv([
        make_env(i,
                 env_id=args.env,
                 unwrap=args.unwrap,
                 rand_seed=int(time.time())) for i in range(n_env)
    ])

    if args.conti:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.shape[0]
    else:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.n

    runner = EnvRunner(env,
                       s_dim,
                       a_dim,
                       n_step,
                       gamma,
                       lamb,
                       device=device,
                       conti=args.conti)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    value_net = ValueNet(s_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                mb_size,
                device=device)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run(
                policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs,
                                           mb_actions, mb_values, mb_advs,
                                           mb_returns, mb_old_a_logps)

        #Print the result
        if it % disp_step == 0:
            agent.lr_decay(it, n_iter)
            policy_net.eval()
            value_net.eval()
            n_sec = time.time() - t_start
            fps = int((it - start_it) * n_env * n_step / n_sec)
            mean_return, std_return, mean_len = runner.get_performance()
            policy_net.train()
            value_net.train()

            print("[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Timesteps    = {:d}".format((it - start_it) * mb_size))
            print("Elapsed time = {:.2f} sec".format(n_sec))
            print("FPS          = {:d}".format(fps))
            print("actor loss   = {:.6f}".format(pg_loss))
            print("critic loss  = {:.6f}".format(v_loss))
            print("entropy      = {:.6f}".format(ent))
            print("mean return  = {:.6f}".format(mean_return))
            print("mean length  = {:.2f}".format(mean_len))
            print()

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
コード例 #22
0
class A2CAgent:
    def __init__(self, config):
        self.config = config

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.actor(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 히스토리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리턴값 계산
    def get_returns(self, rewards, done, next_value):
        returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device)
        R = 0 if done else next_value
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, done):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions, dtype=torch.float).to(self.config.device)
        next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device)

        next_values = self.critic(next_states).view(-1)

        # 리턴값 계산
        returns = self.get_returns(rewards, done, next_values[-1])

        values = self.critic(states).view(-1)

        # 가치신경망 학습
        critic_loss = self.train_critic(values, returns)
        # 정책신경망 학습
        actor_loss = self.train_actor(states, actions, returns - values)

        return actor_loss, critic_loss
    
    # 정책신경망을 업데이트하는 함수
    def train_actor(self, states, actions, advantages):
        policy = self.actor(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach()
        actor_loss = -torch.mean(cross_entropy)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item()
    
    # 가치신경망을 업데이트하는 states
    def train_critic(self, values, targets):
        critic_loss = torch.mean(torch.pow(targets - values, 2))

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    # model의 weight를 파일로 저장
    def save(self):
        torch.save(self.actor.state_dict(), self.config.save_file + ".actor")
        torch.save(self.critic.state_dict(), self.config.save_file + ".critic")
    
    # 파일로 부터 model의 weight를 읽어 옴
    def load(self):
        self.actor.load_state_dict(torch.load(self.config.save_file + ".actor"))
        self.critic.load_state_dict(torch.load(self.config.save_file + ".critic"))
    
    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
コード例 #23
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="BipedalWalker-v3")
    parser.add_argument("--discrete", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    clip_val = 0.2
    sample_mb_size = 64
    sample_n_epoch = 4
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.01
    max_grad_norm = 0.5
    lr = 1e-4
    n_iter = 10000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    #Create environment
    #----------------------------
    env = gym.make(args.env)

    if args.discrete:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.n
    else:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]

    if args.unwrap:
        env = env.unwrapped

    runner = EnvRunner(s_dim,
                       a_dim,
                       gamma,
                       lamb,
                       max_step=2048,
                       device=device,
                       conti=not args.discrete)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device)
    value_net = ValueNet(s_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                device=device)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()
    mean_total_reward = 0
    mean_length = 0

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run(
                env, policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs,
                                           mb_actions, mb_values, mb_advs,
                                           mb_returns, mb_old_a_logps)
        mean_total_reward += mb_rewards.sum()
        mean_length += len(mb_obs)
        print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format(
            it, mb_rewards.sum(), len(mb_obs)))

        #Print the result
        if it % disp_step == 0:
            print("\n[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Elapsed time = {:.2f} sec".format(time.time() - t_start))
            print("actor loss   = {:.6f}".format(pg_loss))
            print("critic loss  = {:.6f}".format(v_loss))
            print("entropy      = {:.6f}".format(ent))
            print("mean return  = {:.6f}".format(mean_total_reward /
                                                 disp_step))
            print("mean length  = {:.2f}".format(mean_length / disp_step))
            print()

            agent.lr_decay(it, n_iter)
            mean_total_reward = 0
            mean_length = 0

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()