Ejemplo n.º 1
0
def main():
    # 获取游戏
    env = create_train_env(world=1, stage=1, actions=COMPLEX_MOVEMENT)

    print(env.observation_space.shape)
    print(env.action_space.n)

    obs = env.reset()

    while True:
        # 游戏生成的随机动作,int类型数值
        action = env.action_space.sample()
        # 执行游戏
        obs, reward, terminal, info = env.step(action)
        obs = np.squeeze(obs)
        obses = obs[0]
        for i in range(1, obs.shape[0]):
            obses = np.hstack([obses, obs[i]])
        cv2.imshow('obes', obses)
        env.render()
        print("=" * 50)
        print("action:", action)
        print("obs shape:", obs.shape)
        print("reward:", reward)
        print("terminal:", terminal)
        print("info:", info)
        if terminal:
            obs = env.reset()
Ejemplo n.º 2
0
def main():
    # 获取游戏
    env = create_train_env(game="SuperMarioBros-Nes")
    print(env.observation_space.shape)
    print(env.action_space.n)

    obs = env.reset()

    while True:
        # 游戏生成的随机动作,int类型数值
        action = env.action_space.sample()
        # 执行游戏
        obs, reward, terminal, info = env.step(action)
        # 显示连续动作
        obs = np.squeeze(obs)
        obses = obs[0]
        for i in range(1, obs.shape[0]):
            obses = np.hstack([obses, obs[i]])
        cv2.imshow('obes', obses)
        cv2.waitKey(1)
        env.render()
        print("=" * 50)
        print("action:", action)
        print("obs shape:", obs.shape)
        print("reward:", reward)
        print("terminal:", terminal)
        print("info:", info)
        if terminal:
            obs = env.reset()
Ejemplo n.º 3
0
def infer(args):
    # 固定初始化状态
    paddle.seed(123)
    # 使用 GPU预测
    if paddle.is_compiled_with_cuda():
        paddle.set_device("gpu:0")
    # 判断游戏动作类型
    if args.action_type == "right":
        actions = RIGHT_ONLY
    elif args.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    # 创建游戏环境
    env = create_train_env(args.world, args.stage, actions)
    # 创建模型
    model = Model(env.observation_space.shape[0], len(actions))
    # 加载模型参数文件
    model_path = "{}/model_{}_{}_finish.pdparams".format(
        args.saved_path, args.world, args.stage)
    if not os.path.exists(model_path):
        model_path = "{}/model_{}_{}.pdparams".format(args.saved_path,
                                                      args.world, args.stage)
    model.load_dict(paddle.load(model_path))
    # 切换评估模式
    model.eval()
    # 获取刚开始的游戏图像
    state = paddle.to_tensor(env.reset(), dtype="float32")
    total_reward = 0
    while True:
        # 显示界面
        env.render()
        # 预测动作概率和评估值
        logits, value = model(state)
        # 获取动作的序号
        policy = F.softmax(logits, axis=1)
        action = paddle.argmax(policy)[0]
        # 执行游戏
        state, reward, done, info = env.step(int(action))
        total_reward += reward
        # 转换每一步都游戏状态
        state = paddle.to_tensor(state, dtype="float32")
        print(info)
        # 游戏通关
        if info["flag_get"]:
            print("World {} stage {} 通关".format(args.world, args.stage))
            break
        if done:
            print("游戏结束,得分:%f, 未能通过!" % total_reward)
            break
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')

    env = create_train_env(1, args.difficulty, args.macro, 'env1.mp4')
    input_size = env.observation_space.shape[0]
    output_size = env.action_space.n

    model = RNNActorCriticNetwork(input_size, output_size,
                                  args.noise_linear).to(device)
    model.eval()

    dummy_input = torch.rand(1, 1,
                             *env.observation_space.shape).to(device=device)
    writer = SummaryWriter(log_dir=args.log_dir)
    writer.add_graph(model, (dummy_input, ))
def local_test(index, opt, global_model, model_type=None):
    torch.manual_seed(42 + index)
    env, num_states, num_actions = create_train_env(opt.layout,
                                                    index + 1,
                                                    index=index)
    if model_type:
        AC_NN_MODEL = getattr(model, model_type)()
    else:
        AC_NN_MODEL = SimpleActorCriticLineal

    local_model = AC_NN_MODEL(num_states, num_actions)
    # Test model we are going to test (turn off dropout, no backward pass)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            # Copy global model to local model
            local_model.load_state_dict(global_model.state_dict(),
                                        strict=False)
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float)
                c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        value = value.clamp(-1., 1.)
        policy = F.softmax(logits, dim=0)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        state = torch.from_numpy(state)
        actions.append(action)

        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = torch.from_numpy(env.reset())
Ejemplo n.º 6
0
def train(opt: argparse.ArgumentParser):
    torch.manual_seed(42)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
        os.makedirs(opt.log_path)
        if not os.path.isdir(opt.saved_path):
            os.makedirs(opt.saved_path)
    multi_processes = mp.get_context("spawn")
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    global_model = ActorCritics(num_states, num_actions)
    if opt.use_gpu and torch.cuda.is_available():
        global_model.cuda()
    global_model.share_memory()
    if opt.load_from_stage:
        if opt.stage == 1:
            previous_worldd = opt.world - 1
            previous_stage = 4
        else:
            previous_world = opt.world
            previous_stage = opt.stage - 1
        file_ = f"{opt.saved_path}/a3c_super_mario_bros_{previous_world}_{previous_stage}"
        if os.path.isfile(file_):
            global_model.load_state_dict(file_)
    optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr)
    processes = []
    for pid in range(opt.num_processes):
        if pid == 0:
            process = multi_processes.Process(target=local_train,
                                              args=(pid, opt, global_model,
                                                    optimizer, True))
        else:
            process = multi_processes.Process(target=local_train,
                                              args=(pid, opt, global_model,
                                                    optimizer))
        process.start()
        processes.append(process)
    process = multi_processes.Process(target=local_train,
                                      args=(opt.num_processes, opt,
                                            global_model, optimizer))
    process.start()
    processes.append(process)
    for process in processes:
        process.join()
Ejemplo n.º 7
0
def local_test(index, opt, global_model):
    torch.manual_seed(123 + index)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)

    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        env.render()
        actions.append(action)

        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True

        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()

        state = torch.from_numpy(state)
Ejemplo n.º 8
0
def main():
    args = get_args()

    device = torch.device('cuda' if args.cuda else 'cpu')
    env = create_train_env(1, args.difficulty, args.macro, 'env1.mp4')

    input_size = env.observation_space.shape[0]
    output_size = env.action_space.n

    model_path = os.path.join(args.save_dir, 'policy.cpt')
    model = RNNActorCriticNetwork(input_size, output_size,
                                  args.noise_linear).to(device)
    if args.cuda:
        model.load_state_dict(torch.load(model_path))
    else:
        model.load_state_dict(torch.load(model_path, map_location='cpu'))

    model.eval()
    print('Testing...')

    # looping
    obs = env.reset()
    hidden = None

    sample_rall = 0
    sample_step = 0
    sample_max_stage = 0
    done = False

    while not done:
        action, _, action_probs, hidden = get_action(model, device,
                                                     obs[None,
                                                         None, :], hidden)
        obs, rew, done, info = env.step(int(action))

        sample_rall += rew
        sample_max_stage = max(sample_max_stage, info['stage'])
        sample_step += 1

    print('Max Stage: %d | Reward: %f | Total Steps: %d' \
            % (sample_max_stage, sample_rall, sample_step))
Ejemplo n.º 9
0
def infer(args):
    # 固定初始化状态
    paddle.seed(123)
    # 使用 GPU预测
    if paddle.is_compiled_with_cuda():
        paddle.set_device("gpu:0")
    # 创建游戏环境
    env = create_train_env(args.game)
    # 创建模型
    model = Model(env.observation_space.shape[0], env.action_space.n)
    # 加载模型参数文件
    model.load_dict(
        paddle.load("{}/model_best_{}.pdparams".format(args.saved_path,
                                                       args.game)))
    # 切换评估模式
    model.eval()
    # 获取刚开始的游戏图像
    state = paddle.to_tensor(env.reset(), dtype="float32")
    total_reward = 0
    while True:
        # 显示界面
        env.render()
        # 预测动作概率和评估值
        logits, value = model(state)
        # 获取动作的序号
        policy = F.softmax(logits, axis=1)
        action = paddle.argmax(policy)[0]
        # 执行游戏
        state, reward, done, info = env.step(int(action))
        total_reward += reward
        print(info)
        # 转换每一步都游戏状态
        state = paddle.to_tensor(state, dtype="float32")
        if done:
            print("游戏结束,得分:%f" % total_reward)
            break
Ejemplo n.º 10
0
def eval(args, num_states, num_actions):
    log_writer = LogWriter(logdir='log')
    # 固定初始化状态
    paddle.seed(123)
    # 使用 GPU预测
    if paddle.is_compiled_with_cuda():
        paddle.set_device("gpu:0")
    # 创建游戏动作
    env = create_train_env(args.game)
    # 获取网络模型
    local_model = Model(num_states, num_actions)
    # 切换为评估状态
    local_model.eval()
    # 将图像转换为Paddle的数据类型
    state = paddle.to_tensor(env.reset(), dtype="float32")
    # 一开始就更新模型参数
    done = True
    # 日志的记录步数
    step = 0
    # 旧模型的MD5
    old_model_file_md5 = ''
    # 游戏总得分
    total_reward = 0
    max_reward = 0
    while True:
        # 每结束一次就更新模型参数
        if done:
            try:
                model_path = "{}/model_{}.pdparams".format(
                    args.saved_path, args.game)
                # 使用文件的MD5保证每个模型只用一次
                with open(model_path, 'rb') as f:
                    file = f.read()
                file_md5 = hashlib.md5(file).hexdigest()
                if file_md5 == old_model_file_md5:
                    continue
                else:
                    model_dict = paddle.load(model_path)
                    old_model_file_md5 = file_md5
            except:
                continue
            total_reward = 0
            local_model.load_dict(model_dict)
        # 预测动作概率和评估值
        logits, value = local_model(state)
        # 获取动作的序号
        policy = F.softmax(logits, axis=1)
        action = paddle.argmax(policy)[0]
        # 执行游戏
        state, reward, done, info = env.step(int(action))
        total_reward += reward
        # 显示界面
        if args.show_play:
            env.render()
        # 重置游戏状态
        if done:
            step += 1
            state = env.reset()
            print('总得分是:%f' % total_reward)
            log_writer.add_scalar(tag='Eval reward',
                                  value=total_reward,
                                  step=step)
            if max_reward < total_reward:
                paddle.save(
                    local_model.state_dict(),
                    "{}/model_best_{}.pdparams".format(args.saved_path,
                                                       args.game))
                max_reward = total_reward
        # 转换每一步都游戏状态
        state = paddle.to_tensor(state, dtype="float32")
Ejemplo n.º 11
0
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='LunarLander-v2')#'HalfCheetah-v2')
    parser.add_argument('--hid', type=int, default=64)
    parser.add_argument('--l', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.999)
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--cpu', type=int, default=1)
    parser.add_argument('--steps', type=int, default=4000)
    parser.add_argument('--epochs', type=int, default=350)
    parser.add_argument('--pretrain', type=str, default='/root/lele/spinningup/spinningup/data/ppo_0715/ppo_0715_s0/pyt_save/model.pt')
    parser.add_argument('--exp_name', type=str, default='ppo_lstm_1106')
    args = parser.parse_args()

    import os
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
    mpi_fork(args.cpu)  # run parallel code with mpi

    from spinup.utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
    # from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
    env_fn = lambda : create_train_env(1,1,'complex')
    # env_fn = SubprocVecEnv([])
    # env_fn = lambda : JoypadSpace(gym_super_mario_bros.make("SuperMarioBros-{}-{}-v0".format(1, 1)), gym_super_mario_bros.actions.COMPLEX_MOVEMENT)
    ppo(env_fn, actor=userActor, critic=userCritic,#core.MLPActorCritic, #gym.make(args.env)
        ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, 
        seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs,
        logger_kwargs=logger_kwargs, clip_ratio=0.2, pi_lr=0.001, vf_lr=0.001, pretrain=None)#args.pretrain)

        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len))
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            hidden = (torch.zeros((1, 512), dtype=torch.float).to(device),
                      torch.zeros((1, 512), dtype=torch.float).to(device))
            n += 1

    logger.log_tabular('EpRet', with_min_and_max=True)
    logger.log_tabular('EpLen', average_only=True)
    logger.dump_tabular()


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--fpath', '-f', type=str, default='./pretrain')
    parser.add_argument('--len', '-l', type=int, default=0)
    parser.add_argument('--episodes', '-n', type=int, default=100)
    parser.add_argument('--norender', '-nr', action='store_true')
    parser.add_argument('--itr', '-i', type=int, default=-1)
    parser.add_argument('--deterministic', '-d', action='store_true')
    args = parser.parse_args()
    # env, get_action = load_policy_and_env(args.fpath,
    #                                       args.itr if args.itr >= 0 else 'last',
    #                                       args.deterministic)
    env = create_train_env(1, 1, 'complx')
    get_action = load_pytorch_policy(args.fpath)  #itr='_50'
    run_policy(env, get_action, args.len, args.episodes, not (args.norender))
Ejemplo n.º 13
0
def local_train(index, opt, global_model, optimizer, save=False):
    torch.manual_seed(123 + index)
    if save:
        start_time = timeit.default_timer()

    writer = SummaryWriter(opt.log_path)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.train()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    curr_episode = 0

    while True:
        if save:
            if curr_episode % opt.save_interval == 0 and curr_episode > 0:
                torch.save(
                    global_model.state_dict(),
                    f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}"
                )
            print(f"Now Process {index}. Episode {curr_episode}")
        curr_episode += 1
        local_model.load_state_dict(global_model.state_dict())

        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()

        log_policies = []
        values = []
        rewards = []
        entropies = []

        for _ in range(opt.num_local_steps):
            curr_step += 1
            logits, value, h_0, c_0 = local_model(state, h_0, c_0)
            policy = F.softmax(logits, dim=1)
            log_policy = F.log_softmax(logits, dim=1)
            entropy = -(policy * log_policy).sum(1, keepdim=True)

            m = Categorical(policy)
            action = m.sample().item()

            state, reward, done, _ = env.step(action)
            state = torch.from_numpy(state)

            if curr_step > opt.num_global_steps:
                done = True

            if done:
                curr_step = 0
                state = torch.from_numpy(env.reset())

            values.append(value)
            log_policies.append(log_policy[0, action])
            rewards.append(reward)
            entropies.append(entropy)

            if done:
                break

        R = torch.zeros((1, 1), dtype=torch.float)

        if not done:
            _, R, _, _ = local_model(state, h_0, c_0)

        gae = torch.zeros((1, 1), dtype=torch.float)
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0
        next_value = R

        for value, log_policy, reward, entropy in list(
                zip(values, log_policies, rewards, entropies))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            actor_loss = actor_loss + log_policy * gae
            R = R * opt.gamma + reward
            critic_loss = critic_loss + (R - value)**2 / 2
            entropy_loss = entropy_loss + entropy

        total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss
        writer.add_scalar(f"Train_{index}/Loss", total_loss, curr_episode)
        optimizer.zero_grad()
        total_loss.backward()

        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad is not None:
                break
            global_param._grad = local_param.grad

        optimizer.step()

        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print(f"Training process {index} terminated")
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return
Ejemplo n.º 14
0
def local_train(index, opt, global_model, optimizer, save=False):
    torch.manual_seed(42 + index)
    if save:
        start_time = timeit.default_timer()
    writer = SummaryWriter(opt.log_path)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    state = torch.from_numpy(env.reset())
    local_model = ActorCritics(num_states, num_actions)
    if opt.use_gpu and torch.cuda.is_available():
        local_model = local_model.cuda()
        state = state.cuda()
    local_model.train()
    done = True
    cur_step = 0
    cur_episode = 0
    while True:
        if save:
            if cur_episode % opt.save_interval == 0 and cur_episode > 0:
                torch.save(
                    global_model.state_dict(),
                    f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}"
                )
                print(f"Process {index}. Episode {cur_episode}")
            cur_episode += 1
            local_model.load_state_dict(global_model.state_dict())
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()
            if opt.use_gpu and torch.cuda.is_available():
                h_0 = h_0.cuda()
                c_0 = c_0.cuda()

        log_policies = []
        values = []
        rewards = []
        entropies = []
        # predict the action and react with the environment
        for _ in range(opt.num_local_steps):
            cur_step += 1
            logits, value, h_0, c_0 = local_model(state, h_0, c_0)
            policy = F.softmax(logits, dim=1)
            log_policy = F.log_softmax(logits, dim=1)
            entropy = -(policy * log_policy).sum(1, keepdim=True)
            # get the next action from sampling
            m = Categorical(policy)
            action = m.sample().item()
            # react
            state, reward, done, _ = env.step(action)
            state = torch.from_numpy(state)
            if opt.use_gpu and torch.cuda.is_available():
                state = state.cuda()
            # finishing of the episode
            if cur_step > opt.num_global_steps:
                done = True

            if done:
                cur_step = 0
                state = torch.from_numpy(env.reset())
                if opt.use_gpu and torch.cuda.is_available():
                    state = state.cuda()

            # aggregate the info
            values.append(value)
            log_policies.append(log_policy[0, action])
            rewards.append(reward)
            entropies.append(entropy)

            if done:
                break

        # calculate the `R' and calculate loss according age
        R = torch.zeros((1, 1), dtype=torch.float)
        gae = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu and torch.cuda.is_available():
            R = R.cuda()
            gae = gae.cuda()
        if not done:
            _, R, _, _ = local_model(state, h_0, c_0)

        actor_loss, critic_loss, entropy_loss = 0, 0, 0
        next_value = R
        for value, log_policy, reward, entropy in list(values, log_policies,
                                                       rewards,
                                                       entropies)[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            actor_loss = actor_loss + log_policy * gae
            R = R * opt.gamma + reward
            critic_loss = critic_loss + (R - value)**2 / 2
            entropy_loss = entropy_loss + entropy
        # backward
        total_loss = critic_loss - actor_loss - opt.beta * entropy_loss
        writer.add_scalar(f"Train_{index}/Loss", total_loss, cur_episode)
        optimizer.zero_grad()
        total_loss.backward()

        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad:
                break
            global_param._grad = local_param.grad

        optimizer.step()

        if cur_episode == int(opt.num_global_steps / opt.num_local_steps):
            print(f"Training process {index} terminated")
            if save:
                end_time = timeit.default_timer()
                print(f"The code runs for {end_time -start_time} s")
            return
def local_train(index, opt, global_model, optimizer, save=False):

    torch.manual_seed(42 + index)
    if save:
        start_time = timeit.default_timer()
    if index==0:
        # Path for tensorboard log
        process_log_path = "{}/process-{}".format(opt.log_path, index)
        writer = SummaryWriter(process_log_path)#, max_queue=1000, flush_secs=10)
    # Creates training environment for this particular process
    env, num_states, num_actions = create_train_env(opt.layout, opt.num_processes_to_render, index=index)
    # local_model keeps local weights for each async process
    local_model = AC_NN_MODEL(num_states, num_actions)
    if opt.use_gpu:
        local_model.cuda()
    # Tell the model we are going to use it for training
    local_model.train()
    # env.reset and get first state
    state = torch.from_numpy(env.reset()) # to tensor

    if opt.use_gpu:
        state = state.cuda()
    done = True
    curr_step = 0
    curr_episode = 0
    if index == 0:
        interval = 100
        #reward_hist = np.zeros(interval)
        reward_hist = deque(maxlen=100)
        #queue_rewards = queue.Queue(maxsize=interval)
        record_tag = False
    while True:
        if save:
            # Save trained model at save_interval
            if curr_episode % opt.save_interval == 0 and curr_episode > 0:
                torch.save(global_model.state_dict(),
                           "{}/gfootball_{}".format(opt.saved_path, opt.layout))
        if curr_episode%10==0:
            print("Process {}. Episode {}   ".format(index, curr_episode))
        curr_episode += 1
        episode_reward = 0
        # Synchronize thread-specific parameters theta'=theta and theta'_v=theta_v
        # (copy global params to local params (after every episode))
        local_model.load_state_dict(global_model.state_dict(), strict=True)
        # Follow gradients only after 'done' (end of episode)
        if done:
            h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float)
            c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if opt.use_gpu:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()

        log_policies = []
        values = []
        rewards = []
        entropies = []
        # Local steps
        for _ in range(opt.num_local_steps):
            curr_step += 1
            # Model prediction from state. Returns two functions:
            # * Action prediction (Policy function) -> logits (array with every action-value)
            # * Value prediction (Value function)   -> value (single value state-value)
            logits, value, h_0, c_0 = local_model(state, h_0, c_0)
            value = value.clamp(-1.,1.)
            lstm_model = False
            if lstm_model:
                # Lstm model returns data with one more dimension
                dim=1
            else:
                dim=0
            # Softmax over action-values
            policy = F.softmax(logits, dim=dim)
            # Log-softmax over action-values, to get the entropy of the policy
            log_policy = F.log_softmax(logits, dim=dim)
            #print('logits.size():',   logits.size())
            #print('value.size():',    value.size())
            #print('log_policy.size()',log_policy.size())
            # Entropy acts as exploration rate
            entropy = -(policy * log_policy).sum(dim, keepdim=True)
            # From Async Methods for Deep RL:
            """ We also found that adding the entropy of the policy π to the
                objective function improved exploration by discouraging
                premature convergence to suboptimal deterministic poli-
                cies. This technique was originally proposed by (Williams
                & Peng, 1991), who found that it was particularly help-
                ful on tasks requiring hierarchical behavior."""
            # We sample one action given the policy probabilities
            m = Categorical(policy)
            action = m.sample().item()
            # Perform action_t according to policy pi
            # Receive reward r_t and new state s_t+1
            state, reward, done, _ = env.step(action)
            # state to tensor
            state = torch.from_numpy(state)
            episode_reward += reward
            
            if opt.use_gpu:
                state = state.cuda()
            # If last global step, reset episode
            if curr_step > opt.num_global_steps:
                done = True
            if done:
                curr_step = 0
                state = torch.from_numpy(env.reset())
                print("Process {:2.0f}. acumR: {}     ".format(index, episode_reward))
                if opt.use_gpu:
                    state = state.cuda()
            # Save state-value, log-policy, reward and entropy of
            # every state we visit, to gradient-descent later
            values.append(value)
            if lstm_model:
                # Lstm model returns data with one more dimension
                log_policies.append(log_policy[0, action])
            else:
                log_policies.append(log_policy[action])
            rewards.append(reward)
            entropies.append(entropy)

            if done:
                # All local steps done.
                break
        # Save history every n episodes as statistics (just from one process)
        if index==0: 
            #sample_size = 100
            # hist_idx = (curr_episode - 1)%sample_size
            # if hist_idx==0:
            #     reward_hist = np.zeros(sample_size)
            # reward_hist[hist_idx] = episode_reward
            reward_hist.append(episode_reward)
            if True:#hist_idx==sample_size-1:
                r_mean   = np.mean(reward_hist)
                r_median = np.median(reward_hist)
                r_std    = np.std(reward_hist)
                stand_median = (r_median - r_mean) / (r_std + 1e-9)
                writer.add_scalar("Process_{}/Last100Statistics_mean".format(index), r_mean, curr_episode)
                writer.add_scalar("Process_{}/Last100Statistics_median".format(index), r_median, curr_episode)
                writer.add_scalar("Process_{}/Last100Statistics_std".format(index), r_std, curr_episode)
                writer.add_scalar("Process_{}/Last100Statistics_stand_median".format(index), stand_median, curr_episode)
        # fin save history
        # Baseline rewards standarization over episode rewards.
        # Uncomment prints to see how rewards change
        #if index == 0: # only print first agent's process
        #    print("Rewards before:", rewards)
        mean_rewards = np.mean(rewards)
        std_rewards  = np.std(rewards)
        rewards = (rewards - mean_rewards) / (std_rewards + 1e-9)
        #if index == 0:
        #    print("Rewards after:", rewards)
        # Initialize R/G_t: Discounted reward over local steps
        R = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            R = R.cuda()
        if not done:
            _, R, _, _ = local_model(state, h_0, c_0)
            # Standardize this reward estimation too
            #mean_rewards = np.mean([R, rewards])
            #std_rewards  = np.std([R, rewards])
            #R = (R - mean_rewards) / (std_rewards + 1e-9)
            # Simple value estimations between -1 and 1
            R = R.clamp(-1.,1.)
        gae = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            gae = gae.cuda()
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0
        next_value = R
        # Gradiend descent over minibatch of local steps, from last to first step
        for value, log_policy, reward, entropy in list(zip(values, log_policies, rewards, entropies))[::-1]:
            # Generalized Advantage Estimator (GAE)
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach() - value.detach()
            next_value = value
            # Accumulate discounted reward
            R = reward + opt.gamma * R
            # Accumulate gradients wrt parameters theta'
            actor_loss = actor_loss + log_policy * gae
            # Accumulate gradients wrt parameters theta'_v
            critic_loss = critic_loss + ((R - value)**2) / 2.
            entropy_loss = entropy_loss + entropy
        # Clamp critic loss value if too big
        #max_critic_loss = 1./opt.lr
        #critic_loss = critic_loss.clamp(-max_critic_loss, max_critic_loss)
        # Total process' loss
        total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss
        # Clamp loss value if too big
        #max_loss =  2 * max_critic_loss
        #total_loss = total_loss.clamp(-max_loss, max_loss)

        # Saving logs for TensorBoard
        if index==0:
            writer.add_scalar("Process_{}/Total_Loss".format(index), total_loss, curr_episode)
            writer.add_scalar("Process_{}/Acum_Reward".format(index), episode_reward, curr_episode)
            #writer.add_scalar("actor_{}/Loss".format(index), -actor_loss, curr_episode)
            #writer.add_scalar("critic_{}/Loss".format(index), critic_loss, curr_episode)
            #writer.add_scalar("entropyxbeta_{}/Loss".format(index), opt.beta * entropy_loss, curr_episode)
        # Gradientes a cero
        optimizer.zero_grad()
        # Backward pass
        total_loss.backward()
        # Perform asynchronous update of theta and theta_v
        for local_param, global_param in zip(local_model.parameters(), global_model.parameters()):
            if global_param.grad is not None:
                # Shared params. No need to copy again. Updated on optimizer.
                break
            # First update to global_param
            global_param._grad = local_param.grad
        # Step en la direccion del gradiente, para los parametros GLOBALES
        optimizer.step()

        # Final del training
        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print("Training process {} terminated".format(index))
            if index==0:
                writer.close()
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return
    return