Esempio n. 1
0
def local_train(index, opt, global_model, optimizer, save=False):
    #    torch.manual_seed(123 + index)
    if save:
        start_time = timeit.default_timer()
#    writer = SummaryWriter(opt.log_path)
    if not opt.saved_path:
        if opt.game == "Supermario":
            saved_path = "{}_{}_{}_{}".format(opt.game, opt.num_sequence,
                                              opt.internal_reward, opt.world,
                                              opt.stage)
        else:
            saved_path = "{}_{}".format(opt.game, opt.num_sequence)
    else:
        saved_path = opt.saved_path
    if opt.game == "Supermario":
        env, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:

        env, num_states, num_actions = create_train_env_atari(opt.game,
                                                              saved_path,
                                                              output_path=None)
    local_model = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if opt.use_gpu:
        local_model.cuda()
    local_model.train()
    state = torch.from_numpy(env.reset())
    if opt.use_gpu:
        state = state.cuda()
    done = True
    curr_step = 0
    curr_episode = 0

    loss_matrix = []
    Cum_reward1 = []
    SCORE1 = []
    X1 = []
    Num_interaction1 = []

    if opt.game == "Supermario":
        env1, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        env1, num_states, num_actions = create_train_env_atari(
            opt.game, saved_path, output_path=None)
    local_model1 = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if opt.use_gpu:
        local_model1.cuda()
    local_model1.eval()

    Cum_reward2 = []
    SCORE2 = []
    X2 = []
    Num_interaction2 = []

    if opt.game == "Supermario":
        env2, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        env2, num_states, num_actions = create_train_env_atari(
            opt.game, saved_path, output_path=None)
    local_model2 = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if opt.use_gpu:
        local_model2.cuda()
    local_model2.eval()

    Cum_reward3 = []
    SCORE3 = []
    X3 = []
    Num_interaction3 = []
    if opt.game == "Supermario":
        env3, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        env3, num_states, num_actions = create_train_env_atari(
            opt.game, saved_path, output_path=None)

    local_model3 = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if opt.use_gpu:
        local_model3.cuda()
    local_model3.eval()

    while True:
        if save:
            if curr_episode % opt.save_interval == 0 and curr_episode > 0:
                if opt.game == 'Supermario':
                    #                    torch.save(global_model.state_dict(),
                    #                               "{}/a3c_seq_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))
                    torch.save(global_model.state_dict(),
                               saved_path + "/trained_model")
                else:
                    torch.save(global_model.state_dict(),
                               saved_path + "/trained_model")
#            print("Process {}. Episode {}".format(index, curr_episode),done)

        if curr_episode % opt.log_interval == 0:

            if opt.game == 'Supermario':
                #                local_model1.load_state_dict(global_model.state_dict())
                #                Cum_reward1,X1,Num_interaction1,x_arrive_all_pro = local_test_iter(opt,env1,local_model1,Cum_reward1,X1,Num_interaction1,save)
                #            local_model2.load_state_dict(global_model.state_dict())
                #           Cum_reward2,SCORE2,X2,Num_interaction2,x_arrive_all_pro = local_test_iter(opt,env2,local_model2,Cum_reward2,SCORE2,X2,Num_interaction2,videosave=False,action_max=False,gate_max=False)
                local_model2.load_state_dict(global_model.state_dict())
                Cum_reward2, SCORE2, X2, Num_interaction2, x_arrive_all_max = local_test_iter(
                    opt,
                    env2,
                    local_model2,
                    Cum_reward2,
                    SCORE2,
                    X2,
                    Num_interaction2,
                    videosave=False,
                    action_max=True,
                    gate_max=True)

                local_model3.load_state_dict(global_model.state_dict())
                Cum_reward3, SCORE3, X3, Num_interaction3, x_arrive_actionpro_gatemax = local_test_iter(
                    opt,
                    env3,
                    local_model3,
                    Cum_reward3,
                    SCORE3,
                    X3,
                    Num_interaction3,
                    videosave=False,
                    action_max=False,
                    gate_max=True)
                print(curr_episode, x_arrive_all_max,
                      x_arrive_actionpro_gatemax)
            else:
                local_model1.load_state_dict(global_model.state_dict())
                Cum_reward1, SCORE1, X1, Num_interaction1, x_arrive_all_pro = local_test_iter(
                    opt,
                    env1,
                    local_model1,
                    Cum_reward1,
                    SCORE1,
                    X1,
                    Num_interaction1,
                    videosave=False,
                    action_max=False,
                    gate_max=False)
                print(curr_episode, x_arrive_all_pro)

        curr_episode += 1
        local_model.load_state_dict(global_model.state_dict())
        #        g_0_cnt = 0
        if done:
            g_0_ini = torch.ones((1))
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
            g_0 = torch.zeros((1, opt.num_sequence), dtype=torch.float)
            cum_r = 0
            g_0_cnt = 0
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
#            g_0 = g_0.detach()

        if opt.use_gpu:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            g_0_ini = g_0_ini.cuda()
            g_0 = g_0.cuda()

        log_policies = []
        log_gates = []
        values = []
        rewards = []
        reward_internals = []
        entropies = []

        for aaaaa in range(opt.num_local_steps):
            curr_step += 1
            g_pre = g_0
            g_pre_cnt = g_0_cnt

            logits, value, h_0, c_0, g_0, g_0_cnt, gate_flag1, gate_flag2 = local_model(
                state, h_0, c_0, g_0, g_0_ini)

            policy = F.softmax(logits, dim=1)
            log_policy = F.log_softmax(logits, dim=1)
            entropy = -(policy * log_policy).sum(1, keepdim=True)

            m = Categorical(policy)
            action = m.sample().item()
            state, reward, raw_reward, done, info = env.step(action)
            reward_internal = reward

            if g_0_ini == 1:

                log_gate = torch.zeros((), dtype=torch.float)
                if opt.use_gpu:
                    log_gate = log_gate.cuda()
            elif gate_flag1:

                #                log_gate = log_gate
                log_gate = torch.zeros((), dtype=torch.float)
            elif gate_flag2:

                #                log_gate = log_gate + torch.log(1-g_pre[0,g_pre_cnt])
                log_gate = torch.log(1 - g_pre[0, g_pre_cnt])
            else:
                #                log_gate = log_gate+torch.log(g_0[0,g_0_cnt-1])
                log_gate = torch.log(g_0[0, g_0_cnt - 1])
                if reward > 0:
                    reward_internal = reward + opt.internal_reward
            g_0_ini = torch.zeros((1))
            if opt.use_gpu:
                g_0_ini = g_0_ini.cuda()
#            if save:
#                env.render()
#                print(reward)
#                time.sleep(1)
            state = torch.from_numpy(state)
            if opt.use_gpu:
                state = state.cuda()
            if curr_step > opt.num_global_steps:
                done = True
                print('max glabal step achieve')

            if done:

                curr_step = 0

                env.reset()
                if opt.start_initial == 'random':
                    for i in range(opt.start_interval):
                        state, reward, _, done, info = env.step(
                            env.action_space.sample())
                        if done:
                            env.reset()
                    state = torch.from_numpy(state)
                else:
                    state = torch.from_numpy(env.reset())
                if opt.use_gpu:
                    state = state.cuda()

            values.append(value)
            log_policies.append(log_policy[0, action])
            log_gates.append(log_gate)
            rewards.append(reward)
            reward_internals.append(reward_internal)
            entropies.append(entropy)
            cum_r += reward
            if done:
                break
#        print(log_policies,log_gates)
        R = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            R = R.cuda()
        if not done:
            _, R, _, _, _, _, gate_flag1, gate_flag2 = local_model(
                state, h_0, c_0, g_0, g_0_ini, gate_update=False)

        gae = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            gae = gae.cuda()
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0

        #        next_value = R
        #        for value, log_policy, log_gate, reward, reward_internal, entropy in list(zip(values, log_policies, log_gates, rewards,reward_internals, entropies))[::-1]:
        #            gae = gae * opt.gamma * opt.tau
        #            gae = gae + reward_internal + opt.gamma * next_value.detach() - value.detach()
        #            next_value = value
        #            actor_loss = actor_loss + (log_policy+log_gate) * gae
        #            R = R * opt.gamma + reward
        #            critic_loss = critic_loss + (R - value) ** 2 / 2
        #            entropy_loss = entropy_loss + entropy

        # estimate internal reward directly
        if not (gate_flag1 or gate_flag2):
            if R > 0:
                R = R + opt.internal_reward
        next_value = R
        for value, log_policy, log_gate, reward, reward_internal, entropy in list(
                zip(values, log_policies, log_gates, rewards, reward_internals,
                    entropies))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward_internal + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            actor_loss = actor_loss + (log_policy + log_gate) * gae
            R = R * opt.gamma + reward_internal
            critic_loss = critic_loss + (R - value)**2 / 2
            entropy_loss = entropy_loss + entropy

# estimate external reward

#        next_value = R
#        for value, log_policy, log_gate, reward, reward_internal, entropy in list(zip(values, log_policies, log_gates, rewards,reward_internals, entropies))[::-1]:
#            gae = gae * opt.gamma * opt.tau
#            gae = gae + reward_internal-0.01* + opt.gamma * next_value.detach() - value.detach()
#            next_value = value
#            actor_loss = actor_loss + (log_policy+log_gate) * gae
#            R = R * opt.gamma + reward
#            critic_loss = critic_loss + (R - value) ** 2 / 2
#            entropy_loss = entropy_loss + entropy

        if opt.value_loss_coef:
            total_loss = -actor_loss + critic_loss * opt.value_loss_coef - opt.beta * entropy_loss
        else:
            total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss


#        writer.add_scalar("Train_{}/Loss".format(index), total_loss, curr_episode)
        optimizer.zero_grad()
        total_loss.backward(retain_graph=True)
        if opt.max_grad_norm:
            torch.nn.utils.clip_grad_norm_(local_model.parameters(),
                                           opt.max_grad_norm)

        loss_matrix.append(total_loss.detach().cpu().numpy())

        if curr_episode % opt.save_interval == 0:
            #            print('aaaaaaaaaaa',X,Cum_reward)
            if opt.game == 'Supermario':
                np.save(saved_path + "/X1{}".format(index), X1)
                np.save(saved_path + "/X2{}".format(index), X2)
                np.save(saved_path + "/X3{}".format(index), X3)

            np.save(saved_path + "/loss{}".format(index), loss_matrix)
            np.save(saved_path + "/Cum_reward1{}".format(index), Cum_reward1)
            np.save(saved_path + "/SCORE1{}".format(index), SCORE1)
            np.save(saved_path + "/Num_interaction1{}".format(index),
                    Num_interaction1)

            np.save(saved_path + "/Cum_reward2{}".format(index), Cum_reward2)
            np.save(saved_path + "/SCORE2{}".format(index), SCORE2)
            np.save(saved_path + "/Num_interaction2{}".format(index),
                    Num_interaction2)

            np.save(saved_path + "/Cum_reward3{}".format(index), Cum_reward3)
            np.save(saved_path + "/SCORE3{}".format(index), SCORE3)
            np.save(saved_path + "/Num_interaction3{}".format(index),
                    Num_interaction3)

        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad is not None:
                break
            global_param._grad = local_param.grad

        optimizer.step()

        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print("Training process {} terminated".format(index))
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return
Esempio n. 2
0
def local_test_certain(index, opt, global_model):
    torch.manual_seed(123 + index)

    if opt.game == "Supermario":
        env, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        if not opt.saved_path:
            saved_path = "{}_{}_{}_{}".format(opt.game, opt.num_sequence,
                                              opt.internal_reward, opt.lr)
        env, num_states, num_actions = create_train_env_atari(opt.game,
                                                              saved_path,
                                                              output_path=None)

    local_model = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    local_model.eval()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    Cum_reward = []
    X = []
    i = 0

    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
                g_0_ini = torch.ones((1))
                state = torch.from_numpy(env.reset())
                cum_r = 0
                g_0 = torch.zeros((1, opt.num_sequence), dtype=torch.float)
                score = 0
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0, g_0, g_0_cnt, gate_flag, _ = local_model(
            state, h_0, c_0, g_0, g_0_ini, certain=True)
        #print(g_0,g_0_cnt)
        g_0_ini = torch.zeros((1))
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, raw_reward, done, info = env.step(action)
        score += raw_reward
        #        env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        cum_r = cum_r + reward
        if done:

            i = i + 1
            curr_step = 0
            actions.clear()
            state = env.reset()
            if opt.game == "Supermario":
                x = info['x_pos']
            else:
                x = score

            print(i, 'test_certain', x)
            X.append(x)
            Cum_reward.append(cum_r)

        state = torch.from_numpy(state)

        if i % 100 == 0:
            np.save("{}/Cum_reward_test_certain".format(opt.saved_path),
                    Cum_reward)
            np.save("{}/X_test_certain".format(opt.saved_path), X)
Esempio n. 3
0
def test(opt):
    gate_max = True
    action_max = True
    torch.manual_seed(123)
    if opt.game == "Supermario":
        env, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        env, num_states, num_actions = create_train_env_atari(opt.game,
                                                              opt.saved_path,
                                                              output_path=None)
#    env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type,
#                                                    "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage))
    model = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if torch.cuda.is_available() and opt.use_gpu:
        model.load_state_dict(torch.load(opt.saved_path + "/trained_model"))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load(opt.saved_path + "/trained_model",
                       map_location=lambda storage, loc: storage))

    done = True
    while True:
        if done:

            curr_step_test = 0
            cum_r = 0
            with torch.no_grad():
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
                g_0_ini = torch.ones((1))

                g_0 = torch.zeros((1, opt.num_sequence), dtype=torch.float)

                env.reset()
                if opt.start_initial == 'random':
                    for i in range(opt.start_interval):
                        if opt.game == 'Supermario':
                            state, reward, _, done, info = env.step(
                                env.action_space.sample())
                        else:
                            state, reward, _, done, info = env.step(
                                env.action_space.sample(), 0, video_save=False)
                        if done:
                            env.reset()
                    state = torch.from_numpy(state)
                else:
                    state = torch.from_numpy(env.reset())
            if opt.use_gpu:
                state = state.cuda()
                h_0 = h_0.cuda()
                c_0 = c_0.cuda()
                g_0_ini = g_0_ini.cuda()
                g_0 = g_0.cuda()

            num_interaction = 1
            score = 0

        curr_step_test += 1
        with torch.no_grad():
            h_0 = h_0.detach()
            c_0 = c_0.detach()

        if gate_max:
            logits, value, h_0, c_0, g_0, g_0_cnt, gate_flag, _ = model(
                state, h_0, c_0, g_0, g_0_ini, certain=True)
        else:
            logits, value, h_0, c_0, g_0, g_0_cnt, gate_flag, _ = model(
                state, h_0, c_0, g_0, g_0_ini)
        g_0_ini = torch.zeros((1))
        if opt.use_gpu:
            g_0_ini = g_0_ini.cuda()
        policy = F.softmax(logits, dim=1)
        if action_max:
            action = torch.argmax(policy).item()
        else:
            m = Categorical(policy)
            action = m.sample().item()

        if opt.game == 'Supermario':
            state, reward, raw_reward, done, info = env.step(action)
        else:
            state, reward, raw_reward, done, info = env.step(action,
                                                             g_0_cnt,
                                                             video_save=False)


#        if save:
#            print(reward,raw_reward)
        env.render()
        #            time.sleep(0.5)
        score += raw_reward
        state = torch.from_numpy(state)
        if opt.use_gpu:
            state = state.cuda()
        cum_r = cum_r + reward
        #        actions.append(action)
        if g_0_cnt == 0:
            time.sleep(1)
            num_interaction += 1

        else:
            print(g_0_cnt, num_interaction)
        if done:
            if opt.game == "Supermario":
                x = info['x_pos']
                print(x, num_interaction)