Exemple #1
0
def test(opts):
    env = create_env(opts.env_name, 712)
    action_space = env.action_space
    state_space = env.state_space
    tp_matrix = env.tp_matrix
    agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount,
                           action_space, state_space, tp_matrix,
                           env.blocked_positions, 712)

    qvalues = np.load(os.path.join(opts.policy_dir, opts.env_name + '.npy'))
    agent.qvalues = qvalues

    env.render(agent.qvalues)
    state = env.get_state()

    for i in range(200):
        possible_actions = env.get_possible_actions()
        action = agent.get_best_action(state, possible_actions)
        time.sleep(0.1)
        next_state, reward, done, next_possible_states = env.step(action)
        env.render(agent.qvalues)

        next_state_possible_actions = env.get_possible_actions()
        state = next_state

        print(reward)
        if done == True:
            env.reset_state()
            env.render(agent.qvalues)
            time.sleep(0.5)
            state = env.get_state()
            continue
Exemple #2
0
    def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS, id_num = 0):
        threading.Thread.__init__(self)

        self.render = render
        #self.env = #gym.make(ENV)
        self.env = create_env('mario',id_num,None)
        self.agent = Agent(eps_start, eps_end, eps_steps)
def main():
    args = parser.parse_args()

    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if torch.cuda.is_available() and not args.disable_cuda:
        args.device = torch.device('cuda')
        torch.cuda.manual_seed(np.random.randint(1, 10000))
        # Disable nondeterministic ops (not sure if critical but better safe than sorry)
        #torch.backends.cudnn.enabled = False
    else:
        args.device = torch.device('cpu')

    # Environment
    env = create_env(args.environment_filename,
                     custom=False,
                     skip_frames=1,
                     realtime=args.render,
                     worker_id=args.worker,
                     device=args.device)

    agent = AgentEval(args, env)

    if env.unwrapped.is_grading():
        print('grading...')
        run_evaluation(env, agent)
    else:
        print('testing...')
        rewards = []
        for _ in range(10):
            episode_rewards = run_episode(env, agent)
            rewards.append(episode_rewards)
        env.close()
        print(sum(rewards) / len(rewards))
Exemple #4
0
def train(opts):
    env = create_env(opts.env_name)
    action_space = env.action_space
    state_space = env.state_space
    tp_matrix = env.tp_matrix

    agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount,
                           action_space, state_space, tp_matrix,
                           env.blocked_positions)

    env.render(agent.qvalues)
    state = env.get_state()

    for i in range(opts.num_iters):

        possible_actions = env.get_possible_actions()
        action = agent.get_action(state, possible_actions)
        next_state, reward, done, next_possible_states = env.step(action)
        env.render(agent.qvalues)

        next_state_possible_actions = env.get_possible_actions()
        agent.update(state, action, reward, next_state,
                     next_state_possible_actions, next_possible_states, done)
        state = next_state

        if done == True:
            env.reset_state()
            env.render(agent.qvalues)
            state = env.get_state()
            continue

    if not os.path.exists(opts.policy_dir):
        os.makedirs(opts.policy_dir)
    np.save(os.path.join(opts.policy_dir, opts.env_name + '.npy'),
            np.asarray(agent.qvalues))
Exemple #5
0
def global_test(global_model, device, args, model_type, delay=0.03):
    world = args.world
    stage = args.stage
    env = create_env(world, stage)
    device = device
    state = env.reset()
    state = (env.reset()).to(device, dtype=torch.float)

    state = state.view(1, 1, 80, 80)
    done = True

    if (model_type == "LSTM"):
        model = ActorCritic_LSTM().to(device)
    else:
        model = ActorCritic().to(device)

    model.eval()
    model.load_state_dict(global_model.state_dict())

    while (True):
        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()

        h_0 = h_0.to(device)
        c_0 = c_0.to(device)

        env.render()
        p, _, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(p, dim=1)
        action = torch.argmax(policy)

        next_state, _, done, info = env.step(action.item())

        next_state = (next_state).to(device, dtype=torch.float)
        next_state = next_state.view(1, 1, 80, 80)

        state = next_state
        if (done):
            if (info['flag_get']):
                break
            state = env.reset()
            state = state.to(device)
            state = state.view(1, 1, 80, 80)
            model.load_state_dict(global_model.state_dict())
        time.sleep(delay)
    print('Success clear {}-{}'.format(world, stage))
Exemple #6
0
def main():
    args = parser.parse_args()
    print(' ' * 26 + 'Options')
    for k, v in vars(args).items():
        print(' ' * 26 + k + ': ' + str(v))
    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if torch.cuda.is_available() and not args.disable_cuda:
        args.device = torch.device('cuda')
        torch.cuda.manual_seed(np.random.randint(1, 10000))
        # Disable nondeterministic ops (not sure if critical but better safe than sorry)
        #torch.backends.cudnn.enabled = False
    else:
        args.device = torch.device('cpu')

    args.large = False
    args.skip_frames = 0
    args.random_aug = 0.

    # Environment
    train_env = create_env(args.environment_filename,
                           custom=True,
                           large=args.large,
                           skip_frames=args.skip_frames,
                           random_aug=args.random_aug,
                           docker=args.docker_training,
                           device=args.device)
    action_space = train_env.action_space

    test_env = create_env(
        args.environment_filename,
        custom=True,
        large=args.large,
        custom_reward=False,
        skip_frames=args.skip_frames,
        docker=args.docker_training,
        device=args.device,
        worker_id=1,
    )

    mem = ReplayMemory(args,
                       args.memory_capacity,
                       obs_space=train_env.observation_space)
    val_mem = ReplayMemory(args,
                           args.evaluation_size,
                           obs_space=test_env.observation_space)

    # for debugging environment issues
    if args.timeout_monitor:
        train_env = TimeoutMonitor(train_env, mem)
        test_env = TimeoutMonitor(test_env, val_mem)

    # Agent
    dqn = Agent(args, train_env)

    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)
    time_step = 0
    done = True
    state = None
    while time_step < args.evaluation_size:
        if done:
            state = train_env.reset()
            done = False

        next_state, _, done, _ = train_env.step(action_space.sample())
        val_mem.append(state, None, None, done)
        state = next_state
        time_step += 1

    if args.evaluate:
        dqn.eval()  # Set DQN (online network) to evaluation mode
        avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True)  # Test
        print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
    else:
        # Training loop
        dqn.train()
        done = True
        for time_step in tqdm(range(args.T_max)):
            if done:
                state = train_env.reset()
                done = False

            if time_step % args.replay_frequency == 0:
                dqn.reset_noise()  # Draw a new set of noisy weights

            action = dqn.act(
                state)  # Choose an action greedily (with noisy weights)
            next_state, reward, done, info = train_env.step(action)  # Step
            if args.reward_clip > 0:
                reward = max(min(reward, args.reward_clip),
                             -args.reward_clip)  # Clip rewards
            mem.append(state, action, reward,
                       done)  # Append transition to memory

            # Train and test
            if time_step >= args.learn_start:
                # Anneal importance sampling weight β to 1
                mem.priority_weight = min(
                    mem.priority_weight + priority_weight_increase, 1)

                if time_step % args.replay_frequency == 0:
                    dqn.learn(
                        mem
                    )  # Train with n-step distributional double-Q learning

                if time_step % args.evaluation_interval == 0:
                    dqn.eval()  # Set DQN (online network) to evaluation mode
                    avg_reward, avg_Q = test(args,
                                             time_step,
                                             dqn,
                                             val_mem,
                                             env=test_env)  # Test
                    log('T = ' + str(time_step) + ' / ' + str(args.T_max) +
                        ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                        str(avg_Q))
                    dqn.train(
                    )  # Set DQN (online network) back to training mode

                # Update target network
                if time_step % args.target_update == 0:
                    dqn.update_target_net()

            state = next_state

    train_env.close()
Exemple #7
0
#  * compute the minimal initial load needed in each state.
# In other words, all the solvers have the same guarantees. Where they differ are the strategies they produce. Our overall goal is to provide strategies that not only provide these guarantees but are also **usable in practical control case studies**.
#
# We use a simple gridworld underwater environment generated by [FiMDPEnv] to demonstrate the behavior we can obtain using different solvers of the [FiMDP] package. We have some pre-defined environments in the file [env.py](env.py). The goal of the agent is to reach the green target with sufficient energy so that it can reach it again and again.
#
# In each cell of the gridworld, the agent can choose one of eight possible actions. For each of the 4 directions (`NORTH`, `SOUTH`, `WEST`, `EAST`) the agent chooses whether to play a *weak* or _strong_ action. A strong action costs more energy, while the weak action has uncertain outcome. The resulting direction of movement can be affected by pre-defined currents. For example, in most cases, picking `EAST` can, with a small probability end up with the agent going `SOUTH` or `NORTH`.
#
# [FiMDP]: https://github.com/xblahoud/FiMDP
# [FiMDPEnv]: https://github.com/pthangeda/FiMDPEnv

import fimdpenv

fimdpenv.setup()
from env import create_env

e = create_env('2R-1T-simple', heading_sd=0.32, agent_capacity=40)
e

# The colors of the gridworld cells have the following semantics:
#  * <font color='blue'>Blue Cell</font>: Current location of the agent
#  * <font color='gray'>Gray Cells</font>: Trajectory of the agent
#  * <font color='green'>Green Cells</font>: Target States
#  * <font color='orange'>Orange Cells</font>: Reload states

# This package offers 2 solvers that generate strategies:
#  * Basic solver (class `BasicES`)
#  * Goal-leaning solver (class `GoalLeaningES`)

# +
import fimdp
Exemple #8
0
def train(bisimulation, opts):
    log_path = 'output_logs/Softmax-Logs/{}'.format(opts.env_name)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    curr_log_dir = os.path.join(log_path, str(datetime.datetime.now())[:-7])
    os.makedirs(curr_log_dir)
    with open(os.path.join(curr_log_dir, 'command.txt'), 'w') as f:
        json.dump(opts.__dict__, f, indent=2)

    lower_bound = np.zeros(
        (bisimulation.src_env.state_space, bisimulation.tgt_env.state_space))
    for t in range(bisimulation.tgt_env.state_space):
        for s in range(bisimulation.src_env.state_space):
            lower_bound[s, t] = np.max(bisimulation.src_agent.qvalues[s]
                                       ) - bisimulation.dist_matrix_final[s, t]

    plt_path = os.path.join(curr_log_dir,
                            opts.env_name + '_softmax_qlearn.png')
    policy_path = os.path.join(curr_log_dir,
                               opts.env_name + '_softmax_qlearn.npy')
    df_path = os.path.join(curr_log_dir, opts.env_name + '_softmax_qlearn.csv')
    heatmap_path = os.path.join(curr_log_dir,
                                opts.env_name + '_explore_softmax.png')

    seeds = random.sample(range(1, 10000), opts.num_seeds)
    print(seeds)
    # seeds = [1981, 8702, 3497, 8058, 4931, 1968, 6555, 8390, 8711, 7212] # np.arange(opts.num_seeds)
    cr_allvec = []
    tp_all = []
    pi_val_all = []
    avg_reward_all = []
    exp_q_all = []
    pbar = tqdm(total=opts.num_seeds)
    dummy_env = create_env(opts.env_name, 712)
    count_matrix_avg = np.zeros(
        (dummy_env.state_space, dummy_env.action_space))
    for i in seeds:
        np.random.seed(i)
        random.seed(i)

        env = create_env(opts.env_name, i)
        action_space = env.action_space
        state_space = env.state_space
        tp_matrix = env.tp_matrix
        count_matrix = np.zeros((env.state_space, env.action_space))

        gt_agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount,
                                  action_space, state_space, tp_matrix,
                                  env.blocked_positions, i)
        gt_agent.qvalues = np.load(
            os.path.join(opts.policy_dir, opts.env_name + '.npy'))
        gt_policy_val = np.mean(gt_agent.qvalues)
        agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount,
                               action_space, state_space, tp_matrix,
                               env.blocked_positions, i)
        agent.qvalues = np.random.rand(state_space, action_space)
        # env.render(agent.qvalues)

        cr_vec = []
        pi_val_vec = []
        exp_q_vec = []
        c_reward = 0
        avg_reward_vec = []

        state = env.get_state()
        for i in range(opts.num_iters):
            possible_actions = env.get_possible_actions()

            action = agent.get_action_softmax(state, possible_actions,
                                              opts.temp)
            # action = agent.get_action_trexsoftmax(state, possible_actions, lower_bound, bisimulation.d_sa_final, i, opts.temp)
            next_state, reward, done, next_possible_states = env.step(action)
            if i < exp_check_thresh:
                count_matrix[state][action] = 1
            # env.render(agent.qvalues)

            next_state_possible_actions = env.get_possible_actions()
            agent.update(state, action, reward, next_state,
                         next_state_possible_actions, done)
            state = next_state

            c_reward += reward
            # pival_diff = match_actions(gt_agent, agent, env)
            if i % plot_freq == 0:
                avg_r = evaluate_mean_avg_reward(dummy_env, agent)
                exp_q = exploration_quality(gt_agent.qvalues, count_matrix)
                avg_reward_vec.append(avg_r)
                exp_q_vec.append(exp_q)
            if done == True:
                env.reset_state()
                # env.render(agent.qvalues)
                state = env.get_state()
                continue
        # for _ in range(100):
        #     env.render(agent.qvalues)
        timesteps = np.arange(start=0, stop=opts.num_iters, step=plot_freq)
        tp_all.append(timesteps)
        avg_reward_all.append(avg_reward_vec)
        exp_q_all.append(exp_q_vec)
        pbar.update(1)
        count_matrix_avg += count_matrix

    temp_tp = np.arange(start=0, stop=opts.num_iters, step=plot_freq)
    count_matrix_avg /= opts.num_seeds
    timesteps = np.asarray(tp_all)
    avg_reward_array = np.asarray(avg_reward_all)
    exp_q_array = np.asarray(exp_q_all)
    timesteps = timesteps.reshape((opts.num_seeds * temp_tp.shape[0]))
    avg_reward_all = avg_reward_array.reshape(
        (opts.num_seeds * temp_tp.shape[0]))
    exp_q_all = exp_q_array.reshape((opts.num_seeds * temp_tp.shape[0]))
    df = pd.DataFrame({
        'Timesteps': timesteps,
        'Mean Average Reward': avg_reward_all,
        'Exp Quality': exp_q_all
    })

    mean_avg_reward = np.mean(avg_reward_array, axis=0)
    exp_q = np.mean(exp_q_array, axis=0)
    tp = timesteps[:temp_tp.shape[0]]
    poly = np.polyfit(tp, mean_avg_reward, 5)
    poly_y = np.poly1d(poly)(tp)

    plt.subplot(2, 1, 1)
    plt.title('Softmax')
    plt.grid()
    plt.xlabel('Timesteps')
    plt.ylabel('Mean Average Reward')
    plt.plot(tp, poly_y)

    plt.subplot(2, 1, 2)
    plt.grid()
    plt.xlabel('Timesteps')
    plt.ylabel('Exploration Quality')
    plt.plot(tp[:int(exp_check_thresh / plot_freq)],
             exp_q[:int(exp_check_thresh / plot_freq)])

    plt.savefig(plt_path)
    df.to_csv(df_path)
    np.save(policy_path, agent.qvalues)
    env.generate_heatmap(count_matrix, heatmap_path)
                           type=str)
    argparser.add_argument(
        '-ma',
        '--match-action',
        action='store_true',
        dest='debug',
        help='Match actions with ground truths and generate plots')
    argparser.add_argument('-v',
                           '--verbose',
                           action='store_true',
                           dest='debug',
                           help='print debug information')

    args = argparser.parse_args()

    env = create_env(args.env_name, 712)
    agent = QLearningAgent(args.alpha, args.epsilon, args.discount,
                           env.action_space, env.state_space, env.tp_matrix,
                           env.blocked_positions, 712)
    agent.qvalues = np.load(
        os.path.join(args.policy_dir, args.env_name + '.npy'))

    avg_reward_vec = []
    # for _ in range(args.num_iters):
    avg_r = evaluate_mean_avg_reward(env, agent)
    print(avg_r)
    # avg_reward_vec.append(avg_r)
    avg_reward_vec = np.asarray(
        (avg_r)).repeat(int(args.num_iters / plot_freq))
    timesteps = np.arange(start=0, stop=args.num_iters, step=plot_freq)
    avg_reward_array = np.asarray(avg_reward_vec)
Exemple #10
0
        return history.history["loss"][0]

    def eval2target(self):
        self.model_target.set_weights(self.model_eval.get_weights())

    def save(self, filepath):
        self.model_eval.save(filepath)

    def load(self, filepath):
        self.model_eval = load_model(filepath)
        self.eval2target()


if __name__ == "__main__":
    # load the gym env
    env = create_env('MsPacman-ram-v0')
    # set  random seeds to get reproduceable result(recommended)
    set_random_seed(0)
    # get size of state and action from environment
    state_size = env.observation_space.shape[0] * env.observation_space.shape[1]
    action_size = env.action_space.n
    # create the agent
    agent = DQNAgent(state_size, action_size)
    if os.path.exists("dqn.h5"):
        agent.load("dqn.h5")
    # log the training result
    scores, episodes = [], []
    graph_episodes = []
    graph_score = []
    avg_length = 10
    sum_score = 0
Exemple #11
0
def train(bisimulation, opts):
    log_path = 'output_logs/Count-Based-Q-Logs/{}'.format(opts.env_name)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    curr_log_dir = os.path.join(log_path, str(datetime.datetime.now())[:-7])
    os.makedirs(curr_log_dir)
    with open(os.path.join(curr_log_dir, 'command.txt'), 'w') as f:
        json.dump(opts.__dict__, f, indent=2)

    lower_bound = np.zeros(
        (bisimulation.src_env.state_space, bisimulation.tgt_env.state_space))
    for t in range(bisimulation.tgt_env.state_space):
        for s in range(bisimulation.src_env.state_space):
            lower_bound[s, t] = np.max(bisimulation.src_agent.qvalues[s]
                                       ) - bisimulation.dist_matrix_final[s, t]

    plt_path = os.path.join(curr_log_dir,
                            opts.env_name + '_countbased_qlearn.png')
    policy_path = os.path.join(curr_log_dir,
                               opts.env_name + '_countbased_qlearn.npy')
    df_path = os.path.join(curr_log_dir,
                           opts.env_name + '_countbased_qlearn.csv')
    heatmap_path = os.path.join(curr_log_dir,
                                opts.env_name + '_explore_countbased.png')

    epsilon_bisim = 0.5
    seeds = np.arange(opts.num_seeds)
    cr_allvec = []
    tp_all = []
    pi_val_all = []
    avg_reward_all = []
    exp_q_all = []
    pbar = tqdm(total=opts.num_seeds)
    dummy_env = create_env(opts.env_name, 712)
    temp = 1

    count_matrix_avg = np.zeros(
        (dummy_env.state_space, dummy_env.action_space))
    for s in seeds:
        np.random.seed(s)
        random.seed(s)

        env = create_env(opts.env_name, s)
        action_space = env.action_space
        state_space = env.state_space
        tp_matrix = env.tp_matrix

        gt_agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount,
                                  action_space, state_space, tp_matrix,
                                  env.blocked_positions, s)
        gt_agent.qvalues = np.load(
            os.path.join(opts.policy_dir, opts.env_name + '.npy'))
        gt_policy_val = np.mean(gt_agent.qvalues)
        agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount,
                               action_space, state_space, tp_matrix,
                               env.blocked_positions, s)
        agent.qvalues = np.random.rand(state_space, action_space)

        cr_vec = []
        pi_val_vec = []
        exp_q_vec = []
        c_reward = 0
        avg_reward_vec = []

        count_matrix = np.zeros((state_space, action_space))
        count_matrix_t = np.zeros((state_space, action_space))
        state = env.get_state()
        for i in range(opts.num_iters):
            possible_actions = env.get_possible_actions()
            action = agent.get_action(state, possible_actions)
            # action = agent.get_action_trail(state, possible_actions, lower_bound, bisimulation.d_sa_final, i, epsilon_bisim, temp)
            count_matrix[state][action] += 1
            if i < exp_check_thresh:
                count_matrix_t[state][action] = 1
            next_state, reward, done, next_possible_states = env.step(action)
            c_reward += copy.deepcopy(reward)
            reward += opts.cb_beta / np.sqrt(count_matrix[state][action])
            # if opts.render:
            # env.render(agent.qvalues)

            next_state_possible_actions = env.get_possible_actions()
            agent.update(state, action, reward, next_state,
                         next_state_possible_actions, done)
            state = next_state

            if i % plot_freq == 0:
                avg_r = evaluate_mean_avg_reward(dummy_env, agent)
                exp_q = exploration_quality(gt_agent.qvalues, count_matrix_t)
                avg_reward_vec.append(avg_r)
                exp_q_vec.append(exp_q)

            if done == True:
                env.reset_state()
                # if opts.render:
                # env.render(agent.qvalues)
                state = env.get_state()
                continue

        timesteps = np.arange(start=0, stop=opts.num_iters, step=plot_freq)
        tp_all.append(timesteps)
        avg_reward_all.append(avg_reward_vec)
        exp_q_all.append(exp_q_vec)
        pbar.update(1)
        # count_matrix_avg += count_matrix

    temp_tp = np.arange(start=0, stop=opts.num_iters, step=plot_freq)
    # count_matrix_avg /= opts.num_seeds
    timesteps = np.asarray(tp_all)
    avg_reward_array = np.asarray(avg_reward_all)
    exp_q_array = np.asarray(exp_q_all)
    timesteps = timesteps.reshape((opts.num_seeds * temp_tp.shape[0]))
    avg_reward_all = avg_reward_array.reshape(
        (opts.num_seeds * temp_tp.shape[0]))
    exp_q_all = exp_q_array.reshape((opts.num_seeds * temp_tp.shape[0]))
    df = pd.DataFrame({
        'Timesteps': timesteps,
        'Mean Average Reward': avg_reward_all,
        'Exp Quality': exp_q_all
    })

    mean_avg_reward = np.mean(avg_reward_array, axis=0)
    exp_q = np.mean(exp_q_array, axis=0)
    tp = timesteps[:temp_tp.shape[0]]
    poly = np.polyfit(tp, mean_avg_reward, 5)
    poly_y = np.poly1d(poly)(tp)

    plt.subplot(2, 1, 1)
    plt.title('MBIE-EB')
    plt.grid()
    plt.xlabel('Timesteps')
    plt.ylabel('Mean Average Reward')
    plt.plot(tp, poly_y)

    plt.subplot(2, 1, 2)
    plt.grid()
    plt.xlabel('Timesteps')
    plt.ylabel('Exploration Quality')
    plt.plot(tp[:int(exp_check_thresh / plot_freq)],
             exp_q[:int(exp_check_thresh / plot_freq)])

    plt.savefig(plt_path)
    df.to_csv(df_path)
    np.save(policy_path, agent.qvalues)
    env.generate_heatmap(count_matrix, heatmap_path)
Exemple #12
0
    def run(self):
        #self.global_model=self.global_model.to(self.device)
        if(self.args.model_type == "LSTM"):
            self.AC=ActorCritic_LSTM()
        else:
            self.AC=ActorCritic()

        #optimizer_to(self.optimizer,self.device)
        env = create_env(self.world,self.stage)
        state=(env.reset())
        #state=state.reshape(1,1,80,80)
        state=(state).to(self.device,dtype=torch.float)

        #state=self.imageProcess(state) 
        
        i_epoch=self.epoch

        done=True
        while True:
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()
                
            h_0 = h_0.to(self.device)
            c_0 = c_0.to(self.device)

            Timestamp=50
            for i in range((Timestamp)):
                env.render()
                    
                p,value,h_0,c_0=self.AC(state,h_0,c_0)


                
                policy=F.softmax(p,dim=1)
                log_prob=F.log_softmax(p,dim=1)
                entropy=-(policy*log_prob).sum(1,keepdim=True)
                
                m=Categorical(policy)

                action=m.sample()
                next_state, reward, done, info = env.step(action.item())

                #reward=reward/15
                

                #next_state=next_state.view(1,1,80,80)
                next_state=(next_state).to(self.device,dtype=torch.float)
                

                
                #self.states.append(state)
                self.log_probs.append(log_prob[0,action])
                self.rewards.append(reward)
                self.values.append(value)
                self.entropies.append(entropy)
                
                state=next_state
                

                
                if(done):
                    state=(env.reset())
                    #state=state.reshape(1,1,80,80)
                    state=state.to(self.device)
                    #state=self.imageProcess(state)
                    break

            """
            actor_loss=0
            critic_loss=0
            returns=[]
            R=0
            for reward in self.rewards[::-1]:
                R=reward+self.GAMMA*R
                returns.insert(0,R)
            """
            #td=torch.tensor([1],dtype=torch.float).to(device)
            
            R = torch.zeros((1, 1), dtype=torch.float)
            if not done:
                _, R, _, _ = self.AC(state, h_0, c_0)

            R=R.to(self.device)
            actor_loss=0
            critic_loss=0
            entropy_loss=0
            advantage=torch.zeros((1, 1), dtype=torch.float)
            advantage=advantage.to(self.device)
            next_value=R
                
            for log_prob,reward,value,entropy in list(zip(self.log_probs,self.rewards,self.values,self.entropies))[::-1]:
                advantage=advantage*self.GAMMA
                advantage=advantage+reward+self.GAMMA*next_value.detach()-value.detach()
                next_value=value
                actor_loss=actor_loss+(-log_prob*advantage)
                R=R*self.GAMMA+reward
                critic_loss=critic_loss+(R-value)**2/2
                entropy_loss=entropy_loss+entropy

            
            total_loss=actor_loss+critic_loss-0.01*entropy_loss
            
            
            push_and_pull(self.optimizer, self.AC, self.global_model, total_loss)

            #for name, parms in self.C.named_parameters():	
            #print('-->name:', name, '-->grad_requirs:',parms.requires_grad,' -->grad_value:',parms.grad)

            
            if(i_epoch%10==0):
                print(self.name+"\ Episode %d \ Actor loss:%f \ Critic Loss:%f \ Total Loss: %f"%(i_epoch,actor_loss.item(),critic_loss.item(),total_loss.item()))
            
            

            """
            y.append(critic_loss.item())
            x.append(i_epoch)
            plt.plot(x,y) #畫線
            plt.show() #顯示繪製的圖形
            """                    
            i_epoch+=1
            
            del self.log_probs[:]
            del self.rewards[:]
            del self.values[:]
            del self.entropies[:]
            
            if(self.save):
                if(i_epoch%100==0):
                    PATH='./model/{}/A3C_{}_{}.pkl'.format(self.level,self.level,self.args.model_type)
                    torch.save({
                                'epoch': i_epoch,
                                'model_state_dict': self.global_model.state_dict(),
                                'optimizer_state_dict': self.optimizer.state_dict(),
                                'loss': total_loss,
                                'type':self.args.model_type,
                                }, PATH)
            if(i_epoch==Max_epoch):
                return
Exemple #13
0
def run(args, server):
    env = create_env(client_id=str(args.task), remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise)

    # 以 'local' 开头的变量(局部变量)不会被保存在 checkpoint 参数文件中
    variables_to_save = [
        v for v in tf.global_variables() if not v.name.startswith("local")
    ]
    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()

    # 保存变量到参数文件中
    saver = FastSaver(variables_to_save)

    # 获取可被训练的变量
    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)

    logger.info('可被训练的变量 :')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("初始化所有参数。")
        ses.run(init_all_op)

    config = tf.ConfigProto(device_filters=[
        "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)
    ])

    logdir = os.path.join(args.log_dir, 'train')
    # 写入 TensorBoard 的日志文件
    summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)

    logger.info("存储 TensorBoard 文件的目录: %s_%s", logdir, args.task)

    # 一个高层的 Wrapper(包装类)
    # 可以做 TensorBoard 日志文件的保存,参数文件的保存,等等操作
    sv = tf.train.Supervisor(
        is_chief=(args.task == 0),
        logdir=logdir,  # 存储参数文件的目录
        saver=saver,  # 存储参数文件所用的 Saver
        summary_op=None,
        init_op=init_op,
        init_fn=init_fn,
        summary_writer=summary_writer,  # 存储 TensorBoard 日志文件的 FileWriter
        ready_op=tf.report_uninitialized_variables(variables_to_save),
        global_step=trainer.global_step,
        save_model_secs=30,
        save_summaries_secs=30)

    # 总的可运行步数。可修改
    num_global_steps = 100000000

    logger.info("启动会话中...")
    with sv.managed_session(server.target,
                            config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("在第 %d 步开始训练", global_step)
        while not sv.should_stop() and (not num_global_steps
                                        or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # 停止所有服务
    sv.stop()
    logger.info('已经 %s 步了. worker 被停止.', global_step)
Exemple #14
0
# Memory params
pretrain_length = batch_size                                # length for pretraining filling of memory
memory_size = 50000                                         # memory size

### Preprocessing params
stack_size = 4                                              #
stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size), maxlen = 4])

##################################################
##################################################
##################################################

net = CNN()
mem = Memory(memory_size)
game, possible_actions = create_env()


# We need to pre-populate memory with taking some random actions
game.new_episode()

for i in range(pretrain_length):
    if i == 0:
        state = game.get_state().screen_buffer
        state = stacked_frames(stacked_frames, state)

    # take random action
    action = random.choice(possible_actions)

    # observe reward from that action
    reward = game.make_action(action)
Exemple #15
0
    def __init__(self, opts):
        self.opts = opts
        self.src_env = create_env(opts.src_env, 712)
        self.tgt_env = create_env(opts.tgt_env, 712)
        self.src_agent = QLearningAgent(alpha, epsilon, discount, 4,
                                        self.src_env.state_space,
                                        self.src_env.tp_matrix,
                                        self.src_env.blocked_positions, 712)
        self.tgt_agent = QLearningAgent(alpha, epsilon, discount, 4,
                                        self.tgt_env.state_space,
                                        self.tgt_env.tp_matrix,
                                        self.tgt_env.blocked_positions, 712)
        self.transferred_agent = QLearningAgent(alpha, epsilon, discount, 4,
                                                self.tgt_env.state_space,
                                                self.tgt_env.tp_matrix,
                                                self.tgt_env.blocked_positions,
                                                712)
        self.action_space = self.src_env.action_space
        self.solver = opts.solver
        self.lfp_iters = opts.lfp_iters
        self.threshold = opts.threshold
        self.discount_kd = opts.discount_kd
        self.discount_kd = opts.discount_r
        if self.solver == 'lp':
            m = self.src_env.state_space
            n = self.tgt_env.state_space
            A_r = np.zeros((m, m, n))
            A_t = np.zeros((n, m, n))

            for i in range(m):
                for j in range(n):
                    A_r[i, i, j] = 1

            for i in range(n):
                for j in range(m):
                    A_t[i, j, i] = 1
            self.A = np.concatenate((A_r.reshape(
                (m, m * n)), A_t.reshape((n, m * n))),
                                    axis=0)

        self.src_possible_actions = self.src_env.get_possible_actions()
        self.tgt_possible_actions = self.tgt_env.get_possible_actions()

        # Initialize Q-Values
        self.src_agent.qvalues = np.load(
            os.path.join(opts.policy_dir, opts.src_env + '.npy'))
        self.tgt_agent.qvalues = np.load(
            os.path.join(opts.policy_dir, opts.tgt_env + '.npy'))

        # Distance and reward matrices
        self.d_sa_final = np.zeros(
            (self.src_env.state_space, self.action_space,
             self.tgt_env.state_space, self.action_space))
        self.dist_matrix_final = np.zeros(
            (self.src_env.state_space, self.tgt_env.state_space))
        self.reward_matrix_tmp = np.zeros(
            (self.src_env.state_space, self.action_space,
             self.tgt_env.state_space, self.action_space))
        self.reward_matrix = np.zeros(
            (self.src_env.state_space, self.tgt_env.state_space))
        self.init_reward_matix()
        self.accuracy = None
def test(args,
         T,
         dqn,
         val_mem,
         skip_frames=1,
         evaluate=False,
         realtime=False,
         env=None):

    global Ts, rewards, Qs, best_avg_reward

    if env is None:
        env = create_env(args.environment_filename,
                         custom=False,
                         skip_frames=skip_frames,
                         realtime=realtime,
                         docker=args.docker_training,
                         worker_id=1,
                         device=args.device)
        own_env = True
    else:
        own_env = False

    Ts.append(T)
    T_rewards = []
    T_Qs = []

    # Test performance over several episodes
    done = True
    for _ in range(args.evaluation_episodes):
        while True:
            if done:
                state = env.reset()
                reward_sum = 0
                done = False

            action = dqn.act_e_greedy(state)  # Choose an action ε-greedily
            state, reward, done, _ = env.step(action)  # Step
            reward_sum += reward
            if args.render:
                env.render()

            if done:
                T_rewards.append(reward_sum)
                break

    if own_env:
        env.close()

    # Test Q-values over validation memory
    for state in val_mem:  # Iterate over valid states
        T_Qs.append(dqn.evaluate_q(state))

    avg_reward = sum(T_rewards) / len(T_rewards)
    avg_Q = sum(T_Qs) / len(T_Qs)
    if not evaluate:
        # Append to results
        rewards.append(T_rewards)
        Qs.append(T_Qs)

        # Plot
        _plot_line(Ts, rewards, 'Reward', path='results')
        _plot_line(Ts, Qs, 'Q', path='results')

        # Save model parameters if improved
        if avg_reward > best_avg_reward:
            best_avg_reward = avg_reward
            dqn.save('results')

    # Return average reward and Q-value
    return avg_reward, avg_Q