Esempio n. 1
0
def train_model(num_frames):
    env = make_atari('PongNoFrameskip-v4')
    env = wrap_deepmind(env,episode_life=True, frame_stack=True)
    train_results = results.results(globals())

    cumulative_frames = 0
    best_score = -50
    games = 0
    full_loss = []
    rewards = []
    while 1:
        state = env.reset()
        done = False
        cum_reward = 0
        cum_loss = []
        while not done:
            action = select_action(torch.tensor(np.array(state).reshape(-1, 4, HEIGHT, WIDTH)).to(device), cumulative_frames)

            next_state, reward, done, _ = env.step(action)

            memory.add(state, action, reward, next_state, reward)

            state = next_state
            if cumulative_frames % TRAIN_FREQUENCY == 0 and cumulative_frames > LEARNING_STARTS:
                loss = optimize_model(cumulative_frames)
                cum_loss.append(loss)
            
            cum_reward += reward
            cumulative_frames += 1
        
            if cumulative_frames % TARGET_UPDATE == 0:
                target_net.load_state_dict(policy_net.state_dict())

        if best_score < cum_reward:
            best_score = cum_reward
        if len(cum_loss) == 0:
            full_loss.append(0)
        else:
            full_loss.append(np.mean(cum_loss))
        rewards.append(cum_reward)
        games += 1

        if games % 10 == 0:
            print("=============================================")
            print("Game: {} | Frame {}".format(games, cumulative_frames))
            print("Final reward: {}".format(cum_reward))
            print("Epsilon after: {}".format(EPSILON))
            print("Best High Score: {}".format(best_score))
            print("Avg Loss Last 100 games: {}".format(
                np.mean(full_loss[-100:])))
            print("Avg Reward Last 100 games: {}".format(
                np.mean(rewards[-100:])))

        train_results.record(cumulative_frames, games, EPSILON, cum_reward, full_loss[-1])

        if np.mean(rewards[-100:]) >= 18 and cumulative_frames > LEARNING_STARTS:
            break

    torch.save(target_net.state_dict(), PATH)
    train_results.close()
Esempio n. 2
0
def record():
    '''
  This function generates a gif file for a single episode. This process may take some time.
  To watch the non-stop game play, please run the test() function.
  '''
    save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"
    figure_path = FIGURE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"

    list_obs = []
    list_reward = []

    obs_mean_std = np.load(save_path + "obs_mean_std.npz")
    obs_mean = obs_mean_std["obs_mean"]
    obs_std = obs_mean_std["obs_std"]

    # Create environment.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Build models.
    policy = Policy(obs_space, action_space, is_training=False)

    with tf.Session() as sess:
        # Load variables.
        saver_policy = tf.train.Saver(policy.trainable_variables)
        saver_policy.restore(sess, save_path + "policy")

        total_reward = 0
        obs = env.reset()
        while True:
            list_obs.append(obs)
            list_reward.append(total_reward)
            env.render()
            # Get observation.
            obs = (obs - obs_mean) / obs_std
            # Get action.
            action = sess.run(
                policy.action,
                feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])})
            action = np.squeeze(action, (0, 1))

            # Interact with the environment.
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                list_obs.append(obs)
                list_reward.append(total_reward)
                break
    env.close()

    # Record the gameplay.
    imageio.mimsave(
        figure_path + "gameplay.gif",
        [plot_obs(obs, reward) for obs, reward in zip(list_obs, list_reward)],
        fps=30)
Esempio n. 3
0
def test():
    '''
  This function visualizes the game play. The environment will be reset immediately and the game will not be recorded.
  To record the game play, please run the record() function.
  '''
    save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"

    obs_mean_std = np.load(save_path + "obs_mean_std.npz")
    obs_mean = obs_mean_std["obs_mean"]
    obs_std = obs_mean_std["obs_std"]

    # Create environment.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Build models.
    policy = Policy(obs_space, action_space, is_training=False)

    with tf.Session() as sess:
        # Load variables.
        saver_policy = tf.train.Saver(policy.trainable_variables)
        saver_policy.restore(sess, save_path + "policy")

        total_step = 0
        total_reward = 0
        while True:
            # Get observation.
            if total_step == 0:
                obs = env.reset()
            else:
                obs = obs_next
            obs = (obs - obs_mean) / obs_std
            env.render()
            # Get action.
            action = sess.run(
                policy.action,
                feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])})
            action = np.squeeze(action, (0, 1))

            # Interact with the environment.
            obs_next, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                # Reset environment.
                print("Episodic reward: ", total_reward, sep="")
                obs_next = env.reset()
                total_reward = 0
            # Update step counter.
            total_step += 1
        env.close()
Esempio n. 4
0
def inference(episodes, model, env_name):
    env = make_atari(env_name)
    env = wrap_deepmind(env, episode_life=True, frame_stack=True)
    for _ in range(episodes):
        observation = env.reset()
        done = False
        while not done:
            time.sleep(0.05)
            env.render()
            observation = torch.tensor(np.array(observation).reshape(-1, 4, HEIGHT, WIDTH)).to(device)
            with torch.no_grad():
                action = model(observation).max(1)[1].item()
                observation, reward, done, _ = env.step(action)
                if reward != 0:
                    print(reward)
    dh[eph <= 0] = 0  # backprop relu
    dW1 = np.dot(dh.T, epx)
    return {'W1': dW1, 'W2': dW2}


if __name__ == '__main__':
    # hyperparameters
    H = 200  # number of hidden layer neurons
    batch_size = 10  # every how many episodes to do a param update?
    learning_rate = 1e-4
    gamma = 0.99  # discount factor for reward
    decay_rate = 0.99  # decay factor for RMSProp leaky sum of grad^2
    resume = False  # resume from previous checkpoint?  #!!!!!
    render = True  #!!!!!

    env = make_atari(sys.argv[1])
    num_actions = env.action_space.n
    env = wrap_deepmind(env)

    # model initialization
    D = 84 * 84  # input dimensionality: 80x80 grid
    if resume:
        print('RESUMING')  #!!!!!
        model = pickle.load(open('save.p', 'rb'))
    else:
        model = {}
        model['W1'] = np.random.randn(H, D) / np.sqrt(
            D)  # "Xavier" initialization
        model['W2'] = np.random.randn(num_actions, H) / np.sqrt(H)

    grad_buffer = {
Esempio n. 6
0
name = "pong_nn_0"
env_name = "PongNoFrameskip-v4"

if len(sys.argv) > 1:
    env_name = sys.argv[1]
    name = sys.argv[2]

mode_file = name + ".h5"
npy = name + ".npy"
frame_name = name + "_frames.npy"

max_frames = int(1e7)
windows = 50
learn_delay = 80000

env = make_atari(env_name)
env = wrap_deepmind(env)

print(env.action_space)
print(env.observation_space, env.observation_space.shape)
agent = DQN_NN(env.action_space.n)
fs = []
avg_reward = []
best_avg_reward = -math.inf
rs = deque(maxlen=windows)
frames = 0

if load:
    try:
        agent.q_network.load_weights(mode_file)
        agent.q_target.load_weights(mode_file)
Esempio n. 7
0
# run python -i test.py for testing stuff in shell
import torch
import numpy as np
import gym
from wrappers import make_atari, wrap_deepmind
from utils import LinearSchedule, Replay

env=wrap_deepmind(make_atari('BreakoutNoFrameskip-v4'))
state=env.reset()
state = np.array(state)
r = Replay(50, 3, False)
for i in range(100):
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    r.add(state, action, reward, next_state, done)
    state = next_state
s, a, r, ns, d = r.sample_tensor()
    return p, h  # return probability of taking action 2, and hidden state


def policy_backward(eph, epdlogp):
    """ backward pass. (eph is array of intermediate hidden states) """
    dW2 = np.dot(eph.T, epdlogp).ravel()
    dh = np.outer(epdlogp, model['W2'])
    dh[eph <= 0] = 0  # backpro prelu
    dW1 = np.dot(dh.T, epx)
    return {'W1': dW1, 'W2': dW2}


if __name__ == '__main__':
    start_time = time.time()

    env = make_atari("PongNoFrameskip-v4")
    env = wrap_deepmind(env)
    observation = env.reset()
    prev_x = None  # used in computing the difference frame
    xs, hs, dlogps, drs = [], [], [], []
    running_reward = None
    reward_sum = 0
    episode_number = 0

    reward_log = open('pg pv4w 1e-4 500 no' + str(sys.argv[1]) + '.txt', 'w')

    while (episode_number < 500):
        # if render: env.render()

        # preprocess the observation, set input to network to be difference image
        cur_x = observation.astype(np.float).ravel()
Esempio n. 9
0
def get_env():
    env = make_atari("PongNoFrameskip-v4")
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)
    return env
Esempio n. 10
0
def get_env(env_id, frame_stack):
    env = make_atari(env_id)
    env = wrap_deepmind(env, frame_stack)
    env = wrap_pytorch(env)
    return env
Esempio n. 11
0
def train():
    save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"
    figure_path = FIGURE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"
    # Create folders.
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.isdir(figure_path):
        os.makedirs(figure_path)

    # Get observation space and action space.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Estimate the mean and standard deviation of observations.
    env.reset()
    list_obs = []
    for _ in range(RANDOM_STEP):
        action = action_space.sample()
        obs, _, done, _ = env.step(action)
        if done:
            obs = env.reset()
        list_obs.append(obs)
    obs_mean = np.mean(list_obs, 0)
    obs_std = np.mean(np.std(list_obs, 0))
    np.savez_compressed(save_path + "obs_mean_std",
                        obs_mean=obs_mean,
                        obs_std=obs_std)
    env.close()
    del env

    # Build models.
    dynamics = Dynamics(obs_space,
                        action_space,
                        auxiliary_task=AUXILIARY_TASK,
                        is_training=True)
    policy = Policy(obs_space, action_space, is_training=True)

    variables_initializer = tf.global_variables_initializer()

    # Create environments.
    par_env = ParallelEnvironment(
        [make_atari(ENV_NAME) for _ in range(NUM_ENV)])

    with tf.Session() as sess:
        # Initialize variables.
        sess.run(variables_initializer)

        saver_dynamics = tf.train.Saver(dynamics.trainable_variables)
        saver_policy = tf.train.Saver(policy.trainable_variables)

        # Initialize the running estimate of rewards.
        sum_reward = np.zeros(NUM_ENV)
        reward_mean = 0.0
        reward_std = 1.0
        reward_count = 0

        # Initialize the counters.
        total_rollout_step = 0
        total_update_step = 0
        total_frame = 0

        # Initialize the recording of highest rewards.
        done_first = np.zeros(NUM_ENV)
        sum_ext_reward = np.zeros((NUM_ENV, ROLLOUT_STEP))
        list_highest_reward = []

        num_batch = int(np.ceil(NUM_ENV / BATCH_SIZE))

        # Each while loop performs a rollout, which first interacts with the environment and then updates the network.
        while total_frame < MAX_FRAME:
            # Initialize buffers.
            buffer_obs = np.zeros(
                (NUM_ENV, ROLLOUT_STEP + 1, *obs_space.shape))
            buffer_action = np.zeros((NUM_ENV, ROLLOUT_STEP))
            buffer_ext_reward = np.zeros((NUM_ENV, ROLLOUT_STEP))
            buffer_done = np.zeros((NUM_ENV, ROLLOUT_STEP))
            buffer_log_prob = np.zeros((NUM_ENV, ROLLOUT_STEP))
            buffer_v = np.zeros((NUM_ENV, ROLLOUT_STEP + 1))
            buffer_int_reward = np.zeros((NUM_ENV, ROLLOUT_STEP))
            buffer_reward = np.zeros((NUM_ENV, ROLLOUT_STEP))
            buffer_sum_reward = np.zeros((NUM_ENV, ROLLOUT_STEP))
            buffer_adv = np.zeros((NUM_ENV, ROLLOUT_STEP))
            buffer_v_target = np.zeros((NUM_ENV, ROLLOUT_STEP))

            # Interact with the environment for ROLLOUT_STEP steps.
            for step in range(ROLLOUT_STEP):
                # Get observation.
                if total_frame == 0:
                    obs = par_env.reset()
                else:
                    obs, _, _, _ = par_env.get_last_response()
                obs = (obs - obs_mean) / obs_std
                # Sample action.
                action, log_prob, v = sess.run(
                    [policy.sampled_action, policy.sampled_log_prob, policy.v],
                    feed_dict={policy.Obs: np.expand_dims(obs, 1)})
                action = np.squeeze(action, 1)
                log_prob = np.squeeze(log_prob, 1)
                v = np.squeeze(v, 1)

                # Interact with the environment.
                obs_next, extrinsic_reward, done, _ = par_env.step(action)

                # Update buffers.
                buffer_obs[:, step] = obs
                buffer_action[:, step] = action
                buffer_ext_reward[:, step] = extrinsic_reward
                buffer_done[:, step] = done
                buffer_log_prob[:, step] = log_prob
                buffer_v[:, step] = v

                if step == ROLLOUT_STEP - 1:
                    # Extra operations for the last time step.
                    obs_next = (obs_next - obs_mean) / obs_std
                    v_next = sess.run(
                        policy.v,
                        feed_dict={policy.Obs: np.expand_dims(obs_next, 1)})
                    v_next = np.squeeze(v_next, 1)
                    buffer_obs[:, step + 1] = obs_next
                    buffer_v[:, step + 1] = v_next

                # Update frame counter.
                total_frame += NUM_ENV

            # Get the highest reward.
            for step in range(ROLLOUT_STEP):
                done_prev = done_first if step == 0 else buffer_done[:,
                                                                     step - 1]
                sum_ext_reward[:, step] = buffer_ext_reward[:, step] + (
                    1 - done_prev) * sum_ext_reward[:, step - 1]
            done_first[:] = buffer_done[:, ROLLOUT_STEP - 1]
            highest_reward = np.amax(sum_ext_reward)
            list_highest_reward.append(highest_reward)

            # Compute the intrinsic reward.
            buffer_int_reward[:] = sess.run(dynamics.intrinsic_reward,
                                            feed_dict={
                                                dynamics.Obs:
                                                buffer_obs[:, :-1],
                                                dynamics.ObsNext:
                                                buffer_obs[:, 1:],
                                                dynamics.Action: buffer_action
                                            })
            # The total reward is a mixture of extrinsic reward and intrinsic reward.
            buffer_reward[:] = COEF_EXT_REWARD * np.clip(
                buffer_ext_reward, -1.0,
                1.0) + COEF_INT_REWARD * buffer_int_reward

            # Normalize reward by dividing it by a running estimate of the standard deviation of the sum of discounted rewards.
            # 1. Compute the sum of discounted rewards.
            for step in range(ROLLOUT_STEP):
                sum_reward = buffer_reward[:, step] + GAMMA * sum_reward
                buffer_sum_reward[:, step] = sum_reward
            # 2. Compute mean and standard deviation of the sum of discounted rewards.
            reward_batch_mean = np.mean(buffer_sum_reward)
            reward_batch_std = np.std(buffer_sum_reward)
            reward_batch_count = np.size(buffer_sum_reward)
            # 3. Update the running estimate of standard deviation.
            reward_mean, reward_std, reward_count = average_mean_std(
                reward_mean, reward_std, reward_count, reward_batch_mean,
                reward_batch_std, reward_batch_count)
            # 4. Normalize reward.
            buffer_reward = buffer_reward / reward_std

            # Compute advantage.
            # - gae_adv_t = sum((gamma * lambda)^i * adv_(t+l)) over i in [0, inf)
            # - adv_t = r_t + gamma * v_(t+1) - v_t
            adv = buffer_reward + GAMMA * buffer_v[:, 1:] - buffer_v[:, :-1]
            sum_adv = np.zeros(NUM_ENV)
            for step in range(ROLLOUT_STEP - 1, -1, -1):
                sum_adv = adv[:, step] + GAMMA * LAMBDA * sum_adv
                buffer_adv[:, step] = sum_adv

            # Compute target value.
            buffer_v_target[:] = buffer_adv + buffer_v[:, :-1]

            # Normalize advantage with zero mean and unit variance.
            adv_mean = np.mean(buffer_adv)
            adv_std = np.std(buffer_adv)
            buffer_adv = (buffer_adv - adv_mean) / adv_std

            # Update networks.
            for epoch in range(EPOCH):
                random_id = np.arange(NUM_ENV)
                np.random.shuffle(random_id)
                for i in range(num_batch):
                    batch_id = random_id[i * BATCH_SIZE:np.
                                         minimum(NUM_ENV, (i + 1) *
                                                 BATCH_SIZE)]
                    _, auxiliary_loss, dyna_loss = sess.run(
                        [
                            dynamics.train_op, dynamics.auxiliary_loss,
                            dynamics.dyna_loss
                        ],
                        feed_dict={
                            dynamics.Obs: buffer_obs[batch_id, :-1],
                            dynamics.ObsNext: buffer_obs[batch_id, 1:],
                            dynamics.Action: buffer_action[batch_id]
                        })
                    _, value_loss, pg_loss, entropy_loss = sess.run(
                        [
                            policy.train_op, policy.value_loss, policy.pg_loss,
                            policy.entropy_loss
                        ],
                        feed_dict={
                            policy.Obs: buffer_obs[batch_id, :-1],
                            policy.Action: buffer_action[batch_id],
                            policy.Adv: buffer_adv[batch_id],
                            policy.VTarget: buffer_v_target[batch_id],
                            policy.LogProbOld: buffer_log_prob[batch_id]
                        })
                    total_update_step += 1

            # Update rollout step.
            total_rollout_step += 1

            # Only print the last update step.
            print("Rollout Step ",
                  total_rollout_step,
                  ", Total Frame ",
                  total_frame,
                  ", Update Step ",
                  total_update_step,
                  ":",
                  sep="")
            print("  Auxiliary Loss = ",
                  format(auxiliary_loss, ".6f"),
                  ", Dynamics Loss = ",
                  format(dyna_loss, ".6f"),
                  ", Value Loss = ",
                  format(value_loss, ".6f"),
                  ", Policy Loss = ",
                  format(pg_loss, ".6f"),
                  sep="")
            print("  Highest Reward = ", highest_reward, sep="")

            if total_rollout_step % AUTOSAVE_STEP == 0:
                # Save network parameters.
                saver_dynamics.save(sess, save_path + "dynamics")
                saver_policy.save(sess, save_path + "policy")
                # Plot reward.
                interval = NUM_ENV * ROLLOUT_STEP
                list_frame = list(
                    range(interval, (total_rollout_step + 1) * interval,
                          interval))
                plot_reward(list_frame, list_highest_reward, figure_path)

        # Save network parameters.
        saver_dynamics.save(sess, save_path + "dynamics")
        saver_policy.save(sess, save_path + "policy")
        # Plot reward.
        interval = NUM_ENV * ROLLOUT_STEP
        list_frame = list(range(interval, total_frame + interval, interval))
        plot_reward(list_frame, list_highest_reward, figure_path)
    par_env.close()
Esempio n. 12
0
        if render:
            time_to_sleep = wait_time - (time.time() - start_time)
            if time_to_sleep > 0:
                time.sleep(time_to_sleep)

    return total_reward


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Render on graphics card(cuda:0).")
    parser.add_argument("--env",
                        default=ENV_NAME,
                        help="Name of the environment, default=" + ENV_NAME)
    parser.add_argument("-m", "--model", help="DQN")
    args = parser.parse_args()

    device = torch.device(GRAPHICS_CARD if args.cuda else "cpu")

    env = wrappers.make_atari(args.env)
    env = wrappers.wrap_deepmind(env, False, False, True)

    net = model.DQN(4, env.action_space.n).to(device)
    net.load_state_dict(torch.load(args.model))

    score = play(env, net, True, device)
    print(f"Score: {score}")
Esempio n. 13
0

def select_action(state, number_actions):
    eps = random.random()

    if eps < epsilon:
        action = random.randrange(number_actions)
    else:
        input = torch.from_numpy(state).to(device, torch.float32).unsqueeze(0)
        score = net(input)
        action = score.max(dim=1)[1].to(torch.int64).item()
    return action


# Build environment
env = make_atari('PongNoFrameskip-v4', stack=2)
env = wrap_pytorch(env)
env = gym.wrappers.Monitor(env,
                           directory='./movie',
                           force=True,
                           video_callable=lambda x: True)
number_actions = env.action_space.n

# Separate target net & policy net
input_shape = env.reset().shape
net = QNet(input_shape, number_actions)
net.load_state_dict(torch.load(model))
net.eval().to(device)

for episode in range(10):
    state = env.reset()
Esempio n. 14
0
def train_dqn(env_name,
              save_path,
              double=False,
              dueling=False,
              notebook=False):
    env = wrap_deepmind(make_atari(env_name))
    num_actions = env.action_space.n
    print('Num actions: {}'.format(num_actions))
    if dueling:
        model = DuelingNet(out_size=num_actions)
        target_model = DuelingNet(out_size=num_actions)
    else:
        model = DQN(out_size=num_actions)
        target_model = DQN(out_size=num_actions)
    criterion = nn.SmoothL1Loss()
    print('Created models')

    cuda = False
    if torch.cuda.is_available():
        cuda = True
        model = model.cuda()
        target_model = target_model.cuda()
        print('GPU: {}'.format(torch.cuda.get_device_name(0)))

    model.apply(init_weights)
    target_model.apply(init_weights)
    optimizer = optim.Adam(model.parameters())  #, lr=0.00001)
    print('Initalized models')

    schedule = LinearSchedule(P.start_eps, P.end_eps, P.steps_eps)
    replay = Replay(P.replay_size, P.batch_size, cuda)
    state = env.reset()
    num_updates = 0
    eps_reward = 0
    rewards = []
    losses = []
    # populate replay with random policy
    print('Populating replay')
    for i in tqdm(range(P.replay_start_size), desc='Populating replay'):
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        replay.add(state, action, reward, next_state, done)
        state = next_state
        if done:
            state = env.reset()
    print('Starting training')
    state = env.reset()
    for i in tqdm(range(P.num_steps), desc='Total steps'):
        if schedule.choose_random():
            action = env.action_space.sample()
        else:
            model_input = torch.from_numpy(np.array(state)[None, :]).type(
                torch.FloatTensor)
            if cuda:
                model_input = model_input.cuda()
            q_values = model(model_input)
            action = int(q_values.argmax(1)[0])
        next_state, reward, done, _ = env.step(action)
        eps_reward += reward
        replay.add(state, action, reward, next_state, done)
        state = next_state
        last_eps = 0
        if i % P.update_freq == 0:
            loss = compute_loss(replay, optimizer, model, target_model,
                                P.gamma, criterion, double)
            num_updates += 1
            if num_updates % P.target_update_freq == 0:
                target_model.load_state_dict(model.state_dict())
        if done:
            rewards.append(eps_reward)
            losses.append(loss.item())
            eps_reward = 0
            state = env.reset()
        if i % P.print_every == 0 and i > 0:
            print('Step: {}'.format(i))
            print('Average episode reward: {}'.format(
                sum(rewards[last_eps:]) / len(rewards[last_eps:])))
            print('Loss: {}'.format(
                sum(losses[last_eps:]) / len(losses[last_eps:])))
            last_eps = len(losses)
        if i % P.plot_every == 0 and i > 0:
            plot(i, rewards, losses, notebook, save_path)
            # if i % P.save_every == 0 and i > 0:
            torch.save(model, 'experiments/{}/{}_model'.format(save_path, i))
            pickle.dump(
                losses,
                open("experiments/{}/{}_losses.p".format(save_path, i), "wb"))
            pickle.dump(
                rewards,
                open("experiments/{}/{}_rewards.p".format(save_path, i), "wb"))
Esempio n. 15
0
    config = Config()
    config.env = args.env
    config.gamma = 0.99
    config.epsilon = 1
    config.epsilon_min = 0.01
    config.eps_decay = 30000
    config.frames = 2000000
    config.learning_rate = 1e-4
    config.max_buff = 300000
    config.update_interval = 2000
    config.batch_size = 32
    config.print_interval = 5000
    config.checkpoint_interval = 50000

    # wrap the env
    env = make_atari(config.env)
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)

    config.action_dim = env.action_space.n
    config.state_shape = env.observation_space.shape

    if args.train:
        agent = DDQNAgent(config)
        trainer = Trainer(agent, env, config)
        trainer.train()

    elif args.test:
        agent = DDQNAgent(config, training=False)
        tester = Tester(agent, env, args.model_path)
        tester.test()
Esempio n. 16
0
def main():
    env_id = "PongNoFrameskip-v4"
    env = make_atari(env_id)
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)

    observation_space = env.observation_space.shape
    action_sapce = env.action_space.n

    model = CnnDQN(observation_space, action_sapce)

    if USE_CUDA:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters())

    replay_buffer = ReplayBuffer(1000)

    batch_size = 32
    gamma = 0.99
    replay_initial = 100
    num_frames = 14000

    losses = []
    all_rewards = []
    x_axis1 = []
    x_axis2= []
    episode_reward = 0

    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 30000

    # 要求探索率随着迭代次数增加而减小
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

    state = env.reset()

    for frame_idx in range(1, num_frames + 1):
        #显示动画
        env.render()
        epsilon = epsilon_by_frame(frame_idx)
        action = model.act(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward

        if done:
            state = env.reset()
            x_axis1.append(frame_idx)
            all_rewards.append(episode_reward)
            episode_reward = 0

        if frame_idx+1 > replay_initial:
            loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size)
            x_axis2.append(frame_idx)
            losses.append(np.array(loss.data.cpu()))



        if frame_idx % 100 == 0:
            plt.figure(1)
            plt.subplot(121)
            plt.plot(x_axis1, all_rewards)
            plt.subplot(122)
            plt.plot(x_axis2, losses)
            plt.show()

    env.close()
    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 30000
    num_frames = 1000000
    batch_size = 32
    learning_rate = 0.0001

    # create environment
    # env_id = "PongNoFrameskip-v4"
    # env_id = 'SpaceInvadersNoFrameskip-v4'
    # env_id = 'MsPacmanNoFrameskip-v4'
    # env_id = 'VideoPinballNoFrameskip-v4'
    # env_id = 'MontezumaRevengeNoFrameskip-v4'
    # env_id = 'QbertNoFrameskip-v4'
    env_id = sys.argv[1]
    env    = make_atari(env_id)
    # env = gym.wrappers.Monitor(env, 'stats', video_callable=lambda episode_id: False, force=True, resume=False)
    env    = wrap_deepmind(env)
    env    = wrap_pytorch(env)

    # create networks
    current_model = CnnDQN(env.observation_space.shape, env.action_space.n)
    target_model  = CnnDQN(env.observation_space.shape, env.action_space.n)
    if USE_CUDA:
        current_model = current_model.cuda()
        target_model  = target_model.cuda()

    # setup optimizer
    optimizer = optim.Adam(current_model.parameters(), lr = learning_rate)

    # initialize replay memory
Esempio n. 18
0
                s = env.reset()
    return rewards


if __name__ == '__main__':
    logger.configure(f'{C.env_id}/logs_{time_stamp}')
    for k, v in C._asdict().items():
        logger.record_tabular(k, v)
    logger.dump_tabular()
    max_reward = tf.placeholder(tf.float32, name='max_reward')
    mean_reward = tf.placeholder(tf.float32, name='mean_reward')
    max_summary = tf.summary.scalar('max_rew', max_reward)
    mean_summary = tf.summary.scalar('mean_rew', mean_reward)

    with create_session(0) as sess:
        eval_env = make_atari(C.env_id, 113, 'eval')()
        envs = SubprocVecEnv(
            [make_atari(C.env_id, r + 1, 'train') for r in range(4)])
        model = Model(eval_env.observation_space.shape,
                      eval_env.action_space.n)
        runner = Runner(envs, model.policy, nb_rollout=C.nb_rollout)
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(
            './{}/summary/{}'.format(C.env_id, time_stamp), sess.graph)

        for i in range(C.iterations):
            if i % C.eval_freq == 0:
                rewards = evaluate(eval_env, model.policy, C.eval_episodes)
                logger.log(
                    f'Step: {i} | Max reward: {np.max(rewards)} | Mean reward: {np.mean(rewards):.2f} | Std: {np.std(rewards):.2f}'
                )