Ejemplo n.º 1
0
 def make_env():
     env = gym.make(env_id)
     if record_video:
         video_path = os.path.join(output_dir, 'video')
         ensure_dir(video_path)
         env = Monitor(env, video_path, video_callable=lambda episode_id: episode_id % record_video_freq == 0, force=True)
     return env    
Ejemplo n.º 2
0
 def _thunk():
     env = make_atari(env_id)
     env.seed(seed + rank)
     if record_video:
         video_path = os.path.join(output_dir, 'video/env-%d' % rank)
         ensure_dir(video_path)
         env = Monitor(env, video_path, video_callable=lambda episode_id: episode_id % record_video_freq == 0, force=True)
     return wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False)
Ejemplo n.º 3
0
 def make_env():
     env = gym.make(env_id)
     if record_video:
         print("RECORDING VIDEO")
         video_path = os.path.join(output_dir, 'video')
         ensure_dir(video_path)
         env = Monitor(env, video_path, video_callable= lambda episode_id: episode_id % record_video_freq == 0, force=True)
 #        env.render()
     return env
Ejemplo n.º 4
0
def run(env_id, model_path, record_video, video_path=None):

    env = make_atari(env_id)
    env = wrap_deepmind(env,
                        episode_life=True,
                        clip_rewards=True,
                        frame_stack=False)

    num_env = 1
    ob_space = env.observation_space
    ac_space = env.action_space

    obs = np.zeros((num_env, 84, 84, 4), dtype=np.uint8)
    next_obs = env.reset()
    obs = update_obs(obs, next_obs)

    ep = 1
    steps = 0
    total_reward = 0

    with tf.Session() as sess:

        print('Loading Model %s' % model_path)
        policy = CnnPolicy(sess, ob_space, ac_space, nbatch=1, nsteps=1)
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

        ts = ts_rand()
        if record_video:
            ensure_dir(video_path)
            video_recorder = VideoRecorder(env,
                                           path=ep_video_path(
                                               video_path, ts, env_id, ep))

        while True:
            env.render()

            if record_video:
                video_recorder.capture_frame()

            actions, values, _ = policy.step(obs)
            value = values[0]
            steps += 1
            next_obs, rewards, dones, info = env.step(actions)
            total_reward += rewards
            print('%d: reward=%f value=%f' % (steps, total_reward, value))
            obs = update_obs(obs, next_obs)

            if dones:
                print('DONE')
                ep += 1
                steps = 0
                total_reward = 0
                next_obs = env.reset()
                obs = np.zeros((num_env, 84, 84, 4), dtype=np.uint8)
                obs = update_obs(obs, next_obs)

                if record_video:
                    video_recorder.close()
                    video_recorder = VideoRecorder(env,
                                                   path=ep_video_path(
                                                       video_path, ts, env_id,
                                                       ep),
                                                   enabled=record_video)
                '`'
                time.sleep(2)
Ejemplo n.º 5
0
def learn(env, env_id, num_env, total_timesteps, output_dir,
          cuda_visible_devices, gpu_memory_fraction, load_model):

    valid_actions = valid_atari_actions(env, env_id)
    num_actions = len(valid_actions)

    report_summary_freq = 100
    save_model_freq = 2000

    num_steps = 5
    batch_size = num_env * num_steps

    model_path = os.path.join(output_dir, 'model')
    summary_path = os.path.join(output_dir, 'summary')

    ensure_dir(summary_path)
    ensure_dir(model_path)

    sess = create_session(cuda_visible_devices, gpu_memory_fraction)

    model = Model(sess,
                  num_env,
                  num_steps,
                  num_actions,
                  total_timesteps=total_timesteps)
    runner = Runner(env, num_env, model, valid_actions, num_steps=num_steps)

    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(summary_path)

    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

    timesteps = 0
    max_updates = total_timesteps // batch_size
    print("Number of updates: %d" % max_updates)

    for update in range(1, max_updates + 1):

        obs, rewards, actions, values = runner.run()

        timesteps = update * batch_size

        policy_loss, value_loss, policy_entropy, cur_lr = model.train(
            obs, rewards, actions, values)

        if update % report_summary_freq == 0 and update != 0:

            mean_reward = safe_mean(runner.running_rewards.copy())
            mean_steps = safe_mean(runner.running_steps.copy())
            mean_value = safe_mean(runner.running_values.copy())

            print("Updates: %d" % update)
            print("Timesteps: %d" % timesteps)
            print("Learn rate: %f" % cur_lr)
            print("Policy loss: %f" % float(policy_loss))
            print("Value loss: %f" % float(value_loss))
            print("Running rewards: %s" % runner.running_rewards)
            print("Mean reward: %s" % mean_reward)
            print("Mean steps: %s" % mean_steps)
            print("Mean values: %s" % mean_value)

            train_summary = tf.Summary()
            train_summary.value.add(tag='Train/Timesteps',
                                    simple_value=timesteps)
            train_summary.value.add(tag='Train/Policy loss',
                                    simple_value=policy_loss)
            train_summary.value.add(tag='Train/Policy entropy',
                                    simple_value=policy_entropy)
            train_summary.value.add(tag='Train/Value loss',
                                    simple_value=value_loss)
            train_summary.value.add(tag='Train/Mean steps',
                                    simple_value=mean_steps)
            train_summary.value.add(tag='Train/Mean reward',
                                    simple_value=mean_reward)
            train_summary.value.add(tag='Train/Mean values',
                                    simple_value=mean_value)
            summary_writer.add_summary(train_summary, update)
            summary_writer.flush()

        if update % save_model_freq == 0 and update != 0:
            print('Save model')
            saver.save(sess, model_path + '/model-' + str(update) + '.ckpt')
Ejemplo n.º 6
0
def learn(policy,
          env,
          nsteps,
          sess,
          total_timesteps,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=50,
          cuda_visible_devices='0',
          gpu_memory_fraction=0.5,
          output_dir=None,
          vec_normalize=None):

    # TODO DRY
    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    model = Model(policy=policy,
                  sess=sess,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    model_path = os.path.join(output_dir, 'model')
    summary_path = os.path.join(output_dir, 'summary')
    ensure_dir(summary_path)
    ensure_dir(model_path)

    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(summary_path)

    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, ep_info = runner.run(
        )

        mblossvals = []

        inds = np.arange(nbatch)
        for _ in range(noptepochs):
            np.random.shuffle(inds)
            for start in range(0, nbatch, nbatch_train):
                end = start + nbatch_train
                mbinds = inds[start:end]
                slices = (arr[mbinds] for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                mblossvals.append(model.train(lrnow, cliprangenow, *slices))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            print('')
            print("nupdates", update)
            print("serial_timesteps", update * nsteps)
            print("total_timesteps", update * nbatch)
            print("fps", fps)
            print("explained_variance", float(ev))
            print('mean_episode_reward', ep_info['ep_mean_reward'])
            print('mean_episode_length', ep_info['ep_mean_length'])
            print('time_elapsed', tnow - tfirststart)

            policy_loss = lossvals[0]
            value_loss = lossvals[1]
            policy_entropy = lossvals[2]
            approxkl = lossvals[3]
            clipfrac = lossvals[4]

            print("policy_loss", policy_loss)
            print("value_loss", value_loss)

            train_summary = tf.Summary()
            train_summary.value.add(tag='Train/Episode Reward',
                                    simple_value=ep_info['ep_mean_reward'])
            train_summary.value.add(tag='Train/Episode Length',
                                    simple_value=ep_info['ep_mean_length'])
            train_summary.value.add(tag='Train/FPS', simple_value=fps)
            train_summary.value.add(tag='Train/Policy Loss',
                                    simple_value=policy_loss)
            train_summary.value.add(tag='Train/Value Loss',
                                    simple_value=value_loss)
            summary_writer.add_summary(train_summary, update)
            summary_writer.flush()

        if update % save_interval == 0 and update != 0:
            print('Save model: %s' % model_path)
            saver.save(
                sess, os.path.join(model_path,
                                   'model-' + str(update) + '.ckpt'))
            if vec_normalize:
                # save observation scaling inside VecNormalize
                vec_normalize.snapshot(
                    os.path.join(model_path,
                                 'vec_normalize-' + str(update) + '.pickle'))

    env.close()
Ejemplo n.º 7
0
def run(env_id, model_path, record_video, video_path=None):

    if env_id.startswith('Roboschool'):
        import roboschool

    gym_env = gym.make(env_id)

    def make_env():
        return gym_env

    dummy_vec_env = DummyVecEnv([make_env])
    vec_normalize = VecNormalize(dummy_vec_env)
    vec_env = vec_normalize
    ob_space = vec_env.observation_space
    ac_space = vec_env.action_space

    window = PygletWindow()

    obs = vec_env.reset()

    ep = 1
    steps = 0
    total_reward = 0

    with tf.Session() as sess:

        policy = MlpPolicy(sess, ob_space, ac_space, nbatch=1, nsteps=1)

        print('Loading Model {}'.format(model_path))
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(model_path)
        print(model_path)
        print(ckpt)
        saver.restore(sess, ckpt.model_checkpoint_path)

        vec_norm_path = last_vec_norm_path(model_path)
        print('Loading VecNormalize state %s' % vec_norm_path)
        vec_normalize.restore(vec_norm_path)

        ts = ts_rand()
        if record_video:
            ensure_dir(video_path)
            video_recorder = VideoRecorder(gym_env,
                                           path=ep_video_path(
                                               video_path, ts, env_id, ep))

        while True:
            actions, values, _ = policy.step(obs)

            img = gym_env.render("rgb_array")
            window.imshow(img)

            if record_video:
                video_recorder.capture_frame()

            if window.still_open == False:
                video_records.close()
                break

            value = values[0]
            steps += 1
            obs, rewards, dones, info = vec_env.step(actions)

            total_reward += rewards
            print('%d: reward=%f value=%f total_reward=%f' %
                  (steps, rewards[0], value, total_reward))
            if dones[0]:
                print('Episode %d finished' % ep)
                ep += 1
                steps = 0
                total_reward = 0

                window.close()
                window = PygletWindow()

                if record_video:
                    video_recorder.close()
                    video_recorder = VideoRecorder(gym_env,
                                                   path=ep_video_path(
                                                       video_path, ts, env_id,
                                                       ep),
                                                   enabled=record_video)
                obs = vec_env.reset()
                time.sleep(2)
Ejemplo n.º 8
0
def learn(env, sess, cuda_visible_devices, gpu_memory_fraction, output_dir):

    log_freq = 100
    save_freq = 500
    update_target_freq = 250

    total_timesteps = int(50e6)

    batch_size = 32  # How many experiences to use for each training step.

    exploration_start_eps = 1.0  # Starting probability of random action
    exploration_final_eps = 0.02  # End probability of random action
    exploration_max_steps = 200000  # How many steps of training to reduce epsilon

    hidden_size = 256
    pre_train_steps = 5000

    nsteps = 4  # How often to perform a training step.
    gamma = 0.99  # Discount factor on the target Q-values

    render = False
    load_model = False

    model_path = os.path.join(output_dir, 'model')
    summary_path = os.path.join(output_dir, 'summary')
    video_path = os.path.join(output_dir, 'video')
    ensure_dir(summary_path)
    ensure_dir(model_path)
    ensure_dir(video_path)

    valid_actions = [2, 3]  # TODO replace
    num_actions = len(valid_actions)

    mainQN = Qnetwork(hidden_size, num_actions, "main", add_summaries=True)
    targetQN = Qnetwork(hidden_size, num_actions, "target")

    saver = tf.train.Saver()

    # operation that copies a snapshot of the main network to the target network
    update_target_op = update_target("main", "target")

    replay_buffer = ReplayBuffer(50000)

    # Create an exploration schedule
    exploration = LinearSchedule(schedule_timesteps=exploration_max_steps,
                                 initial_p=exploration_start_eps,
                                 final_p=exploration_final_eps)

    with sess:

        sess.run(tf.global_variables_initializer())

        summary_writer = tf.summary.FileWriter(summary_path)

        if load_model:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(model_path)
            saver.restore(sess, ckpt.model_checkpoint_path)

        print("Populating replay buffer")
        state = env.reset()
        i = 0
        while (True):
            action = np.random.randint(0, num_actions)
            next_state, reward, done, _ = env.step(valid_actions[action])
            replay_buffer.add(state, action, reward, next_state, float(done))
            if done:
                state = env.reset()
            else:
                state = next_state
            i += 1
            if i > pre_train_steps and done:
                break

        runner = Runner(sess, env, replay_buffer, num_actions, nsteps,
                        exploration, mainQN, valid_actions)

        nupdates = total_timesteps // nsteps

        for update in range(1, nupdates + 1):

            runner.run()

            if update % update_target_freq == 0:
                sess.run(update_target_op)

            # Sample a batch of transitions from the replay buffer
            states, actions, rewards, next_states, dones = replay_buffer.sample(
                batch_size)

            # Calculate the maximizing action q-value for s_tp1 using 'Double Q-learning'

            # 1. Predict the action that maximizes the q-value for s_tp1 using the mainQN
            feed_dict = {
                mainQN.state_input:
                next_states  # shape: (batch_size, 84, 84, 4)
            }
            Q1 = sess.run(mainQN.max_q_action,
                          feed_dict=feed_dict)  # shape: (batch_size,)

            # 2. Predict the q-values for s_tp1 using the targetQN
            feed_dict = {targetQN.state_input: next_states}
            Q2 = sess.run(targetQN.q_out,
                          feed_dict=feed_dict)  # (batch_size, 2)

            # 3. Get the maxiziming action q-value for s_tp1 by selecting the Q1 index from the Q2 array
            max_action_q = Q2[range(batch_size), Q1]  # (batch_size,)

            # inverte the 'done' fields in the train_batch, e.g. 000010000 -> 111101111
            # in 'done' transitions there are no future rewards and the update rule reduces to: target_q = reward
            inverted_done_indicator = -(dones - 1)

            # Calculate the target-q value, that is what we think is the correct q-value for s_t and the
            # selected action and is used to calculate the td-error.
            target_q = rewards + (gamma * max_action_q *
                                  inverted_done_indicator)

            # Update the mainQN
            feed_dict = {
                mainQN.state_input: states,
                mainQN.actions: actions,
                mainQN.target_q: target_q
            }
            _, summaries = sess.run([mainQN.train_op, mainQN.summaries],
                                    feed_dict)

            if update % log_freq == 0:
                print(
                    'update %d: mean_ep_reward=%f, mean_ep_length=%d, total_steps=%d'
                    % (update, runner.mean_ep_reward or 0.0,
                       runner.mean_ep_length or 0.0, runner.total_steps))
                print('Report summaries')
                train_summary = tf.Summary()
                train_summary.value.add(tag='Train/Episode Reward',
                                        simple_value=runner.mean_ep_reward)
                train_summary.value.add(tag='Train/Episode Length',
                                        simple_value=runner.mean_ep_length)
                summary_writer.add_summary(train_summary, update)
                summary_writer.add_summary(summaries, update)
                summary_writer.flush()

            if update % save_freq == 0:
                print('Save model')
                saver.save(sess,
                           model_path + '/model-' + str(update) + '.ckpt')