Esempio n. 1
0
 def make_obs_ph(name):
     return BatchInput(observation_space_shape, name=name)
Esempio n. 2
0
        out = layers.fully_connected(out,
                                     num_outputs=64,
                                     activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out,
                                     num_outputs=num_actions,
                                     activation_fn=None)
        return out


if __name__ == '__main__':
    with U.make_session(8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: BatchInput(env.observation_space.shape,
                                                name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000,
                                     initial_p=1.0,
                                     final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()
Esempio n. 3
0
 def make_obs_ph(name):
     return BatchInput((64, 64), name=name)
Esempio n. 4
0
 def make_obs_ph(name):
     #return BatchInput(observation_space_shape, name=name)
     return BatchInput((16, 16), name=name)
Esempio n. 5
0
 def make_obs_ph(name):
     return BatchInput((16, 16), name=name)
def startTraining():
    # Create the environment
    print('START ENV', RC.GB_CLIENT_ID(), RC.gbRobotHandle())
    env = RobotOperationEnvironment(RC.GB_CLIENT_ID(), RC.GB_CSERVER_ROBOT_ID,
                                    RC.gbRobotHandle())
    #print('ACTION_SPACE', env.action_space.shape)
    # Create all the functions necessary to train the model
    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=lambda name: BatchInput(env.observation_space.shape,
                                            name=name),
        q_func=model,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
    )
    # Create the replay buffer
    replay_buffer = ReplayBuffer(50000)
    # Create the schedule for exploration starting from 1 (every action is random) down to
    # 0.02 (98% of actions are selected according to values predicted by the model).
    exploration = LinearSchedule(schedule_timesteps=10000,
                                 initial_p=1.0,
                                 final_p=0.02)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    obs = env.reset()
    print("Manipulator DEEPQ Training Experiment Start.")
    for t in itertools.count():
        print('Episode ', len(episode_rewards), 'Step ', t, '--------------')
        print('Start waiting for the next action',
              env._robot.getOperationState())
        while (env._robot.getOperationState() != RC.CROBOT_STATE_READY):
            time.sleep(0.01)

        # Take action and update exploration to the newest value
        action = act(obs[None], update_eps=exploration.value(t))[0]
        print('Generated action:', action)
        new_obs, rew, done, _ = env.step(action)
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0)

        is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
        if is_solved:
            # Show off the result
            #env.render()
            pass
        else:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if t > 1000:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    32)
                print('Generated actions:', actions)
                train(obses_t, actions, rewards, obses_tp1, dones,
                      np.ones_like(rewards))
            # Update target network periodically.
            if t % 1000 == 0:
                update_target()

        if done and len(episode_rewards) % 10 == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", len(episode_rewards))
            logger.record_tabular("mean episode reward",
                                  round(np.mean(episode_rewards[-101:-1]), 1))
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.dump_tabular()
Esempio n. 7
0
 def make_obs_ph(name):
     return BatchInput((84, 84, 4), name=name)