Esempio n. 1
0
    def __init__(self, registry, env_creator, config, logdir):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"])
        self.env = env
        self.config = config

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()
Esempio n. 2
0
File: dqn.py Progetto: zcli/ray
    def __init__(self, env_creator, config, logdir):
        env = env_creator()
        env = wrap_dqn(env, config["model"])
        self.env = env
        self.config = config

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config, logdir)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.set_weights_time = RunningStat(())
        self.sample_time = RunningStat(())
        self.grad_time = RunningStat(())

        # Note that workers don't need target vars to be synced
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)
Esempio n. 3
0
    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        # when env.action_space is of Box type, e.g., Pendulum-v0
        # action_space.low is [-2.0], high is [2.0]
        # take action by calling, e.g., env.step([3.5])
        if not isinstance(env.action_space, Box):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)

        # Use either a different `eps` per worker, or a linear schedule.
        if config["per_worker_exploration"]:
            assert config["num_workers"] > 1, "This requires multiple workers"
            self.exploration = ConstantSchedule(
                config["noise_scale"] * 0.4 **
                (1 + worker_index / float(config["num_workers"] - 1) * 7))
        else:
            self.exploration = LinearSchedule(
                schedule_timesteps=int(config["exploration_fraction"] *
                                       config["schedule_max_timesteps"]),
                initial_p=config["noise_scale"] * 1.0,
                final_p=config["noise_scale"] *
                config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        # hard instead of soft
        self.ddpg_graph.update_target(self.sess, 1.0)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the policy and Q-value networks and
        # their corresponding target networks
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()
Esempio n. 4
0
    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Box):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.ddpg_graph.copy_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0
        nb_actions = env.action_space.shape[-1]
        stddev = config["exploration_noise"]
        self.exploration_noise = OUNoise(mu=np.zeros(nb_actions),
                                         sigma=float(stddev) *
                                         np.ones(nb_actions))
        self.action_range = (-1., 1.)

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.ddpg_graph.critic_loss, self.ddpg_graph.action_loss),
            self.sess)
        self.max_action = env.action_space.high
        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        # Technically not needed when not remote
        self.obs_filter = get_filter(config["observation_filter"],
                                     env.observation_space.shape)
        self.rew_filter = get_filter(config["reward_filter"], ())
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }

        self.obs = self.env.reset()
Esempio n. 5
0
    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)

        # Use either a different `eps` per worker, or a linear schedule.
        if config["per_worker_exploration"]:
            assert config["num_workers"] > 1, "This requires multiple workers"
            self.exploration = ConstantSchedule(
                0.4 ** (
                    1 + worker_index / float(config["num_workers"] - 1) * 7))
        else:
            self.exploration = LinearSchedule(
                schedule_timesteps=int(
                    config["exploration_fraction"] *
                    config["schedule_max_timesteps"]),
                initial_p=1.0,
                final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()
Esempio n. 6
0
    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)

        # Use either a different `eps` per worker, or a linear schedule.
        if config["per_worker_exploration"]:
            assert config["num_workers"] > 1, "This requires multiple workers"
            self.exploration = ConstantSchedule(
                0.4 ** (
                    1 + worker_index / float(config["num_workers"] - 1) * 7))
        else:
            self.exploration = LinearSchedule(
                schedule_timesteps=int(
                    config["exploration_fraction"] *
                    config["schedule_max_timesteps"]),
                initial_p=1.0,
                final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()
Esempio n. 7
0
    if not args.env:
        if not args.config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = args.config.get("env")

    ray.init()

    cls = get_agent_class(args.run)
    agent = cls(env=args.env, config=args.config)
    agent.restore(args.checkpoint)
    num_steps = int(args.steps)

    if args.run == "DQN":
        env = gym.make(args.env)
        env = wrap_dqn(env, args.config.get("model", {}))
    else:
        env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env))
    if args.out is not None:
        rollouts = []
    steps = 0
    while steps < (num_steps or steps + 1):
        if args.out is not None:
            rollout = []
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done and steps < (num_steps or steps + 1):
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
Esempio n. 8
0
    if not args.env:
        if not args.config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = args.config.get("env")

    ray.init()

    cls = get_agent_class(args.run)
    agent = cls(env=args.env, config=args.config)
    agent.restore(args.checkpoint)
    num_steps = int(args.steps)

    if args.run == "DQN":
        env = gym.make(args.env)
        env = wrap_dqn(get_registry(), env, args.config.get("model", {}))
    else:
        env = ModelCatalog.get_preprocessor_as_wrapper(get_registry(),
                                                       gym.make(args.env))
    if args.out is not None:
        rollouts = []
    steps = 0
    while steps < (num_steps or steps + 1):
        if args.out is not None:
            rollout = []
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done and steps < (num_steps or steps + 1):
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
Esempio n. 9
0
    if not args.env:
        if not args.config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = args.config.get("env")

    ray.init()

    cls = get_agent_class(args.run)
    agent = cls(env=args.env, config=args.config)
    agent.restore(args.checkpoint)
    num_steps = int(args.steps)

    if args.run == "DQN":
        env = gym.make(args.env)
        env = wrap_dqn(get_registry(), env, args.config.get("model", {}))
    else:
        env = ModelCatalog.get_preprocessor_as_wrapper(get_registry(),
                                                       gym.make(args.env))
    if args.out is not None:
        rollouts = []
    steps = 0
    while steps < (num_steps or steps + 1):
        if args.out is not None:
            rollout = []
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done and steps < (num_steps or steps + 1):
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)