コード例 #1
0
    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env

        # replay memory to play
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = []

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            state = state.reshape([1, -1, 1])

            while True:
                if self.config.render_test:
                    env.render()

                # store last state in buffer
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                action = self.get_action(q_input)

                # perform action in env
                new_state, reward, done, info = env.step(action)

                # store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state
                state = state.reshape([1, -1, 1])

                # count reward
                total_reward += reward
                if done:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes >= 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward
def run_episode(
    env,
    q_func,
    replay_buffer_size=1000000,
    frame_history_len=4,
    game=None,
    ):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space)      == gym.spaces.Discrete


    if len(env.observation_space.shape) == 1:
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n


    Q = q_func(input_arg, num_actions).type(dtype)
    Q.load_state_dict(torch.load("./models/PAL_{}.pth".format(game), map_location=lambda storage, loc: storage))

    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)


    all_obs = []

    last_obs = env.reset()

    for t in count():

        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()
        all_obs.append(recent_observations)


        torch_obs = torch.from_numpy(recent_observations).type(dtype).unsqueeze(0) / 255.0
        with torch.no_grad():
            Qvals = Q(torch_obs).data[0]
        max2val, max2idx = Qvals.topk(2)
        action = max2idx[0]
        

        obs, reward, done, _ = env.step(action)
        env.render()
        replay_buffer.store_effect(last_idx, action, reward, done)

        if done:
            break
        last_obs = obs
    
    return all_obs
コード例 #3
0
ファイル: model.py プロジェクト: zachabarnes/slither-rl-agent
    def evaluate(self, env, num_episodes):
        replay_buffer = ReplayBuffer(self.FLAGS.state_hist,
                                     self.FLAGS.state_hist)
        rewards = []

        if num_episodes > 1: self.logger.info("Evaluating...")

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            while True:
                # Store last state in buffer
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # Get greedy action
                action = self.network.get_best_action(q_input)[0]

                # Perform action in env
                new_state, reward, done, info = env.step(action)

                # Store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # count reward
                total_reward += reward
                if done: break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward
コード例 #4
0
def playPoint(expert, state):
    experts_replay_buffer = ReplayBuffer(
        config.buffer_size, config.state_history)
    counter = 0
    initial_action = -1
    while True:
        idx = experts_replay_buffer.store_frame(state)
        q_input = experts_replay_buffer.encode_recent_observation()
        action, _ = expert.get_best_action(q_input)
        if counter == 0:
            initial_action = action
        # perform action in env
        new_state, reward, done, info = env.step(action)
        # store in replay memory
        state = new_state
        experts_replay_buffer.store_effect(idx, action, reward, done)
        # count reward
        if abs(reward) == 1:
            break
        counter += 1
    print("PLAY POINT ENDED")
    return (config.gamma**counter) * reward, initial_action
コード例 #5
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    # YOUR CODE HERE
    Q = q_func(input_arg, num_actions)
    Q_target = q_func(input_arg, num_actions)

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####
        idx = replay_buffer.store_frame(last_obs)
        encoded_obs = replay_buffer.encode_recent_observation()
        if (t > learning_starts):
            action = select_epilson_greedy_action(Q, encoded_obs, t)
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        replay_buffer.store_effect(idx, action, reward, done)
        if (done):
            last_obs = env.reset()
        else:
            last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            # YOUR CODE HERE
            #
            # Alpha (learning rate) from the q function update isn't present in our code -- its in OptimizerSpec in main.
            # Move to GPU if possible
            # done flag in loop   ---- SKIPPED IF DONE IS TRUE
            # clipping the error between -1 and 1   -- OK
            # backward the error meaning?
            # Suggestion for changing parameters - change exploration scehdule (main)
            #
            # Q.cuda()
            obs_batch, act_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size=batch_size)
            states = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            actions = Variable(torch.from_numpy(act_batch).long())
            rewards = Variable(torch.from_numpy(reward_batch).float())
            next_states = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_dones = Variable(torch.from_numpy(1 - done_mask).type(dtype))
            if USE_CUDA:
                states = states.cuda()
                actions = actions.cuda()
                rewards = rewards.cuda()
                next_states = next_states.cuda()
            Q.train()
            Q_target.eval()
            predicted_rewards = Q(states).gather(1,
                                                 actions.unsqueeze(1))  #Q(s,a)
            next_max_Q = Q_target(next_states).detach().max(1)[
                0]  #.unsqueeze(1) #Q_target(s,a)
            next_Q_values = not_dones * next_max_Q
            target_Q_values = rewards + (gamma * next_Q_values)  #r + Q_target
            bellman_error = target_Q_values - predicted_rewards.squeeze(1)
            clipped_bellman_error = bellman_error.clamp(-1, 1) * (-1.0)
            optimizer.zero_grad()
            predicted_rewards.backward(clipped_bellman_error.data.unsqueeze(1))
            optimizer.step()
            num_param_updates += +1
            if (num_param_updates % target_update_freq == 0):
                Q_target.load_state_dict(Q.state_dict())

            # for obs,act,reward,next_obs,done in zip(obs_batch,act_batch,reward_batch,next_obs_batch,done_mask):
            #     if(done == 1.0):
            #         continue
            #     obs = Variable(torch.from_numpy(obs, ).type(dtype).unsqueeze(0) / 255.0, requires_grad=True)
            #     next_obs = Variable(torch.from_numpy(next_obs).type(dtype).unsqueeze(0) / 255.0, requires_grad=False)
            #     current_Q = Q(obs)
            #     predicted_reward = Variable(current_Q[0][act].unsqueeze(0), requires_grad=True)
            #     target_reward = Q_target(next_obs).data.max(1)[0]
            #     loss = loss_fn(reward + gamma * target_reward, predicted_reward).clamp(-1, 1) * (-1.0)

            #     optimizer.zero_grad()
            #     # should be current.backward(d_error.data.unsqueeze(1))
            #     # but it crashes on misfitting dims
            #     predicted_reward.backward(loss.data.unsqueeze(1))

            #     optimizer.step()
            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
コード例 #6
0
ファイル: q_learning.py プロジェクト: CatalinVoss/music-morph
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            #state = self.env.reset()
            state = self.env.env_reset()
            while True:
                #               print t
                t += 1
                last_eval += 1
                #                print total_reward
                #if self.config.render_train: self.env.render()
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                #print t
                # TODO: log displays to tensorboard
                new_state, reward, done, info = self.env.env_step(
                    action, state)  #, display=(t % DISPLAY_FREQ == 0))

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)
                #                print t > self.config.learning_start
                #                print t % self.config.log_freq == 0
                #                print t % self.config.learning_freq == 0
                #print rewards
                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        self.update_averages(rewards, max_q_values, q_values,
                                             scores_eval)

                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
コード例 #7
0
class DQNAgent(BaseAgent):
    non_terminal_reward = 0

    def __init__(self, env, config, exp_schedule, lr_schedule, is_training_agent, train_from_scratch=False,
                 reward_after_somebody_died=False,
                 logger=None):
        """
        Initialize Q Network and env

        :param env: Game environment
        :param config: config(hyper-parameters) instance
        :param logger: logger instance from logging module
        :param exp_schedule: exploration strategy for epsilon
        :param lr_schedule: schedule for learning rate
        """
        super(DQNAgent, self).__init__()

        # Variables initialized in _build
        self._states = None
        self._actions = None
        self._rewards = None
        self._next_states = None
        self._done_mask = None
        self._learning_rate = None
        self._q_values = None
        self._target_q_values = None
        self._next_q_values = None
        self._update_target_op = None
        self._loss = None
        self._train_op = None
        self._grad_norm = None

        # Variables initialized in init_agent
        self._session = None
        self._avg_reward_placeholder = None
        self._max_reward_placeholder = None
        self._std_reward_placeholder = None
        self._avg_q_placeholder = None
        self._max_q_placeholder = None
        self._std_q_placeholder = None
        # TODO: Commented due to lack of evaluate()
        # self._eval_reward_placeholder = None
        self._merged = None
        self._file_writer = None
        self._saver = None
        self._train_replay_buffer = None
        self._train_rewards = None
        self._train_max_q_values = None
        self._train_q_values = None
        self._avg_reward = None
        self._max_reward = None
        self._std_reward = None
        self._avg_q = None
        self._max_q = None
        self._std_q = None
        # TODO: Commented due to lack of evaluate()
        # self._eval_reward = None
        self._time_step = None
        self._progress_bar = None
        self._has_episode_started = None

        # Variables initialized in act.
        self._last_action = None
        self._last_idx = None
        self._enemy_count = None

        # Directory for training outputs
        if not os.path.exists(config.output_path):
            os.makedirs(config.output_path)

        self._logger = logger
        if logger is None:
            self._logger = get_logger(config.log_path)

        self._config = config
        self._env = env
        self._exp_schedule = exp_schedule
        self._lr_schedule = lr_schedule
        self._is_training_agent = is_training_agent
        self._train_from_scratch = train_from_scratch
        self._reward_after_somebody_died = reward_after_somebody_died
        self._total_reward = 0

        # Build model.
        self._build()

    def init_agent(self, id_, game_type):
        super(DQNAgent, self).init_agent(id_, game_type)

        # Assume the graph has been constructed.
        # Create a tf Session and run initializer of variables.
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        self._session = tf.Session(config=tf_config)

        # Tensorboard
        self._add_summary()

        # Initialize all variables.
        init = tf.global_variables_initializer()
        self._session.run(init)

        # Synchronise q and target_q networks.
        self._session.run(self._update_target_op)

        # for saving networks weights
        self._saver = tf.train.Saver()

        # Initialize replay buffer and variables.
        self._train_replay_buffer = ReplayBuffer(self._config.buffer_size, self._config.state_history)
        self._train_rewards = deque(maxlen=self._config.num_episodes_test)
        self._train_max_q_values = deque(maxlen=1000)
        self._train_q_values = deque(maxlen=1000)
        self._init_averages()

        self._time_step = 0
        self._progress_bar = Progbar(target=self._config.nsteps_train)

        self._has_episode_started = False

        if not self._train_from_scratch:
            self._load()

    def act(self, obs, action_space):
        state = obs['board'][:, :, None]

        if not self._is_training_agent:
            # Act greedily when testing.
            if self._has_episode_started:
                self._train_replay_buffer.store_effect(
                    self._last_idx,
                    self._last_action,
                    0,
                    done=False
                )

            self._last_idx = self._train_replay_buffer.store_frame(state)
            q_input = self._train_replay_buffer.encode_recent_observation()
            action = self._get_action(q_input)
            self._last_action = action

            return action

        if self._has_episode_started:
            reward = DQNAgent.non_terminal_reward

            if self._reward_after_somebody_died:
                if len(self._character.enemies) < self._enemy_count:
                    reward = 1

            self._train(reward, done=False)

        self._enemy_count = len(self._character.enemies)
        self._time_step += 1

        # Replay buffer
        idx = self._train_replay_buffer.store_frame(state)
        q_input = self._train_replay_buffer.encode_recent_observation()

        # Choose action according to current Q and exploration
        best_action, self._train_q_values = self._get_best_action(q_input)
        action = self._exp_schedule.get_action(best_action)

        self._train_max_q_values.append(max(self._train_q_values))
        self._train_q_values += list(self._train_q_values)

        self._last_action = action
        self._last_idx = idx

        if not self._has_episode_started:
            self._has_episode_started = True

        return action

    def episode_end(self, reward):
        """
        Updates to perform at the end of an episode
        """
        # Reset episode.
        self._has_episode_started = False

        if not self._is_training_agent:
            return

        self._train(reward, done=True)
        self._train_rewards.append(self._total_reward)

        # Reset total reward.
        self._total_reward = 0

        # TODO: Commented due to lack of evaluate() and record()
        # if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
        #     # evaluate our policy
        #     last_eval = 0
        #     print("")
        #     scores_eval += [self.evaluate()]
        #
        # if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq):
        #     self.logger.info("Recording...")
        #     last_record = 0
        #     self.record()

    def shutdown(self):
        """
        Save trained results
        """
        if not self._is_training_agent:
            return

        self._logger.info("- Training done.")
        self._save()

        # TODO: Commented due to lack of evaluate()
        # scores_eval += [self.evaluate()]
        # DQNAgent.export_plot(scores_eval, "Scores", self.config.plot_output)

    def _train(self, reward, done):
        # Store the transition.
        self._train_replay_buffer.store_effect(
            self._last_idx,
            self._last_action,
            reward,
            done=done
        )

        # Perform a training step.
        loss_eval, grad_eval = self._train_step(
            self._time_step,
            self._train_replay_buffer,
            self._lr_schedule.epsilon
        )

        # Logging
        if self._time_step > self._config.learning_start \
                and self._time_step % self._config.log_freq == 0 \
                and self._time_step % self._config.learning_freq == 0:

            self._update_averages(self._train_rewards, self._train_max_q_values, self._train_q_values)
            self._exp_schedule.update(self._time_step)
            self._lr_schedule.update(self._time_step)
            if len(self._train_rewards) > 0:
                self._progress_bar.update(
                    self._time_step + 1,
                    exact=[
                        ("Loss", loss_eval), ("Avg R", self._avg_reward),
                        ("Max R", np.max(self._train_rewards)),
                        ("eps", self._exp_schedule.epsilon),
                        ("Grads", grad_eval), ("Max Q", self._max_q),
                        ("lr", self._lr_schedule.epsilon)
                    ]
                )

        elif self._time_step < self._config.learning_start and self._time_step % self._config.log_freq == 0:
            sys.stdout.write("\rPopulating the memory {}/{}...".format(self._time_step, self._config.learning_start))
            sys.stdout.flush()

        # Accumulate reward
        self._total_reward += reward

    def _build(self):
        """
        Build model by adding all necessary variables.
        """
        # Add placeholders.
        self._add_placeholders_op()

        # Compute Q values of state.
        states = self._process_state(self._states)
        self._q_values = self._get_q_values_op(states, scope='q', reuse=False)

        # Compute Q values of next state.
        next_states = self._process_state(self._next_states)
        self._target_q_values = self._get_q_values_op(next_states, scope='target_q', reuse=False)

        # for Double DQN
        self._next_q_values = self._get_q_values_op(next_states, scope='q', reuse=True)

        # Add update operator for target network.
        self._add_update_target_op('q', 'target_q')

        # Add square loss.
        self._add_loss_op(self._q_values, self._target_q_values, self._next_q_values)

        # Add optimizer for the main networks.
        self._add_optimizer_op('q')

    def _add_placeholders_op(self):
        """
        Adds placeholders to the graph

        These placeholders are used as inputs by the rest of the model building and will be fed
        data during training.  Note that when "None" is in a placeholder's shape, it's flexible
        (so we can use different batch sizes without rebuilding the model
        """
        state_shape = list(self._env.observation_space.shape)

        self._states = tf.placeholder(tf.uint8, (None, 11, 11, self._config.state_history))
        self._actions = tf.placeholder(tf.int32, (None,))
        self._rewards = tf.placeholder(tf.float32, (None,))
        self._next_states = tf.placeholder(tf.uint8, (None, 11, 11, self._config.state_history))
        self._done_mask = tf.placeholder(tf.bool, (None,))
        self._learning_rate = tf.placeholder(tf.float32, ())

    def _process_state(self, state):
        """
        Processing of state

        State placeholders are tf.uint8 for fast transfer to GPU
        Need to cast it to float32 for the rest of the tf graph.

        :param state:
                Node of tf graph of shape = (batch_size, height, width, nchannels) of type tf.uint8.if,
                values are between 0 and 255 -> 0 and 1
        """
        state = tf.cast(state, tf.float32)
        state /= self._config.high

        return state

    def _get_q_values_op(self, state, scope, reuse=False):
        """
        Returns Q values for all actions

        :param state: (tf tensor) shape = (batch_size, img height, img width, nchannels)
        :param scope: (string) scope name, that specifies if target network or not
        :param reuse: (bool) reuse of variables in the scope
        :return out: (tf tensor) of shape = (batch_size, num_actions)
        """
        num_actions = self._env.action_space.n
        out = state

        with tf.variable_scope(scope, reuse=reuse) as _:
            x = layers.conv2d(state, 32, 5, stride=2, padding='SAME')
            x = layers.conv2d(x, 64, 4, stride=2, padding='SAME')
            x = layers.conv2d(x, 64, 3, stride=1, padding='SAME')
            x = layers.flatten(x)
            x = layers.fully_connected(x, 512)
            out = layers.fully_connected(x, num_actions, activation_fn=None)

        return out

    def _add_update_target_op(self, q_scope, target_q_scope):
        """
        update_target_op will be called periodically
        to copy Q network weights to target Q network

        Remember that in DQN, we maintain two identical Q networks with
        2 different set of weights. In tensorflow, we distinguish them
        with two different scopes. One for the target network, one for the
        regular network. If you're not familiar with the scope mechanism
        in tensorflow, read the docs
        https://www.tensorflow.org/programmers_guide/variable_scope

        Periodically, we need to update all the weights of the Q network
        and assign them with the values from the regular network. Thus,
        what we need to do is to build a tf op, that, when called, will
        assign all variables in the target network scope with the values of
        the corresponding variables of the regular network scope.

        :param q_scope: (string) name of the scope of variables for q
        :param target_q_scope: (string) name of the scope of variables
                for the target network
        """
        tar_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=target_q_scope)
        q_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=q_scope)
        self._update_target_op = tf.group(*[tf.assign(tar_vars[i], q_vars[i]) for i in range(len(tar_vars))])

    def _add_loss_op(self, q, target_q, next_q):
        """
        Sets the loss of a batch, self.loss is a scalar

        :param q: (tf tensor) shape = (batch_size, num_actions)(Q(s, a))
        :param target_q: (tf tensor) shape = (batch_size, num_actions)(Q_target(s', a'))
        :param next_q: Q(s', a') for Double DQN
        """
        num_actions = self._env.action_space.n
        not_done = 1 - tf.cast(self._done_mask, tf.float32)

        # Double DQN
        # need q_next(Q(s', a')), then find argmax in it
        max_a = tf.argmax(next_q, axis=1)
        q_max = tf.reduce_sum(target_q * tf.one_hot(max_a, num_actions), axis=1)
        q_samp = self._rewards + not_done * self._config.gamma * q_max

        # nature DQN
        q_s = tf.reduce_sum(q * tf.one_hot(self._actions, num_actions), axis=1)
        self._loss = tf.reduce_mean(tf.square(q_samp - q_s))

    def _add_optimizer_op(self, scope):
        """
        Set self.train_op and self.grad_norm
        """
        optimizer = tf.train.AdamOptimizer(self._learning_rate)
        vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
        grads_and_vars = optimizer.compute_gradients(self._loss, vars)

        clip_grads_and_vars = None
        if self._config.grad_clip:
            clip_grads_and_vars = [(tf.clip_by_norm(gv[0], self._config.clip_val), gv[1]) for gv in grads_and_vars]
        self._train_op = optimizer.apply_gradients(clip_grads_and_vars)
        self._grad_norm = tf.global_norm(clip_grads_and_vars)

    def _add_summary(self):
        """
        Tensorflow stuff
        """
        # extra placeholders to log stuff from python
        self._avg_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_reward")
        self._max_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="max_reward")
        self._std_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="std_reward")

        self._avg_q_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_q")
        self._max_q_placeholder = tf.placeholder(tf.float32, shape=(), name="max_q")
        self._std_q_placeholder = tf.placeholder(tf.float32, shape=(), name="std_q")

        # TODO: Commented due to lack of evaluate()
        # self._eval_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="eval_reward")

        # add placeholders from the graph
        tf.summary.scalar("loss", self._loss)
        tf.summary.scalar("grads norm", self._grad_norm)

        # extra summaries from python -> placeholders
        tf.summary.scalar("Avg Reward", self._avg_reward_placeholder)
        tf.summary.scalar("Max Reward", self._max_reward_placeholder)
        tf.summary.scalar("Std Reward", self._std_reward_placeholder)

        tf.summary.scalar("Avg Q", self._avg_q_placeholder)
        tf.summary.scalar("Max Q", self._max_q_placeholder)
        tf.summary.scalar("Std Q", self._std_q_placeholder)

        # TODO: Commented due to lack of evaluate()
        # tf.summary.scalar("Eval Reward", self._eval_reward_placeholder)

        # logging
        self._merged = tf.summary.merge_all()
        self._file_writer = tf.summary.FileWriter(self._config.output_path,
                                                  self._session.graph)

    def _init_averages(self):
        """
        Define extra attributes for tensorboard.
        """
        self._avg_reward = -21.
        self._max_reward = -21.
        self._std_reward = 0

        self._avg_q = 0
        self._max_q = 0
        self._std_q = 0

        # TODO: Commented due to lack of evaluate()
        # self._eval_reward = -21.

    def _get_action(self, obs):
        """
        Returns action with some epsilon strategy

        :param obs: observation from gym
        """
        if np.random.random() < self._config.soft_epsilon:
            return self._env.action_space.sample()
        else:
            return self._get_best_action(obs)[0]

    def _get_best_action(self, obs):
        """
        Return best action

        :param obs: 4 consecutive observations from gym
        :return action: (int)
        :return action_values: (np array) q values for all actions
        """
        action_values = self._session.run(self._q_values, feed_dict={self._states: [obs]})[0]
        return np.argmax(action_values), action_values

    def _train_step(self, t, replay_buffer, lr):
        """
        Perform training step

        :param t: (int) nth step
        :param replay_buffer: buffer for sampling
        :param lr: (float) learning rate
        """
        loss_eval, grad_eval = 0, 0

        # Perform training step
        if t > self._config.learning_start and t % self._config.learning_freq == 0:
            loss_eval, grad_eval = self._update_step(t, replay_buffer, lr)

        # Occasionally update target network with q network
        if t % self._config.target_update_freq == 0:
            self._update_target_params()

        # Occasionally save the weights
        if t % self._config.saving_freq == 0:
            self._save()

        return loss_eval, grad_eval

    def _update_step(self, t, replay_buffer, lr):
        """
        Performs an update of parameters by sampling from replay_buffer

        :param t: number of iteration (episode and move)
        :param replay_buffer: ReplayBuffer instance .sample() gives batches
        :param lr: (float) learning rate
        :return loss: (Q - Q_target) ^ 2
        """
        s_batch, a_batch, r_batch, sp_batch, done_mask_batch = replay_buffer.sample(self._config.batch_size)

        fd = {
            # Inputs
            self._states: s_batch,
            self._actions: a_batch,
            self._rewards: r_batch,
            self._next_states: sp_batch,
            self._done_mask: done_mask_batch,
            self._learning_rate: lr,

            # Extra info
            self._avg_reward_placeholder: self._avg_reward,
            self._max_reward_placeholder: self._max_reward,
            self._std_reward_placeholder: self._std_reward,
            self._avg_q_placeholder: self._avg_q,
            self._max_q_placeholder: self._max_q,
            self._std_q_placeholder: self._std_q,

            # TODO: Commented due to lack of evaluate()
            # self._eval_reward_placeholder: self.eval_reward,
        }

        loss_eval, grad_norm_eval, summary, _ = self._session.run(
            [self._loss, self._grad_norm, self._merged, self._train_op],
            feed_dict=fd
        )

        # Tensorboard
        self._file_writer.add_summary(summary, t)

        return loss_eval, grad_norm_eval

    def _update_target_params(self):
        """
        Update parameters of Q with parameters of Q
        """
        self._session.run(self._update_target_op)

    def _load(self):
        """
        Loads session
        """
        ckpt = tf.train.get_checkpoint_state(self._config.model_output)
        self._saver.restore(self._session, ckpt.model_checkpoint_path)

    def _save(self):
        """
        Saves session
        """
        if not os.path.exists(self._config.model_output):
            os.makedirs(self._config.model_output)

        model_path = os.path.join(self._config.model_output, 'model.ckpt')
        self._saver.save(self._session, model_path)

    def _update_averages(self, rewards, max_q_values, q_values, scores_eval=None):
        """
        Update the averages

        :param rewards: deque
        :param max_q_values: deque
        :param q_values: deque
        :param scores_eval: list
        """
        self._avg_reward = np.mean(rewards)
        self._max_reward = np.max(rewards)
        self._std_reward = np.sqrt(np.var(rewards) / len(rewards))

        self._max_q = np.mean(max_q_values)
        self._avg_q = np.mean(q_values)
        self._std_q = np.sqrt(np.var(q_values) / len(q_values))

        # TODO: Commented due to lack of evaluate()
        # if len(scores_eval) > 0:
        #     self.eval_reward = scores_eval[-1]

    @staticmethod
    def export_plot(y, y_label, filename):
        """
        Export a plot in filename

        :param y: (list) of float / int to plot
        :param filename: (string) directory
        """
        plt.figure()
        plt.plot(range(len(y)), y)
        plt.xlabel("Epoch")
        plt.ylabel(y_label)
        plt.savefig(filename)
        plt.close()
コード例 #8
0
def dqn_learing(
    env,
    q_func,
    checkpoint_path,
    optimizer_spec,
    exploration,
    stopping_criterion=None,
    replay_buffer_size=1000000,
    batch_size=32,
    gamma=0.99,
    learning_starts=50000,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000
    ):

    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space)      == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. dont save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].view(1,1)
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    # optionally resume from a checkpoint
    if checkpoint_path:
        if os.path.isfile(checkpoint_path):
            print("=> loading checkpoint '{}'".format(checkpoint_path))
            checkpoint = torch.load(checkpoint_path)
            Q.load_state_dict(checkpoint['model_state_dict'])
            target_Q.load_state_dict(checkpoint['target_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}')".format(checkpoint_path))
        else:
            print("=> no checkpoint found at '{}'".format(checkpoint_path))



    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    SAVE_EVERY_N_STEPS = 1000
    episode_reward = 0
    episode_rewards = []

    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0][0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        print("reward: %f" % reward)
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            episode_reward = 0
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and
                t % learning_freq == 0 and
                replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze() # squeeze the [batch_size x 1] Tensor to have a shape of batch_size
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)

#            # Compute Bellman error
#            bellman_error = target_Q_values - current_Q_values
#            # clip the bellman error between [-1 , 1]
#            clipped_bellman_error = bellman_error.clamp(-1, 1)
#            # Note: clipped_bellman_delta * -1 will be right gradient
#            d_error = clipped_bellman_error * -1.0

            # Compute Huber loss. Why not MSE? Because, Huber Loss is robust to noisy Q estimates compared to plain MSE.
            loss = F.smooth_l1_loss(current_Q_values, target_Q_values)

            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
#            current_Q_values.backward(d_error.data.unsqueeze(1))

            loss.backward()
            # Clip the gradients to lie between -1 and +1
            for params in Q.parameters():
                params.grad.data.clamp_(-1, 1)
            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_reward += reward
#        episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
        episode_rewards.append(episode_reward)
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t,))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')

        ### 5. Save a checkpoint
        if t % SAVE_EVERY_N_STEPS == 0 and t > learning_starts:
            save_checkpoint({
                'epoch': t + 1,
                'model_state_dict': Q.state_dict(),
                'target_state_dict': target_Q.state_dict(),
                'optimizer' : optimizer.state_dict(),
            }, "checkpoints/checkpoint.%d.tar" % t)
コード例 #9
0
def dqn_learing(
    env,
    q_func,
    optimizer_spec,
    exploration,
    stopping_criterion=None,
    replay_buffer_size=1000000,
    batch_size=32,
    gamma=0.99,
    learning_starts=50000,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000
    ):

    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.size

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return torch.IntTensor([[model(Variable(obs)).data.max(1)[1].cpu()]])
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])


    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)
    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    writer = SummaryWriter()

    for t in count():

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()
        
        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0]
        else:
            action = random.randrange(num_actions)

        # Advance one step
        obs, reward, done = env.step(action)

        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()

        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and
                t % learning_freq == 0 and
                replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)

            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))

            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            loss = F.smooth_l1_loss(current_Q_values, target_Q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())


        # ### 4. Log progress and keep track of statistics
        episode_rewards = env.get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)

        if len(episode_rewards) > 0:
            writer.add_scalar('data/DQN/score', episode_rewards[-1], len(episode_rewards))
            writer.add_scalar('data/DQN/mean_score', mean_episode_reward, len(episode_rewards))
            if len(episode_rewards) > 100:
                writer.add_scalar('data/DQN/best_mean_score', best_mean_episode_reward, len(episode_rewards))

                #LOG 저장
        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t,))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()
            torch.save(Q, 'DQN_net1029.pt')

    
    writer.close()
コード例 #10
0
    experts.append(model)
    # with model.graph.as_default():

print("LOADED ALL MODELS")

for i in range(len(experts)):
    guide = experts[i]
    guide_experience = [[]]
    num_points = 0
    state = env.reset()
    guide_replay_buffer = ReplayBuffer(
        config.buffer_size, config.state_history)
    while True:
            # store last state in buffer
        idx = guide_replay_buffer.store_frame(state)
        q_input = guide_replay_buffer.encode_recent_observation()
        action, _ = guide.get_best_action(q_input)
        # perform action in env
        new_state, reward, done, info = env.step(action)
        # store in replay memory
        guide_replay_buffer.store_effect(idx, action, reward, done)
        if len(guide_experience) <= num_points:
            guide_experience.append([])
        guide_experience[num_points].append((state, action, 0))
        state = new_state
        if abs(reward) == 1:
            cur_point_lis = guide_experience[num_points]
            for k in range(len(cur_point_lis)):
                index = int(len(cur_point_lis) - k - 1)
                if k == 0:
                    cur_point_lis[index] = (
コード例 #11
0
ファイル: main.py プロジェクト: penglaige/minerl
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.gpu and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # arguments
    LAMBDA = [1.0, 0.0, 1.0,
              10e-5]  # for [loss_dq, loss_n_dq, loss_jeq, loss_l2]
    CUDA_VISIBLE_DEVICES = 0
    seed = args.seed
    train = args.train
    demo = args.demo
    task = args.task
    iteration = 3
    convs = [(32, 7, 3), (64, 4, 2), (64, 3, 1)]
    non_pixel_layer = [64]
    in_feature = 7 * 7 * 64
    hidden_actions = [128]
    hidden_value = [128]
    aggregator = "reduceLocalMean"
    dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
    ) else torch.FloatTensor

    #if not train:
    #    args.num_env_steps = 50000

    base_kwargs = {
        'non_pixel_layer': non_pixel_layer,
        'convs': convs,
        'frame_history_len': args.frame_history_len,
        'in_feature': in_feature,
        'hidden_actions': hidden_actions,
        'hidden_value': hidden_value,
        'aggregator': aggregator
    }

    # logger
    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    # threads and device
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.gpu else "cpu")
    print("device:", device)
    gpu = args.gpu
    if (gpu == True):
        print("current available gpu numbers: %d" % torch.cuda.device_count())
        if torch.cuda.is_available():
            torch.cuda.set_device(CUDA_VISIBLE_DEVICES)
            print("CUDA Device: %d" % torch.cuda.current_device())

    # envs

    #envs = gym.make(task)
    #obs_space = env.observation_space
    #act_space = env.action_space
    #action_template = env.action_space.noop()
    env = gym.make(args.task)
    obs_space = env.observation_space
    act_space = env.action_space
    action_template = env.action_space.noop()

    # policy
    actor_critic = Policy(obs_space, act_space, base_kwargs=base_kwargs)
    actor_critic.to(device)

    # algorithm
    if args.algo == 'ppo':
        agent = PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=7e-4,
            eps=1e-5,
            max_grad_norm=args.max_grad_norm,
        )
    else:
        raise NotImplementedError

    # storage
    replay_buffer = None
    if args.frame_history_len > 1:
        _, _, non_pixel_shape = parse_obs_space(obs_space)
        add_non_pixel = True if non_pixel_shape > 0 else False
        replay_buffer = ReplayBuffer(100000, args.frame_history_len,
                                     non_pixel_shape, add_non_pixel)

    rollouts = RolloutStorage(replay_buffer, args.frame_history_len,
                              args.num_steps, args.num_processes, obs_space,
                              act_space)

    obs = env.reset()
    #print("reset obs pov size: ",obs['pov'].shape)
    # obs: key: inventory.dirt...
    # (num_processes, size)

    pov, non_pixel_feature = get_obs_features(obs_space, obs)
    #pov, non_pixel_feature = multi_get_obs_features(obs)
    if args.frame_history_len > 1:
        last_stored_frame_idx = replay_buffer.store_frame(
            pov, non_pixel_feature)
        pov = replay_buffer.encode_recent_observation() / 255.0  # 12 h w
        pov = torch.from_numpy(pov.copy()).reshape(args.num_processes,
                                                   *pov.shape)
    elif args.frame_history_len == 1:
        pov = pov.transpose(2, 0, 1) / 255.0
        pov = torch.from_numpy(pov.copy()).reshape(args.num_processes,
                                                   *pov.shape)
    else:
        raise NotImplementedError

    non_pixel_feature = (torch.tensor(non_pixel_feature) / 180.0).reshape(
        args.num_processes, -1)

    rollouts.obs[0].copy_(pov)
    rollouts.non_pixel_obs[0].copy_(non_pixel_feature)
    rollouts.to(device)

    # ?
    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print("Total steps: ", args.num_env_steps)

    ep = 0
    ep_rewards = []
    #mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    #total_rewards = [0 for i in range(args.num_processes)]
    total_rewards = 0

    for j in range(num_updates):

        for step in range(args.num_steps):
            # num_steps = 5
            # Sample actions
            with torch.no_grad():
                # actor_critic.act output size
                # actions: torch.Tensor, not list
                value, actions, action_log_probs = actor_critic.act(
                    rollouts.obs[step], rollouts.non_pixel_obs[step])

            # value size: batch x 1
            # actions size: torch.Tensor num_processes x num_branches
            # action_log_probs : torch.Tensor num_processes x num_branches
            #print(actions)
            actions_list = actions.squeeze().tolist()

            action = get_actions_continuous(actions_list, act_space,
                                            action_template)

            # step:
            #print(actions)
            obs, reward, done, infos = env.step(action)
            #print('.',end='')
            if args.num_env_steps <= 50000:
                env.render()

            pov, non_pixel_feature = get_obs_features(obs_space, obs)
            #pov, non_pixel_feature = multi_get_obs_features(obs)
            if args.frame_history_len > 1:
                last_stored_frame_idx = replay_buffer.store_frame(
                    pov, non_pixel_feature)
                pov = replay_buffer.encode_recent_observation(
                ) / 255.0  # 12 h w
                pov = torch.from_numpy(pov.copy()).reshape(
                    args.num_processes, *pov.shape)
            elif args.frame_history_len == 1:
                pov = pov.transpose(2, 0, 1) / 255.0
                pov = torch.from_numpy(pov.copy()).reshape(
                    args.num_processes, *pov.shape)
            else:
                raise NotImplementedError

            non_pixel_feature = (torch.tensor(non_pixel_feature) /
                                 180.0).reshape(args.num_processes, -1)

            total_rewards += reward
            #for i in range(len(reward)):
            #    total_rewards[i] += reward[i]
            reward = torch.tensor([reward]).reshape(args.num_processes,
                                                    -1).type(dtype)

            # TODO: may not need bas_masks
            masks = torch.FloatTensor([[0.0] if done else [1.0]])
            bad_masks = torch.FloatTensor([[1.0]])

            if done:
                ep += 1
                ep_rewards.append(total_rewards)
                best_mean_episode_reward = log(j, args.task, ep,
                                               np.array(ep_rewards),
                                               best_mean_episode_reward)

                obs = env.reset()
                pov, non_pixel_feature = get_obs_features(obs_space, obs)
                #pov, non_pixel_feature = multi_get_obs_features(obs)
                if args.frame_history_len > 1:
                    last_stored_frame_idx = replay_buffer.store_frame(
                        pov, non_pixel_feature)
                    pov = replay_buffer.encode_recent_observation(
                    ) / 255.0  # 12 h w
                    pov = torch.from_numpy(pov.copy()).reshape(
                        args.num_processes, *pov.shape)
                elif args.frame_history_len == 1:
                    pov = pov.transpose(2, 0, 1) / 255.0
                    pov = torch.from_numpy(pov.copy()).reshape(
                        args.num_processes, *pov.shape)
                else:
                    raise NotImplementedError
                non_pixel_feature = (torch.tensor(non_pixel_feature) /
                                     180.0).reshape(args.num_processes, -1)

                total_rewards = 0
            # ?
            rollouts.insert(pov, non_pixel_feature, actions, action_log_probs,
                            value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.non_pixel_obs[-1])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        # TODO: minibathc = 32, 1 processes x 10 step should larger than 32
        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_model_dir != '':
            save_path = os.path.join(args.save_model_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save(actor_critic, os.path.join(save_path,
                                                  args.task + ".pt"))

        if j % args.log_interval == 0 and len(ep_rewards) >= 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print("----------- Logs -------------")
            if len(ep_rewards) == 0:
                print(
                    "Updates {}, num timesteps {}, FPS {} \nThe {}th training episodes,"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(ep_rewards)))
            else:
                print(
                    "Updates {}, num timesteps {}, FPS {} \nThe {}th training episodes,\nmean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(ep_rewards), np.mean(ep_rewards),
                            np.median(ep_rewards), np.min(ep_rewards),
                            np.max(ep_rewards)))

    print("-----------------------Training ends-----------------------")
    env.close()
コード例 #12
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    ############################
    # BUILD MODEL / 모델 만들기 #
    ############################

    # Observation 크기에 따라 Q function에 들어갈 input_arg 설정
    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c

    # Simulator에서의 action의 갯수 받아옴
    num_actions = env.action_space.size

    # Construct an epilson greedy policy with given exploration schedule
    # Epsilon greedy 함수 설정
    ## random으로 뽑은 sample값과 dqn learning의 exploration schedule과 비교, 결과에 맞게 epsilon greedy action policy를 줌
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        #
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return torch.IntTensor(
                [[model(Variable(obs)).data.max(1)[1].cpu()]])
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    # Q function과 target Q function 생성
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    # Optimizer_spec을 이용하여 optimizer function을 만듦
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    # ReplayBuffer을 이용해 replay_buffer 생성
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    #초기값 설정
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    # TensorboardX를 모니터함
    writer = SummaryWriter()

    # t가 0부터 loop이 돌 때 마다 하나씩 커짐. 몇 번의 iteration이 실행되었는지 확인 가능
    for t in count():
        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done

        # 가장 최근의 결과 이미지가 replay_buffer에 저장되고 그때의 action, reward, 그리고 끝남의 여부가 last_idx에 저장됨
        last_idx = replay_buffer.store_frame(last_obs)

        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.

        #replay_buffer에 저장된 buffer중 가장 최근 것을 불러 직전의 frame들과 비교, Q network에 들어갈 input을 계산
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        # t가 learning_starts보다 크다면, 즉 충분한 iteration이 진행되었다면 action을 random값이 아닌 learning에 의한 값으로 받음
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)

        # Advance one step
        # action을 취하고 그에 따른 결과 이미지 (obs), 보상 (reward), 끝남 여부 (done)을 저장, replay_buffer에도 넣어줌
        obs, reward, done = env.step(action)
        replay_buffer.store_effect(last_idx, action, reward, done)

        # Resets the environment when reaching an episode boundary.
        # 끝이 났다면 env, 즉 학습 환경도 다시 리셋함
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken

        ## 충분한 iteration으로 t가 learning_starts보다 크고,
        ## learning_freq의 주기와 맞고,
        ## buffer의 사이즈가 batch 사이즈와 비교해 충분 할 때, learning이 시작됨
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target

            #replay_buffer에서 batch size에 맞는 데이터 양을 불러온다
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)

            # Convert numpy nd_array to torch variables for calculation

            # model의 input에 맞게 numpy array에서 torch로 변환
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            # 현재의 Q value를 계산
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))

            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            # 어떤 action이 max Q value를 주는지에 따라 다음 Q value를 설정
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            # 현재의 targer Q value를 optimize와 backward를 이용하여 계산
            target_Q_values = rew_batch + (gamma * next_Q_values)
            loss = F.smooth_l1_loss(current_Q_values,
                                    target_Q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()

            # Perfom the update
            # 업데이트 후 업데이트 횟수도 업데이트
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            # targer 업데이트 주기에 맞을때 마다 target network를 업데이트
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        # ### 4. Log progress and keep track of statistics
        # episode reward 출력, 100번이 넘어가면 최고평균값도 평균값과 함께 출력
        episode_rewards = env.get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        # Tensorboard에 저장
        if len(episode_rewards) > 0:
            writer.add_scalar('data/DQN/score', episode_rewards[-1],
                              len(episode_rewards))
            writer.add_scalar('data/DQN/mean_score', mean_episode_reward,
                              len(episode_rewards))
            if len(episode_rewards) > 100:
                writer.add_scalar('data/DQN/best_mean_score',
                                  best_mean_episode_reward,
                                  len(episode_rewards))

        # learning된 내용 출력
        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()
            torch.save(Q, 'DQN_net1029.pt')
            ## 적당한 파일에 저장

    writer.close()
コード例 #13
0
def dqn_learn(env, q_func, optimizer_spec, exploration, stopping_criterion,
              replay_buffer_size, batch_size, gamma, learning_starts,
              learning_freq, frame_history_len, target_update_freq,
              grad_norm_clipping, double_q):
    """Implements DQN training
    
    Parameters
    ----------
    env : gym.Env
        OpenAI gym environment
    q_func : torch.nn.Module
        DQN that computes q-values for each action: (state) -> (q-value, action)
    optimizer_spec : OptimizerSpec
        parameters for the optimizer
    exploration : Schedule
        schedule for epsilon-greedy exploration
    stopping_criterion : func
        when to stop training: (env, num_timesteps) -> bool
    replay_buffer_size : int
        experience replay memory size
    batch_size : int
        batch size to sample from replay memory
    gamma : float
        discount factor
    learning_starts : int
        number of environment steps before starting the training process
    learning_freq : int
        number of environment steps between updating DQN weights
    frame_history_len : int
        number of previous frames to include as DQN input
    target_update_freq : int
        number of experience replay steps to update the target network
    grad_norm_clipping : float
        maximum size of gradients to clip to
    double_q : bool
        enable double DQN learning
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    def select_action(dqn, obs, t):
        """Implements epsilon-greedy exploration
        
        Parameters
        ----------
        dqn : torch.nn.Module
            DQN model
        obs : np.ndarray
            Stacked input frames to evaluate
        t : int
            Current time step
        
        Returns
        -------
        nd.array (1,1)
            action to take
        """
        threshold = exploration.value(t)
        if random.random() > threshold:
            # take optimal action
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # DQN returns (q-value, action)
            q_values = dqn(obs)
            # returns (max, argmax) of q-values (max q-value, action which produces max q-value)
            _, action = q_values.data.max(1)
        else:
            # take a random action
            action = torch.IntTensor([random.randrange(num_actions)])
        return action

    # get input sizes and num actions
    img_h, img_w, img_c = env.observation_space.shape
    in_channels = frame_history_len * img_c
    input_shape = (img_h, img_w, in_channels)
    num_actions = env.action_space.n

    # construct online and target DQNs
    online_DQN = q_func(in_channels=in_channels, num_actions=num_actions)
    target_DQN = q_func(in_channels=in_channels, num_actions=num_actions)

    # construct optimizer
    optimizer = optimizer_spec.constructor(online_DQN.parameters(),
                                           **optimizer_spec.kwargs)

    # construct replay memory
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    # initialize main loop variables
    num_param_updates = 0
    avg_episode_reward = float('-inf')
    best_avg_episode_reward = float('-inf')
    cumulative_avg_episode_reward = float('-inf')
    prev_obs = env.reset()

    # main training loop
    for t in count():
        # check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        # store transition and concatenate last frames
        last_idx = replay_buffer.store_frame(prev_obs)

        # stack previous frames into a tensor to give to DQN
        stacked_obs = replay_buffer.encode_recent_observation()

        # take random actions until we've officially started training
        if t > learning_starts:
            # select action according to epsilon-greedy
            action = select_action(online_DQN, stacked_obs, t)[0]
        else:
            # take a random action
            action = random.randrange(num_actions)

        # step environment
        obs, reward, done, _ = env.step(action)
        # clip reward
        reward = min(-1.0, max(reward, 1.0))
        # store effect of taking action in prev_obs into replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)

        # if game is finished, reset environment
        if done:
            obs = env.reset()
        prev_obs = obs

        # experience replay
        if t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(
                batch_size):

            # sample batches
            obs_batch, action_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            obs_batch = torch.from_numpy(obs_batch).type(dtype) / 255.0
            action_batch = torch.from_numpy(action_batch).long()
            reward_batch = torch.from_numpy(reward_batch)
            next_obs_batch = torch.from_numpy(next_obs_batch).type(
                dtype) / 255.0
            not_done_mask = torch.from_numpy(1 - done_mask).type(dtype)

            if torch.cuda.is_available():
                action_batch = action_batch.cuda()
                reward_batch = reward_batch.cuda()

            # Compute current q-values: Q(s, a)
            # Select q-values based on actions we would have taken for each state
            # shape: (BATCH_SIZE, 1)
            current_q_values = online_DQN(obs_batch).gather(
                1, action_batch.unsqueeze(1))

            # double DQN or vanilla DQN
            if double_q:
                # compute which actions to take according to online network: argmax_a Q(s', a)
                greedy_actions = online_DQN(next_obs_batch).detach().max(1)[1]
                # compute q-values of those actions using target network: Q_hat(s', argmax_a Q(s', a))
                next_q_values = target_DQN(next_obs_batch).gather(
                    1, greedy_actions.unsqueeze(1))
            else:
                # Compute next q-values using target network
                next_q_values = target_DQN(next_obs_batch).detach().max(1)[0]
                next_q_values = next_q_values.unsqueeze(1)

            # apply mask to retain q-values
            next_q_values = not_done_mask.unsqueeze(1) * next_q_values
            """
            Compute the target q-values (BATCH_SIZE, 1)
            y_j = r_j + gamma * max_a' Q(s', a')                for vanilla DQN
            y_j = r_j + gamma * Q_hat(s', argmax_a Q(s', a))    for double DQN
            """
            target_q_values = reward_batch + (gamma * next_q_values)
            """
            Use the huber loss instead of clipping the TD error.
            Huber loss intuitively means we assign a much larger loss where the error is large (quadratic)
            Smaller errors equate to smaller losses (linear)
            """
            loss = F.smooth_l1_loss(current_q_values, target_q_values)

            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            loss.backward()

            # clip gradients
            nn.utils.clip_grad_norm_(online_DQN.parameters(),
                                     grad_norm_clipping)

            # update weights of dqn
            optimizer.step()
            num_param_updates += 1

            # update target network weights
            if num_param_updates % target_update_freq == 0:
                target_DQN.load_state_dict(online_DQN.state_dict())

        # end experience replay

        # log progress so far by averaging last 100 episodes
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            avg_episode_reward = np.mean(episode_rewards[-100:])
            cumulative_avg_episode_reward = np.mean(episode_rewards)
        if len(episode_rewards) > 100:
            best_avg_episode_reward = max(best_avg_episode_reward,
                                          avg_episode_reward)

        if t % LOG_FREQ == 0 and t > learning_starts:
            print('-' * 64)
            print('Timestep {}'.format(t))
            print(
                'Average reward (100 episodes): {}'.format(avg_episode_reward))
            print('Best average reward: {}'.format(best_avg_episode_reward))
            print('Cumulative average reward: {}'.format(
                cumulative_avg_episode_reward))
            print('Episode {}'.format(len(episode_rewards)))
            print('Exploration {}'.format(exploration.value(t)))
            print('\n')
            sys.stdout.flush()
コード例 #14
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        last_frames = deque(maxlen=4)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += []
        embeddings = []
        extractor = PongExtractor()

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < 2000:
            total_reward = 0
            state = self.env.reset()
            last_frame = state
            last_frames.append(state)
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                feats = extractor.extract(np.squeeze(state))
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                embedding = self.sess.run(self.hidden,
                                          feed_dict={self.s: [q_input]})[0]
                # embedding = self.sess.run(self.q, feed_dict={self.s: [q_input]})[0]
                # print embedding.shape
                embeddings.append(embedding)

                action = best_action

                frame = np.squeeze(state)
                scipy.misc.imsave(
                    'embeddings/breakout/breakout{}.png'.format(t), frame)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state
                total_reward += reward
                if done or t >= 2000:
                    print total_reward, t
                    break
            # updates to perform at the end of an episode
            rewards.append(total_reward)

        # last words
        print 'Saving embeddings'
        np.save(open('embeddings/breakout/breakout.npy', 'w'),
                np.vstack(embeddings))
コード例 #15
0
ファイル: model.py プロジェクト: zachabarnes/slither-rl-agent
    def train(self, exp_schedule, lr_schedule):
        # Initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.FLAGS.buffer_size,
                                     self.FLAGS.state_hist)
        rewards = deque(maxlen=self.FLAGS.num_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = 0  # time control of nb of steps
        loss_eval = grad_eval = 0
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]

        self.prog = Progbar(target=self.FLAGS.train_steps)

        # Train for # of train steps
        while t < self.FLAGS.train_steps:
            continual_crash = 0
            try:
                total_reward = 0
                ep_len = 0
                state = self.env.reset()

                # Run for 1 episode and update the buffer
                while True:
                    ep_len += 1

                    # replay memory stuff
                    idx = replay_buffer.store_frame(state)
                    q_input = replay_buffer.encode_recent_observation()

                    # chose action according to current Q and exploration
                    best_action, q_values = self.network.get_best_action(
                        q_input)
                    action = exp_schedule.get_action(best_action)

                    # store q values
                    max_q_values.append(max(q_values))
                    q_values += list(q_values)

                    # perform action in env
                    new_state, reward, done, info = self.env.step(action)

                    # store the transition
                    replay_buffer.store_effect(idx, action, reward, done)
                    state = new_state

                    # Count reward
                    total_reward += reward

                    # Stop at end of episode
                    if done: break

                #Store episodic rewards
                if ep_len > 1: rewards.append(total_reward)

                # Learn using replay
                while True:
                    t += 1
                    ep_len -= 1

                    # Make train step if necessary
                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.learn_every == 0)):
                        loss_eval, grad_eval = self.network.update_step(
                            t, replay_buffer, lr_schedule.epsilon,
                            self.summary)
                        exp_schedule.update(t)
                        lr_schedule.update(t)

                    if (t % self.FLAGS.target_every == 0):
                        self.network.update_target_params()

                    # Update logs if necessary
                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.log_every == 0)
                            and (len(rewards) > 0)):
                        self.update_averages(rewards, max_q_values, q_values,
                                             scores_eval)
                        self.update_logs(t, loss_eval, rewards,
                                         exp_schedule.epsilon, grad_eval,
                                         lr_schedule.epsilon)

                    # Update logs if necessary
                    elif (t < self.FLAGS.learn_start) and (
                            t % self.FLAGS.log_every == 0):
                        sys.stdout.write(
                            "\rPopulating the memory {}/{}...".format(
                                t, self.FLAGS.learn_start))
                        sys.stdout.flush()

                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.check_every == 0)):
                        # Evaluate current model
                        scores_eval += [
                            self.evaluate(self.env, self.FLAGS.num_test)
                        ]

                        # Save current Model
                        self.network.save()

                        # Record video of current model
                        if self.FLAGS.record:
                            self.record()

                    if ep_len <= 0 or t >= self.FLAGS.train_steps: break
                continual_crash = 0

            except Exception as e:
                continual_crash += 1
                self.logger.info(e)
                if continual_crash >= 10:
                    self.logger.info("Crashed 10 times -- stopping u suck")
                    raise e
                else:
                    t -= 1
                    self.logger.info("Env crash, making new env")
                    time.sleep(60)
                    self.env = create_slither_env(self.FLAGS.state_type)
                    self.env = Unvectorize(self.env)
                    self.env.configure(fps=self.FLAGS.fps,
                                       remotes=self.FLAGS.remotes,
                                       start_timeout=15 * 60,
                                       vnc_driver='go',
                                       vnc_kwargs={
                                           'encoding': 'tight',
                                           'compress_level': 0,
                                           'fine_quality_level': 50
                                       })
                    time.sleep(60)

        # End of training
        self.logger.info("- Training done.")
        self.network.save()
        scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]
        export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
コード例 #16
0
ファイル: dqn_learn.py プロジェクト: YSLIU627/ML-project-2020
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own conv-net using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of choseing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        print(env.observation_space.shape)
        img_h, img_w, img_c = env.observation_space.shape
        # input_arg = frame_history_len * img_c
        input_arg = frame_history_len * 1
    num_actions = env.action_space.n
    print(env.action_space)
    print(f"({input_arg}): ({img_h}X{img_w}X{img_c})")

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                values = model(Variable(obs))
            return values.data.max(1)[1].cpu().unsqueeze(dim=1)
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    obs = cv.cvtColor(last_obs, cv.COLOR_BGR2GRAY)
    obs = cv.resize(obs, dsize=(obs.shape[1] // 2, obs.shape[0] // 2))
    last_obs = obs[..., np.newaxis]
    print("Q model:")
    summary(Q, input_size=(input_arg, last_obs.shape[0], last_obs.shape[1]))
    print("Q-TARGET model:")
    summary(target_Q,
            input_size=(input_arg, last_obs.shape[0], last_obs.shape[1]))
    LOG_EVERY_N_STEPS = 10000

    rewards = 0.
    out_count = 0
    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break
        if t % 1e3 == 0:
            if out_count == 0:
                stdout.write("|")
                out_count += 1
            elif out_count % 10 == 0:
                stdout.write(f"{out_count}|")
                out_count += 1
            elif out_count >= 50:
                stdout.write("=> \n")
                out_count = 0
            else:
                stdout.write(".")
                out_count += 1
            stdout.flush()
        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            values = select_epilson_greedy_action(Q, recent_observations, t)
            action = values[0, 0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        rewards += reward
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
            print(len(episode_rewards), episode_rewards, rewards)
            rewards = 0.
        # print(obs.shape)
        # cv.imshow('now_color', obs)
        # cv.waitKey(1)
        obs = cv.cvtColor(obs, cv.COLOR_BGR2GRAY)
        obs = cv.resize(obs, dsize=(obs.shape[1] // 2, obs.shape[0] // 2))
        obs = obs[..., np.newaxis]
        # cv.imshow('now', obs)
        # cv.waitKey(1)
        last_obs = obs
        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)
            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            values = Q(obs_batch)
            current_Q_values = values.gather(1,
                                             act_batch.unsqueeze(1)).squeeze()
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            # current_Q_values.backward(d_error.data.unsqueeze(1))
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
コード例 #17
0
    def train(self, exp_schedule, lr_schedule, choose_teacher_strategy=None):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        allsteps = []
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                if self.config.state_subspace is not None:
                    out_of_bounds = False
                    if self.config.state_subspace in [
                            'ball_top_half', 'ball_bottom_half'
                    ]:
                        image = self.env.unwrapped._get_obs()
                        ball_position = ball_half_screen_position(image)
                        # check if ball is in top half but we're restricted to bottom half
                        if ball_position == 1 and self.config.state_subspace == 'ball_bottom_half':
                            out_of_bounds = True
                        # check if ball is in bottom half but we're restricted to top half
                        elif ball_position == 0 and self.config.state_subspace == 'ball_top_half':
                            out_of_bounds = True
                    else:
                        raise NotImplementedError
                    if out_of_bounds:  # current state is outside of this agent's state subspace
                        # perform action in env
                        state, reward, done, info = self.env.step(action)

                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()
                # self.q_inputs.append(q_input)

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                if choose_teacher_strategy is not None:
                    # store the reward with the teacher choice strategy
                    choose_teacher_strategy.store_reward(reward, q_input)

                # perform a training step
                loss_eval, grad_eval = self.train_step(
                    t, replay_buffer, lr_schedule.epsilon,
                    choose_teacher_strategy)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if choose_teacher_strategy is not None:
                        choose_teacher_strategy.update_schedule(t)
                    if len(rewards) > 0:
                        exact = [("Loss", loss_eval),
                                 ("Avg R", self.avg_reward),
                                 ("Max R", np.max(rewards)),
                                 ("eps", exp_schedule.epsilon),
                                 ("Grads", grad_eval), ("Max Q", self.max_q),
                                 ("lr", lr_schedule.epsilon)]
                        if choose_teacher_strategy is not None and hasattr(
                                choose_teacher_strategy, 'eps_schedule'):
                            exact.append(
                                ("Choose teacher eps",
                                 choose_teacher_strategy.eps_schedule.epsilon))
                        prog.update(t + 1, exact=exact)

                elif ((t > self.config.learning_start)
                      and (t % self.config.save_teacher_choice_freq == 0)
                      and (choose_teacher_strategy is not None)):
                    choose_teacher_strategy.save(
                        self.config.teacher_choice_output_path)

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
        if choose_teacher_strategy is not None:
            choose_teacher_strategy.save(
                self.config.teacher_choice_output_path)
コード例 #18
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        ############
        # PLAYER 1 #
        ############
        replay_buffer_p1 = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards_p1 = deque(maxlen=self.config.num_episodes_test)
        max_q_values_p1 = deque(maxlen=1000)
        q_values_p1 = deque(maxlen=1000)

        ############
        # PLAYER 2 #
        ############
        replay_buffer_p2 = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards_p2 = deque(maxlen=self.config.num_episodes_test)
        max_q_values_p2 = deque(maxlen=1000)
        q_values_p2 = deque(maxlen=1000)

        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        scores_eval += [self.evaluate()]
        
        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward_p1 = 0
            total_reward_p2 = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx_p1      = replay_buffer_p1.store_frame(state)
                # should get observation from last frame of p2
                q_input_p1 = replay_buffer_p2.encode_recent_observation()

                ############
                # PLAYER 1 #
                ############

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                action                = exp_schedule.get_action(best_action)

                # store q values
                max_q_values_p1.append(max(q_values))
                q_values_p1 += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer_p1.store_effect(idx, action, reward, done)
                state = new_state


                # BEFORE MOVING TO PLAYER 2, need to check if terminal TODO

                ############
                # PLAYER 2 #
                ############

                idx_p2      = replay_buffer_p2.store_frame(state)
                q_input_p2 = replay_buffer_p1.encode_recent_observation()

                # TODO: need to flip the input board state
                print(q_input_p2)

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input_p2)
                action                = exp_schedule.get_action(best_action)

                # store q values
                max_q_values_p2.append(max(q_values))
                q_values_p2 += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer_p2.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                # loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                   (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values, scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                        ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                        ("Grads", grad_eval), ("Max Q", self.max_q), 
                                        ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                        self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record =0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
コード例 #19
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000,
                statistics_file_name="statistics.pkl"):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network

    statistics_file_name: str
        Where to store the statistics file
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    print("STATISTICS_FILE_NAME: {}".format(statistics_file_name))

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(
                torch_types.FloatTensor).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return random.randrange(num_actions)

    # Initialize target q function and q function, i.e. build the model.
    ######

    # YOUR CODE HERE
    policy_net = q_func(input_arg, num_actions).to(device).type(
        torch_types.FloatTensor)  # Q
    target_net = q_func(input_arg, num_actions).to(device).type(
        torch_types.FloatTensor)  # Q target
    target_net.load_state_dict(
        policy_net.state_dict())  # copies the state of policy Q into target

    ######

    # Construct policy_net network optimizer function
    optimizer = optimizer_spec.constructor(policy_net.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####

        # YOUR CODE HERE
        stored_frame_idx = replay_buffer.store_frame(last_obs)
        last_obs_encoded = replay_buffer.encode_recent_observation()
        action = select_epilson_greedy_action(policy_net, last_obs_encoded, t)

        obs, reward, done, info = env.step(action)
        replay_buffer.store_effect(stored_frame_idx, action, reward, done)

        if done:
            obs = env.reset()

        last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            # YOUR CODE HERE
            sample = replay_buffer.sample(batch_size)
            obs_batch, actions_batch, rewards_batch, next_obs_batch, done_mask = sample

            # convert batches to pytorch tensors:
            obs_batch = torch.from_numpy(obs_batch).to(device).type(
                torch_types.FloatTensor) / 255.0
            next_obs_batch = torch.from_numpy(next_obs_batch).to(device).type(
                torch_types.FloatTensor) / 255.0
            actions_batch = torch.from_numpy(actions_batch).to(device).type(
                torch_types.LongTensor)
            rewards_batch = torch.from_numpy(rewards_batch).to(device).type(
                torch_types.FloatTensor)
            non_final_mask = 1 - torch.from_numpy(done_mask).to(device).type(
                torch_types.FloatTensor)

            # inspired by https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html:

            # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
            # columns of actions taken
            state_action_values = policy_net(obs_batch).gather(
                1, actions_batch.unsqueeze(1)).squeeze(1)

            # Compute V(s_{t+1}) for all next states.
            next_state_values = target_net(next_obs_batch).max(
                1)[0].detach() * non_final_mask
            # Compute the expected Q values
            expected_state_action_values = (next_state_values *
                                            gamma) + rewards_batch

            # Compute loss
            d_error = state_action_values - expected_state_action_values  # = -bellman_error
            d_error.clamp_(-1, 1)

            # Optimize the model
            optimizer.zero_grad()
            state_action_values.backward(d_error)
            optimizer.step()

            num_param_updates += 1
            # Periodically update target network:
            if num_param_updates % target_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())
            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t >= learning_starts:
            print("Timestep %d" % (t, ))
            print("  mean reward (100 episodes) %f" % mean_episode_reward)
            print("  best mean reward %f" % best_mean_episode_reward)
            print("  episodes %d" % len(episode_rewards))
            print("  exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open(statistics_file_name, 'wb') as f:
                pickle.dump(Statistic, f)
コード例 #20
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    if not os.path.isdir("./models"):
        os.mkdir("./models")

    if len(env.observation_space.shape) == 1:
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            with torch.no_grad():
                ret = model(obs).data.max(1)[1].cpu()
                return ret
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    save_best_mean_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 20000
    SAVE_EVERY_N_STEPS = 2000000
    AL_ALPHA = 0.7

    for t in count():
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0]
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        reward = max(-1.0, min(reward, 1.0))
        replay_buffer.store_effect(last_idx, action, reward, done)
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            cur_all_Q_values = Q(obs_batch)
            action_gap = cur_all_Q_values.max(
                dim=1)[0] * cur_all_Q_values.size(1) - cur_all_Q_values.sum(
                    dim=1)
            Statistic["mean_action_gap"].append(action_gap.mean().item())

            current_Q_values = cur_all_Q_values.gather(
                1, act_batch.unsqueeze(1)).squeeze()
            next_target_Q_values = target_Q(next_obs_batch).detach()
            next_max_q = next_target_Q_values.max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            target_Q_values = rew_batch + (gamma * next_Q_values)
            bellman_error = target_Q_values - current_Q_values

            cur_target_Q_values = target_Q(obs_batch).detach()

            cur_advantage = cur_target_Q_values.max(
                dim=1)[0] - cur_target_Q_values.gather(
                    1, act_batch.unsqueeze(1)).squeeze()
            next_advantage = next_target_Q_values.max(
                dim=1)[0] - next_target_Q_values.gather(
                    1, act_batch.unsqueeze(1)).squeeze()

            # Set up the error according to the operator you want
            al_error = bellman_error - AL_ALPHA * cur_advantage
            persistent_error = bellman_error - AL_ALPHA * next_advantage
            pal_error = torch.max(al_error, persistent_error)
            error = pal_error  # use whichever you want

            clipped_bellman_error = error.clamp(-1, 1)
            d_error = clipped_bellman_error * -1.0
            optimizer.zero_grad()
            current_Q_values.backward(d_error.data)

            optimizer.step()
            num_param_updates += 1

            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ## Log Progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % './models/statistics.pkl')

            if save_best_mean_reward < best_mean_episode_reward:
                save_best_mean_reward = best_mean_episode_reward
                torch.save(Q.state_dict(), './models/best_model.pth')

        if t % SAVE_EVERY_N_STEPS == 0:
            torch.save(Q.state_dict(), './models/n_steps_%d.pth' % t)
コード例 #21
0
t = 0
while t < 100000:
    t = t + 1
    print(t)
    ### Check stopping criterion
    if stopping_criterion is not None and stopping_criterion(env):
        break

    ### Step the env and store the transition
    # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
    last_idx = replay_buffer.store_frame(last_obs)
    # encode_recent_observation will take the latest observation
    # that you pushed into the buffer and compute the corresponding
    # input that should be given to a Q network by appending some
    # previous frames.
    recent_observations = replay_buffer.encode_recent_observation()

    # Choose random action if not yet start learning
    if t > learning_starts:
        action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0]
    else:
        action = random.randrange(num_actions)
    # Advance one step
    obs, reward, done, _ = env.step(action)
    # clip rewards between -1 and 1
    reward = max(-1.0, min(reward, 1.0))
    # Store other info in replay memory
    replay_buffer.store_effect(last_idx, action, reward, done)
    # Resets the environment when reaching an episode boundary.
    if done:
        obs = env.reset()
コード例 #22
0
def dqn_learing(
        #env,
        q_func,
        optimizer_spec,
        exploration,
        #stopping_criterion=None,
        replay_buffer_size=1000,
        batch_size=32,
        gamma=0.99,
        learning_starts=1,
        learning_freq=4,
        frame_history_len=1,
        target_update_freq=10000):

    #our own code
    read_image()
    rgb_data = depth_data.reshape(640, 480, 1)
    input_arg = rgb_data
    #input for the algorithm
    num_actions = 5
    last_obs = rgb_data

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0

            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(1, num_actions).type(dtype)
    target_Q = q_func(1, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(1000, 1)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    for t in count():

        last_idx = replay_buffer.store_frame(last_obs)

        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step

        control_robot(action + 1)

        rgb_data = depth_data.reshape(640, 480, 1)
        obs = rgb_data
        ##evaluate the action
        dis_data = np.array(depth_data)
        dis_data[np.isnan(dis_data)] = 999999999999
        dis_data[dis_data == 0] = 999999999999
        dis = np.min(dis_data)
        print("MIN DISTANCE:" + str(dis) + "-------------")
        reward = 0
        if dis < 500:
            reward = 1
        else:
            reward = -1
        print("REWARD:" + str(reward) + "--------------")
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, False)
        # Resets the environment when reaching an episode boundary.
        #if done:
        #obs = env.reset()
        last_obs = obs

        if (t > 1 and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            print("Training")
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)

            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(
                1, act_batch.unsqueeze(1)).squeeze()
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            print("next:", next_Q_values.shape)
            print("current:", current_Q_values.squeeze().shape)
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error  #.clamp(-1, 1)
            #print(clipped_bellman_error)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            #print(d_error.data)
            optimizer.zero_grad()
            # run backward pass
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())
コード例 #23
0
ファイル: dqn_learn.py プロジェクト: wwxFromTju/SV-RL
def dqn_learning(
        env,
        method,
        game,
        q_func,
        optimizer_spec,
        exploration,
        stopping_criterion=None,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        double=False,
        dueling=False,
        logdir=None,
        svrl=False,
        me_type=None,
        maskp=None,
        maskstep=None,
        maskscheduler=True
    ):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    def select_epsilon_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].view(1, 1)
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    #   RUN ENV   #
    ###############

    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    SAVE_MODEL_EVERY_N_STEPS = 1000000
    mask_scheduler_step = (1 - maskp) / maskstep

    for t in count():
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ################
        # STEP THE ENV #
        ################

        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()

        if t > learning_starts:
            action = select_epsilon_greedy_action(Q, recent_observations, t)[0][0]
        else:
            action = random.randrange(num_actions)

        obs, reward, done, _ = env.step(action)
        reward = max(-1.0, min(reward, 1.0))
        replay_buffer.store_effect(last_idx, action, reward, done)

        if done:
            obs = env.reset()
        last_obs = obs

        ################
        #   TRAINING   #
        ################

        if (t > learning_starts and
                t % learning_freq == 0 and
                replay_buffer.can_sample(batch_size)):

            # mask scheduler
            if maskscheduler:
                maskp = min(maskp + mask_scheduler_step, 1)

            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)

            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze()
            target_q_mat = target_Q(next_obs_batch).detach()

            # SV-RL scheme
            if svrl:
                target_q_mat = globals()[me_type](target_q_mat, target_q_mat.size(0), target_q_mat.size(1), maskp)

            if not double:
                next_max_q = target_q_mat.max(1)[0]
            else:
                q_temp = Q(next_obs_batch).detach()
                act_temp = np.argmax(q_temp.cpu(), axis=1)
                next_max_q = torch.sum(torch.from_numpy(np.eye(num_actions)[act_temp]).type(dtype) * target_q_mat.type(dtype), dim=1)

            next_Q_values = not_done_mask * next_max_q.type(dtype)
            target_Q_values = rew_batch + (gamma * next_Q_values)

            loss = F.smooth_l1_loss(current_Q_values, target_Q_values)

            optimizer.zero_grad()
            loss.backward()

            for params in Q.parameters():
                params.grad.data.clamp_(-1, 1)

            optimizer.step()
            num_param_updates += 1

            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ################
        # LOG PROGRESS #
        ################

        # save model
        if t % SAVE_MODEL_EVERY_N_STEPS == 0:
            if not os.path.exists("models"):
                os.makedirs("models")
            add_str = 'single'
            if double:
                add_str = 'double'
            if dueling:
                add_str = 'dueling'
            model_save_path = 'models/%s_%s_%s.ckpt' % (str(game[:-14]), add_str, method)
            torch.save(Q.state_dict(), model_save_path)

        # log process
        episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:

            logz.log_tabular('Timestep', t)
            logz.log_tabular('MeanReward100Episodes', mean_episode_reward)
            logz.log_tabular('BestMeanReward', best_mean_episode_reward)
            logz.log_tabular('Episodes', len(episode_rewards))
            logz.log_tabular('Exploration', exploration.value(t))
            logz.dump_tabular()

            sys.stdout.flush()
コード例 #24
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables

        if self.config.use_memory:
            replay_buffer = ReplayBuffer(
                self.config.buffer_size,
                self.config.state_history,
                memory_size=self.config.memory_unit_size)
        else:
            replay_buffer = ReplayBuffer(self.config.buffer_size,
                                         self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()[0]]

        prog = Progbar(target=self.config.nsteps_train)

        evaluation_result_list = []
        oos_evalution_result_list = []

        # interact with environment
        prev_time = time.time()
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                if self.config.use_memory:
                    prev_memory = replay_buffer.encode_recent_memory()
                    best_action, q_values, _, next_memory = self.get_best_action_with_memory(
                        q_input, prev_memory)
                    next_memory = np.squeeze(next_memory)
                else:
                    best_action, q_values = self.get_best_action(q_input)
                # chose action according to current Q and exploration
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                if self.config.use_memory:
                    replay_buffer.store_memory(idx, next_memory)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                time_log_freq = 1000
                if t % time_log_freq == 0:
                    with open(self.config.output_path + 'time_log.txt',
                              'a') as of:
                        of.write('{}\n'.format(time.time() - prev_time))
                        of.write('\n')
                    prev_time = time.time()

                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg_R", self.avg_reward),
                                           ("Max_R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max_Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                score, complete, length = self.evaluate()
                if complete > 0:
                    evaluation_result_list += [(score, complete, length)]
                if score > self.config.extended_eval_threshold:
                    self.logger.info('Extended in-sample evaluation...')
                    self.evaluate(num_episodes=1000)
                    for _ in range(10):
                        self.logger.info(
                            'Extended out-of-sample evaluation...')
                        oos_result = self.evaluate(
                            EnvMaze(n=self.config.maze_size), num_episodes=100)
                        oos_evalution_result_list += [oos_result]
                scores_eval += [score]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()[0]]
        export_plot(scores_eval, "Scores", self.config.plot_output)

        return evaluation_result_list, oos_evalution_result_list
コード例 #25
0
def dqn_learing(
    env,
    q_func,
    optimizer_spec,
    exploration,
    stopping_criterion=None,
    replay_buffer_size=1000000,
    batch_size=32,
    gamma=0.99,
    learning_starts=50000,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000,
    num_actions1=31,
    num_actions2=27
    ):

    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    ###############
    # BUILD MODEL #
    ###############

    img_h, img_w, img_c = 32, 120, 1
    input_arg = frame_history_len * img_c

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0)
            # Use volatile = True if variable is only used in inference mode, i.e. don't save the history
            out1, out2 = model(Variable(obs))
            out1 = out1.max(1)[1].data.cpu().numpy()[0]
            out2 = out2.max(1)[1].data.cpu().numpy()[0]
            return out1, out2
        else:
            return random.randrange(num_actions1), random.randrange(num_actions2)

    # Initialize target q function and q function
    Q = q_func(num_actions1, num_actions2).cuda(0).type(dtype)
    target_Q = q_func(num_actions1, num_actions2).cuda(0).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    epoch_reward = []
    for t in count():

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action1, action2 = select_epilson_greedy_action(Q, recent_observations, t)
        else:
            action1, action2 = random.randrange(num_actions1), random.randrange(num_actions2)
        # Advance one step
        obs, reward, done = env.step(action1, action2)
        epoch_reward.append(reward)
        if done:
            env.render()
        # clip rewards between -1 and 1
        # reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action1, action2, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
            print np.mean(epoch_reward)
            epoch_reward = []
            torch.save(Q,'../../weights/Q' + str(num_actions1) + '.pt')
            torch.save(target_Q,'../../weights/target_Q' + str(num_actions1) + '.pt')
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and
                t % learning_freq == 0 and
                replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act1_batch, act2_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype))
            act1_batch = Variable(torch.from_numpy(act1_batch).long())
            act2_batch = Variable(torch.from_numpy(act2_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype))
            not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

            if USE_CUDA:
                act1_batch = act1_batch.cuda()
                act2_batch = act2_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only stateif stopping_criterion is not None and stopping_criterion(env):
            # break and output value for every state-action pair
            # We choose Q based on action taken.
            q1, q2 =   Q(obs_batch)
            current_Q1_values = q1.gather(1, act1_batch.unsqueeze(1))
            current_Q2_values = q2.gather(1, act2_batch.unsqueeze(1))
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            tq1, tq2 = target_Q(next_obs_batch)
            next_max_q1 = tq1.detach().max(1)[0]
            next_max_q2 = tq2.detach().max(1)[0]
            next_Q1_values = not_done_mask * next_max_q1
            next_Q2_values = not_done_mask * next_max_q2
            # Compute the target of the current Q values
            target_Q1_values = rew_batch + (gamma * next_Q1_values)
            target_Q2_values = rew_batch + (gamma * next_Q2_values)
            # Compute Bellman error
            bellman_error1 = target_Q1_values.unsqueeze(1) - current_Q1_values
            bellman_error2 = target_Q2_values.unsqueeze(1) - current_Q2_values
            bellman_error = bellman_error1 + bellman_error2
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            current_Q_values = current_Q1_values + current_Q2_values
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())
コード例 #26
0
    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        save_paths = False
        if num_episodes is None:
            self.logger.info("Evaluating...")
        else:
            save_paths = True

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env
            bfs_len = self.bfs_len
        else:
            bfs_len = env.get_bfs_length()

        # replay memory to play
        if self.config.use_memory:
            replay_buffer = ReplayBuffer(
                self.config.buffer_size,
                self.config.state_history,
                memory_size=self.config.memory_unit_size)
        else:
            replay_buffer = ReplayBuffer(self.config.buffer_size,
                                         self.config.state_history)
        rewards = []
        steps = []

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            count = 0
            while True:
                if self.config.render_test: env.render()

                # store last state in buffer
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                if self.config.use_memory:
                    prev_memory = replay_buffer.encode_recent_memory()
                    action, bottom_q, top_q, next_memory = self.get_action_with_memory(
                        q_input, prev_memory)
                    next_memory = np.squeeze(next_memory)
                else:
                    action = self.get_action(q_input)

                if i == 0 and self.config.use_memory:
                    with open(self.config.output_path + 'eval_example_log.txt',
                              'a') as of:
                        of.write('State = {}\n'.format(env.cur_state))
                        of.write('Taking action = {}\n'.format(action))
                        of.write('prev_memory = {}\n'.format(
                            prev_memory[0, :6]))
                        of.write('next_memory = {}\n'.format(next_memory[:6]))
                        of.write('bottom_q_values = {}\n'.format(bottom_q))
                        of.write('top_q_values = {}\n'.format(top_q))
                        of.write('\n')

                if save_paths:
                    with open(self.config.output_path + 'path_log.txt',
                              'a') as of:
                        of.write("(s, a) = ({}, {})\n".format(
                            env.cur_state, action))
                        of.write('\n')

                # perform action in env
                new_state, reward, done, info = env.step(action)

                # store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)

                if self.config.use_memory:
                    replay_buffer.store_memory(idx, next_memory)

                state = new_state

                count += 1

                # count reward
                total_reward += reward
                if done:
                    if save_paths:
                        with open(self.config.output_path + 'path_log.txt',
                                  'a') as of:
                            of.write('\n')
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)
            if total_reward <= 0:
                steps.append(np.nan)
            else:
                steps.append(count)

        steps = np.array(steps) - bfs_len  # adjust for shortest possible path
        avg_reward = np.mean(rewards)

        avg_length = np.nanmean(steps)
        sigma_length = np.sqrt(np.nanvar(steps) / len(steps))
        percent_completed = np.count_nonzero(~np.isnan(steps)) / float(
            len(steps))
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}, Percent completed: {:04.2f}, Average length: {:04.2f} +/- {:04.2f}, n = {}".format(
                avg_reward, sigma_reward, percent_completed, avg_length,
                sigma_length, len(rewards))
            self.logger.info(msg)

        return avg_reward, percent_completed, avg_length
コード例 #27
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        last_frames = deque(maxlen=4)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += []

        extractor = PongExtractor()

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            last_frame = state
            last_frames.append(state)
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                feats = extractor.extract(np.squeeze(state))
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                embedding = self.sess.run(self.hidden,
                                          feed_dict={self.s: [q_input]})[0]
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                if t % 100 == 0:
                    # print state.shape
                    # frame = np.zeros(np.squeeze(state).shape)
                    # for f in last_frames:
                    #     frame = frame + np.squeeze(f)
                    # frame = frame / len(last_frames)
                    frame = np.squeeze(state)
                    last_frame = np.squeeze(last_frame)
                    pickle.dump(
                        last_frames,
                        open('frames/embedding/atari{}.p'.format(t), 'w'))
                    for i in range(4):
                        f = np.squeeze(last_frames[i])
                        scipy.misc.imsave(
                            'frames/embedding/atari{}.png'.format(t - 3 + i),
                            f)

                    # scipy.misc.imsave('frames/atari{}.png'.format(t-1),last_frame)
                    # posfile = open('frames/atari{}.txt'.format(t),'w')
                    # posfile.write('Opp Paddle:\t{}\n'.format(oppY))
                    # posfile.write('Player Paddle:\t{}\n'.format(playerY))
                    # posfile.write('ball x:\t{}\n'.format(ballX))
                    # posfile.write('ball y:\t{}\n'.format(ballY))
                    # posfile.close()
                    np.savetxt('frames/embedding/pong{}.txt'.format(t),
                               feats,
                               fmt='%.2f')

                # perform action in env
                new_state, reward, done, info = self.env.step(action)
                # print "state shape:",state.shape()

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                last_frame = state
                state = new_state
                last_frames.append(state)

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
コード例 #28
0
class QN(object):
    """
    Abstract Class for implementing a Q Network
    """
    def __init__(self, env, config, logger=None, name=None):
        """
        Initialize Q Network and env

        Args:
            config: class with hyperparameters
            logger: logger instance from logging module
        """
        # directory for training outputs
        self.name = name
        self.action_space = 3
        if name == None:
            raise Exception("Must supply network name")
        name = time.strftime("_%m%d_%H%M") + "/" + name

        config.output_path = config.output_path.format(name)
        config.model_output = config.model_output.format(name)
        config.log_path = config.log_path.format(name)
        config.plot_output = config.plot_output.format(name)
        config.record_path = config.record_path.format(name)

        if not os.path.exists(config.output_path):
            os.makedirs(config.output_path)

        # store hyper params
        # Customise the config

        self.config = config
        self.logger = logger
        if logger is None:
            self.logger = get_logger(config.log_path)
        self.env = env

        # build model
        self.build()

    def build(self):
        """
        Build model
        """
        pass

    @property
    def policy(self):
        """
        model.policy(state) = action
        """
        return lambda state: self.get_action(state)

    def save(self):
        """
        Save model parameters

        Args:
            model_path: (string) directory
        """
        pass

    def initialize(self):
        """
        Initialize variables if necessary
        """
        pass

    def get_best_action(self, state):
        """
        Returns best action according to the network
    
        Args:
            state: observation from gym
        Returns:
            tuple: action, q values
        """
        raise NotImplementedError

    def get_action(self, state):
        """
        Returns action with some epsilon strategy

        Args:
            state: observation from gym
        """
        if np.random.random() < self.config.soft_epsilon:
            return random
        else:
            return self.get_best_action(state)[0]

    def update_target_params(self):
        """
        Update params of Q' with params of Q
        """
        raise NotImplementedError

    def init_averages(self):
        """
        Defines extra attributes for tensorboard
        """
        self.avg_reward = -21.
        self.max_reward = -21.
        self.std_reward = 0

        self.avg_q = 0
        self.max_q = 0
        self.std_q = 0

        self.eval_reward = -21.

    def update_averages(self, rewards, max_q_values, q_values, scores_eval):
        """
        Update the averages

        Args:
            rewards: deque
            max_q_values: deque
            q_values: deque
            scores_eval: list
        """
        self.avg_reward = np.mean(rewards)
        self.max_reward = np.max(rewards)
        self.std_reward = np.sqrt(np.var(rewards) / len(rewards))

        self.max_q = np.mean(max_q_values)
        self.avg_q = np.mean(q_values)
        self.std_q = np.sqrt(np.var(q_values) / len(q_values))

        if len(scores_eval) > 0:
            self.eval_reward = scores_eval[-1]

    def train(self, exp_schedule, lr_schedule, env=None):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        if env is None:
            env = self.env
        # initialize replay buffer and variables
        rewards = deque(maxlen=self.config.num_episodes_test)
        self.init_averages()
        self.train_init()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1

                if self.config.render_train: env.render()

                action = self.train_step_pre(state, exp_schedule)
                cur_action = actions.trans_single(action)
                # perform action in env
                new_state, reward, done, info = env.step(cur_action)
                self.rewards = reward

                self.replay_buffer.store_effect(self.idx, self.action, reward,
                                                done)
                loss_eval, grad_eval = self.train_step(t, self.replay_buffer,
                                                       lr_schedule.epsilon)
                state = new_state

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, self.max_q_values,
                                         self.q_values, scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", np.mean(rewards)),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q",
                                            np.max(self.max_q_values)),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()
                self.save(t)

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)

    def train_init(self):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        self.replay_buffer = ReplayBuffer(self.config.buffer_size,
                                          self.config.state_history)
        self.max_q_values = deque(maxlen=1000)

    def train_step_pre(self, state, exp_schedule=None):
        self.idx = self.replay_buffer.store_frame(state)
        q_input = self.replay_buffer.encode_recent_observation()

        # chose action according to current Q and exploration
        best_action, q_values = self.get_best_action(q_input)
        if exp_schedule is None:
            self.action = best_action
        else:
            self.action = exp_schedule.get_action(best_action,
                                                  self.action_space)

        # store q values
        self.max_q_values.append(max(q_values))
        self.q_values = list(q_values)
        return self.action

    def train_step_post(self, reward, done, t, lr_schedule, train_model):
        self.replay_buffer.store_effect(self.idx, self.action, reward, done)
        if ((t > self.config.learning_start)
                and (t % self.config.log_freq == 0)
                and (t % self.config.learning_freq == 0)):
            self.update_averages(self.rewards, self.max_q_values,
                                 self.q_values, [0])
        # perform a training step
        if not train_model:
            return 0, 0
        return self.train_step(t, self.replay_buffer, lr_schedule.epsilon)

    def train_step(self, t, replay_buffer, lr):
        """
        Perform training step

        Args:
            t: (int) nths step
            replay_buffer: buffer for sampling
            lr: (float) learning rate
        """
        loss_eval, grad_eval = 0, 0

        # perform training step
        if (t > self.config.learning_start
                and t % self.config.learning_freq == 0):
            loss_eval, grad_eval = self.update_step(t, replay_buffer, lr)

        # occasionaly update target network with q network
        if t % self.config.target_update_freq == 0:
            self.update_target_params()

        # occasionaly save the weights
        if (t % self.config.saving_freq == 0):
            self.save()

        return loss_eval, grad_eval

    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env

        # keep the replay buffer alive
        try:
            r0 = self.replay_buffer
            has_replay = True
        except Exception:
            has_replay = False

        # replay memory to play
        rewards = []
        self.train_init()

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            while True:
                if self.config.render_test: env.render()

                action = self.train_step_pre(state)
                cur_action = actions.trans_single(action)
                # perform action in env
                new_state, reward, done, info = env.step(cur_action)
                self.train_step_post(reward, done, 0, None, False)

                # count reward
                total_reward += reward
                if done:
                    break

                state = new_state
            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

        if has_replay:
            self.replay_buffer = r0

        return avg_reward

    def record(self):
        """
        Re create an env and record a video for one episode
        """
        env = gym.make(self.config.env_name)
        env = gym.wrappers.Monitor(env,
                                   self.config.record_path,
                                   video_callable=lambda x: True,
                                   resume=True)
        env = MaxAndSkipEnv(env, skip=self.config.skip_frame)
        env = PreproWrapper(env,
                            prepro=greyscale,
                            shape=(80, 80, 1),
                            overwrite_render=self.config.overwrite_render)
        self.evaluate(env, 1)

    def run(self, exp_schedule, lr_schedule):
        """
        Apply procedures of training for a QN

        Args:
            exp_schedule: exploration strategy for epsilon
            lr_schedule: schedule for learning rate
        """
        # initialize
        self.initialize()

        # record one game at the beginning
        if self.config.record:
            self.record()

        # model
        self.train(exp_schedule, lr_schedule)

        # record one game at the end
        if self.config.record:
            self.record()
コード例 #29
0
ファイル: dqn_learn.py プロジェクト: kazizzad/GATS
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            current_Q_values.backward(d_error.data.unsqueeze(1))

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
コード例 #30
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):

    print("running new version")
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    """ ---------------------------- OUR CODE ---------------------------- """
    Q = q_func(input_arg, num_actions)  # The parameters are random
    Qtag = q_func(input_arg, num_actions)
    if (USE_CUDA):
        Q.cuda()
        Qtag.cuda()
    Qtag.load_state_dict(Q.state_dict())

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)
    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
    """ ------------------------------------------------------------------ """

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    reward = None
    done = None
    info = None
    LOG_EVERY_N_STEPS = 10000

    startTime = time.time()

    for t in count():
        """ Tsuf: ---- Stuff for debigging times for various places --- """
        T1 = 0
        t1Tmp = 0
        T2 = 0
        t2Tmp = 0
        T3 = 0
        t3Tmp = 0
        T4 = 0
        t4Tmp = 0
        T5 = 0
        t5Tmp = 0
        T6 = 0
        t6Tmp = 0
        T7 = 0
        t7Tmp = 0
        T8 = 0
        t8Tmp = 0
        """ ----------------------------------------------------------- """
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

#if (t>1000000):
#    break
### 2. Step the env and store the transition

# At this point, "last_obs" contains the latest observation that was
# recorded from the simulator. Here, your code needs to store this
# observation and its outcome (reward, next observation, etc.) into
# the replay buffer while stepping the simulator forward one step.
# At the end of this block of code, the simulator should have been
# advanced one step, and the replay buffer should contain one more
# transition.
# Specifically, last_obs must point to the new latest observation.
# Useful functions you'll need to call:
# obs, reward, done, info = env.step(action)
# this steps the environment forward one step
# obs = env.reset()
# this resets the environment if you reached an episode boundary.
# Don't forget to call env.reset() to get a new observation if done
# is true!!
# Note that you cannot use "last_obs" directly as input
# into your network, since it needs to be processed to include context
# from previous frames. You should check out the replay buffer
# implementation in dqn_utils.py to see what functionality the replay
# buffer exposes. The replay buffer has a function called
# encode_recent_observation that will take the latest observation
# that you pushed into the buffer and compute the corresponding
# input that should be given to a Q network by appending some
# previous frames.
# Don't forget to include epsilon greedy exploration!
# And remember that the first time you enter this loop, the model
# may not yet have been initialized (but of course, the first step
# might as well be random, since you haven't trained your net...)
        """ -------------------------- OUR CODE -------------------------- """

        #store last_obs, and get latest obs's as the input for the n.n
        t1Tmp = time.time()
        cur_idx = replay_buffer.store_frame(last_obs)
        next_input = replay_buffer.encode_recent_observation()
        T1 += time.time() - t1Tmp

        #take random action or use the net
        t2Tmp = time.time()
        action = select_epilson_greedy_action(
            Q, next_input, t)  #the returned action is on the CPU
        T2 += time.time() - t2Tmp

        #see what happens after we take that action
        t3Tmp = time.time()
        last_obs, reward, done, info = env.step(
            action)  #the returned parameters are on the CPU
        T3 += time.time() - t3Tmp

        #     print(t)
        # env.render()
        #store the results on the replay buffer
        replay_buffer.store_effect(cur_idx, action, reward, done)  #on the CPU

        #if the simulation is done, reset the environment
        if (done):
            last_obs = env.reset()
        """ -------------------------------------------------------------- """

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            """ ------------------------ OUR CODE ------------------------ """

            #sample a batch of history samples
            t4Tmp = time.time()
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)  #on CPU

            obs_batch = torch.from_numpy(obs_batch).type(
                dtype) / 255.0  # When available, move the samples batch to GPU
            next_obs_batch = torch.from_numpy(next_obs_batch).type(
                dtype) / 255.0  #GPU
            T4 += time.time() - t4Tmp

            #see which Q values the current network gives, for all obs's
            t5Tmp = time.time()
            inter_Qs = Q(
                Variable(obs_batch))  #input is on GPU, output is on GPU
            inter_Qs_chosen = Variable(
                torch.zeros(batch_size).type(dtype))  #GPU
            #take the action that was chosen before
            for i in range(batch_size):
                inter_Qs_chosen[i] = inter_Qs[i, act_batch[i]]
            #take only the intermediate (non-terminal) obs's
            inter_idx = np.where(done_mask == False)[0]  #CPU
            inter_next_obs_batch = next_obs_batch[inter_idx, :, :, :]
            T5 += time.time() - t5Tmp

            #see what the "target" (backuped) network says for the intermediate ones
            t6Tmp = time.time()
            inter_next_Qs = Qtag(
                Variable(inter_next_obs_batch,
                         volatile=True)).data.max(1)[0]  #All on GPU
            T6 += time.time() - t6Tmp

            #calculate the bellman errors
            t7Tmp = time.time()
            #for final obs's, the target is just the reward
            targets = torch.from_numpy(rew_batch).type(
                dtype)  #Moved rew_batch to GPU (as 'targets')
            for (i, idx) in enumerate(inter_idx):
                targets[idx] += gamma * inter_next_Qs[i]  #The bellman item
            # errors = -(inter_Qs_chosen.data - targets)**2 #EQUATION COULD BE WRONG!!   [on GPU]
            # for i in range(len(errors)):
            #     if errors[i]<-1:
            #         errors[i] = -1
            #     elif errors[i]>1:
            #         errors[i] = 1
            errors = inter_Qs_chosen.data - targets
            errors.clamp(-1, 1)
            T7 += time.time() - t7Tmp

            #train the network! (:
            t8Tmp = time.time()
            optimizer.zero_grad()
            inter_Qs_chosen.backward(
                errors)  #COULD BE WRONG WAY!!    [Everything is on GPU (: ]
            optimizer.step()
            T8 += time.time() - t8Tmp

            num_param_updates += 1
            if (num_param_updates % target_update_freq == 0):
                Qtag.load_state_dict(Q.state_dict())
            """ ---------------------------------------------------------- """

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)
        Statistic["running_times"].append(int(time.time() - startTime))

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            if (PRINT_TIMES):
                print("-----------------------")
                print(T1)
                print(T2)
                print(T3)
                print(T4)
                print(T5)
                print(T6)
                print(T7)
                print(T8)
                print("-----------------------")
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')