Ejemplo n.º 1
0
    def test_observation_zeroing(self):
        """ Tests zeroing out of frames not from current episode """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        for terminal_idx in range(5):
            obs_ = []
            obs_next_ = []
            for i in range(1, 6):
                partial_obs = np.ones(obs_shape) * i
                terminal = 1 if i == terminal_idx else 0
                er.append(partial_obs, 0, 0, terminal)

                if i <= terminal_idx:
                    partial_obs *= 0
                if i < 5:
                    obs_.append(partial_obs)
                if i > 1:
                    obs_next_.append(partial_obs)
            obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0))
            obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0))

            batch = er.sample(1)
            obs, rewards, actions, obs_next, terminals = batch
            assert np.array_equal(obs_, obs)
            assert np.array_equal(obs_next_, obs_next)
Ejemplo n.º 2
0
    def __init__(self):
        self.eps = 0.1
        self.env = GridEnv(3)
        self.batch_size = 20

        if prioritized_replay and replay_type == "proportional":
            self.replay = ProportionalReplay(max_buffer_size,
                                             prioritized_replay_alpha)
        elif prioritized_replay and replay_type == "ranked":
            N_list = [self.batch_size] + [
                int(x) for x in np.linspace(100, max_buffer_size, 5)
            ]
            save_quantiles(N_list=N_list,
                           k=self.batch_size,
                           alpha=prioritized_replay_alpha)
            self.replay = RankBasedReplay(max_buffer_size,
                                          prioritized_replay_alpha)
        else:
            self.replay = ExperienceReplay(
                max_buffer_size)  # passing size of buffer

        # define graph
        self.inputs = tf.placeholder(tf.float32,
                                     shape=(None, self.env.state_size))
        self.target_values = tf.placeholder(tf.float32, shape=(None, ))
        self.actions = tf.placeholder(tf.int32, shape=(None, ))
        self.is_weights = tf.placeholder(tf.float32, shape=(
            None, ))  # importance sampling weights for prioritized replay
        self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph(
        )  # build main network
        self.target_Q_out_op, _, _ = self.build_graph(
            'target')  # build identical target network

        self.init_op = tf.global_variables_initializer()
        self.sess = tf.Session()
    def __init__(self):
        self.batch_size = 64  # How many experiences to use for each training step
        self.train_frequency = 5  # How often you update the network
        self.num_epochs = 20  # How many epochs to train when updating the network
        self.y = 0.99  # Discount factor
        self.prob_random_start = 0.6  # Starting chance of random action
        self.prob_random_end = 0.1  # Ending chance of random action
        self.annealing_steps = 1000.  # Steps of training to reduce from start_e -> end_e
        self.max_num_episodes = 10000  # Max number of episodes you are allowes to played to train the game
        self.min_pre_train_episodes = 100  # Number of episodes played with random actions before to start training.
        self.max_num_step = 50  # Maximum allowed episode length
        self.goal = 15  # Number of rewards we want to achieve while playing a game.

        # Set env
        self.env = gameEnv(partial=False, size=5)

        # Reset everything from keras session
        K.clear_session()

        # Setup our Q-networks
        self.main_qn = Qnetwork()
        self.target_qn = Qnetwork()

        # Setup our experience replay
        self.experience_replay = ExperienceReplay()
Ejemplo n.º 4
0
    def __init__(self,
                 random_action_method,
                 future_discount=0.75,
                 learning_rate=0.001,
                 saveAndLoad=True):
        learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8)

        self.model_a = RLModel()
        self.model_a.build((None, AGENT_INPUT_SIZE))

        self.model_b = RLModel()
        self.model_b.build((None, AGENT_INPUT_SIZE))

        self.saveAndLoad = saveAndLoad

        if os.path.isfile(SAVE_PATH_A) and os.path.isfile(
                SAVE_PATH_B) and saveAndLoad:
            print("Loading")
            self.model_a.load_weights(SAVE_PATH_A)
            self.model_b.load_weights(SAVE_PATH_B)

        self.exp_rep_a = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)
        self.exp_rep_b = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)

        self.random_action_method = random_action_method

        self.learning_rate = learning_rate
        self.future_discount = future_discount

        self.loss_measure = tf.losses.MeanSquaredError()
        self.opt = tf.optimizers.Adam(lr=self.learning_rate)

        self.n_since_last_train = 0

        self.latestLoss = tf.add(0, 0)
Ejemplo n.º 5
0
    def __init__(self,
                 random_action_method,
                 future_discount=0.75,
                 learning_rate=0.001,
                 load_path=None):
        learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8)

        self.model = RLModel()
        self.model.build((None, AGENT_INPUT_SIZE))
        self.load_path = load_path
        if load_path is not None and os.path.isfile(load_path):
            print("Loading")
            self.model.load_weights(load_path)

        self.exp_rep = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)

        self.random_action_method = random_action_method

        self.learning_rate = learning_rate
        self.future_discount = future_discount

        self.loss_measure = tf.losses.MeanSquaredError()
        self.opt = tf.optimizers.Adam(lr=self.learning_rate)

        self.n_since_last_train = 0

        self.latestLoss = tf.add(0, 0)
Ejemplo n.º 6
0
    def __init__(self):
        self.experience_replay = ExperienceReplay('BreakoutDeterministic-v0',
                                                  FLAGS.replay_buffer_size, 84,
                                                  84, 4, self.policy,
                                                  FLAGS.decay_to_epoch)

        config = DQNConfig()
        config.learning_rate = FLAGS.learning_rate
        config.gamma = FLAGS.gamma
        config.decay = FLAGS.decay
        config.momentum = FLAGS.momentum
        config.eps = FLAGS.eps
        config.input_width = FLAGS.image_width
        config.input_height = FLAGS.image_height
        config.skip = FLAGS.skip
        self.dqn = DQN(config, FLAGS.use_huber)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        logger.info('initializing variables...')
        self.sess.run(tf.global_variables_initializer())
        self.update_target()

        self.epoch = 0
        self.decay_epsilon()
    def __init__(self,
                 env,
                 batchsize=64,
                 pic_size=(96, 96),
                 num_frame_stack=4,
                 gamma=0.95,
                 frame_skip=1,
                 train_freq=4,
                 initial_epsilon=1.0,
                 min_epsilon=0.1,
                 render=True,
                 epsilon_decay_steps=int(1e6),
                 min_experience_size=int(1e3),
                 experience_capacity=int(1e5),
                 network_update_freq=5000,
                 regularization=1e-6,
                 optimizer_params=None,
                 action_map=None):
        self.exp_history = ExperienceReplay(num_frame_stack,
                                            capacity=experience_capacity,
                                            pic_size=pic_size)
        self.playing_cache = ExperienceReplay(num_frame_stack,
                                              capacity=num_frame_stack * 5 +
                                              10,
                                              pic_size=pic_size)
        self.network_update_freq = network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        self.optimizer_params = optimizer_params or dict(learning_rate=0.0004,
                                                         epsilon=1e-7)
        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None
        self.state_size = (self.num_frame_stack, ) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n

        self.q_values = []
        self.loss_his = []
Ejemplo n.º 8
0
    def test_sampling(self):
        """ Tests observation construction from partial observations """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        for i in range(1, 6):
            partial_obs = np.ones(obs_shape) * i
            er.append(partial_obs, 1, 1, 0)

        batch = er.sample(1)
        _, rewards, actions, _, terminals = batch
        assert np.array_equal(rewards, np.array([1]))
        assert np.array_equal(actions, np.array([1]))
        assert np.array_equal(terminals, np.array([0]))
    def __init__(self):

        self.prob_random = 1.0  # Probability to play random action
        self.y = .99  # Discount factor
        self.batch_size = 64  # How many experiences to use for each training step
        self.prob_random_end = .01  # Ending chance of random action
        self.prob_random_decay = .996  # Decrease decay of the prob random
        self.max_episode = 300  # Max number of episodes you are allowes to played to train the game
        self.expected_goal = 200  # Expected goal

        self.dnn = DNN()
        self.env = gym.make('CartPole-v0')

        self.memory = ExperienceReplay(buffer_size=10000)

        self.metadata = [
        ]  # we will store here info score, at the end of each episode
Ejemplo n.º 10
0
    def __init__(self):
        # gamma is a parameter of Q - learing algorithm
        self.gamma = 0.9

        # We use epsilon - greedy strategy of learning
        self.epsilon = 1
        self.epsilon_decay = 0.99
        self.epsilon_min = 0.01
        
        # Number of epochs (fully played games) to study an agent
        self.epochs = 500

        # Game to play
        self.game = Game()

        # Number of hidden layer nodes
        self.hidden_layer_nodes = 20

        # Create keras model
        # _________________________________________________________________
        # Layer (type)                 Output Shape              Param #   
        # =================================================================
        # dense_1 (Dense)              (None, 20)                120       
        # _________________________________________________________________
        # dense_2 (Dense)              (None, 20)                420       
        # _________________________________________________________________
        # dense_3 (Dense)              (None, 5)                 105       
        # =================================================================
        # Total params: 645
        # Trainable params: 645
        # Non-trainable params: 0
        # _________________________________________________________________
        self.model = Sequential()
        self.model.add(Dense(self.hidden_layer_nodes, input_dim=self.game.state_size, activation='relu'))
        self.model.add(Dense(self.hidden_layer_nodes, activation='relu'))
        self.model.add(Dense(len(POSSIBLE_ACTIONS), activation='linear'))
        self.model.compile('Adam', loss='mse')

        # Initialize experience replay
        self.experience_replay = ExperienceReplay(size=2000)
        self.batch_size = 20
        self.max_turns = 100
Ejemplo n.º 11
0
    def test_observation_construction(self):
        """ Tests observation construction from partial observations """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        obs_ = []
        obs_next_ = []
        for i in range(1, 6):
            partial_obs = np.ones(obs_shape) * i
            if i < 5:
                obs_.append(partial_obs)
            if i > 1:
                obs_next_.append(partial_obs)
            er.append(partial_obs, 0, 0, 0)
        obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0))
        obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0))

        batch = er.sample(1)
        obs, rewards, actions, obs_next, terminals = batch
        assert np.array_equal(obs_, obs)
        assert np.array_equal(obs_next_, obs_next)
Ejemplo n.º 12
0
    def __init__(self,
                 env,
                 net_update_rate: int = 25,
                 exploration_rate: float = 1.0,
                 exploration_decay: float = 0.00005):
        # set hyper parameters
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.net_updating_rate = net_update_rate

        # set environment
        self.env = env
        self.state_shape = env.get_state_shape()
        self.action_shape = env.get_action_shape()

        # the number of experience per batch for batch learning
        # Experience Replay for batch learning
        self.exp_rep = ExperienceReplay()

        # Deep Q Network
        self.net = None
Ejemplo n.º 13
0
def init():
    train_env = SquigglesEnvironment(num_notes=2)
    evaluation_env = SquigglesEnvironment(num_notes=2)

    train_env = tf_py_environment.TFPyEnvironment(train_env)
    evaluation_env = tf_py_environment.TFPyEnvironment(evaluation_env)

    agent, _ = generic_dqn_agent(train_env)

    experience_replay = ExperienceReplay(agent, train_env, BATCH_SIZE)

    return agent, train_env, evaluation_env, experience_replay
Ejemplo n.º 14
0
 def __init__(self, FLAGS):
     """
     This class build the model that implements the deterministic 
     gradient descent algorithm.
     
     :param FLAGS: TensorFlow flags which contain the values for hyperparameters
     
     """
     
     self.FLAGS=FLAGS
     
     self.env = gym.make('Pendulum-v0')
     self.state_size = len(self.env.observation_space.sample())
     self.num_episodes=1000
     self.batch_size=64
     
     self.exp_replay=ExperienceReplay(50000,1500, FLAGS)
     
     self.action_noise=OrnsteinUhlenbeckActionNoise(self.env,mu= 0.0, sigma=0.2, theta=.15, dt=1e-2, x0=None)
     
     self.actor_target=Actor(scope='target',target_network=None,env=self.env, flags=FLAGS)
     self.actor=Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=FLAGS)
     
     self.critic_target=Critic(scope='target',target_network=None,env=self.env, flags=FLAGS)
     self.critic=Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=FLAGS)
     
     init = tf.global_variables_initializer()
     self.session = tf.InteractiveSession()
     self.session.run(init)
     
     self.critic.set_session(self.session)
     self.actor.set_session(self.session)
     self.actor_target.set_session(self.session)
     self.critic_target.set_session(self.session)
     
     self.critic.init_target_network()
     self.actor.init_target_network()
Ejemplo n.º 15
0
    def __init__(self, s_size, a_size, seed):
        """

        Parameters:
            s_size (int): dimension of each state
            a_size (int): dimension of each action
            seed (int): random seed
        """
        self.s_size = s_size
        self.a_size = a_size
        self.seed = random.seed(seed)

        # Initialize both the Q-networks
        self.local_dqn = Model(s_size, a_size, seed).to(device)
        self.target_dqn = Model(s_size, a_size, seed).to(device)
        self.optimizer = optim.Adam(self.local_dqn.parameters(),
                                    lr=c.LEARNING_RATE)

        # Initialize experience deque
        self.buffer = ExperienceReplay(a_size, c.REPLAY_BUFFER_SIZE,
                                       c.BATCH_SIZE, seed)

        # Time step counter used for updating as per UPDATE_FREQUENCY
        self.t_step = 0
Ejemplo n.º 16
0
def run_episode(plan_step_fn,
                learner,
                dataset,
                cache_subtree,
                add_returns,
                preproc_obs_fn=None,
                render=False):
    episode_done = False
    actor.reset()
    episode_rewards = []
    aux_replay = ExperienceReplay(
    )  # New auxiliary buffer to save current episode transitions
    while not episode_done:
        # Planning step
        tree_policy = plan_step_fn(len(episode_rewards))

        # Execute action (choose one node as the new root from depth 1)
        a = sample_pmf(tree_policy)
        prev_root_data, current_root_data = actor.step(a,
                                                       cache_subtree,
                                                       render,
                                                       render_size=(512, 512))
        aux_replay.append({
            "observations": prev_root_data["obs"],
            "target_policy": tree_policy
        })
        episode_rewards.append(current_root_data["r"])
        episode_done = current_root_data["done"]

        # Learning step
        if learner is not None:
            batch = dataset.sample(batch_size)
            if preproc_obs_fn is not None:
                batch["observations"] = preproc_obs_fn(batch["observations"])
            obs = tf.constant(batch["observations"], dtype=tf.float32)
            target_policy = tf.constant(batch["target_policy"],
                                        dtype=tf.float32)
            if add_returns:
                returns = tf.constant(batch["returns"], dtype=tf.float32)
                loss, _ = learner.train_step(obs, target_policy, returns)
            else:
                loss, _ = learner.train_step(obs, target_policy)

    # Add episode to the dataset
    if add_returns:
        returns = compute_returns(episode_rewards,
                                  discount_factor)  # Backpropagate rewards
        aux_replay.add_column("returns", returns)  # Add them to the dataset
    dataset.extend(
        aux_replay
    )  # Add transitions to the buffer that will be used for learning

    return episode_rewards
Ejemplo n.º 17
0
def main():
    hist_length = 50
    processor = Processor(history_length=hist_length)
    price_history = processor.fetchData()
    train_price_history = price_history['train']
    test_price_history = price_history['test']
    env = Environment(horizon=20,
                      train_price_history=train_price_history,
                      test_price_history=test_price_history,
                      history_length=hist_length)
    exp_replay = ExperienceReplay()
    agent = Agent(feature_size=6,
                  window=hist_length,
                  action_size=3,
                  experience_replay=exp_replay,
                  environment=env)
    agent.train()
    print("Agent done training, now testing: ")
    agent.test(test_price_history)
Ejemplo n.º 18
0
 def __init__(self, game, memory_size = 100000, 
              batch_size = 1, epsilon_init = 1.0, alpha_init = .00025,
              anneal_alpha = True, anneal_epsilon = True, 
              batch_size_incr = 0):
     self.memories = ExperienceReplay(memory_size)
     self.nnet = LeNet(game.state_shape, dim_out = game.n_actions, 
                       batch_size = 1, fc_dim = 500,
                       nkerns = [16,32], filter_dims = [2,2],
                       out_type = 'linear')
     self.trainer = single_batch_trainer(self.nnet)
     self.game = game
     self.n_episodes = 0
     self.avg_rewards = []
     self.avg_action_vals = []
     self.alpha = alpha_init
     self.epsilon = epsilon_init
     self.anneal_ep = anneal_epsilon
     self.anneal_lr = anneal_alpha
     self.batch_size = batch_size
     self.batch_size_incr = batch_size_incr
     self._pct_invalids = []
     self._costs = []
Ejemplo n.º 19
0
class DeepQlearner:
    def __init__(self,
                 random_action_method,
                 future_discount=0.75,
                 learning_rate=0.001,
                 load_path=None):
        learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8)

        self.model = RLModel()
        self.model.build((None, AGENT_INPUT_SIZE))
        self.load_path = load_path
        if load_path is not None and os.path.isfile(load_path):
            print("Loading")
            self.model.load_weights(load_path)

        self.exp_rep = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)

        self.random_action_method = random_action_method

        self.learning_rate = learning_rate
        self.future_discount = future_discount

        self.loss_measure = tf.losses.MeanSquaredError()
        self.opt = tf.optimizers.Adam(lr=self.learning_rate)

        self.n_since_last_train = 0

        self.latestLoss = tf.add(0, 0)

    def getActions(self, agentInputs):
        rand_action = self.random_action_method.get_random_action()
        if rand_action is not None:
            return [rand_action] * agentInputs.shape[0]
        else:
            pred = self.model.call(agentInputs)
            #print(pred[0])
            return [ACTIONS[x] for x in np.argmax(pred, axis=1)]

    def update(self, oldAgentInputs, actions, newAgentInputs, rewards):
        # Lägg till i experience_replay

        actions = np.array([ACTIONS.index(action) for action in actions])
        #print(["LEFT","RIGHT","JUMP","NONE"][actions[0]],rewards[0])
        self.exp_rep.add_experinces(oldAgentInputs, actions, newAgentInputs,
                                    rewards)

        self.n_since_last_train += oldAgentInputs.shape[0]

        if self.n_since_last_train > TRAIN_RATE:
            loss = self.train_on_random_minibatch()

            self.n_since_last_train = 0

    def train_on_random_minibatch(self):
        input, action, new_input, reward = self.exp_rep.get_random_minibatch(
            BATCH_SIZE)

        loss = self.train_on_batch(input, action, new_input, reward)

        #if self.load_path is not None:
        #    self.save(self.load_path)

        return loss.numpy()

    def train_on_batch(self, agent_input_before, action, agent_input_after,
                       reward):
        q_after = self.model(agent_input_after)
        wanted_q = reward + self.future_discount * tf.reduce_max(q_after,
                                                                 axis=1)
        #wanted_q = reward

        tvars = self.model.trainable_variables

        with tf.GradientTape() as tape:
            pred_q_for_all_actions = self.model(agent_input_before)

            # Indexera med rätt actions
            action_ind = tf.transpose(
                [tf.range(agent_input_before.shape[0]), action])
            pred_q_for_action = tf.gather_nd(pred_q_for_all_actions,
                                             action_ind)

            loss = self.loss_measure(wanted_q, pred_q_for_action)

            gradients = tape.gradient(loss, tvars)
        self.opt.apply_gradients(zip(gradients, tvars))

        self.latestLoss = loss
        return loss

    def save(self, path=SAVE_PATH):
        self.model.save_weights(path)
class DQN:
    def __init__(self):
        self.batch_size = 64  # How many experiences to use for each training step
        self.train_frequency = 5  # How often you update the network
        self.num_epochs = 20  # How many epochs to train when updating the network
        self.y = 0.99  # Discount factor
        self.prob_random_start = 0.6  # Starting chance of random action
        self.prob_random_end = 0.1  # Ending chance of random action
        self.annealing_steps = 1000.  # Steps of training to reduce from start_e -> end_e
        self.max_num_episodes = 10000  # Max number of episodes you are allowes to played to train the game
        self.min_pre_train_episodes = 100  # Number of episodes played with random actions before to start training.
        self.max_num_step = 50  # Maximum allowed episode length
        self.goal = 15  # Number of rewards we want to achieve while playing a game.

        # Set env
        self.env = gameEnv(partial=False, size=5)

        # Reset everything from keras session
        K.clear_session()

        # Setup our Q-networks
        self.main_qn = Qnetwork()
        self.target_qn = Qnetwork()

        # Setup our experience replay
        self.experience_replay = ExperienceReplay()

    def update_target_graph(self):
        updated_weights = np.array(self.main_qn.model.get_weights())
        self.target_qn.model.set_weights(updated_weights)

    def choose_action(self, state, prob_random, num_episode):
        if np.random.rand() < prob_random or \
                num_episode < self.min_pre_train_episodes:
            # Act randomly based on prob_random or if we
            # have not accumulated enough pre_train episodes
            action = np.random.randint(self.env.actions)
        else:
            # Decide what action to take from the Q network
            # First add one dimension to the netword to fit expected dimension of the network
            state = np.expand_dims(state, axis=0)
            action = np.argmax(self.main_qn.model.predict(state))
        return action

    def run_one_episode(self, num_episode, prob_random):
        # Create an experience replay for the current episode.
        experiences_episode = []

        # Get the game state from the environment
        state = self.env.reset()

        done = False  # Game is complete
        cur_step = 0  # Running sum of number of steps taken in episode

        while cur_step < self.max_num_step and not done:
            cur_step += 1
            action = self.choose_action(state=state,
                                        prob_random=prob_random,
                                        num_episode=num_episode)

            # Take the action and retrieve the next state, reward and done
            next_state, reward, done = self.env.step(action)

            # Setup the experience to be stored in the episode buffer
            experience = [state, action, reward, next_state, done]

            # Store the experience in the episode buffer
            experiences_episode.append(experience)

            # Update the state
            state = next_state

        return experiences_episode

    def generate_target_q(self, train_state, train_action, train_reward,
                          train_next_state, train_done):
        # Our predictions (actions to take) from the main Q network
        target_q = self.main_qn.model.predict(train_state)

        # Tells us whether game over or not
        # We will multiply our rewards by this value
        # to ensure we don't train on the last move
        train_gameover = train_done == 0

        # Q value of the next state based on action
        target_q_next_state = self.target_qn.model.predict(train_next_state)
        train_next_state_values = np.max(target_q_next_state[range(
            self.batch_size)],
                                         axis=1)

        # Reward from the action chosen in the train batch
        actual_reward = train_reward + (self.y * train_next_state_values *
                                        train_gameover)
        target_q[range(self.batch_size), train_action] = actual_reward
        return target_q

    def train_one_step(self):
        # Train batch is [[state,action,reward,next_state,done],...]
        train_batch = self.experience_replay.sample(self.batch_size)

        # Separate the batch into numpy array for each compents
        train_state = np.array([x[0] for x in train_batch])
        train_action = np.array([x[1] for x in train_batch])
        train_reward = np.array([x[2] for x in train_batch])
        train_next_state = np.array([x[3] for x in train_batch])
        train_done = np.array([x[4] for x in train_batch])

        # Generate target Q
        target_q = self.generate_target_q(train_state=train_state,
                                          train_action=train_action,
                                          train_reward=train_reward,
                                          train_next_state=train_next_state,
                                          train_done=train_done)

        # Train the main model
        loss = self.main_qn.model.train_on_batch(train_state, target_q)
        return loss

    def train(self):

        # Make the networks equal
        self.update_target_graph()

        # We'll begin by acting complete randomly. As we gain experience and improve,
        # we will begin reducing the probability of acting randomly, and instead
        # take the actions that our Q network suggests
        prob_random = self.prob_random_start
        prob_random_drop = (self.prob_random_start -
                            self.prob_random_end) / self.annealing_steps

        # Init variable
        num_steps = []  # Tracks number of steps per episode
        rewards = []  # Tracks rewards per episode
        print_every = 50  # How often to print status
        losses = [0]  # Tracking training losses
        num_episode = 0

        while True:
            # Run one episode
            experiences_episode = self.run_one_episode(num_episode,
                                                       prob_random)

            # Save the episode in the replay buffer
            self.experience_replay.add(experiences_episode)

            # If we have play enoug episode. Start the training
            if num_episode > self.min_pre_train_episodes:

                # Drop the probability of a random action if wi didn't reach the prob_random_end value
                if prob_random > self.prob_random_end:
                    prob_random -= prob_random_drop

                # Every train_frequency iteration, train the model
                if num_episode % self.train_frequency == 0:
                    for num_epoch in range(self.num_epochs):
                        loss = self.train_one_step()
                        losses.append(loss)

                    # Update the target model with values from the main model
                    self.update_target_graph()

            # Increment the episode
            num_episode += 1
            num_steps.append(len(experiences_episode))
            rewards.append(sum([e[2] for e in experiences_episode]))

            # Print Info
            if num_episode % print_every == 0:
                # datetime object containing current date and time
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print(
                    "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}"
                    .format(dt_string, num_episode,
                            np.mean(rewards[-print_every:]), prob_random,
                            mean_loss))

            # Stop Condition
            if np.mean(rewards[-print_every:]) >= self.goal:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print(
                    "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}"
                    .format(dt_string, num_episode,
                            np.mean(rewards[-print_every:]), prob_random,
                            mean_loss))
                print("Training complete because we reached goal rewards.")
                break
            if num_episode > self.max_num_episodes:
                print("Training Stop because we reached max num of episodes")
                break
Ejemplo n.º 21
0
class DQN(object):
    """
    OOP for a Deep Q-Network (DQN). 
    """
    def __init__(self, game, memory_size = 100000, 
                 batch_size = 1, epsilon_init = 1.0, alpha_init = .00025,
                 anneal_alpha = True, anneal_epsilon = True, 
                 batch_size_incr = 0):
        self.memories = ExperienceReplay(memory_size)
        self.nnet = LeNet(game.state_shape, dim_out = game.n_actions, 
                          batch_size = 1, fc_dim = 500,
                          nkerns = [16,32], filter_dims = [2,2],
                          out_type = 'linear')
        self.trainer = single_batch_trainer(self.nnet)
        self.game = game
        self.n_episodes = 0
        self.avg_rewards = []
        self.avg_action_vals = []
        self.alpha = alpha_init
        self.epsilon = epsilon_init
        self.anneal_ep = anneal_epsilon
        self.anneal_lr = anneal_alpha
        self.batch_size = batch_size
        self.batch_size_incr = batch_size_incr
        self._pct_invalids = []
        self._costs = []
            
    def train(self, n_episodes = 3, max_iter = 500):
        g = self.game
        g.reset()
        # set anneal rate for epsilon.
        ep_anneal_rate = 0
        if self.anneal_ep:
            ep_anneal_rate = float(self.epsilon)/n_episodes
        alpha_anneal_rate = 0
        if self.anneal_lr:
            alpha_anneal_rate = float(self.alpha)/n_episodes
        for e_idx in range(n_episodes):
            s = g.get_state()
            print "Episode: %d, Exploration Rate: %f, Learning Rate: %f" %(e_idx, self.epsilon, self.alpha)
            while not g.is_terminal() and not self.game._num_moves >= max_iter and not self.game.iter_ctr >= 200:
                # epsilon-greedy action selection below
                if np.random.binomial(1,self.epsilon):
                    a_idx = np.random.randint(self.game.n_actions)
                else:
                    values = self.nnet.outputter(s.reshape(self.nnet.image_shape))
                    a_idx = np.argmax(values[0])
                r = g.take_action(a_idx)
                stp1 = g.get_state()
                # Reshape states into shape expected by convnet. 
                self.memories.insert(Memory(
                    s.transpose(2,0,1).reshape(self.nnet.image_shape), 
                    a_idx, 
                    r, 
                    stp1.transpose(2,0,1).reshape(self.nnet.image_shape)
                ))
                s = stp1

                # TEST CLOOJ
                if self.game.iter_ctr %200 == 0:
                    print "move_n: %d, action: %d, reward: %f, status: %d" %(
                        self.game.iter_ctr, a_idx, r, self.game._STATUS
                    )
                
                # Minibatch update. 
                if e_idx > 0:
                    costs = [] # local for this iter. 
                    data = self.memories.sample(self.batch_size) # random (state, action, reward, nxt_state) sample from memory replay. 
                    data = [m.target_pair(self.nnet) for m in data] # convert above tuple into training data, label pair. 
                    for i in range(self.batch_size):
                        d = data[i]
                        costs.append(self.trainer(d[0], d[1], self.alpha)) # call trainer func
                    self._costs.append(np.mean(costs))
#            print "Game %d ends in %d iterations with status %d, reward %d." %(e_idx, self.game.iter_ctr, self.game._STATUS, r)

            # compute percent invalid actions.
            n_moves = g.iter_ctr
            rs = g.episode_rewards
            n_invalid = len(np.where(rs == np.array([-.02 for _ in range(len(rs))]))[0])
            pct_invalid = float(n_invalid)/n_moves
            self._pct_invalids.append(pct_invalid)
            print "Pct Invalid: %f" %pct_invalid
            g.reset()
            self.epsilon -= ep_anneal_rate
            self.batch_size += self.batch_size_incr
            if e_idx > 0:
                self.alpha -= alpha_anneal_rate
Ejemplo n.º 22
0
    def __init__(self,
            env,
            obs_size = (115,),
            num_frame_stack = 1,
            batch_size = 32,
            mdp_gamma = 0.95,
            initial_epsilon = 1.0,
            min_epsilon = 0.1,
            epsilon_decay_steps = int(1e6),
            replay_capacity = int(1e5),
            min_replay_size = int(1e3),
            train_freq = 4,
            network_update_freq = 5000,
            regularization = 1e-6,
            optimizer_params = None,
            render = False):

            """
            Initialization function
            
            param env:                object. a gym-like environment which our RL agent interacts with
            parma obs_size:           list. the shape of the observation, i.e. (115,) for vector observation or (32,32) for image observation
            parma num_frame_stack:    int. number of stacked frames for network input
            param batch_size:         int. batch size
            param mdp_gamma:          float. MDP discount factor
            param initial_epsilon:    float. epsilon parameter of epsilon-greedy policy
            param min_epsilon:        float. minimum epsilon parameter of epsilon-greedy policy
            param epsilon_decay_steps: int. how many steps to decay epsilon 
            param replay_capacity:    int. replay buffer size
            param min_replay_size:    int. minimum replay buffer size
            param train_freq:         int. training frequency
            param network_update_freq: int. network update frequency
            param regularization:     float. regularization coefficient
            param optimizer_params:   dict. optimizer specilized parameters. i.e. learning rate, momentum
            param render:             bool. is render mode on?
            """
            
            # experience replay buffer for training
            self.exp_buffer = ExperienceReplay(
                num_frame_stack,
                capacity=replay_capacity,
                obs_size = obs_size
            )

            # experience replay buffer for playing/testing
            self.play_buffer = ExperienceReplay(
                num_frame_stack,
                capacity=num_frame_stack * 10,
                obs_size = obs_size
            )

            self.env = env
            self.obs_size = obs_size
            self.num_frame_stack = num_frame_stack
            self.batch_size = batch_size
            self.mdp_gamma = mdp_gamma
            self.initial_epsilon = initial_epsilon
            self.min_epsilon = min_epsilon
            self.epsilon_decay_steps = epsilon_decay_steps
            self.replay_capacity = replay_capacity
            self.min_replay_size = min_replay_size
            self.train_freq = train_freq
            self.network_update_freq = network_update_freq
            self.regularization = regularization
            self.render = render

            self.dim_actions = env.action_space.n
            self.dim_state = (num_frame_stack,) + self.obs_size

            if optimizer_params:
                self.optimizer_params = optimizer_params
            else:
                self.optimizer_params = dict(learning_rate = 0.0001, epsilon = 1e-7)

            self.is_training = True
            # epsilon used for playing
            # if 0, means that we just use the Q-network's optimal action without any exploration
            self.playing_epsilon = 0.0
            
            self.session = None
            
            self.global_counter = 0
            self.episode_counter = 0
            self.loss_history = []
Ejemplo n.º 23
0
class DQN:
    """
    DQN implementation. Note that only supports an environment that is gym-like.(i.e. reset, step, ..)
    """
    def __init__(self,
            env,
            obs_size = (115,),
            num_frame_stack = 1,
            batch_size = 32,
            mdp_gamma = 0.95,
            initial_epsilon = 1.0,
            min_epsilon = 0.1,
            epsilon_decay_steps = int(1e6),
            replay_capacity = int(1e5),
            min_replay_size = int(1e3),
            train_freq = 4,
            network_update_freq = 5000,
            regularization = 1e-6,
            optimizer_params = None,
            render = False):

            """
            Initialization function
            
            param env:                object. a gym-like environment which our RL agent interacts with
            parma obs_size:           list. the shape of the observation, i.e. (115,) for vector observation or (32,32) for image observation
            parma num_frame_stack:    int. number of stacked frames for network input
            param batch_size:         int. batch size
            param mdp_gamma:          float. MDP discount factor
            param initial_epsilon:    float. epsilon parameter of epsilon-greedy policy
            param min_epsilon:        float. minimum epsilon parameter of epsilon-greedy policy
            param epsilon_decay_steps: int. how many steps to decay epsilon 
            param replay_capacity:    int. replay buffer size
            param min_replay_size:    int. minimum replay buffer size
            param train_freq:         int. training frequency
            param network_update_freq: int. network update frequency
            param regularization:     float. regularization coefficient
            param optimizer_params:   dict. optimizer specilized parameters. i.e. learning rate, momentum
            param render:             bool. is render mode on?
            """
            
            # experience replay buffer for training
            self.exp_buffer = ExperienceReplay(
                num_frame_stack,
                capacity=replay_capacity,
                obs_size = obs_size
            )

            # experience replay buffer for playing/testing
            self.play_buffer = ExperienceReplay(
                num_frame_stack,
                capacity=num_frame_stack * 10,
                obs_size = obs_size
            )

            self.env = env
            self.obs_size = obs_size
            self.num_frame_stack = num_frame_stack
            self.batch_size = batch_size
            self.mdp_gamma = mdp_gamma
            self.initial_epsilon = initial_epsilon
            self.min_epsilon = min_epsilon
            self.epsilon_decay_steps = epsilon_decay_steps
            self.replay_capacity = replay_capacity
            self.min_replay_size = min_replay_size
            self.train_freq = train_freq
            self.network_update_freq = network_update_freq
            self.regularization = regularization
            self.render = render

            self.dim_actions = env.action_space.n
            self.dim_state = (num_frame_stack,) + self.obs_size

            if optimizer_params:
                self.optimizer_params = optimizer_params
            else:
                self.optimizer_params = dict(learning_rate = 0.0001, epsilon = 1e-7)

            self.is_training = True
            # epsilon used for playing
            # if 0, means that we just use the Q-network's optimal action without any exploration
            self.playing_epsilon = 0.0
            
            self.session = None
            
            self.global_counter = 0
            self.episode_counter = 0
            self.loss_history = []

    def get_variables(self,scope):
        """
        Get variables according to scope name
        """
        vars_list = []
        for var in tf.global_variables():
            if "%s/" % scope in var.name and "Adam" not in var.name:
                vars_list.append(var)
        return sorted(vars_list, key=lambda x: x.name)
    
    def get_epsilon(self):
        """
        Get current epsilon value.
        Note: with the training process, epsilon is decaying
        """
        if self.is_training == False:
            return self.playing_epsilon
        elif self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            # for simplicity, just use linear decay
            return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * (1.0 - self.global_counter / float(self.epsilon_decay_steps))
            

    def network(self, input, trainable, use_image = False):
        """
        Implementation of Q(s,a) network
        
        param input:  tensor. [Batch_Size, N_State] or [Batch_Size, Num_stack_frame, H, W]

        """

        regularizer = None
        if trainable:
            regularizer = slim.l2_regularizer(self.regularization)
        
        if not use_image:
            # here use vanilla 4-layer perceptron
            # 1st layer
            net = slim.fully_connected(input, 512, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            # 2nd layer
            net = slim.fully_connected(net, 1024, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            # 3rd layer
            net = slim.fully_connected(net,512, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            # 4th layer
            #net = slim.fully_connected(net, 256, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)

            # output layer
            q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn = None, weights_regularizer = regularizer, trainable = trainable)

        else:
            
            x = tf.transpose(input, [0,2,3,1])

            net = slim.conv2d(x, 8, (7,7),  stride = 3, data_format = "NHWC", activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            net = slim.max_pool2d(net, 2, 2)
            net = slim.conv2d(net, 16, (3,3), stride = 1, data_format = "NHWC", activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            net = slim.max_pool2d(net, 2, 2)
            net = slim.flatten(net)
            net = slim.fully_connected(net, 256, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn = None, weights_regularizer = regularizer, trainable = trainable)
        
        return q_state_action_values

    def sample_random_action(self):
        """
        Randomly sample an action for rollout
        """
        return np.random.choice(self.dim_actions)
    
    
    
    def setup_graph(self, use_image = False, if_soft = True):
        """
        Set up tensorflow computing graph
        """

        # define a bunch of placeholders
        if use_image:
            input_next_state_shape = (self.batch_size, self.num_frame_stack) + self.obs_size
            input_prev_state_shape = (None, self.num_frame_stack) + self.obs_size
        else:
            input_next_state_shape = (self.batch_size, self.obs_size[0])
            input_prev_state_shape = (None, self.obs_size[0])

        self.input_prev_state = tf.placeholder(tf.float32, input_prev_state_shape, name = "input_prev_state")
        self.input_next_state = tf.placeholder(tf.float32, input_next_state_shape, name = "input_next_state")
        self.input_actions = tf.placeholder(tf.int32, self.batch_size, name = "input_actions")
        self.input_reward = tf.placeholder(tf.float32, self.batch_size, name = "input_reward")
        self.is_done = tf.placeholder(tf.int32, self.batch_size, name = "is_done")

        self.optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
        """
        Q-learning:
        1. take action a_t according to epsilon-greedy policy
        2. store transition (s_t, a_t, r_t+1, s_t+1) in replay buffer D
        3. sample random mini-batch of transitions (s,a,r,s') from D
        3. compute Q-learning targets w.r.t. old, fixed parameters w-
        4. optimise MSE between Q-network and Q-learning targets

        L(w) = E{s,a,r,s' ~ D} [(r + \gamma \max_a'  Q(s',a',w-) - Q(s,a,w))^2]

        5. use variant of stochastic gradient descent
        """
        # Note: the following 2 networks need to have the same structure
        # fixed, old parameters Q-network for Q-target estimation
        with tf.variable_scope("target_q"):
            q_target = self.network(self.input_next_state, trainable=False, use_image = use_image)
        
        # trainable, new parameters Q-network for Q-learning
        with tf.variable_scope("update_q"):
            q_estimate = self.network(self.input_prev_state, trainable=True, use_image = use_image)
        # optimal action recovered by newest Q-network
        self.optimal_action = tf.argmax(q_estimate, axis = 1)
        
        not_done = tf.cast(tf.logical_not(tf.cast(self.is_done, "bool")), tf.float32)
        q_target_value = self.input_reward + not_done * self.mdp_gamma * tf.reduce_max(q_target, -1)

        # choose chosen self.input_actions from q_estimate to get values
        # first get indexes
        idx = tf.stack([tf.range(0, self.batch_size), self.input_actions], axis = 1)
        q_estimate_value = tf.gather_nd(q_estimate, idx)

        # MSE loss
        mse_loss = tf.nn.l2_loss(q_estimate_value - q_target_value) / self.batch_size
        # Regularization loss
        regularization_loss = tf.add_n(tf.losses.get_regularization_losses())

        self.loss = mse_loss + regularization_loss
        self.train_op = self.optimizer.minimize(self.loss)

        update_params = self.get_variables("update_q")
        target_params = self.get_variables("target_q")

        assert (len(update_params) == len(target_params))
        # weights copy op
        if if_soft:
            self.assign_op = [tf.assign(tp,0.001 * up + 0.999 * tp) for tp, up in zip(target_params, update_params)]
        else:
            self.assign_op = [tf.assign(tp,up) for tp, up in zip(target_params, update_params)]

    def train(self):
        """
        train step
        """
        # sample one mini-batch to compute mse
        batch = self.exp_buffer.sample_mini_batch(self.batch_size)
        if self.num_frame_stack > 1:
            # suppose use image observation
            feed_dict = {
                self.input_prev_state : batch["prev_state"],
                self.input_next_state : batch["next_state"],
                self.input_actions: batch["actions"],
                self.is_done: batch["done_mask"],
                self.input_reward: batch["reward"]
            }
        else:
            # reduce the axis 1
            feed_dict = {
                self.input_prev_state : batch["prev_state"][:,0,:],
                self.input_next_state : batch["next_state"][:,0,:],
                self.input_actions: batch["actions"],
                self.is_done: batch["done_mask"],
                self.input_reward: batch["reward"]
            }

        _, loss = self.session.run([self.train_op, self.loss], feed_dict=feed_dict)
        self.loss_history.append(loss)

        return loss

    def update_target_network(self):
        """
        Update target network
        """
        # no need for feed dicts
        self.session.run(self.assign_op)

    def play_episode(self):
        if self.is_training:
            rb = self.exp_buffer
        else:
            rb = self.play_buffer
        
        # total reward
        sum_reward = 0
        # total loss
        sum_loss = 0
        # steps
        steps_in_episode = 0

        first_obs = self.env.reset()
        rb.new_episode(first_obs)

        while True:
            if np.random.rand() > self.get_epsilon():
                if self.num_frame_stack > 1:
                    action = self.session.run(self.optimal_action, {self.input_prev_state: rb.current_state()[np.newaxis,:]})[0]
                else:
                    action = self.session.run(self.optimal_action, {self.input_prev_state: rb.current_state()})[0]
            else:
                action = self.sample_random_action()
             
            obs, reward, done, info = self.env.step(action)
            if self.render:
                self.env.render()
            else:
                pass
            
            sum_reward += reward
            steps_in_episode += 1

            # add one experience into buffer
            rb.add_experience(obs, action, done, reward)

            if self.is_training:
                self.global_counter += 1
                if self.global_counter % self.network_update_freq == 0:
                    self.update_target_network()
                if self.exp_buffer.counter >= self.min_replay_size and self.global_counter % self.train_freq == 0:
                    sum_loss += self.train()
            if done:
                if self.is_training:
                    self.episode_counter += 1
                
                return sum_reward, steps_in_episode, sum_loss / float(steps_in_episode)
class DQN:
    """ Implementation of deep q learning algorithm """
    def __init__(self):

        self.prob_random = 1.0  # Probability to play random action
        self.y = .99  # Discount factor
        self.batch_size = 64  # How many experiences to use for each training step
        self.prob_random_end = .01  # Ending chance of random action
        self.prob_random_decay = .996  # Decrease decay of the prob random
        self.max_episode = 300  # Max number of episodes you are allowes to played to train the game
        self.expected_goal = 200  # Expected goal

        self.dnn = DNN()
        self.env = gym.make('CartPole-v0')

        self.memory = ExperienceReplay(buffer_size=10000)

        self.metadata = [
        ]  # we will store here info score, at the end of each episode

    def choose_action(self, state, prob_random):
        if np.random.rand() <= prob_random:
            action = np.random.randint(self.env.action_space.n)
        else:
            action = np.argmax(self.dnn.model.predict(state))
        return action

    def run_one_step(self, state):
        action = self.choose_action(state, self.prob_random)
        next_state, reward, done, _ = self.env.step(action)
        next_state = np.expand_dims(next_state, axis=0)
        return state, action, reward, next_state, done

    def generate_target_q(self, train_state, train_action, train_reward,
                          train_next_state, train_done):

        # Our predictions (actions to take) from the main Q network
        target_q = self.dnn.model.predict(train_state)

        # Tells us whether game over or not
        # We will multiply our rewards by this value
        # to ensure we don't train on the last move
        train_gameover = train_done == 0

        # Q value of the next state based on action
        target_q_next_state = self.dnn.model.predict(train_next_state)
        train_next_state_values = np.max(target_q_next_state[range(
            self.batch_size)],
                                         axis=1)

        # Reward from the action chosen in the train batch
        actual_reward = train_reward + (self.y * train_next_state_values *
                                        train_gameover)
        target_q[range(self.batch_size), train_action] = actual_reward
        return target_q

    def train_one_step(self):

        batch_data = self.memory.sample(self.batch_size)
        train_state = np.array([i[0] for i in batch_data])
        train_action = np.array([i[1] for i in batch_data])
        train_reward = np.array([i[2] for i in batch_data])
        train_next_state = np.array([i[3] for i in batch_data])
        train_done = np.array([i[4] for i in batch_data])

        # These lines remove useless dimension of the matrix
        train_state = np.squeeze(train_state)
        train_next_state = np.squeeze(train_next_state)

        # Generate target Q
        target_q = self.generate_target_q(train_state=train_state,
                                          train_action=train_action,
                                          train_reward=train_reward,
                                          train_next_state=train_next_state,
                                          train_done=train_done)

        loss = self.dnn.model.train_on_batch(train_state, target_q)
        return loss

    def train(self):
        scores = []
        for e in range(self.max_episode):
            # Init New episode
            state = self.env.reset()
            state = np.expand_dims(state, axis=0)
            episode_score = 0
            while True:
                state, action, reward, next_state, done = self.run_one_step(
                    state)
                self.memory.add(
                    experiences=[[state, action, reward, next_state, done]])
                episode_score += reward
                state = next_state
                if len(self.memory.buffer) > self.batch_size:
                    self.train_one_step()
                    if self.prob_random > self.prob_random_end:
                        self.prob_random *= self.prob_random_decay
                if done:
                    now = datetime.now()
                    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                    self.metadata.append(
                        [now, e, episode_score, self.prob_random])
                    print(
                        "{} - episode: {}/{}, score: {:.1f} - prob_random {:.3f}"
                        .format(dt_string, e, self.max_episode, episode_score,
                                self.prob_random))
                    break
            scores.append(episode_score)

            # Average score of last 100 episode
            means_last_10_scores = np.mean(scores[-10:])
            if means_last_10_scores == self.expected_goal:
                print('\n Task Completed! \n')
                break
            print("Average over last 10 episode: {0:.2f} \n".format(
                means_last_10_scores))
        print("Maximum number of episode played: %d" % self.max_episode)
Ejemplo n.º 25
0
from training_testing import test

# parameters
epsilon = 0.1  # exploration
max_memory = 500  # Maximum number of experiences we are storing
hidden_size = 100  # Size of the hidden layers
batch_size = 1  # Number of experiences we use for training per batch
epoch = 50


def baseline_model(grid_size, num_actions, hidden_size):
    # seting up the model with keras
    model = Sequential()
    model.add(
        Dense(hidden_size, input_shape=(grid_size**2, ), activation='relu'))
    model.add(Dense(hidden_size, activation='relu'))
    model.add(Dense(num_actions))
    model.compile(SGD(lr=.1), "mse")
    return model


# Define environment/game
env = Catch()

# Initialize experience replay object
exp_replay = ExperienceReplay(max_memory=max_memory)

model = baseline_model(grid_size, num_actions, hidden_size)
train(env, model, exp_replay, epoch, epsilon, num_actions, batch_size)
test(model)
Ejemplo n.º 26
0
                               downsampling_pix_values=None,
                               atari_frameskip=args.atari_frameskip)
        eval_fn = get_evaluate_fn(env_eval=env_eval,
                                  preproc_obs_fn=preproc_obs_fn,
                                  policy_NN=call_model,
                                  args=args)

    process = psutil.Process()
    memory_usage_fn = lambda: process.memory_info().rss

    stats = Stats(use_tensorboard=args.use_tensorboard, log_path=log_path)
    experience_keys = ["observations", "target_policy"]
    if args.compute_value:
        experience_keys.append("returns")

    experience_replay = ExperienceReplay(keys=experience_keys,
                                         capacity=args.replay_capacity)

    run_episode_fn = get_episode_fn(
        actor=high_level_actor if args.hierarchical else low_level_actor,
        planner=high_level_planner if args.hierarchical else low_level_planner,
        train_fn=train_fn,
        dataset=experience_replay,
        add_returns=args.compute_value,
        stats=stats,
        memory_usage_fn=memory_usage_fn,
        preproc_obs_fn=preproc_obs_fn,
        eval_fn=eval_fn,
        n_actions=env.action_space.n,
        value_scalars_to_distrs=value_scalars_to_distrs,
        value_logits_to_scalars=value_logits_to_scalars,
        args=args)
Ejemplo n.º 27
0
gameDisplay = pygame.display.set_mode(DISPLAY_SHAPE)
pygame.display.set_caption('Bouncing Balls')
pygame.key.set_repeat(1, 1)

env = GameEnvironment(DISPLAY_SHAPE, 1.0 / float(FPS))


def action_vector(a):
    res = np.zeros(9)
    res[int(a)] = 1.0
    return res


# Define Experience Replay
if SAVE_EXPERIENCE:
    er = ExperienceReplay.load(EXP_REPLAY_FILE)
    if er == None:
        er = ExperienceReplay(BUFFER_SIZE)


def gameover(hero_score):

    gameDisplay.fill(WHITE)

    font = pygame.font.SysFont(None, 42)
    text = font.render("GAME OVER", True, BLACK)
    gameDisplay.blit(text, (DISPLAY_SHAPE[0] / 3, DISPLAY_SHAPE[1] / 3))

    pygame.display.update()

    pygame.time.delay(3000)
Ejemplo n.º 28
0
from experience_replay import ExperienceReplay
from logger import Logger

ACTIONS = {0: "UP", 1: "DOWN", 2: "RIGHT", 3: "LEFT"}
NUM_ACTIONS = len(ACTIONS)

NUM_GAMES = 30000
OBSERVE = 1000
MAX_TILE = 2048

epsilon = 0.1
min_epsilon = 1e-2
gamma_epsilon = 0.999
gamma_reward = 0.99

replay = ExperienceReplay(capacity=1e6)
logger = Logger()

online = PolicyNetwork(batch_size=32)
target = PolicyNetwork(batch_size=32)


def preprocess(a: np.array) -> np.array:
    a = np.where(a <= 0, 1, a)
    a = np.log2(a) / np.log2(MAX_TILE)
    return a


if __name__ == "__main__":

    best_score = 0
Ejemplo n.º 29
0
from __future__ import division, print_function

import gym
import gym_gazebo
import numpy as np
import sys
import os
from ddq_model import Qnet
from experience_replay import ExperienceReplay
from utils import Config

argv = sys.argv[1:]
config = Config(argv)
env = gym.make('GazeboTurtlebotMazeColor-v0')
replay = ExperienceReplay(config.args.output_dir,
                          config.args.replay_buffer_size)
qnet = Qnet(env.num_state, env.num_action)

if (config.args.continue_from != None):
    qnet.load(config.args.continue_from)
    replay.load(config.args.continue_from)

elif (config.args.from_pretrain != None):
    qnet.load(config.args.from_pretrain)

epsilon = config.args.start_epsilon
epsilon_decay = (config.args.start_epsilon -
                 config.args.end_epsilon) / config.args.annealing_steps

while True:
    state = env.reset()
Ejemplo n.º 30
0
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # Instead of env.step() and env.reset(), we'll use TreeActor helper class, which creates a tree and adds nodes to it
    env = gym.make(env_id)
    observe_fn = observe_pi_iw_dynamic if use_dynamic_feats else observe_pi_iw_BASIC
    actor = TreeActor(env, observe_fn=observe_fn)
    planner = RolloutIW(branching_factor=env.action_space.n, ignore_cached_nodes=True)

    model = Mnih2013(num_logits=env.action_space.n, add_value=False)

    optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate,
                                            rho=rmsprop_decay,
                                            epsilon=rmsprop_epsilon)
    learner = SupervisedPolicy(model, optimizer, regularization_factor=regularization_factor, use_graph=True)
    experience_replay = ExperienceReplay(capacity=replay_capacity)

    def network_policy(node, branching_factor):
        return node.data["probs"]

    # Initialize experience replay: run some steps until we have enough examples to form one batch
    print("Initializing experience replay", flush=True)
    actor.reset()
    while len(experience_replay) < batch_size:
        r, episode_done = planning_step(actor=actor,
                                        planner=planner,
                                        dataset=experience_replay,
                                        policy_fn=network_policy,
                                        tree_budget=tree_budget,
                                        cache_subtree=cache_subtree,
                                        discount_factor=discount_factor)
Ejemplo n.º 31
0
class DQN:
    def __init__(self,
                 env,
                 batchsize=64,
                 pic_size=(96, 96),
                 num_frame_stack=4,
                 gamma=0.95,
                 frame_skip=1,
                 train_freq=4,
                 initial_epsilon=1.0,
                 min_epsilon=0.1,
                 render=True,
                 epsilon_decay_steps=int(1e6),
                 min_experience_size=int(1e3),
                 experience_capacity=int(1e5),
                 network_update_freq=5000,
                 regularization=1e-6,
                 optimizer_params=None,
                 action_map=None):
        self.exp_history = ExperienceReplay(num_frame_stack,
                                            capacity=experience_capacity,
                                            pic_size=pic_size)
        self.playing_cache = ExperienceReplay(num_frame_stack,
                                              capacity=num_frame_stack * 5 +
                                              10,
                                              pic_size=pic_size)
        self.network_update_freq = network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        self.optimizer_params = optimizer_params or dict(learning_rate=0.0004,
                                                         epsilon=1e-7)
        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None
        self.state_size = (self.num_frame_stack, ) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n
        self.q_values = []
        self.loss_his = []

    @staticmethod
    def process_image(img):
        return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1

    def build_graph(self):
        input_dim_with_batch = (self.batchsize,
                                self.num_frame_stack) + self.pic_size
        input_dim_general = (None, self.num_frame_stack) + self.pic_size

        self.input_prev_state = tf.placeholder(tf.float32, input_dim_general,
                                               "prev_state")
        self.input_next_state = tf.placeholder(tf.float32,
                                               input_dim_with_batch,
                                               "next_state")
        self.input_reward = tf.placeholder(tf.float32, self.batchsize,
                                           "reward")
        self.input_actions = tf.placeholder(tf.int32, self.batchsize,
                                            "actions")
        self.input_done_mask = tf.placeholder(tf.int32, self.batchsize,
                                              "done_mask")

        with tf.variable_scope("target"):
            qsa_targets = self.create_network(self.input_next_state,
                                              trainable=False)

        with tf.variable_scope("train"):
            qsa_estimates = self.create_network(self.input_prev_state,
                                                trainable=True)

        self.best_action = tf.argmax(qsa_estimates, axis=1)
        not_done = tf.cast(
            tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
        q_target = tf.reduce_max(
            qsa_targets, -1) * self.gamma * not_done + self.input_reward
        self.q_value_mean = tf.reduce_mean(q_target)
        action_slice = tf.stack(
            [tf.range(0, self.batchsize), self.input_actions], axis=1)
        q_estimates_for_input_action = tf.gather_nd(qsa_estimates,
                                                    action_slice)

        training_loss = tf.nn.l2_loss(
            q_target - q_estimates_for_input_action) / self.batchsize
        optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
        reg_loss = tf.add_n(tf.losses.get_regularization_losses())
        self.loss = tf.reduce_mean(reg_loss + training_loss)
        self.train_op = optimizer.minimize(reg_loss + training_loss)

        train_params = self.get_variables("train")
        target_params = self.get_variables("target")

        try:
            self.copy_network_ops = [
                tf.assign(target_v, train_v)
                for train_v, target_v in zip(train_params, target_params)
            ]
        except:
            print("error")

    def get_variables(self, scope):
        vars = [
            t for t in tf.global_variables()
            if "%s/" % scope in t.name and "Adam" not in t.name
        ]
        return sorted(vars, key=lambda v: v.name)

    def create_network(self, input, trainable):
        if trainable:
            wr = slim.l2_regularizer(self.regularization)
        else:
            wr = None

        input_t = tf.transpose(input, [0, 2, 3, 1])
        net = slim.conv2d(input_t,
                          8, (7, 7),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          stride=3,
                          weights_regularizer=wr,
                          trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.conv2d(net,
                          16, (3, 3),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          weights_regularizer=wr,
                          trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.flatten(net)
        fc_1 = slim.fully_connected(net,
                                    256,
                                    activation_fn=tf.nn.relu,
                                    weights_regularizer=wr,
                                    trainable=trainable)
        fc_2 = slim.fully_connected(net,
                                    256,
                                    activation_fn=tf.nn.relu,
                                    weights_regularizer=wr,
                                    trainable=trainable)
        value = slim.fully_connected(fc_1,
                                     1,
                                     activation_fn=None,
                                     weights_regularizer=wr,
                                     trainable=trainable)
        advantage = slim.fully_connected(fc_2,
                                         self.dim_actions,
                                         activation_fn=None,
                                         weights_regularizer=wr,
                                         trainable=trainable)
        q_state_action_values = value + (advantage - tf.reduce_mean(
            advantage, reduction_indices=[
                1,
            ], keepdims=True))

        return q_state_action_values

    def check_early_stop(self, reward, totalreward):
        return False, 0.0

    def get_random_action(self):
        return np.random.choice(self.dim_actions)

    def get_epsilon(self):
        if not self.do_training:
            return self.playing_epsilon
        elif self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            # linear decay
            r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
            return self.min_epsilon + (self.initial_epsilon -
                                       self.min_epsilon) * r

    def train(self):
        batch = self.exp_history.sample_mini_batch(self.batchsize)

        fd = {
            self.input_reward: "reward",
            self.input_prev_state: "prev_state",
            self.input_next_state: "next_state",
            self.input_actions: "actions",
            self.input_done_mask: "done_mask"
        }
        fd1 = {ph: batch[k] for ph, k in fd.items()}
        _, action_value, loss = self.session.run(
            [self.train_op, self.q_value_mean, self.loss], fd1)
        self.q_values.append(action_value)
        self.loss_his.append(loss)

    def play_episode(self):
        eh = (self.exp_history if self.do_training else self.playing_cache)
        total_reward = 0
        frames_in_episode = 0

        first_frame = self.env.reset()
        first_frame_pp = self.process_image(first_frame)

        eh.start_new_episode(first_frame_pp)

        while True:
            if np.random.rand() > self.get_epsilon():
                action_idx = self.session.run(self.best_action, {
                    self.input_prev_state:
                    eh.current_state()[np.newaxis, ...]
                })[0]
            else:
                action_idx = self.get_random_action()

            if self.action_map is not None:
                action = self.action_map[action_idx]
            else:
                action = action_idx

            reward = 0
            for _ in range(self.frame_skip):
                observation, r, done, info = self.env.step(action)
                if self.render:
                    self.env.render()
                reward += r
                if done:
                    break

            early_done, punishment = self.check_early_stop(
                reward, total_reward)
            if early_done:
                reward += punishment

            done = done or early_done

            total_reward += reward
            frames_in_episode += 1

            eh.add_experience(self.process_image(observation), action_idx,
                              done, reward)

            if self.do_training:
                self.global_counter += 1
                if self.global_counter % self.network_update_freq:
                    self.update_target_network()
                train_cond = (
                    self.exp_history.counter >= self.min_experience_size
                    and self.global_counter % self.train_freq == 0)
                if train_cond:
                    self.train()

            if done:
                if self.do_training:
                    self.episode_counter += 1

                q_value = np.mean(self.q_values)
                loss = np.mean(self.loss_his)
                self.q_values = []
                self.loss_his = []
                return total_reward, frames_in_episode, q_value, loss

    def update_target_network(self):
        self.session.run(self.copy_network_ops)
Ejemplo n.º 32
0
        plan_step_fn = get_pi_iw_planning_step_fn(
            actor=actor,
            planner=planner,
            policy_fn=network_policy,
            tree_budget=tree_budget,
            discount_factor=discount_factor,
            temp=policy_temp)
        learner = SupervisedPolicy(model,
                                   optimizer,
                                   regularization_factor=regularization_factor,
                                   use_graph=True)

    # Initialize experience replay: run complete episodes until we exceed both batch_size and dataset_min_transitions
    print("Initializing experience replay", flush=True)
    train_stats = TrainStats()
    experience_replay = ExperienceReplay(capacity=replay_capacity)
    while len(experience_replay) < batch_size or len(
            experience_replay) < replay_min_transitions:
        episode_rewards = run_episode(
            plan_step_fn=plan_step_fn,
            learner=None,
            dataset=experience_replay,
            cache_subtree=cache_subtree,
            add_returns=(args.algorithm == "AlphaZero"),
            preproc_obs_fn=preproc_obs_fn,
            render=args.render)
        train_stats.report(episode_rewards, actor.nodes_generated)

    # Interleave planning and learning steps
    print("\nInterleaving planning and learning steps.", flush=True)
    while actor.nodes_generated < max_simulator_steps: