def learn_and_evaluate(self):
        workers_id = []
        batch_size = self.parms['training_episodes'] // self.parms['workers'][0]
        for _ in range(self.parms['workers'][0]):
            workers_id.append(collecting_worker.remote(self.env, self.model_server, self.memory_server, batch_size))

        all_results = []
        if self.parms['do_test']:
            eval_model = DQNModel(len(env.reset()), len(ACTION_DICT))
            learn_done, filedir = False, ""
            workers_num = self.parms['workers'][1]
            interval = self.parms['test_interval']//workers_num
            while not learn_done:
                filedir, learn_done = ray.get(self.memory_server.get_evaluate_filedir.remote())
                if not filedir:
                    continue
                eval_model.load(filedir)
                start_time, total_reward = time.time(), 0
                eval_workers = []
                for _ in range(workers_num):
                    eval_workers.append(evaluation_worker_test2.remote(self.env, self.memory_server, eval_model, interval))
                    
                avg_reward = sum(ray.get(eval_workers))/workers_num
                print(filedir, avg_reward, (time.time() - start_time))
                all_results.append(avg_reward)

        return all_results
    def __init__(self, env, hyper_params, memory, action_space):
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        self.final_epsilon = hyper_params['final_epsilon']
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.beta = hyper_params['beta']
        self.model_replace_freq = hyper_params['model_replace_freq']
        self.learning_rate = hyper_params['learning_rate']
        self.training_episodes = hyper_params['training_episodes']
        self.test_interval = hyper_params['test_interval']
        self.memory = memory

        self.episode = 0
        self.steps = 0
        self.result_count = 0
        self.next = 0
        self.batch_num = self.training_episodes // self.test_interval

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate'])
        self.target_model = DQNModel(input_len, output_len)

        self.results = [0] * (self.batch_num + 1)
        self.previous_q_networks = []

        self.collector_done = False
        self.evaluator_done = False
Example #3
0
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps

        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']

        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.memory = ReplayBuffer(hyper_params['memory_size'])

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']
Example #4
0
    def __init__(self, env, hyper_params, batch_size, update_steps, memory_size, beta, model_replace_freq,
                 learning_rate, use_target_model=True, memory=Memory_Server, action_space=2,
                 training_episodes=7000, test_interval=50):
        # super().__init__(update_steps, memory_size, model_replace_freq, learning_rate, beta=0.99, batch_size = 32, use_target_model=True)
        self.batch_size = batch_size

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len, output_len, learning_rate=0.0003)
        self.target_model = DQNModel(input_len, output_len)
        self.steps = 0
        self.memory = memory
        # self.memory = ReplayBuffer(hyper_params['memory_size'])
        self.prev = 0
        self.next = 0
        self.model_dq = deque()
        self.result = [0] * ((training_episodes // test_interval) + 1)
        self.previous_q_networks = []
        self.result_count = 0
        self.learning_episodes = training_episodes
        self.episode = 0
        self.is_collection_completed = False
        self.evaluator_done = False
        self.batch_num = training_episodes // test_interval
        self.use_target_model = True
        self.beta = 0.99
        self.test_interval = test_interval
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon.
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps',
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)
#         memory: Store and sample experience replay.
        self.memory = ReplayBuffer(hyper_params['memory_size'])
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

        print("agent initialized")
class Model_Server(object):
    def __init__(self, env, hyper_params, memory_server):
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        self.beta = hyper_params['beta']

        state = env.reset()
        action_space = len(ACTION_DICT)
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.memory_server = memory_server

    def update_batch(self, batch_size):

        batch = ray.get(self.memory_server.sample.remote(batch_size))

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([0 if t else 1 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            actions, q_next = self.target_model.predict_batch(next_states)
            q_next = q_next[batch_index, actions]
        else:
            actions, q_next = self.eval_model.predict_batch(next_states)
            q_next = q_next[batch_index, actions]

        #INSERT YOUR CODE HERE --- neet to compute 'q_targets' used below
        q_max = q_next * terminal
        q_target = reward + self.beta * q_max

        # update model
        self.eval_model.fit(q_values, q_target)

    def replace_target(self):
        self.target_model.replace(self.eval_model)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)
def evaluation_worker(env, mem_server, trials):
    eval_model = DQNModel(len(env.reset()), len(ACTION_DICT))
    learn_done, filedir = False, ""
    while not learn_done:
        filedir, learn_done = ray.get(mem_server.get_evaluate_filedir.remote())
        if not filedir:
            continue
        eval_model.load(filedir)
        start_time, total_reward = time.time(), 0
        for _ in range(trials):
            state, done, steps = env.reset(), False, 0
            while steps < env._max_episode_steps and not done:
                steps += 1
                state, reward, done, _ = env.step(eval_model.predict(state))
                total_reward += reward
        mem_server.add_results.remote(total_reward / trials)
Example #8
0
    def __init__(self, env, memory, action_space=2, test_interval=50):

        self.collector_done = False
        self.evaluator_done = False

        self.env = env
        # self.max_episode_steps = env._max_episode_steps
        self.max_episode_steps = 200

        self.beta = hyperparams_CartPole['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyperparams_CartPole['final_epsilon']
        self.epsilon_decay_steps = hyperparams_CartPole['epsilon_decay_steps']
        self.batch_size = hyperparams_CartPole['batch_size']

        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        self.previous_q_models = []
        self.results = [0] * (self.batch_size + 1)
        self.reuslt_count = 0
        self.episode = 0
        self.test_interval = test_interval
        self.memory = memory

        state = env.reset()
        input_len = len(state)
        output_len = action_space

        self.eval_model = DQNModel(input_len, output_len, learning_rate=hyperparams_CartPole['learning_rate'])

        self.use_target_model = hyperparams_CartPole['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        # #         memory: Store and sample experience replay.
        #         self.memory = ReplayBuffer(hyper_params['memory_size'])

        self.batch_size = hyperparams_CartPole['batch_size']
        self.update_steps = hyperparams_CartPole['update_steps']
        self.model_replace_freq = hyperparams_CartPole['model_replace_freq']
    def __init__(self, env, hyper_params, memory_server):
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        self.beta = hyper_params['beta']

        state = env.reset()
        action_space = len(ACTION_DICT)
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.memory_server = memory_server
Example #10
0
    def __init__(self,
                 hyper_params,
                 memory_server,
                 nb_agents,
                 nb_evaluators,
                 action_space=len(ACTION_DICT)):
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        self.hyper_params = hyper_params
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']
        self.action_space = action_space
        self.batch_size = hyper_params['batch_size']
        self.memory_server = memory_server
        self.nb_agents = nb_agents
        self.nb_evaluators = nb_evaluators
        env = CartPoleEnv()
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.target_model = DQNModel(input_len, output_len)

        self.agents = [
            DQN_agent_remote.remote(CartPoleEnv(), memory_server, hyper_params,
                                    action_space, i) for i in range(nb_agents)
        ]
        self.evaluators = [
            EvalWorker.remote(self.eval_model, CartPoleEnv(),
                              hyper_params['max_episode_steps'],
                              hyper_params['eval_trials'], i)
            for i in range(nb_evaluators)
        ]
Example #11
0
    def __init__(self,
                 learning_rate,
                 training_episodes,
                 memory,
                 env,
                 test_interval=50,
                 batch_size=32,
                 action_space=len(ACTION_DICT),
                 beta=0.99):

        self.env = env
        #self.max_episode_steps = env._max_episode_steps

        self.batch_num = training_episodes // test_interval
        self.steps = 0

        self.collector_done = False
        self.evaluator_done = False
        self.training_episodes = training_episodes
        self.episode = 0
        #self.esults = []
        self.batch_size = batch_size
        self.privous_q_model = []
        self.results = [0] * (self.batch_num + 1)
        self.result_count = 0
        self.memory = memory
        self.use_target_model = True

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len, output_len, learning_rate=0.0003)
        self.target_model = DQNModel(input_len, output_len)

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']
    def __init__(self, env, hyper_params, memo_server):
        self.memory_server = memo_server
        self.env = env
        self.max_episode_steps = env._max_episode_steps

        self.beta = hyper_params['beta']
        self.training_episodes = hyper_params['training_episodes']
        self.test_interval = hyper_params['test_interval']

        action_space = len(ACTION_DICT)
        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']
        self.collector_done = False
        self.results = []

        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        self.replace_targe_cnt = 0
        self.epsilon = 1
        self.eval_models_seq = 1
Example #13
0
    def __init__(self, name):
        """
        :param name: name of the rl_component
        """
        # name of the rl_component
        self.name = name
        # True if the model was set up
        self.is_model_init = False
        # Service for communicating the activations
        self._get_activation_service = rospy.Service(
            name + 'GetActivation', GetActivation,
            self._get_activation_state_callback)
        # choose appropriate model
        self.model = DQNModel(self.name)

        # save the last state
        self.last_state = None
        # the dimensions of the model
        self.number_outputs = -1
        self.number_inputs = -1

        self._unregistered = False
        rospy.on_shutdown(
            self.unregister)  # cleanup hook also for saving the model.
class DQN_server():
    def __init__(self, env, hyper_params, action_space):

        #self.env = env
        #self.max_episode_steps = env._max_episode_steps
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon.
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps',
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        # self.episode = 0
        # self.steps = 0
        # self.best_reward = 0
        # self.learning = True
        # self.action_space = action_space
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)


#         memory: Store and sample experience replay.
#self.memory = ReplayBuffer(hyper_params['memory_size'])
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        self.batch_size = hyper_params['batch_size']
        #self.update_steps = hyper_params['update_steps']
        #self.model_replace_freq = hyper_params['model_replace_freq']

        print("server initialized")

    def replace_target_model(self):
        self.target_model.replace(self.eval_model)

    def eval_model_predict(self, state):
        return self.eval_model.predict(state)

    # This next function will be called in the main RL loop to update the neural network model given a batch of experience
    # 1) Sample a 'batch_size' batch of experiences from the memory.
    # 2) Predict the Q-value from the 'eval_model' based on (states, actions)
    # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max
    # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward
    # 5) Call fit() to do the back-propagation for 'eval_model'.
    def update_batch(self, memory):
        current_memory_size = memory.get_current_size()
        if current_memory_size < self.batch_size:
            return

        #print("fetching minibatch from replay memory")
        batch = memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)

        #q_values = q_values[np.arange(self.batch_size), actions]
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            #print("target_model.predict")
            best_actions, q_next = self.target_model.predict_batch(next_states)
        else:
            best_actions, q_next = self.eval_model.predict_batch(next_states)

        q_max = q_next[batch_index, best_actions]

        terminal = 1 - terminal
        q_max *= terminal
        q_target = reward + self.beta * q_max

        # update model
        self.eval_model.fit(q_values, q_target)

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
Example #15
0
class DQN_server(object):
    def __init__(self,
                 learning_rate,
                 training_episodes,
                 memory,
                 env,
                 test_interval=50,
                 batch_size=32,
                 action_space=len(ACTION_DICT),
                 beta=0.99):

        self.env = env
        #self.max_episode_steps = env._max_episode_steps

        self.batch_num = training_episodes // test_interval
        self.steps = 0

        self.collector_done = False
        self.evaluator_done = False
        self.training_episodes = training_episodes
        self.episode = 0
        #self.esults = []
        self.batch_size = batch_size
        self.privous_q_model = []
        self.results = [0] * (self.batch_num + 1)
        self.result_count = 0
        self.memory = memory
        self.use_target_model = True

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len, output_len, learning_rate=0.0003)
        self.target_model = DQNModel(input_len, output_len)

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

    def get_eval_model(self):
        print(self.episode)
        if self.episode >= self.training_episodes:
            self.collector_done = True

        return self.collector_done

    def add_episode(self):
        self.episode += 1
        return self.episode

    def update_batch(self):
        if self.collector_done:
            return
        if ray.get(self.memory.__len__.remote()
                   ) < self.batch_size or self.steps % self.update_steps != 0:
            return

        batch = ray.get(self.memory.sample.remote(self.batch_size))

        (states, actions, reward, next_states, is_terminal) = batch

        self.steps += self.update_steps
        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            actions, q_next = self.target_model.predict_batch(next_states)
        else:
            actions, q_next = self.eval_model.predict_batch(next_states)

        #INSERT YOUR CODE HERE --- neet to compute 'q_targets' used below
        q_targets = [0] * self.batch_size
        for i in range(self.batch_size):
            if terminal[i] == 1:
                q_targets[i] = reward[i]
            else:
                max_value = torch.max(q_next, dim=1).values[i].data
                q_targets[i] = reward[i] + beta * max_value

        q_target = FloatTensor(q_targets)

        # update model
        self.eval_model.fit(q_values, q_target)

        if self.episode // test_interval + 1 > len(self.privous_q_model):
            model_id = ray.put(self.eval_model)
            self.privous_q_model.append(model_id)
        return self.steps

    # evalutor
    def add_result(self, result, num):
        #print(num)
        self.results[num] = result

    def get_results(self):
        return self.results

    def ask_evaluation(self):
        if len(self.privous_q_model) > self.result_count:
            num = self.result_count
            evluation_q_model = self.privous_q_model[num]
            self.result_count += 1
            return evluation_q_model, False, num
        else:
            if self.episode >= self.training_episodes:
                self.evaluator_done = True
            return [], self.evaluator_done, None

    def replace(self):
        self.target_model.replace(self.eval_model)

    def predict(self, state):
        return self.eval_model.predict(state)
Example #16
0
class RLComponent(object):
    """
    The rl component as a class. functions as a bridge between manager and rl-algo.
    It can also be used in a separated node through its service interface.
    """
    def __init__(self, name):
        """
        :param name: name of the rl_component
        """
        # name of the rl_component
        self.name = name
        # True if the model was set up
        self.is_model_init = False
        # Service for communicating the activations
        self._get_activation_service = rospy.Service(
            name + 'GetActivation', GetActivation,
            self._get_activation_state_callback)
        # choose appropriate model
        self.model = DQNModel(self.name)

        # save the last state
        self.last_state = None
        # the dimensions of the model
        self.number_outputs = -1
        self.number_inputs = -1

        self._unregistered = False
        rospy.on_shutdown(
            self.unregister)  # cleanup hook also for saving the model.

    def _get_activation_state_callback(self, request_msg):
        """
        answers the RL activation service and responds with the activations/reinforcements
        :param request_msg: GetActivation 
        :return: Service Response
        """
        input_state = request_msg.input_state
        negative_states = request_msg.negative_states
        try:

            activation_state = self.get_activation_state(
                input_state, negative_states)
            return GetActivationResponse(activation_state)
        except Exception as e:
            rhbplog.logerr(e.message)
            return None

    def get_activation_state(self, input_state, negative_states=None):
        """
        Determine the activation/reinforcement for the given input states, save the state (combined with last
        state for training)
        :param input_state:
        :type input_state: InputState
        :param negative_states:
        :return: ActivationState
        """
        if negative_states is None:
            negative_states = []

        try:
            self.check_if_model_is_valid(input_state.num_inputs,
                                         input_state.num_outputs)

            if input_state.last_action:  # only save state if we have a valid prior action.
                # save current input state
                self.save_state(input_state)
                # update the last state, which would also be the starting point for the negative states
                self.last_state = input_state.input_state
                # save negative states if available
                for state in negative_states:
                    self.save_state(state, is_extra_state=True)
                # update the model
                self.model.train_model()

            # transform the input state and get activation
            transformed_input = numpy.array(input_state.input_state).reshape(
                ([1, len(input_state.input_state)]))
            activations = self.model.feed_forward(transformed_input)
            # return the activation via the service
            activations = activations.tolist()[0]
            activation_state = ActivationState(
                **{
                    "name": self.
                    name,  # this is sent for sanity check and planner status messages only
                    "activations": activations,
                })
            return activation_state
        except Exception as e:
            rhbplog.logerr(e.message)
            return None

    def save_state(self, input_state, is_extra_state=False):
        """
        save the old_state,new_state,action,reward tuple for batch updating of the model
        :param input_state: current state input (positive or negative)
        :type input_state: InputState
        :param is_extra_state: set to True if this is a special extra state (e.g. negative states) that is recorded but
                               not necessarily has been explored/executed
        """
        if self.last_state is None:
            return
        last = numpy.array(self.last_state).reshape(([1,
                                                      len(self.last_state)]))
        new = numpy.array(input_state.input_state).reshape(
            ([1, len(input_state.input_state)]))
        reward_tuple = (last, new, input_state.last_action, input_state.reward)

        self.model.add_sample(tuple=reward_tuple,
                              consider_reward=not is_extra_state)

    def check_if_model_is_valid(self, num_inputs, num_outputs):
        """
        checks if the in-/outputs are the same as the current model has. If not 
        a new model is started
        :param num_inputs: 
        :param num_outputs: 
        :return: 
        """
        if not self.is_model_init:
            self.init_model(num_inputs, num_outputs)
        else:
            if (not self.number_outputs
                    == num_outputs) or (not self.number_inputs == num_inputs):
                self.init_model(num_inputs, num_outputs)

    def init_model(self, num_inputs, num_outputs):
        """
        inits the model with the specified parameters
        :param num_inputs: 
        :param num_outputs: 
        :return: 
        """
        self.number_inputs = num_inputs

        self.number_outputs = num_outputs

        self.last_state = None

        self.model.start_nn(num_inputs, num_outputs)

        self.is_model_init = True

    def unregister(self):
        if not self._unregistered:
            self._unregistered = True
            if self.model:
                self.model.save_model()

    def __del__(self):
        self.unregister()
Example #17
0
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps

        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']

        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.memory = ReplayBuffer(hyper_params['memory_size'])

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon,
                                       self.final_epsilon, self.steps,
                                       self.epsilon_decay_steps)

        if p < epsilon:
            # return action
            return randint(0, self.action_space - 1)
        else:
            # return action
            return self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    def update_batch(self):
        pass

    def learn(self):
        pass
class RLAgent_model_server():
    def __init__(self, env, hyper_params, memo_server):
        self.memory_server = memo_server
        self.env = env
        self.max_episode_steps = env._max_episode_steps

        self.beta = hyper_params['beta']
        self.training_episodes = hyper_params['training_episodes']
        self.test_interval = hyper_params['test_interval']

        action_space = len(ACTION_DICT)
        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']
        self.collector_done = False
        self.results = []

        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        self.replace_targe_cnt = 0
        self.epsilon = 1
        self.eval_models_seq = 1

    def update_batch(self):
        # Get memory sample
        batch = ray.get(self.memory_server.sample.remote(self.batch_size))
        if not batch:
            return
        (states, actions, reward, next_states, is_terminal) = batch

        # Setting torch value
        states = states
        next_states = next_states
        terminal = FloatTensor([0 if t else 1 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            actions, q_next = self.target_model.predict_batch(next_states)
        else:
            actions, q_next = self.eval_model.predict_batch(next_states)
        max_q_next, index = torch.max(q_next, dim=1)
        q_target = reward + self.beta * max_q_next * terminal
        # Update model
        self.eval_model.fit(q_values, q_target)

    def replace_target_model(self):
        if self.use_target_model and self.steps % self.model_replace_freq == 0:
            self.target_model.replace(self.eval_model)

    def evaluate_result(self):
#         print(self.episode, self.training_episodes)
        self.episode += 1
        if self.episode % self.test_interval == 0:
            self.save_model()
#             evaluation_worker_gg.remote(self.env, self.memory_server, self.eval_model, self.test_interval)

    def save_model(self):
        filename = "/best_model{0}.pt".format(self.eval_models_seq)
        self.eval_model.save(result_floder + filename)
        self.memory_server.add_evamodel_dir.remote(result_floder + filename)
        self.eval_models_seq += 1

    def ask_evaluate(self):
        if len(self.eval_models) == 0:
            return None, self.episode >= self.training_episodes

        eval_model, is_done = self.eval_models[0]
        del self.eval_models[0]
        return eval_model, is_done

    def get_collector_done(self):
        return self.episode >= self.training_episodes

    def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        self.epsilon = self.linear_decrease(self.initial_epsilon, 
                                            self.final_epsilon, 
                                            self.steps,
                                            self.epsilon_decay_steps)
        return randint(0, self.action_space - 1) if uniform(0, 1) < self.epsilon else self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    def add_results(self, result):
        self.results.append(result)

    def get_reuslts(self):
        return self.results

    def update_and_replace_model(self):
        self.steps += 1
        if self.steps % self.update_steps != 0:
            self.update_batch()
        self.replace_target_model()
Example #19
0
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps

        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']

        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.memory = ReplayBuffer(hyper_params['memory_size'])

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon,
                                       self.final_epsilon, self.steps,
                                       self.epsilon_decay_steps)

        if p < epsilon:
            #return action
            return randint(0, self.action_space - 1)
        else:
            #return action
            return self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    def update_batch(self):
        if len(self.memory
               ) < self.batch_size or self.steps % self.update_steps != 0:
            return
        # 1) Sample a 'batch_size' batch of experiences from the memory.
        batch = self.memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values --- 2) Predict the Q-value from the 'eval_model' based on (states, actions)
        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        # Calculate target --- 3) Predict the Q-value from the 'target model' based on (next_states), and take max of each Q-value vector, Q_max
        if self.use_target_model:
            actions, q_next = self.target_model.predict_batch(next_states)
        else:
            actions, q_next = self.eval_model.predict_batch(next_states)

        q_next = q_next[batch_index, actions]
        q_target = FloatTensor([
            reward[index] if is_terminal[index] else reward[index] +
            self.beta * q_next[index] for index in range(self.batch_size)
        ])

        # update model
        self.eval_model.fit(q_values, q_target)

    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []

        for i in range(test_number):
            # learn
            self.learn(test_interval)

            # evaluate
            avg_reward = self.evaluate()
            all_results.append(avg_reward)

        return all_results

    def learn(self, test_interval):
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:

                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, _ = self.env.step(action)
                # Store history
                self.memory.add(state, action, reward, next_state, done)
                # Update the model
                if self.steps % self.update_steps == 0:
                    self.update_batch()
                # Update the target network if DQN uses it
                if self.use_target_model:
                    if self.steps % self.model_replace_freq == 0:
                        self.target_model.replace(self.eval_model)
                # Update information for the next loop
                state = next_state
                steps += 1
                self.steps += 1

    def evaluate(self, trials=30):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)
                state, reward, done, _ = self.env.step(action)
                total_reward += reward

        avg_reward = total_reward / trials
        print(avg_reward)
        f = open(result_file, "a+")
        f.write(str(avg_reward) + "\n")
        f.close()
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
        return avg_reward

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
Example #20
0
def train_main(exp_prefix="",
               fc_units=[128, 64, 64],
               env_list=[],
               num_envs=10,
               num_obstacls_ratio=[0.2, 0.3, 0.3, 0.2],
               n_step=1,
               max_episodes=10000,
               max_steps=120,
               per_num_envs=8,
               replay_buffer_len=400,
               no_replay=False,
               batch_size=64,
               learning_rate=1e-4,
               epsilon_min=0.05,
               epsilon_max=0.10,
               gamma=0.98,
               without_map_info=False,
               save_interval=1000,
               show=False):
    # create envs
    if len(env_list) == 0:
        env_list = create_or_load_envs(num_envs, num_obstacls_ratio)
    # create model
    if without_map_info:
        state_dims = 2 + 1
    else:
        state_dims = 4 * (2 + 2) + 6 + 2 + 2
    act_dims = 5
    model = DQNModel(state_dims=state_dims,
                     act_dims=act_dims,
                     fc_units=fc_units)
    print("create model done")
    # optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    # create replay buffer
    buffer = ReplayBuffer(replay_buffer_len)
    print("create buffer done")

    # construct save path suffix
    weight_dir = os.path.join("weights", exp_prefix)
    dir_util.mkpath(weight_dir)
    log_dir = os.path.join("logs", exp_prefix)
    dir_util.mkpath(log_dir)
    summary_writer = tf.summary.create_file_writer(log_dir)

    # run simulations
    mean_loss_vals = []
    mean_ep_rewards = []
    last_save_ep_idx = 0
    for ep in range(max_episodes // per_num_envs):
        if no_replay:
            buffer.clear()
        num_new_samples = 0
        ep_rewards = []
        # randomly select an env and run rollout
        envs = np.random.choice(env_list, size=(per_num_envs))
        env_indices = np.random.randint(len(env_list), size=(per_num_envs))
        for roll_idx, env_idx in enumerate(env_indices):
            env = env_list[env_idx]
            episode_index = ep * per_num_envs + roll_idx
            epsilon = epsilon_max - (
                epsilon_max - epsilon_min) / max_episodes * episode_index
            ship_state_trace, input_states, action_list, reward_list, done_list, is_random_act_list, qvals = run_one_episodes(
                env, model, epsilon, max_steps, without_map_info)
            # td_errors = (reward_list + qvals[1:] * gamma) - qvals[:-1]
            td_errors = get_n_step_estimated_qvals(reward_list, qvals[1:],
                                                   gamma, n_step) - qvals[:-1]
            buffer.add_items(input_states, action_list, reward_list, done_list,
                             td_errors)
            num_new_samples += len(input_states)
            ep_rewards.append(np.sum(reward_list))
            print(
                "episode {:4d}, env-{:03d}, epsilon: {:4.2f}, episode length: {:3d}, ep_reward: {:8.2f}"
                .format(episode_index, env_idx, epsilon, len(input_states),
                        np.sum(reward_list)))
            tot_ep_reward = np.sum(reward_list)
            avg_ep_reward = np.mean(reward_list)
            with summary_writer.as_default():
                tf.summary.scalar('tot_ep_reward_trn',
                                  tot_ep_reward,
                                  step=episode_index)
                tf.summary.scalar('avg_ep_reward_trn',
                                  avg_ep_reward,
                                  step=episode_index)
            if episode_index % 100 == 0:
                # run an evaluation
                (eval_ship_state_trace, eval_input_states, eval_action_list,
                 eval_reward_list, eval_done_list, eval_is_random_act_list,
                 eval_qval_list) = run_one_episodes(env, model, 0, max_steps,
                                                    without_map_info)
                # log episode reward
                with summary_writer.as_default():
                    eval_tot_ep_reward = np.sum(eval_reward_list)
                    eval_avg_ep_reward = np.mean(eval_reward_list)
                    tf.summary.scalar('tot_ep_reward_evl',
                                      eval_tot_ep_reward,
                                      step=episode_index)
                    tf.summary.scalar('avg_ep_reward_evl',
                                      eval_avg_ep_reward,
                                      step=episode_index)
                # eval the loss
                eval_states_curr = np.array(eval_input_states[:-1])
                eval_states_next = np.array(eval_input_states[1:])
                eval_qvals_next = model(eval_states_next,
                                        training=False).numpy()
                eval_qvals_next_max = np.amax(
                    eval_qvals_next, axis=1) * (1 - np.array(eval_done_list))
                eval_qvals_esti = get_n_step_estimated_qvals(
                    eval_reward_list, eval_qvals_next_max, gamma, n_step)
                # to tensor
                eval_states_curr = tf.convert_to_tensor(
                    eval_states_curr, tf.float32)
                eval_action_list_tf = tf.convert_to_tensor(eval_action_list)
                eval_qvals_esti = tf.convert_to_tensor(eval_qvals_esti,
                                                       tf.float32)
                # eval to get loss
                eval_loss = eval_step_v0(model, eval_states_curr,
                                         eval_action_list_tf,
                                         eval_qvals_esti).numpy()
                with summary_writer.as_default():
                    tf.summary.scalar('loss_evl',
                                      eval_loss,
                                      step=episode_index)
                # draw map and state trace
                env.show(eval_ship_state_trace,
                         np.sum(eval_reward_list),
                         eval_loss,
                         eval_action_list,
                         eval_is_random_act_list,
                         save_path="pictures",
                         prefix=exp_prefix,
                         count=episode_index)
        # run update
        avg_ep_reward = float(np.mean(ep_rewards))
        mean_ep_rewards.append(avg_ep_reward)
        curr_update_loss_vals = []
        if no_replay:
            num_updates = 1
        else:
            num_updates = max(
                1,
                min(num_new_samples, replay_buffer_len) // batch_size)
        for _ in range(num_updates):
            # get qvals of next states
            if no_replay:
                batch_size = max(1, int(num_new_samples *
                                        0.8))  # overwrite batch_size
            states_curr, states_next, actions, rewards, dones = buffer.sample(
                batch_size)
            states_next = tf.convert_to_tensor(states_next, tf.float32)
            qvals_next = model(states_next, training=False).numpy()
            qvals_next = np.amax(qvals_next, axis=1) * (1 - dones)
            qvals_esti = get_n_step_estimated_qvals(rewards, qvals_next, gamma,
                                                    n_step)
            # to tensor
            states_curr = tf.convert_to_tensor(states_curr, tf.float32)
            actions = tf.convert_to_tensor(actions)
            qvals_esti = tf.convert_to_tensor(qvals_esti, tf.float32)
            # do an update
            loss_trn = train_step_v0(model, optimizer, states_curr, actions,
                                     qvals_esti).numpy()
            with summary_writer.as_default():
                tf.summary.scalar('loss_trn', loss_trn, step=episode_index)
            curr_update_loss_vals.append(loss_trn)
            print("episode {:4d}, bs: {:4d}, loss_trn: {:6.2f}".format(
                episode_index, batch_size, loss_trn))
        mean_loss_vals.append(float(np.mean(curr_update_loss_vals)))

        # draw loss
        if ep > 0 and ep % 10 == 0:
            draw_vals(mean_ep_rewards,
                      mean_loss_vals,
                      per_num_envs,
                      exp_prefix=exp_prefix)
            # save to file for further use
            json.dump([mean_loss_vals, mean_ep_rewards],
                      open("logs/{}_logs_info.json".format(exp_prefix), "w"))

        # Save the weights using the `checkpoint_path` format
        if (episode_index - last_save_ep_idx) > save_interval:
            save_path = os.path.join(
                weight_dir, "weights_{:05d}.ckpt".format(episode_index))
            model.save_weights(save_path)
            last_save_ep_idx = episode_index
            print("episode-{}, save weights to: {}".format(
                episode_index, save_path))
Example #21
0
class DQN_Model_Server():
    def __init__(self, env, hyper_params, batch_size, update_steps, memory_size, beta, model_replace_freq,
                 learning_rate, use_target_model=True, memory=Memory_Server, action_space=2,
                 training_episodes=7000, test_interval=50):
        # super().__init__(update_steps, memory_size, model_replace_freq, learning_rate, beta=0.99, batch_size = 32, use_target_model=True)
        self.batch_size = batch_size

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len, output_len, learning_rate=0.0003)
        self.target_model = DQNModel(input_len, output_len)
        self.steps = 0
        self.memory = memory
        # self.memory = ReplayBuffer(hyper_params['memory_size'])
        self.prev = 0
        self.next = 0
        self.model_dq = deque()
        self.result = [0] * ((training_episodes // test_interval) + 1)
        self.previous_q_networks = []
        self.result_count = 0
        self.learning_episodes = training_episodes
        self.episode = 0
        self.is_collection_completed = False
        self.evaluator_done = False
        self.batch_num = training_episodes // test_interval
        self.use_target_model = True
        self.beta = 0.99
        self.test_interval = test_interval


    def get_evaluation_model(self):

        if self.episode >= self.learning_episodes:
            self.is_collection_completed = True

        return self.is_collection_completed


    def replace(self):
        self.target_model.replace(self.eval_model)

    def get_total_steps(self):
        return self.steps


    def predict_next(self, state, e_model):
        return e_model.predict(state)

    def get_predict(self, state):
        return self.eval_model.predict(state)

    def set_collect_count(self):
        self.next += 1

    def set_collector_count(self):
        self.episode += 1


    def get_evaluation_count(self):
        return self.result_count

    def get_evaluator_count(self):
        return self.episode

    def ask_evaluation(self):
        if len(self.previous_q_networks) > self.result_count:
            num = self.result_count
            evluation_q_network = self.previous_q_networks[num]
            self.result_count += 1
            self.episode += 50
            return evluation_q_network, False, num
        else:
            if self.episode >= self.learning_episodes:
                self.evaluator_done = True
            return [], self.evaluator_done, None

    def update_batch(self):

        self.steps += 10

        if ray.get(self.memory.__len__.remote()) < self.batch_size:  # or self.steps % self.update_steps != 0:
            return

        if self.is_collection_completed:
            return

        batch = ray.get(self.memory.sample.remote(self.batch_size))

        (states, actions, reward, next_states,
         is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size,
                                   dtype=torch.long)

        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        if self.use_target_model:
            actions, q_next = self.target_model.predict_batch(next_states)
        else:
            actions, q_next = self.eval_model.predict_batch(next_states)#dont need though

        q_targets = []

        for i in range(0, len(terminal), 1):
            if terminal[i] == 1:
                q_targets.append(reward[i])
            else:
                q_targets.append(reward[i] + (self.beta * torch.max(q_next, 1).values[i].data))

        q_target = FloatTensor(q_targets)

        self.eval_model.fit(q_values, q_target)

        if self.episode // self.test_interval + 1 > len(self.previous_q_networks):
            model_id = ray.put(self.eval_model)
            self.previous_q_networks.append(model_id)
        return self.steps

    def set_results(self, result, num):
        self.result[num] = result

    def get_results(self):
        return self.result
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon.
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps',
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)
#         memory: Store and sample experience replay.
        self.memory = ReplayBuffer(hyper_params['memory_size'])
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

        print("agent initialized")

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon,
                                       self.final_epsilon, self.steps,
                                       self.epsilon_decay_steps)
        #if(np.random.randint(1000)==4):
        #print("epsilon",epsilon)
        if p < epsilon:
            #return action
            return randint(0, self.action_space - 1)
        else:
            #return action
            return self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    # This next function will be called in the main RL loop to update the neural network model given a batch of experience
    # 1) Sample a 'batch_size' batch of experiences from the memory.
    # 2) Predict the Q-value from the 'eval_model' based on (states, actions)
    # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max
    # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward
    # 5) Call fit() to do the back-propagation for 'eval_model'.
    def update_batch(self):
        if len(self.memory
               ) < self.batch_size or self.steps % self.update_steps != 0:
            return

        #print("fetching minibatch from replay memory")
        batch = self.memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)

        #q_values = q_values[np.arange(self.batch_size), actions]
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            #print("target_model.predict")
            best_actions, q_next = self.target_model.predict_batch(next_states)
        else:
            best_actions, q_next = self.eval_model.predict_batch(next_states)

        q_max = q_next[batch_index, best_actions]

        terminal = 1 - terminal
        q_max *= terminal
        q_target = reward + self.beta * q_max

        # update model
        self.eval_model.fit(q_values, q_target)

    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []

        for i in range(test_number):
            # learn
            self.learn(test_interval)

            # evaluate
            avg_reward = self.evaluate()
            all_results.append(avg_reward)

        return all_results

    def learn(self, test_interval):
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                #INSERT YOUR CODE HERE
                # add experience from explore-exploit policy to memory
                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, info = self.env.step(action)
                self.memory.add(state, action, reward, next_state, done)

                # update the model every 'update_steps' of experience
                self.update_batch()

                # update the target network (if the target network is being used) every 'model_replace_freq' of experiences
                if self.use_target_model and (self.steps %
                                              self.model_replace_freq == 0):
                    self.target_model.replace(self.eval_model)

                self.steps += 1
                steps += 1
                state = next_state

    def evaluate(self, trials=30):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)
                state, reward, done, _ = self.env.step(action)
                total_reward += reward

        avg_reward = total_reward / trials
        print(avg_reward)
        f = open(result_file, "a+")
        f.write(str(avg_reward) + "\n")
        f.close()
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
        return avg_reward

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
Example #23
0
        for col in range(3):
            if env.board[row, col] == 0:
                s += (str(row * 3 + col))
            elif env.board[row, col] == 1:
                s += 'X'
            else:
                s += 'O'
            if col < 2:
                s += '|'
        print(s)


e = TTT()
e.reset()

model = DQNModel.load('awesome2.pb')

is_finished = False
output = None
winner = None
while not is_finished:
    print_board(e)
    user_input = input('\nChoose your move ')
    output = e.step(int(user_input))

    if not is_finished:
        predicted_qs = model.predict(e.board)[0]
        for index, q in enumerate(predicted_qs):
            print('{}: {}'.format(index, q))
        ai_action = model.get_top_action(e.board)
        output = e.step(ai_action)
Example #24
0
class DQN_model_server(object):
    def __init__(self, env, memory, action_space=2, test_interval=50):

        self.collector_done = False
        self.evaluator_done = False

        self.env = env
        # self.max_episode_steps = env._max_episode_steps
        self.max_episode_steps = 200

        self.beta = hyperparams_CartPole['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyperparams_CartPole['final_epsilon']
        self.epsilon_decay_steps = hyperparams_CartPole['epsilon_decay_steps']
        self.batch_size = hyperparams_CartPole['batch_size']

        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        self.previous_q_models = []
        self.results = [0] * (self.batch_size + 1)
        self.reuslt_count = 0
        self.episode = 0
        self.test_interval = test_interval
        self.memory = memory

        state = env.reset()
        input_len = len(state)
        output_len = action_space

        self.eval_model = DQNModel(input_len, output_len, learning_rate=hyperparams_CartPole['learning_rate'])

        self.use_target_model = hyperparams_CartPole['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        # #         memory: Store and sample experience replay.
        #         self.memory = ReplayBuffer(hyper_params['memory_size'])

        self.batch_size = hyperparams_CartPole['batch_size']
        self.update_steps = hyperparams_CartPole['update_steps']
        self.model_replace_freq = hyperparams_CartPole['model_replace_freq']


    def get_steps(self):
        return self.steps

    def update_batch(self):

        # if len(memory) < self.batch_size or self.steps % self.update_steps != 0:
        #     return
        # print(len(self.memory.remote()))
        batch = self.memory.sample.remote(self.batch_size)
        (states, actions, reward, next_states, is_terminal) = ray.get(batch)

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, self.q_values = self.eval_model.predict_batch(states)
        self.q_values = self.q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            actions, self.q_next = self.target_model.predict_batch(next_states)
            self.q_next = self.q_next[batch_index, actions]
        else:
            actions, self.q_next = self.eval_model.predict_batch(next_states)
            self.q_next = self.q_next[batch_index, actions]

        # INSERT YOUR CODE HERE --- neet to compute 'q_targets' used below
        self.q_target = []

        for i in range(len(reward)):

            if terminal[i] == 1:
                self.q_target.append(reward[i])
            else:
                self.q_target.append(reward[i] + self.beta * self.q_next[i])
    
        self.q_target = FloatTensor(self.q_target)

        # update model
        self.eval_model.fit(self.q_values, self.q_target)

        if(np.random.randint(100)==4):
            print("==========",self.q_values[0],self.q_target[0])
            # print("..................................................", self.evaluate())

        # score = self.evaluate()
        # f_results = open("./results_8_4.txt", "a+")
        # f_results.write(str(score) + "\n")
        # f_results.close()

        if self.episode // self.test_interval + 1 > len(self.previous_q_models):
            model_id = ray.put(self.eval_model)
            self.previous_q_models.append(model_id)

        self.steps += 10
        return self.steps

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    def replace_target(self):

        return self.target_model.replace(self.eval_model)

        # evalutor

    # def add_result(self, result):
    #     self.results[num] = result

    def get_reuslts(self):
        return self.results


    def ask_evaluation(self):
        if len(self.previous_q_models) > self.reuslt_count:
            num = self.reuslt_count
            evluation_q_model = self.previous_q_models[num]
            self.reuslt_count += 1
            return evluation_q_model, False, num
        else:
            if self.episode >= training_episodes:
                self.evaluator_done = True
            return [], self.evaluator_done, None

    def add_episode(self):

        self.episode += 1
Example #25
0
def train(environment, starting_model_path=None, episodes=15000):
    if starting_model_path:
        policy_model = DQNModel.load(starting_model_path)
        target_model = DQNModel.load(starting_model_path)
        print('loaded model {}'.format(starting_model_path))
    else:
        print('starting model from scratch')
        policy_model = DQNModel()
        target_model = DQNModel()
        target_model.set_weights(policy_model.get_weights())

    print('Begin training...')
    replay_memory = []
    epsilon = 0.0

    for episode_i in range(episodes):
        replay_memory += play_out_episode(policy_model, environment, epsilon)
        replay_memory = replay_memory[-hparams['max_mem_size']:]

        epsilon = max(hparams['min_epsilon'], epsilon*hparams['epsilon_decay'])
        if len(replay_memory) >= hparams['min_mem_size']:
            do_training_step(policy_model, target_model, random.sample(replay_memory, hparams['batch_size']))

        if episode_i % hparams['target_model_update_every'] == 0:
            target_model.set_weights(policy_model.get_weights())
        if episode_i % hparams['evaluation_every'] == 0:
            info = evaluate_model(policy_model, environment)
            print('===================== episode {}, epsilon {}'.format(episode_i, epsilon))
            print(info)
            print('======================================')
            policy_model.save('checkpoint-{}'.format(episode_i))
class Model_Server():
    def __init__(self, env, hyper_params, memory, action_space):
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        self.final_epsilon = hyper_params['final_epsilon']
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.beta = hyper_params['beta']
        self.model_replace_freq = hyper_params['model_replace_freq']
        self.learning_rate = hyper_params['learning_rate']
        self.training_episodes = hyper_params['training_episodes']
        self.test_interval = hyper_params['test_interval']
        self.memory = memory

        self.episode = 0
        self.steps = 0
        self.result_count = 0
        self.next = 0
        self.batch_num = self.training_episodes // self.test_interval

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate'])
        self.target_model = DQNModel(input_len, output_len)

        self.results = [0] * (self.batch_num + 1)
        self.previous_q_networks = []

        self.collector_done = False
        self.evaluator_done = False

    def ask_evaluation(self):
        if len(self.previous_q_networks) > self.result_count:
            num = self.result_count
            evaluation_q_network = self.previous_q_networks[num]
            self.result_count += 1
            return evaluation_q_network, False, num
        else:
            if self.episode >= self.training_episodes:
                self.evaluator_done = True
            return [], self.evaluator_done, None

    def get_evaluation_model(self):
        if self.episode >= self.training_episodes:
            self.collector_done = True
        return self.eval_model, self.collector_done

    def replace_with_eval_model(self):
        self.target_model.replace(self.eval_model)

    def get_model_steps(self):
        return self.steps

    def predict_next_eval(self, state, eval_model):
        return eval_model.predict(state)

    def get_predict(self, state):
        return self.eval_model.predict(state)

    def increment_episode(self):
        self.episode += 1

    def increment_model_steps(self):
        self.steps += 1
        return self.steps

    def update_batch(self):

        self.steps += self.update_steps

        if ray.get(self.memory.__len__.remote()) < self.batch_size:  # or self.steps % self.update_steps != 0:
            return

        if self.collector_done:
            return

        batch = ray.get(self.memory.sample.remote(self.batch_size))

        (states, actions, reward, next_states,
         is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        # Calculate target
        actions, q_next = self.target_model.predict_batch(next_states)

        q_max, indices = torch.max(q_next, dim=1)

        # INSERT YOUR CODE HERE --- neet to compute 'q_targets' used below
        q_targets = []
        for i, is_term in enumerate(terminal):
            if is_term == 1:
                q_targets.append(reward[i])
            else:
                q_targets.append(reward[i] + self.beta * q_max[i])
        q_targets_tensor = FloatTensor(q_targets)

        # update model
        self.eval_model.fit(q_values, q_targets_tensor)

        if self.episode // self.test_interval + 1 > len(self.previous_q_networks):
            model_id = ray.put(self.eval_model)
            self.previous_q_networks.append(model_id)
        return self.steps

    def add_result(self, reward, num):
        self.results[num] = reward

    def get_results(self):
        return self.results
Example #27
0
class ModelServer():
    def __init__(self,
                 hyper_params,
                 memory_server,
                 nb_agents,
                 nb_evaluators,
                 action_space=len(ACTION_DICT)):
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        self.hyper_params = hyper_params
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']
        self.action_space = action_space
        self.batch_size = hyper_params['batch_size']
        self.memory_server = memory_server
        self.nb_agents = nb_agents
        self.nb_evaluators = nb_evaluators
        env = CartPoleEnv()
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.target_model = DQNModel(input_len, output_len)

        self.agents = [
            DQN_agent_remote.remote(CartPoleEnv(), memory_server, hyper_params,
                                    action_space, i) for i in range(nb_agents)
        ]
        self.evaluators = [
            EvalWorker.remote(self.eval_model, CartPoleEnv(),
                              hyper_params['max_episode_steps'],
                              hyper_params['eval_trials'], i)
            for i in range(nb_evaluators)
        ]

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def update_batch(self):
        batch = self.memory_server.sample.remote(self.batch_size)
        (states, actions, reward, next_states, is_terminal) = ray.get(batch)
        if len(states) < self.batch_size:
            return
        nonterminal_x_beta = FloatTensor(
            [0 if t else self.beta for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)
        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]
        # Calculate target
        actions, q_next = self.target_model.predict_batch(next_states)
        q_targets = reward + nonterminal_x_beta * torch.max(q_next, 1).values
        # update model
        self.eval_model.fit(q_values, q_targets)

    def learn(self, test_interval, epsilon):
        # determine which collectors are idle
        ready_ids, _ = ray.wait(
            [agent.pingback.remote() for agent in self.agents], num_returns=1)
        ready_agents = ray.get(ready_ids)
        # send eval model to idle collectors, initiate collection
        for agent_id in ready_agents:
            self.agents[agent_id].collect.remote(self.eval_model,
                                                 test_interval, epsilon)

    def evaluate(self, all_results):
        # determine which evaluators are idle
        ready_ids, _ = ray.wait(
            [evaluator.pingback.remote() for evaluator in self.evaluators],
            num_returns=1)
        ready_evaluators = ray.get(ready_ids)
        # send eval model to idle evaluators, get results
        for evaluator_id in ready_evaluators:
            avg_reward = ray.get(
                self.evaluators[evaluator_id].evaluate.remote())
            all_results.append(avg_reward)

    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []
        for i in range(test_number):
            self.steps = i * test_interval
            # Get decreased epsilon
            epsilon = self.linear_decrease(self.initial_epsilon,
                                           self.final_epsilon, self.steps,
                                           self.epsilon_decay_steps)
            # send eval model to collectors, have them collect experience
            self.learn(test_interval, epsilon)
            # sample experience from memory server, perform batch update on eval model
            if self.steps % self.update_steps == 0:
                self.update_batch()
            # replace target model
            if self.steps % self.model_replace_freq == 0:
                self.target_model.replace(self.eval_model)
            # send eval model to evaluators, record results
            self.evaluate(all_results)
        return all_results