コード例 #1
0
def train(environment, starting_model_path=None, episodes=15000):
    if starting_model_path:
        policy_model = DQNModel.load(starting_model_path)
        target_model = DQNModel.load(starting_model_path)
        print('loaded model {}'.format(starting_model_path))
    else:
        print('starting model from scratch')
        policy_model = DQNModel()
        target_model = DQNModel()
        target_model.set_weights(policy_model.get_weights())

    print('Begin training...')
    replay_memory = []
    epsilon = 0.0

    for episode_i in range(episodes):
        replay_memory += play_out_episode(policy_model, environment, epsilon)
        replay_memory = replay_memory[-hparams['max_mem_size']:]

        epsilon = max(hparams['min_epsilon'], epsilon*hparams['epsilon_decay'])
        if len(replay_memory) >= hparams['min_mem_size']:
            do_training_step(policy_model, target_model, random.sample(replay_memory, hparams['batch_size']))

        if episode_i % hparams['target_model_update_every'] == 0:
            target_model.set_weights(policy_model.get_weights())
        if episode_i % hparams['evaluation_every'] == 0:
            info = evaluate_model(policy_model, environment)
            print('===================== episode {}, epsilon {}'.format(episode_i, epsilon))
            print(info)
            print('======================================')
            policy_model.save('checkpoint-{}'.format(episode_i))
コード例 #2
0
    def learn_and_evaluate(self):
        workers_id = []
        batch_size = self.parms['training_episodes'] // self.parms['workers'][0]
        for _ in range(self.parms['workers'][0]):
            workers_id.append(collecting_worker.remote(self.env, self.model_server, self.memory_server, batch_size))

        all_results = []
        if self.parms['do_test']:
            eval_model = DQNModel(len(env.reset()), len(ACTION_DICT))
            learn_done, filedir = False, ""
            workers_num = self.parms['workers'][1]
            interval = self.parms['test_interval']//workers_num
            while not learn_done:
                filedir, learn_done = ray.get(self.memory_server.get_evaluate_filedir.remote())
                if not filedir:
                    continue
                eval_model.load(filedir)
                start_time, total_reward = time.time(), 0
                eval_workers = []
                for _ in range(workers_num):
                    eval_workers.append(evaluation_worker_test2.remote(self.env, self.memory_server, eval_model, interval))
                    
                avg_reward = sum(ray.get(eval_workers))/workers_num
                print(filedir, avg_reward, (time.time() - start_time))
                all_results.append(avg_reward)

        return all_results
コード例 #3
0
def evaluation_worker(env, mem_server, trials):
    eval_model = DQNModel(len(env.reset()), len(ACTION_DICT))
    learn_done, filedir = False, ""
    while not learn_done:
        filedir, learn_done = ray.get(mem_server.get_evaluate_filedir.remote())
        if not filedir:
            continue
        eval_model.load(filedir)
        start_time, total_reward = time.time(), 0
        for _ in range(trials):
            state, done, steps = env.reset(), False, 0
            while steps < env._max_episode_steps and not done:
                steps += 1
                state, reward, done, _ = env.step(eval_model.predict(state))
                total_reward += reward
        mem_server.add_results.remote(total_reward / trials)
コード例 #4
0
class DQN_server():
    def __init__(self, env, hyper_params, action_space):

        #self.env = env
        #self.max_episode_steps = env._max_episode_steps
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon.
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps',
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        # self.episode = 0
        # self.steps = 0
        # self.best_reward = 0
        # self.learning = True
        # self.action_space = action_space
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)


#         memory: Store and sample experience replay.
#self.memory = ReplayBuffer(hyper_params['memory_size'])
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        self.batch_size = hyper_params['batch_size']
        #self.update_steps = hyper_params['update_steps']
        #self.model_replace_freq = hyper_params['model_replace_freq']

        print("server initialized")

    def replace_target_model(self):
        self.target_model.replace(self.eval_model)

    def eval_model_predict(self, state):
        return self.eval_model.predict(state)

    # This next function will be called in the main RL loop to update the neural network model given a batch of experience
    # 1) Sample a 'batch_size' batch of experiences from the memory.
    # 2) Predict the Q-value from the 'eval_model' based on (states, actions)
    # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max
    # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward
    # 5) Call fit() to do the back-propagation for 'eval_model'.
    def update_batch(self, memory):
        current_memory_size = memory.get_current_size()
        if current_memory_size < self.batch_size:
            return

        #print("fetching minibatch from replay memory")
        batch = memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)

        #q_values = q_values[np.arange(self.batch_size), actions]
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            #print("target_model.predict")
            best_actions, q_next = self.target_model.predict_batch(next_states)
        else:
            best_actions, q_next = self.eval_model.predict_batch(next_states)

        q_max = q_next[batch_index, best_actions]

        terminal = 1 - terminal
        q_max *= terminal
        q_target = reward + self.beta * q_max

        # update model
        self.eval_model.fit(q_values, q_target)

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
コード例 #5
0
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon.
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps',
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)
#         memory: Store and sample experience replay.
        self.memory = ReplayBuffer(hyper_params['memory_size'])
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

        print("agent initialized")

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon,
                                       self.final_epsilon, self.steps,
                                       self.epsilon_decay_steps)
        #if(np.random.randint(1000)==4):
        #print("epsilon",epsilon)
        if p < epsilon:
            #return action
            return randint(0, self.action_space - 1)
        else:
            #return action
            return self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    # This next function will be called in the main RL loop to update the neural network model given a batch of experience
    # 1) Sample a 'batch_size' batch of experiences from the memory.
    # 2) Predict the Q-value from the 'eval_model' based on (states, actions)
    # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max
    # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward
    # 5) Call fit() to do the back-propagation for 'eval_model'.
    def update_batch(self):
        if len(self.memory
               ) < self.batch_size or self.steps % self.update_steps != 0:
            return

        #print("fetching minibatch from replay memory")
        batch = self.memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)

        #q_values = q_values[np.arange(self.batch_size), actions]
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            #print("target_model.predict")
            best_actions, q_next = self.target_model.predict_batch(next_states)
        else:
            best_actions, q_next = self.eval_model.predict_batch(next_states)

        q_max = q_next[batch_index, best_actions]

        terminal = 1 - terminal
        q_max *= terminal
        q_target = reward + self.beta * q_max

        # update model
        self.eval_model.fit(q_values, q_target)

    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []

        for i in range(test_number):
            # learn
            self.learn(test_interval)

            # evaluate
            avg_reward = self.evaluate()
            all_results.append(avg_reward)

        return all_results

    def learn(self, test_interval):
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                #INSERT YOUR CODE HERE
                # add experience from explore-exploit policy to memory
                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, info = self.env.step(action)
                self.memory.add(state, action, reward, next_state, done)

                # update the model every 'update_steps' of experience
                self.update_batch()

                # update the target network (if the target network is being used) every 'model_replace_freq' of experiences
                if self.use_target_model and (self.steps %
                                              self.model_replace_freq == 0):
                    self.target_model.replace(self.eval_model)

                self.steps += 1
                steps += 1
                state = next_state

    def evaluate(self, trials=30):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)
                state, reward, done, _ = self.env.step(action)
                total_reward += reward

        avg_reward = total_reward / trials
        print(avg_reward)
        f = open(result_file, "a+")
        f.write(str(avg_reward) + "\n")
        f.close()
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
        return avg_reward

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
コード例 #6
0
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps

        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']

        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.memory = ReplayBuffer(hyper_params['memory_size'])

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon,
                                       self.final_epsilon, self.steps,
                                       self.epsilon_decay_steps)

        if p < epsilon:
            #return action
            return randint(0, self.action_space - 1)
        else:
            #return action
            return self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    def update_batch(self):
        if len(self.memory
               ) < self.batch_size or self.steps % self.update_steps != 0:
            return
        # 1) Sample a 'batch_size' batch of experiences from the memory.
        batch = self.memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values --- 2) Predict the Q-value from the 'eval_model' based on (states, actions)
        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        # Calculate target --- 3) Predict the Q-value from the 'target model' based on (next_states), and take max of each Q-value vector, Q_max
        if self.use_target_model:
            actions, q_next = self.target_model.predict_batch(next_states)
        else:
            actions, q_next = self.eval_model.predict_batch(next_states)

        q_next = q_next[batch_index, actions]
        q_target = FloatTensor([
            reward[index] if is_terminal[index] else reward[index] +
            self.beta * q_next[index] for index in range(self.batch_size)
        ])

        # update model
        self.eval_model.fit(q_values, q_target)

    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []

        for i in range(test_number):
            # learn
            self.learn(test_interval)

            # evaluate
            avg_reward = self.evaluate()
            all_results.append(avg_reward)

        return all_results

    def learn(self, test_interval):
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:

                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, _ = self.env.step(action)
                # Store history
                self.memory.add(state, action, reward, next_state, done)
                # Update the model
                if self.steps % self.update_steps == 0:
                    self.update_batch()
                # Update the target network if DQN uses it
                if self.use_target_model:
                    if self.steps % self.model_replace_freq == 0:
                        self.target_model.replace(self.eval_model)
                # Update information for the next loop
                state = next_state
                steps += 1
                self.steps += 1

    def evaluate(self, trials=30):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)
                state, reward, done, _ = self.env.step(action)
                total_reward += reward

        avg_reward = total_reward / trials
        print(avg_reward)
        f = open(result_file, "a+")
        f.write(str(avg_reward) + "\n")
        f.close()
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
        return avg_reward

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
コード例 #7
0
        for col in range(3):
            if env.board[row, col] == 0:
                s += (str(row * 3 + col))
            elif env.board[row, col] == 1:
                s += 'X'
            else:
                s += 'O'
            if col < 2:
                s += '|'
        print(s)


e = TTT()
e.reset()

model = DQNModel.load('awesome2.pb')

is_finished = False
output = None
winner = None
while not is_finished:
    print_board(e)
    user_input = input('\nChoose your move ')
    output = e.step(int(user_input))

    if not is_finished:
        predicted_qs = model.predict(e.board)[0]
        for index, q in enumerate(predicted_qs):
            print('{}: {}'.format(index, q))
        ai_action = model.get_top_action(e.board)
        output = e.step(ai_action)