def _init_replay_memory(args):
    if MODEL not in args:
        info("Using empty replay memory.")
        return ReplayMemory(DEFAULT_PARAMS[REPLAY_MEMORY_SIZE])

    replay_memory = load(args[MODEL], REPLAY_MEMORY_EXT)
    if replay_memory is None:
        info("Using empty replay memory.")
        return ReplayMemory(DEFAULT_PARAMS[REPLAY_MEMORY_SIZE])
    info("Successfully loaded saved replay memory of model %s." % args[MODEL])
    return replay_memory
Esempio n. 2
0
    def __init__(self, h, w, num_actions, device):

        self.device = device
        self.num_actions = num_actions
        self.memory = ReplayMemory(Config.CAPACITY)
        self.main_q_network = Net(h, w, num_actions).to(self.device)
        self.target_q_network = Net(h, w, num_actions).to(self.device)
        self.loss = 0.
        self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001)
        self.epsilon = 1.0
        print(self.main_q_network)
    def __init__(self, p):
        self.p = p
        self.target_dqn = DQN(self.p['HIDDEN_DIM'])
        self.eval_dqn = DQN(self.p['HIDDEN_DIM'])

        self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4])
        self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE'])

        try:
            self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            print("Data has been loaded successfully")
        except:
            print("No data existing")
 def __init__(self, args, n_actions):
     self.model = DQN(args.img_width, args.img_height, args.channels, n_actions).to(device)
     self.n_action = n_actions
     self.epsilon_start = args.epsilon
     self.epsilon = args.epsilon
     self.decay_start = args.decay_start
     self.decay_end = args.n_epochs * 0.8
     self.memory = ReplayMemory(args)
     self.batch_size = args.batch_size
     self.actions = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
Esempio n. 5
0
    def __init__(self, config, session):
        # build the net
        self.config = config
        self.sess = session
        self.RM = ReplayMemory(config)
        self.step_count = 0
        self.episode = 0
        self.isTesting = False
        self.game_state = np.zeros((1, 84, 84, self.config.buff_size),
                                   dtype=np.uint8)
        self.reset_game()
        self.timeout_option = tf.RunOptions(timeout_in_ms=5000)

        # if the new agent needs other action modes define a different dict
        self.action_modes = {
            str(config.testing_epsilon) + "_greedy": self.e_greedy_action
        }
        self.default_action_mode = self.action_modes.items()[0][0]
        self.action_mode = self.default_action_mode
Esempio n. 6
0
    def __init__(self, input_size, nb_action, gamma):
        """
        Initialize Deep Q Learning 
        
        @param input_size: input size of Neural Network
        @param nb_action: possible actions
        @param gamma: gamma paramemter of Deep Q-learning equation
        """
        self.gamma = gamma
        self.reward_window = []
        self.model = Network(input_size, nb_action)
        self.memory = ReplayMemory(100000)

        #optimization algorithm
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        #unsqueeze(0) => torch tensor of size 1 x input_size
        self.last_state = torch.Tensor(input_size).unsqueeze(0)
        self.last_action = 0
        self.last_reward = 0
Esempio n. 7
0
class Brain:
    def __init__(self, h, w, num_actions, device):

        self.device = device
        self.num_actions = num_actions
        self.memory = ReplayMemory(Config.CAPACITY)
        self.main_q_network = Net(h, w, num_actions).to(self.device)
        self.target_q_network = Net(h, w, num_actions).to(self.device)
        self.loss = 0.
        self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001)
        self.epsilon = 1.0
        print(self.main_q_network)

    def replay(self):
        if len(self.memory) < Config.LEARNING_START:
            return

        self.batch, self.state_batch, self.action_batch, self.reward_batch, \
        self.non_final_next_states = self.make_minibatch()

        self.state_batch = self.state_batch.to(self.device)
        self.action_batch = self.action_batch.to(self.device)
        self.reward_batch = self.reward_batch.to(self.device)
        self.non_final_next_states = self.non_final_next_states.to(self.device)

        self.expected_state_action_values = self.get_expected_state_action_values()

        self.update_main_q_network()

    def decide_action(self, state):

        state = state.to(self.device)

        if self.epsilon <= np.random.uniform(0, 1):
            self.main_q_network.eval()
            with torch.no_grad():
                action = self.main_q_network(state).max(1)[1].view(1, 1)

        else:
            action = torch.LongTensor([[random.randrange(self.num_actions)]])
        return action

    def make_minibatch(self):

        transitions = self.memory.sample(Config.BATCH_SIZE)

        batch = Transition(*zip(*transitions))
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                           if s is not None])

        return batch, state_batch, action_batch, reward_batch, non_final_next_states

    def get_expected_state_action_values(self):

        self.main_q_network.eval()
        self.target_q_network.eval()

        self.state_action_values = self.main_q_network(self.state_batch).gather(1,
                                                                                self.action_batch)  # *********************************8

        non_final_mask = torch.BoolTensor(tuple(map(lambda s: s is not None, self.batch.next_state)))

        a_m = torch.zeros(Config.BATCH_SIZE).type(torch.LongTensor).to(self.device)

        a_m[non_final_mask] = self.main_q_network(self.non_final_next_states).detach().max(1)[1]

        a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

        next_state_values = torch.zeros(Config.BATCH_SIZE).to(self.device)

        next_state_values[non_final_mask] = self.target_q_network(
            self.non_final_next_states).gather(1, a_m_non_final_next_states).detach().squeeze()

        expected_state_action_values = self.reward_batch + Config.GAMMA * next_state_values

        return expected_state_action_values

    def update_main_q_network(self):

        self.main_q_network.train()

        self.loss = F.smooth_l1_loss(self.state_action_values,
                                     self.expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

    def update_target_q_function(self):

        self.target_q_network.load_state_dict(self.main_q_network.state_dict())

    def update_epsilon(self):
        if self.epsilon > Config.EPSILON_MIN:
            self.epsilon -= 1 / (Config.NUM_EPISODES - Config.START_TRAIN_EP)

    def model_save(self):
        torch.save(self.main_q_network, 'puckworkd_model.pth')
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser(
        description='Train using Gazebo Simulations')
    parser.add_argument('--seed', default=10, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=(80, 100), help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    parser.add_argument('--epsilon',
                        default=0.1,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate',
                        default=0.00001,
                        help='learning rate')
    parser.add_argument('--window_size',
                        default=4,
                        type=int,
                        help='Number of frames to feed to the Q-network')
    parser.add_argument('--num_time',
                        default=4,
                        type=int,
                        help='Number of steps in RNN')
    parser.add_argument('--num_actions',
                        default=7,
                        type=int,
                        help='Number of actions')
    parser.add_argument('--batch_size',
                        default=64,
                        type=int,
                        help='Batch size of the training part')
    parser.add_argument('--num_iteration',
                        default=500000,
                        type=int,
                        help='number of iterations to train')
    parser.add_argument(
        '--eval_every',
        default=0.01,
        type=float,
        help='What fraction of num_iteration to run between evaluations')

    args = parser.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    batch_environment = GazeboWorld()
    print('Environment initialized')

    replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size,
                                 args.input_shape)
    online_model, online_params = create_model(args.window_size,
                                               args.input_shape,
                                               args.num_actions,
                                               'online_model',
                                               create_duel_q_network,
                                               trainable=True)
    target_model, target_params = create_model(args.window_size,
                                               args.input_shape,
                                               args.num_actions,
                                               'target_model',
                                               create_duel_q_network,
                                               trainable=False)
    update_target_params_ops = [
        t.assign(s) for s, t in zip(online_params, target_params)
    ]

    agent = DQNAgent(online_model, target_model, replay_memory,
                     args.num_actions, args.gamma, TARGET_UPDATE_FREQENCY,
                     update_target_params_ops, args.batch_size,
                     args.learning_rate)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    with sess.as_default():
        # saving and loading networks
        trainables = tf.trainable_variables()
        trainable_saver = tf.train.Saver(trainables, max_to_keep=1)
        sess.run(tf.global_variables_initializer())
        checkpoint = tf.train.get_checkpoint_state("saved_networks")
        print('checkpoint:', checkpoint)
        if checkpoint and checkpoint.model_checkpoint_path:
            trainable_saver.restore(sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
        # make target_model equal to online_model
        sess.run(update_target_params_ops)

        print('Prepare fixed samples for mean max Q.')
        fixed_samples = get_fixed_samples(batch_environment, args.num_actions,
                                          NUM_FIXED_SAMPLES)

        # initialize replay buffer
        print('Burn in replay_memory.')
        agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False)

        # start training:
        fit_iteration = int(args.num_iteration * args.eval_every)
        for i in range(0, args.num_iteration, fit_iteration):
            # evaluate:
            reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate(
                sess, batch_environment)
            mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(
                sess, fixed_samples)
            print("%d, %f, %f, %f, %f, %f, %f" %
                  (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var,
                   reward_max, reward_min))
            # train:
            agent.fit(sess, batch_environment, fit_iteration, do_train=True)
            trainable_saver.save(sess, 'saved_networks/', global_step=i)

        reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate(
            sess, batch_environment)
        mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(sess, fixed_samples)
        print("%d, %f, %f, %f, %f, %f, %f" %
              (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var,
               reward_max, reward_min))
Esempio n. 9
0
class Dqn():
    """
    Whole process of Deep Q Learning Algorithm
    
    """
    def __init__(self, input_size, nb_action, gamma):
        """
        Initialize Deep Q Learning 
        
        @param input_size: input size of Neural Network
        @param nb_action: possible actions
        @param gamma: gamma paramemter of Deep Q-learning equation
        """
        self.gamma = gamma
        self.reward_window = []
        self.model = Network(input_size, nb_action)
        self.memory = ReplayMemory(100000)

        #optimization algorithm
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        #unsqueeze(0) => torch tensor of size 1 x input_size
        self.last_state = torch.Tensor(input_size).unsqueeze(0)
        self.last_action = 0
        self.last_reward = 0

    def select_action(self, state):
        """
        Select the next action of the car. 
        Use softmax function to get the best action while exploring different actions
        
        1.Generate Q values for all of possible actions
        2.It generate the probability distribution of all Q values
        3.Choose the action according to the probability distribution of each action
        
        @param state: input state of neural network
        @return: final action to play
        """
        #Convert torch tensor state to torch variable
        # T=100 Temperature parameter
        # Higher temperature, softmax function increases the certainty
        probs = F.softmax(self.model(Variable(state, volatile=True)) * 100)

        # Get random draw of the probability distribution for eacj state
        action = probs.multinomial()

        return action.data[0, 0]

    def learn(self, batch_state, batch_next_state, batch_reward, batch_action):
        """
        Train the deep neural network
      
        1.Get the output by forward propagation
        2.Get the target
        3.Compare output and target to computer last error
        4.Back propagate the last error to neural network
        5.Use stochastic gradient descent to update the weight 
            according how much contribute the last error
        
        @param batch_state: current state
        @param batch_next_state: next state
        @param batch_reward: reward
        @param batch_action: action
        """
        #output of all possible actions
        #gather(1, batch_action.unsqueeze(1)) to choose the only one action selected by NN
        #Want a simple vector, we need to kill fake dimension
        #squeeze(1) removes size of 1 from the dimension
        outputs = self.model(batch_state).gather(
            1, batch_action.unsqueeze(1)).squeeze(1)
        #Detach
        next_outputs = self.model(batch_next_state).detach().max(1)[0]

        #Bellman equation
        target = self.gamma * next_outputs + batch_reward

        #Temporal Difference
        td_loss = F.smooth_l1_loss(outputs, target)

        #re-initialize the optimizer
        self.optimizer.zero_grad()
        #back-ward propagation
        #free memory by retain variables
        td_loss.backward(retain_variables=True)
        #Update the weight of neural network
        self.optimizer.step()

    def update(self, reward, new_signal):
        """
        Update everything once AI reaches to new state
        
        @param reward: last reward
        @param new_signal: last signal
        @return action to play
        """
        new_state = torch.Tensor(new_signal).float().unsqueeze(0)

        self.memory.push((self.last_state, new_state,
                          torch.LongTensor([int(self.last_action)]),
                          torch.Tensor([self.last_reward])))
        action = self.select_action(new_state)

        if len(self.memory.memory) > 100:
            batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample(
                100)
            self.learn(batch_state, batch_next_state, batch_reward,
                       batch_action)

        self.last_action = action
        self.last_state = new_state
        self.last_reward = reward

        self.reward_window.append(reward)

        if len(self.reward_window) > 1000:
            del self.reward_window[0]

        return action

    def score(self):
        """
        Get the current score
        
        @return score
        """
        return sum(self.reward_window) / (len(self.reward_window) + 1.)

    def save(self):
        """
        Save the current Neural Network into file
        
        """
        torch.save(
            {
                'state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict(),
            }, 'last_brain.pth')

    def load(self):
        """
        Load the existed Neural Network
        
        """

        if os.path.isfile('last_brain.pth'):
            print("=> loading checkpoint... ")
            checkpoint = torch.load('last_brain.pth')
            self.model.load_state_dict(checkpoint['state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            print("done !")
        else:
            print("no checkpoint found...")
class AgentAtari:
    def __init__(self, p):
        self.p = p

        self.target_cnn = Convolutional()
        self.eval_cnn = Convolutional()

        self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4, 84, 84])
        self.optimizer = torch.optim.Adam(self.eval_cnn.parameters(),
                                          self.p['LEARNING_RATE'])

        try:
            self.eval_cnn.load_state_dict(torch.load("Model/eval_cnn4ac.data"))
            self.target_cnn.load_state_dict(
                torch.load("Model/eval_cnn4ac.data"))
            print("Data has been loaded successfully")
        except:
            print("No data existing")

    def act(self, state):
        r = random.random()

        if r > self.p['EPSILON']:
            x = torch.FloatTensor(state).to(device)
            q_value = self.eval_cnn(x)

            return torch.argmax(q_value).item()
        else:

            action = random.randint(0, self.p['N_ACTIONS'] - 1)

            return action

    def learn(self, losss):
        if self.memory.index < self.p['BATCH_SIZE']:
            return

        # Get the state dict from the saved date
        eval_dict = self.eval_cnn.state_dict()
        target_dict = self.eval_cnn.state_dict()

        # Updating the parameters of the target DQN

        for w in eval_dict:
            target_dict[w] = (1 - self.p['ALPHA']) * target_dict[w] + self.p[
                'ALPHA'] * eval_dict[w]
        self.target_cnn.load_state_dict(target_dict)

        # Get a sample of size BATCH
        batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.pop(
            self.p['BATCH_SIZE'])

        # Update the treshold for the act() method if needed everytime the agent learn
        if self.p["EPSILON"] > self.p["EPSILON_MIN"]:
            self.p["EPSILON"] *= self.p["EPSILON_DECAY"]

        loss = nn.MSELoss()

        # Compute q values for the current evaluation
        q_eval = self.eval_cnn(batch_state).gather(
            1,
            batch_action.long().unsqueeze(1)).reshape([self.p["BATCH_SIZE"]])

        # Compute the next state q values
        q_next = self.target_cnn(batch_next_state).detach()

        # Compute the targetted q values
        q_target = batch_reward + q_next.max(1)[0].reshape(
            [self.p["BATCH_SIZE"]]) * self.p["GAMMA"]
        self.optimizer.zero_grad()
        l = loss(q_eval, q_target)
        losss.append(l)
        l.backward()
        self.optimizer.step()

    def atari(self):
        env = gym.make('BreakoutNoFrameskip-v4')
        env = env.unwrapped
        env = AtariPreprocessing(env,
                                 frame_skip=4,
                                 grayscale_obs=True,
                                 scale_obs=True)
        env = FrameStack(env, 4)

        rewards = []
        losss = []
        print(env.get_action_meanings())
        for i in range(self.p['N_EPISODE']):
            state = env.reset()
            rewards.append(0)
            env.step(1)
            actual_life = 5
            for s in range(self.p['N_STEPS']):
                env.render()
                action = self.act(state)

                n_state, reward, done, _ = env.step(action)

                if env.env.ale.lives() != actual_life:
                    reward = -1
                    actual_life -= 1
                    env.step(1)
                rewards[-1] += reward

                self.memory.push(state, action, n_state, reward, done)
                self.learn(losss)
                state = n_state
            print("Episode : ", i, ", Rewards : ", rewards[-1])

        torch.save(self.eval_cnn.state_dict(), "Model/eval_cnn4ac.data")

        # Display result
        plt.ylabel("Rewards")
        plt.xlabel("Episode")
        plt.plot(rewards)
        plt.grid()
        plt.show()
        plt.ylabel("loss")
        plt.xlabel("Episode")
        plt.plot(losss)
        plt.grid()
        plt.show()
        env.close()
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser(
        description='Run DQN on Atari Space Invaders')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('--seed', default=10703, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=(84, 84), help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    parser.add_argument('--epsilon',
                        default=0.1,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate',
                        default=0.00025,
                        help='Training learning rate.')
    parser.add_argument('--window_size',
                        default=4,
                        type=int,
                        help='Number of frames to feed to the Q-network')
    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='Batch size of the training part')
    parser.add_argument('--num_process',
                        default=3,
                        type=int,
                        help='Number of parallel environment')
    parser.add_argument('--num_iteration',
                        default=20000000,
                        type=int,
                        help='number of iterations to train')
    parser.add_argument(
        '--eval_every',
        default=0.001,
        type=float,
        help='What fraction of num_iteration to run between evaluations.')
    parser.add_argument('--is_duel',
                        default=1,
                        type=int,
                        help='Whether use duel DQN, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_double',
        default=1,
        type=int,
        help='Whether use double DQN, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_per',
        default=1,
        type=int,
        help='Whether use PriorityExperienceReplay, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_distributional',
        default=1,
        type=int,
        help='Whether use distributional DQN, 0 means no, 1 means yes.')
    parser.add_argument('--num_step',
                        default=1,
                        type=int,
                        help='Num Step for multi-step DQN, 3 is recommended')
    parser.add_argument('--is_noisy',
                        default=1,
                        type=int,
                        help='Whether use NoisyNet, 0 means no, 1 means yes.')

    args = parser.parse_args()
    args.input_shape = tuple(args.input_shape)
    print('Environment: %s.' % (args.env, ))
    env = gym.make(args.env)
    num_actions = env.action_space.n
    print('number_actions: %d.' % (num_actions, ))
    env.close()

    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    batch_environment = BatchEnvironment(args.env, args.num_process,
                                         args.window_size, args.input_shape,
                                         NUM_FRAME_PER_ACTION,
                                         MAX_EPISODE_LENGTH)

    if args.is_per == 1:
        replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE,
                                                 args.window_size,
                                                 args.input_shape)
    else:
        replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size,
                                     args.input_shape)

    create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network
    create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model
    noisy = True if args.is_noisy == 1 else False
    online_model, online_params = create_model_fn(args.window_size,
                                                  args.input_shape,
                                                  num_actions,
                                                  'online_model',
                                                  create_network_fn,
                                                  trainable=True,
                                                  noisy=noisy)
    target_model, target_params = create_model_fn(args.window_size,
                                                  args.input_shape,
                                                  num_actions,
                                                  'target_model',
                                                  create_network_fn,
                                                  trainable=False,
                                                  noisy=noisy)
    update_target_params_ops = [
        t.assign(s) for s, t in zip(online_params, target_params)
    ]

    agent = DQNAgent(online_model, target_model, replay_memory, num_actions,
                     args.gamma, UPDATE_FREQUENCY, TARGET_UPDATE_FREQENCY,
                     update_target_params_ops, args.batch_size, args.is_double,
                     args.is_per, args.is_distributional, args.num_step,
                     args.is_noisy, args.learning_rate, RMSP_DECAY,
                     RMSP_MOMENTUM, RMSP_EPSILON)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    with sess.as_default():
        sess.run(tf.global_variables_initializer())
        # make target_model equal to online_model
        sess.run(update_target_params_ops)

        print('Prepare fixed samples for mean max Q.')
        fixed_samples = get_fixed_samples(batch_environment, num_actions,
                                          NUM_FIXED_SAMPLES)

        print('Burn in replay_memory.')
        agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False)

        # Begin to train:
        fit_iteration = int(args.num_iteration * args.eval_every)

        for i in range(0, args.num_iteration, fit_iteration):
            # Evaluate:
            reward_mean, reward_var = agent.evaluate(sess, batch_environment,
                                                     NUM_EVALUATE_EPSIODE)
            mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples)
            print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var))
            # Train:
            agent.fit(sess, batch_environment, fit_iteration, do_train=True)

    batch_environment.close()
Esempio n. 12
0
    def train(self, env, steps_per_epoch=128, epochs=10000):
        # Every four actions a gradient descend step is performed
        UPDATE_FREQ = 4
        # Number of chosen actions between updating the target network.
        NETW_UPDATE_FREQ = 10000
        # Replay mem
        REPLAY_MEMORY_START_SIZE = 33
        # Create network model
        self.main_network.compile(optimizer=tf.keras.optimizers.Adam(),
                                  loss='mse')
        # Replay memory
        my_replay_memory = ReplayMemory()
        # Metrics
        loss_avg = tf.keras.metrics.Mean()
        train_reward_tot = tf.keras.metrics.Sum()
        train_rew_comf_tot = tf.keras.metrics.Sum()
        train_rew_eff_tot = tf.keras.metrics.Sum()
        train_rew_safe_tot = tf.keras.metrics.Sum()
        train_coll_rate = tf.keras.metrics.Mean()
        train_speed_rate = tf.keras.metrics.Mean()

        # Training loop: collect samples, send to optimizer, repeat updates times.
        next_obs = env.reset(gui=True, numVehicles=40)
        first_epoch = 0
        try:
            for epoch in range(first_epoch, epochs):
                ep_rewards = 0
                for step in range(steps_per_epoch):
                    # curr state
                    state = next_obs.copy()
                    # get action
                    action = self.act(state, self.main_network)
                    # do step
                    next_obs, rewards_info, done, collision = env.step(action)
                    # process obs and get rewards
                    avg_speed_perc = env.speed / env.target_speed
                    rewards_tot, R_comf, R_eff, R_safe = rewards_info
                    # Add experience
                    my_replay_memory.add_experience(action=action,
                                                    frame=next_obs,
                                                    reward=rewards_tot,
                                                    terminal=done)
                    # Update metrics
                    train_reward_tot.update_state(rewards_tot)
                    train_rew_comf_tot.update_state(R_comf)
                    train_rew_eff_tot.update_state(R_eff)
                    train_rew_safe_tot.update_state(R_safe)
                    train_coll_rate.update_state(collision)
                    train_speed_rate.update_state(avg_speed_perc)

                    # Train every UPDATE_FREQ times
                    if self.steps_done > REPLAY_MEMORY_START_SIZE:
                        loss_value = self.train_step_(my_replay_memory)
                        loss_avg.update_state(loss_value)
                        self.update_network()
                    else:
                        loss_avg.update_state(-1)
                    # Copy network from main to target every NETW_UPDATE_FREQ
                    if step % NETW_UPDATE_FREQ == 0 and step > REPLAY_MEMORY_START_SIZE:
                        self.target_network.set_weights(
                            self.main_network.get_weights())

                    self.steps_done += 1

                # Write
                with self.train_summary_writer.as_default():
                    tf.summary.scalar('loss', loss_avg.result(), step=epoch)
                    tf.summary.scalar('reward_tot',
                                      train_reward_tot.result(),
                                      step=epoch)
                    tf.summary.scalar('rewards_comf',
                                      train_rew_comf_tot.result(),
                                      step=epoch)
                    tf.summary.scalar('rewards_eff',
                                      train_rew_eff_tot.result(),
                                      step=epoch)
                    tf.summary.scalar('rewards_safe',
                                      train_rew_safe_tot.result(),
                                      step=epoch)
                    tf.summary.scalar('collission_rate',
                                      train_coll_rate.result(),
                                      step=epoch)
                    tf.summary.scalar('avg speed wrt maximum',
                                      train_speed_rate.result(),
                                      step=epoch)

                # Reset
                train_reward_tot.reset_states()
                train_rew_comf_tot.reset_states()
                train_rew_eff_tot.reset_states()
                train_rew_safe_tot.reset_states()
                train_coll_rate.reset_states()
                train_speed_rate.reset_states()

                # Save model
                if epoch % 1000 == 0:
                    tf.keras.models.save_model(self.main_network,
                                               self.model_dir + "/" +
                                               str(epoch) +
                                               "_main_network.hp5",
                                               save_format="h5")
                    tf.keras.models.save_model(self.target_network,
                                               self.model_dir + "/" +
                                               str(epoch) +
                                               "_target_network.hp5",
                                               save_format="h5")
        except KeyboardInterrupt:
            # self.model.save_weights(self.model_dir+"/model.ckpt")
            tf.keras.models.save_model(self.main_network,
                                       self.model_dir + "/" + str(epoch) +
                                       "_main_network.hp5",
                                       save_format="h5")
            tf.keras.models.save_model(self.target_network,
                                       self.model_dir + "/" + str(epoch) +
                                       "_target_network.hp5",
                                       save_format="h5")

        env.close()

        return 0
Esempio n. 13
0
def train(sess, env, args, actors, critics, noise):
    summary_ops, summary_vars = build_summaries()
    # summary_ops,episode_reward1 = build_summaries()
    init = tf.initialize_all_variables()
    sess.run(init)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    for a in actors:
        a.update_target()
    for b in critics:
        b.update_target()

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for ep in range(int(args['max_episodes']) + 1):
        print('starting runing')
        print('this is {} of epoch'.format(ep))
        s = env.reset()
        episode_reward = np.zeros((env.n, ))

        if ep % 1000 == 0:
            for k in range(env.n):
                file1 = 'results/actor' + str(k) + str(ep) + '.h5'
                # file2 = 'results/actor'+str(k)+'/target'+str(ep)+'.h5'
                file3 = 'results/critic' + str(k) + str(ep) + '.h5'
                # file4 = 'results/critic'+str(k)+'/target'+str(ep)+'.h5'
                actor = actors[k]
                critic = critics[k]
                actor.mainModel.save(file1)
                # actor.targetModel.save(file2)
                critic.mainModel.save(file3)
                # critic.targetModel.save(file4)
        plt.close()
        plt.figure()
        for stp in range(int(args['max_episode_len'])):
            if args['render_env']:
                env.render(s)
                plt.clf()
            a = []  # shape=(n,actor.action_dim)
            for i in range(env.n):
                actor = actors[i]
                a.append(
                    actor.act(np.reshape(s[i], (-1, actor.state_dim)),
                              noise[i]()).reshape(actor.action_dim, ))
            s2, r, done = env.step(
                a)  # a is a list with each element being an array
            replayMemory.add(s, a, r, done, s2)
            s = s2
            action_dims_done = 0
            for i in range(env.n):
                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args['minibatch_size']):

                    s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                        int(args['minibatch_size']))
                    a = []
                    for j in range(env.n):
                        state_batch_j = np.asarray(
                            [x for x in s_batch[:, j]]
                        )  # batch processing will be much more efficient even though reshaping will have to be done
                        a.append(actors[j].predict_target(state_batch_j))

                    a_temp = np.transpose(np.asarray(a), (1, 0, 2))
                    a_for_critic = np.asarray([x.flatten() for x in a_temp])
                    s2_batch_i = np.asarray([
                        x for x in s2_batch[:, i]
                    ])  # Checked till this point, should be fine.
                    targetQ = critic.predict_target(
                        s2_batch_i, a_for_critic)  # Should  work, probably

                    yi = []
                    for k in range(int(args['minibatch_size'])):
                        if d_batch[:, i][k]:
                            yi.append(r_batch[:, i][k])
                        else:
                            yi.append(r_batch[:, i][k] +
                                      critic.gamma * targetQ[k])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    critic.train(
                        s_batch_i, np.asarray([x.flatten() for x in a_batch]),
                        np.reshape(yi, (int(args['minibatch_size']), 1)))

                    actions_pred = []
                    for j in range(env.n):
                        state_batch_j = np.asarray([x for x in s2_batch[:, j]])
                        actions_pred.append(
                            actors[j].predict(state_batch_j)
                        )  # Should work till here, roughly, probably

                    a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2))
                    a_for_critic_pred = np.asarray(
                        [x.flatten() for x in a_temp])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    grads = critic.action_gradients(
                        s_batch_i,
                        a_for_critic_pred)[:,
                                           action_dims_done:action_dims_done +
                                           actor.action_dim]
                    actor.train(s_batch_i, grads)
                    actor.update_target()
                    critic.update_target()

                action_dims_done = action_dims_done + actor.action_dim
            episode_reward += r
            # print(done)
            if sum(done):
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: episode_reward[0],
                                           summary_vars[1]: episode_reward[2]
                                       })
                writer.add_summary(summary_str, ep)
                writer.flush()
                break
class AgentCartpole:
    def __init__(self, p):
        self.p = p
        self.target_dqn = DQN(self.p['HIDDEN_DIM'])
        self.eval_dqn = DQN(self.p['HIDDEN_DIM'])

        self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4])
        self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE'])

        try:
            self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            print("Data has been loaded successfully")
        except:
            print("No data existing")

    def act(self, state):
        r = random.random()

        if r > self.p['EPSILON']:
            x = torch.FloatTensor(state)
            q_value = self.eval_dqn(x)
            action = torch.argmax(q_value).item()
            return action
        else:
            action = random.randint(0, self.p['N_ACTIONS']-1)
            return action

    def learn(self):
        if self.memory.index < self.p['BATCH_SIZE']:
            return

        # Get the state dict from the saved date
        eval_dict = self.eval_dqn.state_dict()
        target_dict = self.eval_dqn.state_dict()

        # Updating the parameters of the target DQN
        for w in eval_dict:
            target_dict[w] = (1 - self.p['ALPHA']) * target_dict[w] + self.p['ALPHA'] * eval_dict[w]
        self.target_dqn.load_state_dict(target_dict)

        # Get a sample of size BATCH
        batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.pop(self.p['BATCH_SIZE'])

        # Update the treshold for the act() method if needed everytime the agent learn
        if self.p["EPSILON"] > self.p["EPSILON_MIN"]:
            self.p["EPSILON"] *= self.p["EPSILON_DECAY"]

        loss = nn.MSELoss()

        # Compute q values for the current evaluation
        q_eval = self.eval_dqn(batch_state).gather(1, batch_action.long().unsqueeze(1)).reshape([self.p["BATCH_SIZE"]])

        # Compute the next state q values
        q_next = self.target_dqn(batch_next_state).detach()

        # Compute the targetted q values
        q_target = batch_reward + q_next.max(1)[0].reshape([self.p["BATCH_SIZE"]]) * self.p["GAMMA"]
        self.optimizer.zero_grad()
        l = loss(q_eval, q_target)
        l.backward()
        self.optimizer.step()

    def random(self):
        env = gym.make('CartPole-v1')
        env = env.unwrapped
        env.reset()
        rewards = []
        while True:
            env.render()
            action = env.action_space.pop(self.p['BATCH_SIZE'])
            observation, reward, done, info = env.step(action)
            rewards.append(reward)
            if done:
                break

        env.close()
        plt.ylabel("Rewards")
        plt.xlabel("Nb interactions")
        plt.plot(rewards)
        plt.grid()
        plt.show()

    def dqn_cartpole(self):
        env = gym.make('CartPole-v1')
        env = env.unwrapped
        rewards = []
        for i in range(self.p['N_EPISODE']):
            state = env.reset()
            rewards.append(0)
            for s in range(self.p['N_STEPS']):
                # env.render()
                action = self.act(state)
                n_state, reward, done, _ = env.step(action)
                if done:
                    reward = -1
                rewards[-1] += reward

                self.memory.push(state, action, n_state, reward, done)
                self.learn()
                state = n_state

            print('Episode : ', i, ', Rewards : ', rewards[-1])

            # Save the eval model after each episode
            torch.save(self.eval_dqn.state_dict(), "Model/eval_dqn.data")

        # Display result
        n = 50
        res = sum(([a]*n for a in [sum(rewards[i:i+n])//n for i in range(0,len(rewards),n)]), [])
        print(rewards)
        plt.ylabel("Rewards")
        plt.xlabel("Episode")
        plt.plot(rewards)
        plt.plot(res)
        plt.grid()
        plt.legend(['Rewards per episode', 'Last 50 runs average'])
        plt.show()
        env.close()
Esempio n. 15
0
def algorithmImpl():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = gym.make(
        "BreakoutDeterministic-v4")  # Deterministic-v4; frameskip = 4

    numActions = env.action_space.n
    mem = ReplayMemory(MEM_CAPACITY)
    agent = Agent(EP_START, EP_END, EP_DECAY, numActions, device)
    policyNet = DQN(numActions).to(device)
    targetNet = DQN(numActions).to(device)
    targetNet.load_state_dict(policyNet.state_dict())
    targetNet.eval()
    optimizer = optim.Adam(params=policyNet.parameters(), lr=LEARNIN_RATE)

    stepCount = 0

    for ep in range(EPISODES):
        print('episode: ', ep + 1)
        done = False
        obv = env.reset()
        preproObv = preprocessing(obv)
        frames = [preproObv]
        nextFrames = []
        lastAction = 0
        totalReward = 0

        for t in count():
            if len(frames) == 4:
                state = torch.cat(frames, dim=1).to(
                    device)  # returns tensor of 1x4x84x84
                action = agent.selectAction(state, policyNet)
                frames = []
            else:
                action = lastAction

            obv, r, done, _ = env.step(action)
            preproObv = preprocessing(obv)
            frames.append(preproObv)
            nextFrames.append(preproObv)
            totalReward += r  # for evalution
            if done:
                r = -1.0
            reward = torch.tensor(r).reshape(1, 1).to(device)

            lastAction = action

            if len(nextFrames) == 4:
                nextState = torch.cat(nextFrames, dim=1).to(
                    device)  # returns tensor of 1x4x84x84
                nextFrames = []
                mem.push(Experience(state, action, reward, nextState))
                state = nextState

                if mem.canProvideSample(BATCH_SIZE):
                    exps = mem.sample(BATCH_SIZE)
                    states, actions, rewards, nextStates = extractTensors(exps)
                    qPred = policyNet(states).gather(1, actions)

                    qTarget = targetNet(nextStates).max(
                        dim=1, keepdim=True)[0].detach()
                    target = GAMMA * qTarget + rewards

                    loss = functional.mse_loss(qPred, target)
                    policyNet.zero_grad()
                    loss.backward()
                    optimizer.step()

                stepCount += 1
                if stepCount == TARGET_UPDATE:
                    stepCount = 0
                    targetNet.load_state_dict(policyNet.state_dict())
                    print("SavedModels/Saved model")
                    torch.save(policyNet, "Policy.pt")

            if ep % 20 == 0:
                env.render()

            if done:
                break
    torch.save(policyNet, "SavedModels/Policy.pt")
Esempio n. 16
0
print("device", device)

if not os.path.exists(file_path):
    os.makedirs(file_path)

write_lr(lr)  #lrのtextファイルを作成する

now = datetime.datetime.now()
print('{0:%Y%m%d}'.format(now))

#tensorboarx
writer_x = SummaryWriter('tfbx2/' + '_' + '{0:%Y%m%d%H%M%S_}'.format(now) +
                         model_filename + MEMO + '/')

ban = Env(BANHEN, WINREN)
memory = ReplayMemory(CAPACITY, ban)
brain = Brain_dqn(NeuralNet_cnn, device, ban.size, ban, memory, GAMMA,
                  BATCH_SIZE, lr, T, BANHEN, BANSIZE)

match_is_continue = True  #試合が継続しているかどうか
train_is_continue = True  #訓練を継続するか
reward = 0  #報酬
step = 0  #何手目か
step_sum = 0
gen_num = 0  #モデルの初期値
episode_sum = 0  #エピソードの累積
search_depth = 3
ep_random_data = 0

log_print("lrはtextファイルから読み取り")
Esempio n. 17
0
weapons = ["Candlestick", "Knife", "Lead Pipe", "Revolver", "Rope", "Wrench"]
characters = [
    "Mr. Green", "Colonel Mustard", "Mrs. Peacock", "Professor Plum",
    "Ms. Scarlet", "Mrs. White"
]

if numQlearnPlayers > 0:
    qtbl = QTable(rooms, weapons, characters)
else:
    qtbl = {}

if numDeepQLearnPlayers > 0:
    if os.path.exists("QNetworks.pickle"):
        nets = pickle.load(open("QNetworks.pickle", "rb"))
        rm = ReplayMemory(
            100000,
            namedtuple('Transition',
                       ('state', 'action', 'next_state', 'reward')))
        qNetworks = (nets[0], nets[1], rm)
    else:
        n1 = QNetwork(6, 6, 67220).to(
            torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        n2 = QNetwork(6, 6, 67220).to(
            torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        rm = ReplayMemory(
            100000,
            namedtuple('Transition',
                       ('state', 'action', 'next_state', 'reward')))
        qNetworks = (n1, n2, rm)
else:
    qNetworks = ()
Esempio n. 18
0
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    #Setup Tensorboard
    tsb_folder = "tensorboard"
    tsb_dirname = "DQN_%d" % (args.target_update)
    tsb_path = os.path.join(tsb_folder, tsb_dirname)
    writer = SummaryWriter(tsb_path)

    BATCH_SIZE = 32
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Transition = namedtuple('Transition',
                            ('state', 'action', 'next_state', 'reward'))
    print("TRAINING ON RANDOM INSTANCES")

    memory = ReplayMemory(args.memory_capacity)
    brain = DQNet(args)
    i_episode = 0
    steps_done = 0
    trainPath = "./Data/MediumData/"
    print("Start Initialization")
    while len(memory) < 5 * BATCH_SIZE:
        #generate random dataset
        randomFile = random.choice(os.listdir(trainPath))
        randomFile = os.path.join(trainPath, randomFile)
        print(randomFile)
        instance = readMatInstance(randomFile)
        loss, reward, real_reward = run_episode(args,
                                                i_episode,
                                                instance,
                                                memory_initialization=True)
Esempio n. 19
0
gamma = 0.999
eps_start = 1
eps_end = 0.01
eps_decay = 0.001
target_update = 10  #update the target network every 10 episode
memory_size = 100000
lr = 0.001  #learning rate
num_episodes = 1000

#set the device use cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#enviorement manager
em = CartPoleEnvManager(device)
#create the strategy
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)

#create agent
agent = Agent(strategy, em.num_actions_available(), device)
#create replay memory
memory = ReplayMemory(memory_size)

#create policy network and target network
#pass height and width to create appropriate input shape

policy_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
Esempio n. 20
0
File: train.py Progetto: cgel/DQN_tf
    device = args.device
    gamma = args.gamma
    learning_rate = args.learning_rate
    exploration_steps = args.exploration_steps
    initial_epsilon = args.initial_epsilon
    final_epsilon = args.final_epsilon
    sync_rate = args.sync_rate
    save_summary_rate = args.save_summary_rate

def get_epsilon():
    if global_step < config.exploration_steps:
        return config.initial_epsilon-((config.initial_epsilon-config.final_epsilon)/config.exploration_steps)*global_step
    else:
        return config.final_epsilon

RM = ReplayMemory(config)

def flush_print(str):
    print(str)
    sys.stdout.flush()

def preprocess(new_frame, state):
    frame = cv2.resize(new_frame, (84, 84))
    new_state = np.roll(state, -1, axis=3)
    new_state[0, :, :, config.buff_size -1] = frame
    return new_state



with tf.device(config.device):
    input_state_ph = tf.placeholder(tf.float32,[config.batch_size,84,84,4], name="input_state_ph")
Esempio n. 21
0
class BaseAgent:

    # must be implemented by each agent
    def update(self):
        return

    def __init__(self, config, session):
        # build the net
        self.config = config
        self.sess = session
        self.RM = ReplayMemory(config)
        self.step_count = 0
        self.episode = 0
        self.isTesting = False
        self.game_state = np.zeros(
            (1, 84, 84, self.config.buff_size), dtype=np.uint8)
        self.reset_game()
        self.timeout_option = tf.RunOptions(timeout_in_ms=5000)

        # if the new agent needs other action modes define a different dict
        self.action_modes = {
            str(config.epsilon) + "_greedy": self.e_greedy_action}
        self.default_action_mode = self.action_modes.items()[0][0]
        self.action_mode = self.default_action_mode

        self.representations = []

    def step(self, screen, reward):
        # clip the reward
        if not self.isTesting:
            # add the last transition
            self.RM.add(self.game_state[:, :, :, -1],
                        self.game_action, self.game_reward, False)
            self.observe(screen, reward)
            self.game_action = self.e_greedy_action(self.epsilon())
            if self.step_count > self.config.steps_before_training:
                self.update()
            self.step_count += 1
        else:
            # if the agent is testing
            self.observe(screen, reward)
            self.game_action = self.e_greedy_action(0.01)
        return self.game_action

    # Add the final transition to the RM and reset the internal state for the next
    # episode
    def terminal(self):
        if not self.isTesting:
            self.RM.add(
                self.game_state[:, :, :, -1],
                self.game_action, self.game_reward, True)
        self.reset_game()

    def observe(self, screen, reward):
        self.game_reward = max(-1, min(1, reward))
        screen = cv2.resize(screen, (84, 84))
        screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)
        self.game_state = np.roll(self.game_state, -1, axis=3)
        self.game_state[0, :, :, -1] = screen

    def e_greedy_action(self, epsilon):
        ops = [self.Q] + self.representations
        res= self.sess.run( ops, feed_dict={
            self.state_ph: self.game_state})

        self.Q_np = res[0]
        self.representations_np = res[1:]

        action = np.argmax(self.Q_np)
        if np.random.uniform() < epsilon:
            action = random.randint(0, self.config.action_num - 1)
        return action

    def testing(self, t=True):
        self.isTesting = t

    def set_action_mode(self, mode):
        if mode not in self.action_modes:
            raise Exception(str(mode) + " is not a valid action mode")
        self.select_action = self.action_modes[mode]

    def reset_game(self):
        self.game_state.fill(0)
        self.game_action = 0
        self.game_reward = 0
        if not self.isTesting:
            # add initial black screens for next episode
            for i in range(self.config.buff_size - 1):
                self.RM.add(np.zeros((84, 84)), 0, 0, False)

    def epsilon(self):
        if self.step_count < self.config.exploration_steps:
            return self.config.initial_epsilon - \
                ((self.config.initial_epsilon - self.config.final_epsilon) /
                 self.config.exploration_steps) * self.step_count
        else:
            return self.config.final_epsilon
Esempio n. 22
0

if config.h_to_h not in ["oh_concat", "expanded_concat", "conditional"]:
    raise "Not valid transition function"


def get_epsilon():
    if global_step < config.exploration_steps:
        return config.initial_epsilon - (
            (config.initial_epsilon - config.final_epsilon) /
            config.exploration_steps) * global_step
    else:
        return config.final_epsilon


RM = ReplayMemory(config)


def flush_print(str):
    print(str)
    sys.stdout.flush()


def preprocess(new_frame, state):
    frame = cv2.resize(new_frame, (84, 84))
    new_state = np.roll(state, -1, axis=3)
    new_state[0, :, :, config.buff_size - 1] = frame
    return new_state


with tf.device(config.device):