Example #1
0
 def __init__(self,
              q_network,
              q_network2,
              preprocessor: Preprocessor(),
              RLmemory: ReplayMemory(),
              SLmemory: ReplayMemory(),
              policy,
              gamma,
              target_update_freq,
              num_burn_in,
              train_freq,
              batch_size,
              algorithm='DoubuleDQN',
              render=False):
     self.net = q_network
     self.net2 = q_network2
     self.pre = preprocessor
     self.rl_mem = RLmemory
     self.sl_mem = SLmemory
     self.policy = policy
     self.gamma = gamma
     self.renew = target_update_freq
     self.burn_in = num_burn_in
     self.train_freq = train_freq
     self.batch_size = batch_size
     self.algorithm = algorithm
     self.render = render
Example #2
0
    def __init__(self, clip_grad=True,
                 num_episodes=50,
                 trajectory_len=MAX_STEPS,
                 custom_func=None,
                 custom_func_args=None
                 ):
        '''
        Initialization
        :param clip_grad: bool: flag for clipping gradients with value 1
        :param num_episodes: number of episodes to run
        :param trajectory_len: maximal number of steps in each trajectory
        :param custom_func: custom reward function
        :param custom_func_args: custom reward function arguments
        '''
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.build_nn()
        self.num_episodes = num_episodes
        self.trajectory_len = trajectory_len
        self.model = DQN()
        self.replay = ReplayMemory(10000)
        self.steps_done = 0
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.clip_grad=clip_grad

        self.rewards = []
        self.modules = []
        self.env = ChainAgent(inventory_level=10,
                         fix_delay=1,
                         max_num_steps=MAX_STEPS + 10,
                         demand_generation_function=self.demand_generation_function,
                         custom_func=custom_func,
                         custom_func_args=custom_func_args)
Example #3
0
    def sample_from_Replay_Memory(self, batches, ReplayMemory, Net):
        current_states = []
        actions = []
        q_values = []

        for samples in ReplayMemory.sample(batches):
            state, action, reward, next_state, is_done = [samples.state,
                                                          samples.action,
                                                          samples.reward,
                                                          samples.next_state,
                                                          samples.done]

            next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0)

            current_states.append(state)
            actions.append(action)
            target = reward
            if not is_done:
                target = reward + self.gamma * np.amax(Net.predict(next_state)[0])
            target_f = Net.predict(state)[0]
            target_f[action] = target
            q_values.append(target_f)
        current_states = np.reshape(current_states, (-1, DIM_STATES))
        q_values = np.reshape((q_values), (-1, NUM_ACTIONS))
        return current_states, actions, q_values
Example #4
0
    def __init__(self,
                 num_episodes=NUM_EPISODES,
                 trajectory_len=MAX_STEPS,
                 clip_grad=True):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_episodes = num_episodes
        self.trajectory_len = trajectory_len
        self.replay = ReplayMemory(10000)
        self.steps_done = 0
        self.lr = 1e-3
        self.actor = Actor()
        self.optimizer = optim.Adam(self.actor.parameters(), lr=self.lr)
        self.clip_grad = clip_grad
        self.trajectory = []
        self.trajectories = []
        self.optimize_each = 5
        self.gamma = 0.99

        self.env = ChainAgent(
            inventory_level=10,
            fix_delay=1,
            max_num_steps=MAX_STEPS + 10,
            demand_generation_function=self.demand_generation_function)
Example #5
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description="Run DQN on iLOCuS")
    parser.add_argument("--network_name",
                        default="deep_q_network",
                        type=str,
                        help="Type of model to use")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size")
    parser.add_argument("--map_shape",
                        default=(15, 15),
                        type=tuple,
                        help="map size")
    parser.add_argument("--num_actions",
                        default=4,
                        type=int,
                        help="level of pricing")

    parser.add_argument("--gamma",
                        default=0.8,
                        type=float,
                        help="Discount factor")
    parser.add_argument("--alpha",
                        default=0.0001,
                        type=float,
                        help="Learning rate")
    parser.add_argument("--epsilon",
                        default=0.5,
                        type=float,
                        help="Exploration probability for epsilon-greedy")
    parser.add_argument("--target_update_freq",
                        default=10000,
                        type=int,
                        help="Frequency for copying weights to target network")
    parser.add_argument(
        "--num_iterations",
        default=5000000,
        type=int,
        help="Number of overal interactions to the environment")
    parser.add_argument("--max_episode_length",
                        default=200000,
                        type=int,
                        help="Terminate earlier for one episode")
    parser.add_argument("--train_freq",
                        default=4,
                        type=int,
                        help="Frequency for training")
    parser.add_argument("--num-burn-in",
                        default=10000,
                        type=int,
                        help="number of memory before train")

    parser.add_argument("-o",
                        "--output",
                        default="ilocus-v0",
                        type=str,
                        help="Directory to save data to")
    parser.add_argument("--seed", default=0, type=int, help="Random seed")
    parser.add_argument("--train",
                        default=True,
                        type=bool,
                        help="Train/Evaluate, set True if train the model")
    parser.add_argument("--model_path",
                        default="atari-v0",
                        type=str,
                        help="specify model path to evaluation")
    parser.add_argument("--max_grad",
                        default=1.0,
                        type=float,
                        help="Parameter for huber loss")
    parser.add_argument("--log_dir",
                        default="log",
                        type=str,
                        help="specify log folder to save evaluate result")
    parser.add_argument(
        "--flip_coin",
        default=False,
        type=str,
        help="specify whether or not choosing double q learning")
    parser.add_argument("--eval_num",
                        default=100,
                        type=int,
                        help="number of evaluation to run")
    parser.add_argument("--save_freq",
                        default=100000,
                        type=int,
                        help="model save frequency")

    # memory related args
    parser.add_argument("--buffer_size",
                        default=100000,
                        type=int,
                        help="reply memory buffer size")
    parser.add_argument(
        "--look_back_steps",
        default=4,
        type=int,
        help="how many previous pricing tables will be fed into RL")

    args = parser.parse_args()
    print("\nParameters:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    # Initiating policy for both tasks (training and evaluating)
    policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0.1, 1000000,
                                            args.num_actions)

    if not args.train:
        '''Evaluate the model'''
        # check model path
        if args.model_path is '':
            print("Model path must be set when evaluate")
            exit(1)

        # specific log file to save result
        log_file = os.path.join(args.log_dir, args.network_name,
                                str(args.model_num))
        model_dir = os.path.join(args.model_path, args.network_name,
                                 str(args.model_num))

        with tf.Session() as sess:
            # load model
            # with open(model_dir + ".json", 'r') as json_file:
            #     loaded_model_json = json_file.read()
            #     q_network_online = model_from_json(loaded_model_json)
            #     q_network_target = model_from_json(loaded_model_json)
            #
            # sess.run(tf.global_variables_initializer())
            #
            # # load weights into model
            # q_network_online.load_weights(model_dir + ".h5")
            # q_network_target.load_weights(model_dir + ".h5")

            driver_sim = DriverSim()
            env = Environment(driver_sim=driver_sim)

            memory = ReplayMemory(args.buffer_size, args.look_back_steps)
            q_network = create_model(args.look_back_steps, args.map_shape,
                                     args.num_actions)
            dqn_agent = DQNAgent(q_network=q_network,
                                 memory=memory,
                                 policy=policy,
                                 gamma=args.gamma,
                                 target_update_freq=args.target_update_freq,
                                 num_burn_in=args.num_burn_in,
                                 train_freq=args.train_freq,
                                 batch_size=args.batch_size)
        exit(0)
    '''Train the model'''

    with tf.Session() as sess:
        # with tf.device('/cpu:0'):
        print("created model")

        driver_sim = DriverSim()
        env = Environment(driver_sim=driver_sim)
        print("set up environment")

        # # create output dir, meant to pop up error when dir exist to avoid over written
        # os.mkdir(args.output + "/" + args.network_name)

        memory = ReplayMemory(args.buffer_size, args.look_back_steps)
        q_network = create_model(args.look_back_steps, args.map_shape,
                                 args.num_actions)
        dqn_agent = DQNAgent(q_network=q_network,
                             memory=memory,
                             policy=policy,
                             gamma=args.gamma,
                             target_update_freq=args.target_update_freq,
                             num_burn_in=args.num_burn_in,
                             train_freq=args.train_freq,
                             batch_size=args.batch_size)
        print("defined dqn agent")

        optimizer = Adam(learning_rate=args.alpha)
        q_network.compile(optimizer, mean_huber_loss)

        sess.run(tf.global_variables_initializer())

        print("initializing environment")
        env.reset()

        print("in fit")
        if os.path.exists(args.output):
            shutil.rmtree(args.output)
        os.mkdir(args.output)
        dqn_agent.fit(env=env,
                      num_iterations=args.num_iterations,
                      output_dir=os.path.join(args.output),
                      max_episode_length=args.max_episode_length)
Example #6
0
#initialize neural network to store policy

#load environment
env = gym.make('SpaceInvaders-v0')

#make the q_network
q_network_online = Sequential()
q_network_target = Sequential()

#make the preprocessors
history_preproc = HistoryPreprocessor(4)
atari_preproc = AtariPreprocessor()
preprocessor = PreprocessorSequence(atari_preproc, history_preproc)
#make the replay memory
memory = ReplayMemory()

#make the policy
policy = LinearDecayGreedyEpsilonPolicy(0, 0, 6, 0.8, 0.05, 100000)

#take the gamma, nicely
gamma = 0.99

#target_update_freq
target_update_freq = 10000

#num_burn_in : DUMMY
num_burn_in = 10

#train_freq : DUMMY
train_freq = 10
Example #7
0
    learning_rate = 1e-4
    epsilon = 0.05
    num_training_samples = int(5e6)
    buffer_size = int(1e6)
    target_update_freq = int(1e4)
    batch_size = 32
    num_burn_in = int(5e4)
    train_freq = 1
    nA = env.action_space.n

    # create preprocessor class
    preprocessor = AtariPreprocessor(84)
    print('created preprocessor')

    # create replay buffer
    replay_buffer = ReplayMemory(buffer_size, history_length, 84)
    print('created replay buffer')

    # create DQN agent
    agent = DQNAgent(DQN, preprocessor, replay_buffer,
                     policy.GreedyEpsilonPolicy, gamma, target_update_freq,
                     num_burn_in, train_freq, batch_size, history_length, nA,
                     dtype, epsilon, model_name)
    print('create DQN agent')

    if mode == 'train':
        env = wrappers.Monitor(env,
                               '/tmp/SpaceInvaders-DQN-expt-train.' +
                               model_name,
                               force=True)
        agent.fit(env, num_training_samples)
Example #8
0
class LearnerDQN:
    '''
    Learner class - abstraction which includes configuration of experiment, necessary models, and all actions needed for conduction.
    '''
    def __init__(self, clip_grad=True,
                 num_episodes=50,
                 trajectory_len=MAX_STEPS,
                 custom_func=None,
                 custom_func_args=None
                 ):
        '''
        Initialization
        :param clip_grad: bool: flag for clipping gradients with value 1
        :param num_episodes: number of episodes to run
        :param trajectory_len: maximal number of steps in each trajectory
        :param custom_func: custom reward function
        :param custom_func_args: custom reward function arguments
        '''
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.build_nn()
        self.num_episodes = num_episodes
        self.trajectory_len = trajectory_len
        self.model = DQN()
        self.replay = ReplayMemory(10000)
        self.steps_done = 0
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.clip_grad=clip_grad

        self.rewards = []
        self.modules = []
        self.env = ChainAgent(inventory_level=10,
                         fix_delay=1,
                         max_num_steps=MAX_STEPS + 10,
                         demand_generation_function=self.demand_generation_function,
                         custom_func=custom_func,
                         custom_func_args=custom_func_args)

    def select_action(self, state):
        '''
        Implementation of e-greedy approach
        :param state: input state to choose appropriate action
        :return: Tensor: action
        '''
        state = torch.Tensor(state)[None, :]
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * self.steps_done / EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            # print('greedy')
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            # print('random')
            return torch.tensor([[random.randrange(N_ACTIONS)]], device=self.device, dtype=torch.long)


    def demand_generation_function(self):
        '''
        Default function to generate demand
        :return: int: demand level
        '''
        return np.random.randint(0, 10)


    def optimize_model(self):
        '''
        Method of optimizing parameters of neural net
        :return:
        '''

        if len(self.replay) < BATCH_SIZE:
            return

        transitions = self.replay.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_states = torch.cat(batch.next_state)

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
        next_state_values = torch.zeros(BATCH_SIZE, device=self.device)
        # next_state_values = self.target_net(next_states).max(1)[0].detach()
        next_state_values = self.policy_net(next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        if self.clip_grad:
            for param in self.policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
        self.optimizer.step()


    def build_nn(self):
        '''
        Building torch graph
        :return:
        '''
        self.policy_net = DQN().to(self.device)
        # self.target_net = DQN().to(self.device)
        # self.target_net.load_state_dict(self.policy_net.state_dict())
        # self.target_net.eval()


    def get_stat(self, state, next, reward, action):
        '''
        Some visualisation
        :param state:
        :param next:
        :param reward:
        :param action:
        :return:
        '''
        print('=====')
        print('DEM: ', self.env.demand_next, 'ST: ', state, ' -> ', action)
        print('NXST: ', next, 'REW: ', reward)
        print('=====')
        print()


    def run(self):
        '''
        Main loop of training. Iterates over num_episodes * trajectory len steps
        :return:
        '''
        for i_episode in range(self.num_episodes):
            state = self.env.reset()
            rewards = 0
            for step in range(self.trajectory_len):
                action = self.select_action(torch.Tensor(state))
                next_state, reward, done, _ = self.env.step(action.item())
                reward *= 1.
                rewards += reward
                reward = torch.tensor([reward], device=self.device)
                self.replay.push(torch.Tensor([state]), action, torch.Tensor([next_state]), reward)
                state = next_state
                self.optimize_model()

                if done:
                    break

            self.rewards.append(rewards)

            if i_episode % TARGET_UPDATE == 0:
                print(i_episode, ' : ', np.array(self.rewards[-100:]).mean())
Example #9
0
    def fit_nash(self, env: VehicleFollowingENV, num_iterations, episode, total_step, max_episode_length=None):
        """
        Fit with Nash Equilibrium
        """
        # RL network: LSTM
        self.p1_net = self.net  # target network
        self.p1_net2 = self.net2

        self.p2_net = deepcopy(self.net)
        self.p2_net2 = deepcopy(self.net2)

        # SL network: NN
        self.p1_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS)
        self.p2_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS)

        self.p1_policy.compile('Adam', categorical_crossentropy)
        self.p2_policy.compile('Adam', mse)

        # ReplayMemory
        self.p1_RL_mem = ReplayMemory(max_size=100000)
        self.p2_RL_mem = ReplayMemory(max_size=100000)
        self.p1_SL_mem = ReplayMemory(max_size=100000)
        self.p2_SL_mem = ReplayMemory(max_size=100000)

        # MainLoop
        state = env.reset()
        total_reward = 0
        done = False

        for i in num_iterations:
            total_step += 1

            # if self.render:
            #     env.render()
            if max_episode_length and i > max_episode_length:
                break

            if np.random.random() < ETA:
                best_response = True
            else:
                best_response = False

            if best_response:
                p1_action = self.select_action(state, net=self.p1_net)
                p2_action = self.select_action(state, net=self.p2_net)
            else:
                p1_action = self.select_action(state, net=self.p1_policy)
                p2_action = self.select_action(state, net=self.p2_policy)

            next_state, reward, done = env.step(action_weight=p1_action, action_attacker=p2_action)

            self.p1_RL_mem.append((state, p1_action, RC - reward, next_state, done))
            self.p2_RL_mem.append((state, p2_action, reward, next_state, done))
            self.p1_SL_mem.append((state, p1_action))
            self.p2_SL_mem.append((state, p2_action))

            total_reward += reward

            if done:
                with open(self.algorithm + 'total_reward.txt', 'a') as f:
                    f.write('Episode ({}), reward: ({})\n'.format(episode, total_reward))
                print("Episode finished after {} time steps, total_reward is {}...".format(i, total_reward))
                break

            if total_step % self.renew == 0 and total_step != 0:
                self.p1_net2 = self.p1_net
                self.p2_net2 = self.p2_net

            # if total_step % 100000 == 0:
            #     self.save(total_step)

            if total_step >= self.burn_in and total_step % self.train_freq == 0:
                batches = min(self.batch_size, len(self.p1_RL_mem))
                p1_states, p1_actions, p1_q_values = self.sample_from_Replay_Memory(batches, self.p1_RL_mem,
                                                                                    self.p1_net)
                p2_states, p2_actions, p2_q_values = self.sample_from_Replay_Memory(batches, self.p2_RL_mem,
                                                                                    self.p2_net)

                self.p1_net.fit(p1_states, p1_q_values)
                self.p2_net.fit(p2_states, p2_q_values)
                self.p1_policy.fit(p1_states, p1_actions)
                self.p2_policy.fit(p2_states, p2_actions)

            state = next_state
        return total_step, done
Example #10
0
class DQNAgent:
    """
    Class implementing DQN.

    This is a basic outline of the functions/parameters you will need
    in order to implement the DQNAgnet. This is just to get you
    started. You may need to tweak the parameters, add new ones, etc.


    Parameters
    ----------
    q_network: keras.models.Model
      Your Q-network model.
    preprocessor: deeprl_hw2.core.Preprocessor
      The preprocessor class. See the associated classes for more
      details.
    memory: deeprl_hw2.core.Memory
      Your replay memory.
    gamma: float
      Discount factor.
    target_update_freq: float
      Frequency to update the target network. You can either provide a
      number representing a soft target update (see utils.py) or a
      hard target update (see utils.py and Atari paper.)
    num_burn_in: int
      Before you begin updating the Q-network your replay memory has
      to be filled up with some number of samples. This number says
      how many.
    train_freq: int
      How often you actually update your Q-Network. Sometimes
      stability is improved if you collect a couple samples for your
      replay memory, for every Q-network update that you run.
    batch_size: int
      How many samples in each minibatch.
    """

    def __init__(self,
                 q_network,
                 q_network2,
                 preprocessor: Preprocessor(),
                 RLmemory: ReplayMemory(),
                 SLmemory: ReplayMemory(),
                 policy,
                 gamma,
                 target_update_freq,
                 num_burn_in,
                 train_freq,
                 batch_size,
                 algorithm='DoubuleDQN',
                 render=False):
        self.net = q_network
        self.net2 = q_network2
        self.pre = preprocessor
        self.rl_mem = RLmemory
        self.sl_mem = SLmemory
        self.policy = policy
        self.gamma = gamma
        self.renew = target_update_freq
        self.burn_in = num_burn_in
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.algorithm = algorithm
        self.render = render

    def create_SL_model(self, state_shape, num_actions):
        model = Sequential()
        model.add(Dense(32, input_shape=state_shape, activation='ReLU'))
        model.add(Dense(num_actions, activation='softmax'))
        return model

    def compile(self, optimizer, loss_func):
        """Setup all of the TF graph variables/ops.

        This is inspired by the compile method on the
        keras.models.Model class.

        This is a good place to create the target network, setup your
        loss function and any placeholders you might need.
        
        You should use the mean_huber_loss function as your
        loss_function. You can also experiment with MSE and other
        losses.

        The optimizer can be whatever class you want. We used the
        keras.optimizers.Optimizer class. Specifically the Adam
        optimizer.
        """
        self.net.compile(optimizer=optimizer, loss=loss_func)
        self.net2.compile(optimizer=optimizer, loss=loss_func)

    def calc_q_values(self, state, net):
        """Given a state (or batch of states) calculate the Q-values.

        Basically run your network on these states.

        Return
        ------
        Q-values for the state(s)
        """
        # with tf.Session() as f:
        #     print(state.eval())

        q_value = net.predict(state, steps=32)
        return q_value

    def select_action(self, state, net, process='training'):
        """Select the action based on the current state.

        You will probably want to vary your behavior here based on
        which stage of training your in. For example, if you're still
        collecting random samples you might want to use a
        UniformRandomPolicy.

        If you're testing, you might want to use a GreedyEpsilonPolicy
        with a low epsilon.

        If you're training, you might want to use the
        LinearDecayGreedyEpsilonPolicy.

        This would also be a good place to call
        process_state_for_network in your preprocessor.

        Returns
        --------
        selected action
        """
        assert process in ['sampling', 'testing', 'training'], 'Unsupported process.'

        epsilon = 0.1
        start_value = 1
        end_value = 0.1
        num_steps = 10 ** 6

        q_values = self.calc_q_values(state, net)

        if process == 'sampling':
            action = UniformRandomPolicy(len(q_values)).select_action()
        elif process == 'testing':
            action = GreedyEpsilonPolicy(epsilon).select_action(q_values)
        else:
            action = LinearDecayGreedyEpsilonPolicy(start_value, end_value, num_steps).select_action(q_values)

        return action

    def fit(self, env, num_iterations, max_episode_length=None):
        """Fit your model to the provided environment.

        Its a good idea to print out things like loss, average reward,
        Q-values, etc to see if your agent is actually improving.

        You should probably also periodically save your network
        weights and any other useful info.

        This is where you should sample actions from your network,
        collect experience samples and add them to your replay memory,
        and update your network parameters.

        Parameters
        ----------
        env: VehicleFollowingEnv
          This is your Atari environment. You should wrap the
          environment using the wrap_atari_env function in the
          utils.py
        num_iterations: int
          How many samples/updates to perform.
        max_episode_length: int
          How long a single episode should last before the agent
          resets. Can help exploration.
        """
        # state = self.pre.process_state_for_memory(env.reset()) #函数内容是pass
        state = env.reset()  # 获取初始状态
        tmp = 0
        prev_action = np.zeros(4)  # 初始前操作
        states = [state]
        state_ = np.zeros(4)
        for i in range(num_iterations):
            # env.render()
            if max_episode_length and i > max_episode_length:
                break
            if state_.all() <= 0:
                action = np.random.random(4)  # 初始状态产生随机权重
            else:
                # state_ = tf.squeeze(state_)
                # state_ = tf.reshape(state_, 1)
                action = self.select_action(state_, process='testing')
            # print(action)
            # print('action', action)
            next_state, reward, done = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(i + 1))
                break
            # next_state = self.pre.process_state_for_memory(next_state)
            states.append(next_state)
            tmp += 1
            self.rl_mem.append(state, prev_action, reward, next_state, done)
            # if tmp >= 6:
            #     # frames = states[-5:-1]
            #     # frames2 = states[-4:]
            #     # state_ = tf.concat([tf.expand_dims(i, 2) for i in frames], 2)
            #     # next_state_ = tf.concat([tf.expand_dims(i, 2) for i in frames2], 2)
            #     print(state, next_state)
            #
            #     states = states[-5:]
            prev_action = action
            if i % self.renew == 0 and i != 0:
                self.net2 = self.net
            if i != 0 and i % self.train_freq == 0:
                print('{}th iteration, {}th train starts.'.format(i, i // self.train_freq))
                batches = min(self.batch_size, len(self.rl_mem))
                current_states = []
                q_values = []
                for samples in self.rl_mem.sample(batches):
                    current_state, action, reward, next_state, is_done = [samples.state,
                                                                          samples.action,
                                                                          samples.reward,
                                                                          samples.next_state,
                                                                          samples.done]
                    # state = tf.reshape(tf.squeeze(current_state), 4)
                    # next_state = tf.reshape((tf.squeeze(current_state)), 4)
                    current_states.append(state)
                    target = reward
                    if not is_done:
                        if self.algorithm == 'NDQN':
                            target = reward + self.gamma * np.amax(self.net2.predict(next_state, steps=32)[0])
                        elif self.algorithm == 'DQN':
                            target = reward + self.gamma * np.amax(self.net.predict(next_state, steps=32)[0])
                        elif self.algorithm == 'DoubleDQN':
                            target = reward
                            # TODO
                        elif self.algorithm == 'DuelingDQN':
                            target = reward
                    print(state)
                    target_f = self.net.predict(states[-10:], steps=32)
                    print(len(target_f))
                    print(action)
                    target_f[action] = target
                    q_values.append(target_f)
                # current_states = tf.reshape(current_states, 4)
                q_values = np.reshape((q_values), (-1, 6))
                print(current_states.shape, q_values.shape)
                self.net.fit(current_states, q_values, steps_per_epoch=self.batch_size)

    def fit_nash(self, env: VehicleFollowingENV, num_iterations, episode, total_step, max_episode_length=None):
        """
        Fit with Nash Equilibrium
        """
        # RL network: LSTM
        self.p1_net = self.net  # target network
        self.p1_net2 = self.net2

        self.p2_net = deepcopy(self.net)
        self.p2_net2 = deepcopy(self.net2)

        # SL network: NN
        self.p1_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS)
        self.p2_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS)

        self.p1_policy.compile('Adam', categorical_crossentropy)
        self.p2_policy.compile('Adam', mse)

        # ReplayMemory
        self.p1_RL_mem = ReplayMemory(max_size=100000)
        self.p2_RL_mem = ReplayMemory(max_size=100000)
        self.p1_SL_mem = ReplayMemory(max_size=100000)
        self.p2_SL_mem = ReplayMemory(max_size=100000)

        # MainLoop
        state = env.reset()
        total_reward = 0
        done = False

        for i in num_iterations:
            total_step += 1

            # if self.render:
            #     env.render()
            if max_episode_length and i > max_episode_length:
                break

            if np.random.random() < ETA:
                best_response = True
            else:
                best_response = False

            if best_response:
                p1_action = self.select_action(state, net=self.p1_net)
                p2_action = self.select_action(state, net=self.p2_net)
            else:
                p1_action = self.select_action(state, net=self.p1_policy)
                p2_action = self.select_action(state, net=self.p2_policy)

            next_state, reward, done = env.step(action_weight=p1_action, action_attacker=p2_action)

            self.p1_RL_mem.append((state, p1_action, RC - reward, next_state, done))
            self.p2_RL_mem.append((state, p2_action, reward, next_state, done))
            self.p1_SL_mem.append((state, p1_action))
            self.p2_SL_mem.append((state, p2_action))

            total_reward += reward

            if done:
                with open(self.algorithm + 'total_reward.txt', 'a') as f:
                    f.write('Episode ({}), reward: ({})\n'.format(episode, total_reward))
                print("Episode finished after {} time steps, total_reward is {}...".format(i, total_reward))
                break

            if total_step % self.renew == 0 and total_step != 0:
                self.p1_net2 = self.p1_net
                self.p2_net2 = self.p2_net

            # if total_step % 100000 == 0:
            #     self.save(total_step)

            if total_step >= self.burn_in and total_step % self.train_freq == 0:
                batches = min(self.batch_size, len(self.p1_RL_mem))
                p1_states, p1_actions, p1_q_values = self.sample_from_Replay_Memory(batches, self.p1_RL_mem,
                                                                                    self.p1_net)
                p2_states, p2_actions, p2_q_values = self.sample_from_Replay_Memory(batches, self.p2_RL_mem,
                                                                                    self.p2_net)

                self.p1_net.fit(p1_states, p1_q_values)
                self.p2_net.fit(p2_states, p2_q_values)
                self.p1_policy.fit(p1_states, p1_actions)
                self.p2_policy.fit(p2_states, p2_actions)

            state = next_state
        return total_step, done

    def sample_from_Replay_Memory(self, batches, ReplayMemory, Net):
        current_states = []
        actions = []
        q_values = []

        for samples in ReplayMemory.sample(batches):
            state, action, reward, next_state, is_done = [samples.state,
                                                          samples.action,
                                                          samples.reward,
                                                          samples.next_state,
                                                          samples.done]

            next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0)

            current_states.append(state)
            actions.append(action)
            target = reward
            if not is_done:
                target = reward + self.gamma * np.amax(Net.predict(next_state)[0])
            target_f = Net.predict(state)[0]
            target_f[action] = target
            q_values.append(target_f)
        current_states = np.reshape(current_states, (-1, DIM_STATES))
        q_values = np.reshape((q_values), (-1, NUM_ACTIONS))
        return current_states, actions, q_values

    def evaluate(self, env, num_episodes, max_episode_length=None):
        """Test your agent with a provided environment.
        
        You shouldn't update your network parameters here. Also if you
        have any layers that vary in behavior between train/test time
        (such as dropout or batch norm), you should set them to test.

        Basically run your policy on the environment and collect stats
        like cumulative reward, average episode length, etc.

        You can also call the render function here if you want to
        visually inspect your policy.
        """
        for i in range(num_episodes):
            total = 0
            state = np.zeros(4)
            tmp = 0
            prev_action = 0
            states = [state]
            state_ = -1
            while True:
                if max_episode_length and i > max_episode_length:
                    break
                if state_ == -1:
                    action = np.random.randint(6)
                else:
                    action = self.select_action(state_)
                next_state, reward, done, _ = env.step(action)
                if tmp < 6:
                    # next_state = self.pre.process_state_for_memory(next_state)
                    states.append(next_state)
                    tmp += 1
                if tmp >= 6:
                    # frames = states[-5:-1]
                    # frames2 = states[-4:]
                    # state_ = tf.concat([tf.expand_dims(i, 2) for i in frames], 2)
                    # next_state_ = tf.concat([tf.expand_dims(i, 2) for i in frames2], 2)
                    self.rl_mem.append(state, prev_action, reward, next_state, done)
                    states.append(state)
                    states = states[-5:]
                prev_action = action
                state = next_state
                total += reward
            print('Episode {}, total reward is {}'.format(i, total))
Example #11
0
                                           args.num_atoms, -10, 10)
    else:
        if args.dueling:
            q_net_builder = model.build_dueling_network
        else:
            q_net_builder = model.build_basic_network

        q_net = q_net_builder(args.num_frames, args.frame_size,
                              train_env.num_actions, args.noisy_net,
                              args.sigma0, args.net)

        q_net.cuda()
        agent = dqn.DQNAgent(q_net, args.double_dqn, train_env.num_actions)

    if args.noisy_net:
        train_policy = GreedyEpsilonPolicy(0, agent)
    else:
        train_policy = LinearDecayGreedyEpsilonPolicy(args.train_start_eps,
                                                      args.train_final_eps,
                                                      args.train_eps_num_steps,
                                                      agent)

    eval_policy = GreedyEpsilonPolicy(args.eval_eps, agent)
    replay_memory = ReplayMemory(args.replay_buffer_size)
    replay_memory.burn_in(train_env, agent, args.burn_in_frames)

    evaluator = lambda logger: evaluate(eval_env, eval_policy, 10, logger)
    train(agent, train_env, train_policy, replay_memory, args.gamma,
          args.batch_size, args.num_iters, args.frames_per_update,
          args.frames_per_sync, args.frames_per_eval, evaluator, args.output)
Example #12
0
            args.net)

        q_net.cuda()
        agent = dqn.DQNAgent(q_net, args.double_dqn, train_env.num_actions)

    if args.noisy_net:
        train_policy = GreedyEpsilonPolicy(0, agent)
    else:
        train_policy = LinearDecayGreedyEpsilonPolicy(
            args.train_start_eps,
            args.train_final_eps,
            args.train_eps_num_steps,
            agent)

    eval_policy = GreedyEpsilonPolicy(args.eval_eps, agent)
    replay_memory = ReplayMemory(args.replay_buffer_size)
    replay_memory.burn_in(train_env, agent, args.burn_in_frames)

    evaluator = lambda logger: evaluate(eval_env, eval_policy, 10, logger)
    train(agent,
          train_env,
          train_policy,
          replay_memory,
          args.gamma,
          args.batch_size,
          args.num_iters,
          args.frames_per_update,
          args.frames_per_sync,
          args.frames_per_eval,
          evaluator,
          args.output)