Esempio n. 1
0
    def __init__(self, env, mode, pre_trained_model, tensorboard_writer=None):
        super(DQNAgent, self).__init__(env, mode, tensorboard_writer)
        self.agent_name = 'DQN' + str(self.agent_no)
        self.memory = ReplayMemory()

        self.network = DeepQNetwork(self.obs_space[0], self.action_space)

        if self.mode == 'play':
            self.network.load_params(pre_trained_model)
            self.network.eval()

        elif self.mode == 'train':

            self.eval_network = DeepQNetwork(self.obs_space[0],
                                             self.action_space)
            self.eval_network.eval()

            if pre_trained_model:
                self.eval_network.load_params(pre_trained_model)

            self.optimizer = optim.RMSprop(self.network.parameters(), lr=LR)
            self.loss_func = SmoothL1Loss()
        else:
            raise ValueError(
                'Please set a valid mode for the agent (play or train)')
Esempio n. 2
0
    def __init__(self, sess, s_size, a_size, scope, queues, trainer):
        self.queue = queues[0]
        self.param_queue = queues[1]
        self.replaymemory = ReplayMemory(100000)
        self.sess = sess
        self.learner_net = network(s_size, a_size, scope, 20)

        self.q = self.learner_net.q
        self.Q = self.learner_net.Q

        self.actions_q = tf.placeholder(shape=[None, a_size, N],
                                        dtype=tf.float32)
        self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32)
        self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32)

        self.q_actiona = tf.multiply(self.q, self.actions_q)
        self.q_action = tf.reduce_sum(self.q_actiona, axis=1)
        self.u = tf.abs(self.q_target - self.q_action)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1))

        self.local_vars = self.learner_net.local_vars  #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        self.gradients = tf.gradients(self.loss, self.local_vars)
        #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0)
        self.apply_grads = trainer.apply_gradients(
            zip(self.gradients, self.local_vars))
        self.sess.run(tf.global_variables_initializer())
    def __init__(self, policy_net, target_net, durability, optimizer, name,
                 constants):
        """An agent class that takes action on the environment and optimizes
        the action based on the reward.

        Parameters
        ----------
        policy_net : DQN
            [description]
        target_net : DQN
            [description]
        durability : int
            [description]
        optimizer : [type]
            [description]
        name : str
            The name of agent
        constants: Constants
            The hyper-parameters from Constants class
        """
        self.CONSTANTS = constants
        self.policy_net = policy_net
        self.target_net = target_net
        self.target_net.load_state_dict(policy_net.state_dict())
        self.durability = durability
        self.optimizer = optimizer
        self.name = name
        self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE)
        self.steps_done = 0
        self.total_reward = 0.0
        self.reward = 0.0
        self.obtained_reward = 0.0
        self.n_best = 0
        self.policy_net_flag = False
Esempio n. 4
0
    def __init__(self, state_size, action_size, seed, is_double_q=False):
        '''Initialize an Agent.

        Params
        ======
            state_size (int): the dimension of the state
            action_size (int): the number of actions
            seed (int): random seed
        '''

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP)
        self.running_loss = 0
        self.training_cnt = 0

        self.is_double_q = is_double_q

        self.qnetwork_local = QNetwork(self.state_size, self.action_size,
                                       seed).to(device)
        self.qnetowrk_target = QNetwork(self.state_size, self.action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)
    def __init__(self,
                 load_checkpoint,
                 n_states,
                 n_actions,
                 checkpoint_file,
                 mem_size=10**6,
                 batch_size=64,
                 n_hid1=400,
                 n_hid2=300,
                 alpha=1e-4,
                 beta=1e-3,
                 gamma=0.99,
                 tau=0.99):
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  alpha,
                                  checkpoint_file,
                                  name='actor')
        self.critic = CriticNetwork(n_states,
                                    n_actions,
                                    n_hid1,
                                    n_hid2,
                                    beta,
                                    checkpoint_file,
                                    name='critic')

        self.actor_target = ActorNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         alpha,
                                         checkpoint_file,
                                         name='actor_target')
        self.critic_target = CriticNetwork(n_states,
                                           n_actions,
                                           n_hid1,
                                           n_hid2,
                                           beta,
                                           checkpoint_file,
                                           name='critic_target')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))
        self.memory = ReplayMemory(mem_size, n_states, n_actions)
        self.update_network_parameters_phil(tau=1)
        if load_checkpoint:
            self.actor.eval()
        self.load_checkpoint = load_checkpoint
Esempio n. 6
0
 def __init__(self, env, name, s_size, a_size, trainer, model_path,
              global_episodes):
     self.name = "worker_" + str(name)
     self.number = name
     self.model_path = model_path
     self.trainer = trainer
     self.global_episodes = global_episodes
     self.increment = self.global_episodes.assign_add(1)
     self.episode_rewards = []
     self.episode_lengths = []
     self.episode_mean_values = []
     #Create the local copy of the network and the tensorflow op to copy global paramters to local network
     self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
     self.update_local_ops = update_target_graph('global', self.name)
     self.env = env
     self.replaymemory = ReplayMemory(max_memory)
Esempio n. 7
0
 def __init__(self, num_states, num_actions, Double, Dueling, PER):
     self.num_actions = num_actions # 행동 가짓수(2)를 구함
     self.Double = Double
     self.Dueling = Dueling
     self.PER = PER
     
     # transition을 기억하기 위한 메모리 객체 생성
     self.memory = ReplayMemory(CAPACITY)
     
     # 신경망 구성
     n_in, n_mid, n_out = num_states, 32, num_actions
     self.main_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용
     self.target_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용
     print(self.main_q_network) # 신경망의 구조를 출력
     
     # 최적화 기법을 선택
     self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001)
     
     # PER - TD 오차를 기억하기 위한 메모리 객체 생성
     if self.PER == True:
         self.td_error_memory = TDerrorMemory(CAPACITY)
Esempio n. 8
0
    def __init__(self, dim):
        self.critic_path = cst.CN_CKPT_PATH
        self.actor_path = cst.AN_CKPT_PATH
        self.replaymemory_path = cst.RM_PATH

        self.dim_body = dim[0]
        self.dim_sensor = dim[1]
        self.dim_state = dim[0] + dim[1] * 3
        self.dim_action = dim[2]

        self.sess = tf.InteractiveSession()
        self.act_lr = cst.ACT_LEARNING_RATE
        self.cri_lr = cst.CRI_LEARNING_RATE
        self.tau = cst.TAU
        self.batch_size = cst.BATCH_SIZE
        self.gamma = cst.REWARD_DECAY

        self.actorNN = ActorNetwork(self.sess, self.dim_state, self.dim_action,
                                    self.act_lr, self.tau, self.batch_size)
        self.criticNN = CriticNetwork(self.sess, self.dim_state,
                                      self.dim_action, self.cri_lr, self.tau,
                                      self.gamma,
                                      self.actorNN.get_num_trainable_vars())

        self.sess.run(tf.global_variables_initializer())

        self.actorNN.update_target_network()
        self.criticNN.update_target_network()

        self.rm = ReplayMemory('DDPG')

        self.agent_count = cst.AGENT_COUNT
        self.exploration_rate = cst.EXPLORATION_RATE
        self.epsilon = cst.CRITIC_EPSILON
        self.LOSS_ITERATION = cst.LOSS_ITERATION

        self.expl_noise = OUNoise(self.dim_action)

        self.expl = False
        self.expl_decay = cst.EXPLORATION_DECAY
Esempio n. 9
0
 def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes):
     self.name = "worker_" + str(name)
     self.number = name
     self.model_path = model_path
     self.trainer = trainer
     self.global_episodes = global_episodes
     self.increment = self.global_episodes.assign_add(1)
     self.episode_rewards = []
     self.episode_lengths = []
     self.episode_mean_values = []
     #Create the local copy of the network and the tensorflow op to copy global paramters to local network
     self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
     self.update_local_ops = update_target_graph('global', self.name)
     self.env = env
     self.replaymemory = ReplayMemory(max_memory)
Esempio n. 10
0
def main(game, episodes, training_mode=False, log=False, no_ops=30):
    env = gym.make(game)
    num_actions = env.action_space.n
    dqn = DeepQNetwork(num_actions, (4, 84, 84))
    replay = ReplayMemory(100000)
    obs = env.reset()
    h, w, c = obs.shape
    phi = Phi(4, 84, 84, c, h, w)
    agent = Agent(replay, dqn, training_mode=training_mode)
    stats = Stats('results/results.csv')

    for i_episode in range(episodes):
        env.reset()

        for i in range(random.randint(1, no_ops)):
            observation, _, _, _ = env.step(0)
            pre_state = phi.add(observation)

        game_score = 0
        done = False
        t = 0

        while not done:
            t += 1
            env.render()
            action = agent.get_action(pre_state)
            observation, reward, done, _ = env.step(action)
            post_state = phi.add(observation)

            if training_mode:
                agent.update_replay_memory(pre_state, action, reward,
                                           post_state, done)
                if agent.time_step > agent.replay_start_size:
                    stats.log_time_step(agent.get_loss())

            pre_state = post_state
            game_score += reward

        print("Episode {} finished after {} time steps with score {}".format(
            i_episode, t, game_score))
        phi.reset()
        if agent.time_step > agent.replay_start_size:
            stats.log_game(game_score, t)

    stats.close()

    if log:
        dqn.save_model('results/model_weights.hdf5')
Esempio n. 11
0
def main():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env_evaluate = PLE(game, fps=30, display_screen=False)
    obs_dim = len(env.getGameState())
    action_dim = 2  # 只能是up键,还有一个其它,所以是2

    # rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim)
    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_dim,
        act_dim=action_dim,
        e_greed=0.2,  # explore
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    if os.path.exists('./model_dir'):
        agent.restore('./model_dir')

    # while rpm.size() < MEMORY_WARMUP_SIZE:  # warm up replay memory
    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, env, rpm)

    max_episode = 5000

    # start train
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1

        eval_reward = evaluate(agent, env_evaluate)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))

    agent.save('./model_dir')
Esempio n. 12
0
EPISODES = 500
START_RANDOM = False
MAX_EPISODE_COUNTER = 3600 * 24 * 2.0 / PERIOD
ACTION_DIM = 1
STATE_DIM = 6
ACTION_MAX = 1.0
MAX_BUFFER = 100000
MAX_TOTAL_REWARD = 300
EPISODE_PLOT = 25

# -------------------------------------------- #
# LOAD USEFULL CLASSES.
# -------------------------------------------- #

# Load the memroy
memory = ReplayMemory(MAX_BUFFER)

# Load the environment.
env = Environment(FILENAME, QUOTE_QTY, TRADE_QTY)

# Load the trainer.
trainer = Trainer(STATE_DIM, ACTION_DIM, ACTION_MAX, memory)

# Load the window.
window = Window(LOOK_BACK)
window.add_norm("#t", method="log_change", ref="close_price_#t")

# Load the tensorboard writer.
writer = SummaryWriter("tensorboard/runs")

# -------------------------------------------- #
Esempio n. 13
0
    def __init__(
            self,
            game,
            mem_size=512 * 512,  #1024*512,
            state_buffer_size=4,
            batch_size=64,
            learning_rate=1e-5,
            pretrained_model=None,
            frameskip=4,  #1
            record=False):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders)
        #if self.game == 'Breakout-v0':
        #    dimensions = (32, 195, 8, 152)
        #elif self.game == 'SpaceInvaders-v0':
        #    dimensions = (21, 195, 20, 141)
        #elif self.game == 'Assault-v0':
        #    dimensions = (50, 240, 5, 155)
        #elif self.game == 'Phoenix-v0':
        #    dimensions = (23, 183, 0, 160)
        #elif self.game == 'Skiing-v0':
        #    dimensions = (55, 202, 8, 152)
        #elif self.game == 'Enduro-v0':
        #    dimensions = (50, 154, 8, 160)
        #elif self.game == 'BeamRider-v0':
        #    dimensions = (32, 180, 9, 159)

        if self.game == 'BreakoutAndSpace':
            dimensions_break = (32, 195, 8, 152)
            dimensions_space = (21, 195, 20, 141)
        elif self.game != 'BreakoutAndSpace':
            print(
                'Error! This version is for playing BreakOut and SpaceInvaders at the same time.'
            )

        # Environment
        self.env_break = Environment('BreakoutNoFrameskip-v4',
                                     dimensions_break,
                                     frameskip=frameskip)
        self.env_space = Environment('SpaceInvaders-v0',
                                     dimensions_space,
                                     frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in=state_buffer_size,
                       num_actions=self.env_space.get_number_of_actions())

        self.target_net = DQN(
            channels_in=state_buffer_size,
            num_actions=self.env_space.get_number_of_actions())

        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 4
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size,
                                   num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 25000
        else:
            self.start_train_after = mem_size // 2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500
Esempio n. 14
0
    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 update_actor_interval=2,
                 warmup=1000,
                 mem_size=10**6,
                 batch_size=100,
                 n_hid1=400,
                 n_hid2=300,
                 lr_alpha=1e-3,
                 lr_beta=1e-3,
                 gamma=0.99,
                 tau=5e-3,
                 noise_mean=0,
                 noise_sigma=0.1):

        self.load_checkpoint = load_checkpoint
        self.checkpoint_file = checkpoint_file
        # needed for clamping in the learn function
        self.env = env
        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.n_actions = n_actions
        # to keep track of how often we call "learn" function, for the actor network
        self.learn_step_counter = 0
        # to handle countdown to the end of the warmup period, incremented every time we call an action
        self.time_step = 0
        self.update_actor_interval = update_actor_interval
        self.warmup = warmup
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.noise_mean = noise_mean
        self.noise_sigma = noise_sigma

        self.actor = TD3ActorNetwork(n_states,
                                     n_actions,
                                     n_hid1,
                                     n_hid2,
                                     lr_alpha,
                                     checkpoint_file,
                                     name='actor')
        self.target_actor = TD3ActorNetwork(n_states,
                                            n_actions,
                                            n_hid1,
                                            n_hid2,
                                            lr_alpha,
                                            checkpoint_file,
                                            name='target_actor')

        self.critic_1 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_1')
        self.critic_2 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_2')
        self.target_critic_1 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_1')
        self.target_critic_2 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_2')

        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        # tau=1 perform an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.actor, self.target_actor, tau=1)
        self.update_network_parameters(self.critic_1,
                                       self.target_critic_1,
                                       tau=1)
        self.update_network_parameters(self.critic_2,
                                       self.target_critic_2,
                                       tau=1)
Esempio n. 15
0
image_dimensions = 210 * 160 * 3
num_episodes = 50
target_episode_update = 5
action_threshold = 250
train_batch_size = 64

GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
steps_done = 0
n_actions = env.action_space.n
screen_height = 210
screen_width = 160

memory = ReplayMemory(10000)

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())


def optimize_model():
    if len(memory) < train_batch_size:
        return

    transitions = memory.sample(train_batch_size)
    print('Training on:', len(transitions))
Esempio n. 16
0
class SingleAgent(object):
    def __init__(self,
                 game,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 frameskip = 4
                 ):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # Environment
        self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())

        self.target_net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500


    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        EPSILON_PLAY = 0.01
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        elif mode=='play':
            epsilon = EPSILON_PLAY
        else:
            epsilon = EPSILON_END

        if epsilon < random():
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1,1)

            # Prevent noops
            if action[0,0]!=1:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0,0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:
            # Random action
            action = self.env.sample_action()
            action = LongTensor([[action]])

        return action


    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition
        batch = self.replay.sampleTransition(self.batch_size)

        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]),
                                              volatile=True) # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(1)[0]
        next_state_values[non_final_mask]= next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()

        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates%self.update_target_net_each_k_steps==0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]


    def play(self, n):
        """
        Play a game with the current net and render it

        Inputs:
        - n: games to play
        """
        for i in range(n):
            done = False # games end indicator variable
            score = 0
            # Reset game
            screen = self.env.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = gray2pytorch(screen)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            while not done:
                action = self.select_action(state, mode='play')[0,0]

                screen, reward, _, done, _ = self.env.step(action, mode='play')
                score += reward

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                # convert frames to range 0 to 1 again
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                self.state = state
            print('Game ({}/{}) - Final score {}: {}'.format(i+1, n, self.game, score))
        self.env.game.close()


    def play_stats(self, n_games, mode='random'):
        """
        Play N games randomly or evaluate a net and log results for statistics

        Input:
        - n_games: int Number of games to play
        - mode: str 'random' or 'evaluation'
        """
        # Subdirectory for logging
        sub_dir = mode + '_' + self.game + '/'
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # Store history
        reward_history = []
        reward_clamped_history = []

        # Number of actions to sample from
        n_actions = self.env.get_number_of_actions()

        for i_episode in range(1, n_games+1):
            # Reset game
            screen = self.env.reset()

            # Store screen
            if mode=='evaluation':
                # list of k last frames
                last_k_frames = []
                for j in range(self.num_stored_frames):
                    last_k_frames.append(None)
                    last_k_frames[j] = gray2pytorch(screen)
                # frame is saved as ByteTensor -> convert to gray value between 0 and 1
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done = False

            # reset score with initial lives, because every lost live adds -1
            total_reward = 0
            total_reward_clamped = self.env.get_lives()

            while not done:
                if mode=='random':
                    action = randrange(n_actions)
                elif mode=='evaluation':
                    action = self.select_action(state, mode='play')[0,0]

                screen, reward, reward_clamped, done, _ = self.env.step(action)
                total_reward += int(reward)
                total_reward_clamped += int(reward_clamped)

                if mode=='evaluation':
                    #   save latest frame, discard oldest
                    for j in range(self.num_stored_frames-1):
                        last_k_frames[j] = last_k_frames[j+1]
                    last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                    # convert frames to range 0 to 1 again
                    state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0



            # Print current result
            print('Episode: {:6}/{:6} |  '.format(i_episode, n_games),
                  'score: ({:4}/{:4})'.format(total_reward_clamped,total_reward))

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

        avg_reward = np.sum(reward_history)/len(reward_history)
        avg_reward_clamped = np.sum(reward_clamped_history)/len(reward_clamped_history)

        # Print final result
        print('\n\n=============================================\n' +
              'avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward))

        # Log results to files
        with open(sub_dir + mode + '.txt', 'w') as fp:
            fp.write('avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward))
        with open(sub_dir + mode + '_reward.pickle', 'wb') as fp:
            pickle.dump(reward_history, fp)
        with open(sub_dir + mode + '_reward_clamped.pickle', 'wb') as fp:
            pickle.dump(reward_clamped_history, fp)


    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + self.game + '_train.txt'
        loss_file = sub_dir + 'loss.pickle'
        reward_file = sub_dir + 'reward.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        loss_history = []
        reward_history = []
        reward_clamped_history = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' +
                     'Trained game:                       ' + str(self.game) + '\n' +
                     'Learning rate:                      ' + str(self.learning_rate) + '\n' +
                     'Batch size:                         ' + str(self.batch_size) + '\n' +
                     'Memory size(replay):                ' + str(self.mem_size) + '\n' +
                     'Pretrained:                         ' + str(self.pretrained_model) + '\n' +
                     'Started training after k frames:    ' + str(self.start_train_after) + '\n' +
                     'Optimized after k frames:           ' + str(self.optimize_each_k) + '\n' +
                     'Target net update after k frame:    ' + str(self.update_target_net_each_k_steps) + '\n\n' +
                     '------------------------------------------------------' +
                     '--------------------------------------------------\n')

        print('Started training...\nLogging to', sub_dir)

        for i_episode in range(1,num_episodes):
            # reset game at the start of each episode
            screen = self.env.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = gray2pytorch(screen)

            if i_episode == 1:
                self.replay.pushFrame(last_k_frames[0].cpu())

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            done = False # games end indicator variable
            # reset score with initial lives, because every lost live adds -1
            total_reward = 0
            total_reward_clamped = self.env.get_lives()

            # Loop over one game
            while not done:
                self.steps +=1

                action = self.select_action(state)

                # perform selected action on game
                screen, reward, reward_clamped, done, _ = self.env.step(action[0,0])
                total_reward += int(reward)
                total_reward_clamped += int(reward_clamped)

                # Wrap into tensor
                reward = torch.Tensor([reward_clamped])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                # convert frames to range 0 to 1 again
                if not done:
                    next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                else:
                    next_state = None

                # Store transition
                self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu())
                self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity, action, reward, done)

                #	only optimize each kth step
                if self.steps%self.optimize_each_k == 0:
                    loss = self.optimize(net_updates)

                    # Logging
                    loss_history.append(loss)
                    #q_history.append(q_value)
                    #exp_q_history.append(exp_q_value)

                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done:
                    break;

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

            print('Episode: {:6} |  '.format(i_episode),
                  'steps {:8} |  '.format(self.steps),
                  'loss: {:.2E} |  '.format(loss if loss else 0),
                  'score: ({:4}/{:4}) |  '.format(total_reward_clamped,total_reward),
                  'best score: ({:4}/{:4}) |  '.format(best_score_clamped,best_score),
                  'replay size: {:7}'.format(len(self.replay)))

            avg_score_clamped += total_reward_clamped
            avg_score += total_reward
            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            if i_episode % log_avg_episodes == 0 and i_episode!=0:
                avg_score_clamped /= log_avg_episodes
                avg_score /= log_avg_episodes

                print('----------------------------------------------------------------'
                      '-----------------------------------------------------------------',
                      '\nLogging to file: \nEpisode: {:6}   '.format(i_episode),
                      'steps: {:8}   '.format(self.steps),
                      'avg on last {:4} games ({:6.1f}/{:6.1f})   '.format(log_avg_episodes, avg_score_clamped,avg_score),
                      'best score: ({:4}/{:4})'.format(best_score_clamped, best_score),
                      '\n---------------------------------------------------------------'
                      '------------------------------------------------------------------')
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write('Episode: {:6} |  '.format(i_episode) +
                             'steps: {:8} |  '.format(self.steps) +
                             'avg on last {:4} games ({:6.1f}/{:6.1f}) |  '.format(log_avg_episodes, avg_score_clamped,avg_score) +
                             'best score: ({:4}/{:4})\n'.format(best_score_clamped, best_score))
                # Dump loss & reward
                with open(loss_file, 'wb') as fp:
                    pickle.dump(loss_history, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) + '...\n')
                self.target_net.save(sub_dir + self.game + '-' + str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + self.game + '.model')
Esempio n. 17
0
class Agent(object):
    def __init__(
            self,
            game,
            mem_size=512 * 512,  #1024*512,
            state_buffer_size=4,
            batch_size=64,
            learning_rate=1e-5,
            pretrained_model=None,
            frameskip=4,  #1
            record=False):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders)
        #if self.game == 'Breakout-v0':
        #    dimensions = (32, 195, 8, 152)
        #elif self.game == 'SpaceInvaders-v0':
        #    dimensions = (21, 195, 20, 141)
        #elif self.game == 'Assault-v0':
        #    dimensions = (50, 240, 5, 155)
        #elif self.game == 'Phoenix-v0':
        #    dimensions = (23, 183, 0, 160)
        #elif self.game == 'Skiing-v0':
        #    dimensions = (55, 202, 8, 152)
        #elif self.game == 'Enduro-v0':
        #    dimensions = (50, 154, 8, 160)
        #elif self.game == 'BeamRider-v0':
        #    dimensions = (32, 180, 9, 159)

        if self.game == 'BreakoutAndSpace':
            dimensions_break = (32, 195, 8, 152)
            dimensions_space = (21, 195, 20, 141)
        elif self.game != 'BreakoutAndSpace':
            print(
                'Error! This version is for playing BreakOut and SpaceInvaders at the same time.'
            )

        # Environment
        self.env_break = Environment('BreakoutNoFrameskip-v4',
                                     dimensions_break,
                                     frameskip=frameskip)
        self.env_space = Environment('SpaceInvaders-v0',
                                     dimensions_space,
                                     frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in=state_buffer_size,
                       num_actions=self.env_space.get_number_of_actions())

        self.target_net = DQN(
            channels_in=state_buffer_size,
            num_actions=self.env_space.get_number_of_actions())

        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 4
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size,
                                   num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 25000
        else:
            self.start_train_after = mem_size // 2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500

    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (
                EPSILON_START - EPSILON_END) / EPSILON_DECAY
        else:
            epsilon = EPSILON_END

        if epsilon > random() or mode == 'play':
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1, 1)

            # Prevent noops
            if action[0, 0] == 0:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0, 0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:

            # Random action
            action = self.env_space.sample_action()
            action = LongTensor([[action]])

        return action

    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition

        batch = self.replay.sampleTransition(self.batch_size)
        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(
            list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(
            torch.cat([ns for ns in batch.next_state if ns is not None]),
            volatile=True
        )  # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(
            self.batch_size).type(FloatTensor),
                                     volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(
            1)[0]
        next_state_values[non_final_mask] = next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        self.optimizer.zero_grad()

        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates % self.update_target_net_each_k_steps == 0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]

    def play(self):
        """
        Play a game with the current net and render it
        """
        done = False  # games end indicator variable
        score = 0
        # Reset game
        screen_break = self.env_break.reset()
        screen_space = self.env_space.reset()

        # list of k last frames
        ############old version:
        #breakout part
        #last_k_frames_break = []
        #for j in range(self.num_stored_frames):
        #    last_k_frames_break.append(None)
        #    last_k_frames_break[j] = gray2pytorch(screen_break)
        #spaceinvaders part
        #last_k_frames_space = []
        #for j in range(self.num_stored_frames):
        #    last_k_frames_space.append(None)
        #    last_k_frames_space[j] = gray2pytorch(screen_space)
        #################
        last_k_frames = []
        for j in range(self.num_stored_frames):
            last_k_frames.append(None)
            last_k_frames[j] = torch.cat(
                (gray2pytorch(screen_break), gray2pytorch(screen_space)),
                dim=2)

        # frame is saved as ByteTensor -> convert to gray value between 0 and 1
        ############old version:
        #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0
        #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0
        #state = torch.cat((state_break,state_space), 2)
        state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0

        while not done:
            action = self.select_action(state, mode='play')

            # Render game
            self.env_break.game.render(mode='human')
            self.env_space.game.render(mode='human')

            # maps actions from space invaders to breakout (shot-left to left, shot-right to right)
            if action[0, 0] == 4:
                action_break = 2
            elif action[0, 0] == 5:
                action_break = 3
            elif action[0, 0] != 5:
                action_break = action[0, 0]

            screen_break, _, reward_break, done_break, info_break = self.env_break.step(
                action_break, mode='play')
            screen_space, _, reward_space, done_space, info_space = self.env_space.step(
                action[0, 0], mode='play')
            score += reward_break
            score += reward_space
            done = done_break or done_space

            ############old
            #   save latest frame, discard oldest
            #for j in range(self.num_stored_frames - 1):
            #    last_k_frames_break[j] = last_k_frames_break[j + 1]
            #    last_k_frames_space[j] = last_k_frames_space[j + 1]
            #last_k_frames_break[self.num_stored_frames - 1] = gray2pytorch(screen_break)
            #last_k_frames_space[self.num_stored_frames - 1] = gray2pytorch(screen_space)

            # convert frames to range 0 to 1 again
            #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0
            #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0
            #state = torch.cat((state_break, state_space), 2)
            #############old_end

            #   save latest frame, discard oldest
            for j in range(self.num_stored_frames - 1):
                last_k_frames[j] = last_k_frames[j + 1]
            last_k_frames[self.num_stored_frames - 1] = torch.cat(
                (gray2pytorch(screen_break), gray2pytorch(screen_space)),
                dim=2)

            # convert frames to range 0 to 1 again
            state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0
            done = done_break or done_space

        print('Final score:', score)
        self.env.game.close()  #for both changen

    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game + '_' + datetime.now().strftime(
            '%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + self.game + '_train.log'
        loss_file = sub_dir + 'loss.pickle'
        reward_file = sub_dir + 'reward.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        loss_history = []
        reward_history = []
        reward_clamped_history = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(
                datetime.now().strftime('%Y%m%d_%H%M%S') + '\n' +
                'Trained game:    ' + str(self.game) + '\n' +
                'Learning rate:    ' + str(self.learning_rate) + '\n' +
                'Batch size:    ' + str(self.batch_size) + '\n' +
                'Pretrained:    ' + str(self.pretrained_model) + '\n' +
                'Started training after k frames:    ' +
                str(self.start_train_after) + '\n' +
                'Optimized after k frames:    ' + str(self.optimize_each_k) +
                '\n' + 'Target net update after k frame:    ' +
                str(self.update_target_net_each_k_steps) + '\n\n' +
                '--------------------------------------------------------------------------------\n'
            )

        print('Started training...\nLogging to', sub_dir)

        for i_episode in range(1, num_episodes):

            # reset game at the start of each episode
            screen_break = self.env_break.reset()
            screen_space = self.env_space.reset()

            # list of k last frames
            last_k_frames_break = []
            last_k_frames_space = []
            for j in range(self.num_stored_frames):
                last_k_frames_break.append(None)
                last_k_frames_space.append(None)
                last_k_frames_break[j] = gray2pytorch(screen_break)
                last_k_frames_space[j] = gray2pytorch(screen_space)

            if i_episode == 1:
                frames_both = torch.cat((last_k_frames_break[0].cpu(),
                                         last_k_frames_space[0].cpu()), 2)
                #self.replay.pushFrame(last_k_frames_break[0].cpu())
                #self.replay.pushFrame(last_k_frames_space[0].cpu())
                self.replay.pushFrame(frames_both)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state_break = torch.cat(last_k_frames_break,
                                    1).type(FloatTensor) / 255.0
            state_space = torch.cat(last_k_frames_space,
                                    1).type(FloatTensor) / 255.0
            state = torch.cat((state_break, state_space), 2)

            done = False  # games end indicator variable
            # reset score with initial lives, because every lost live adds -1
            total_reward = self.env_break.get_lives()
            total_reward += self.env_space.get_lives()
            total_reward_clamped = self.env_break.get_lives()
            total_reward_clamped += self.env_space.get_lives()
            ###########

            # Loop over one game
            while not done:
                self.steps += 1
                action = self.select_action(state)
                # perform selected action on game
                # screen, reward, done, info = self.env.step(action[0,0])#envTest.step(action[0,0])
                #maps actions from space invaders to breakout (shot-left to left, shot-right to right)

                screen_space, _, reward_space, done_space, info_space = self.env_space.step(
                    action[0, 0])

                action_break = action[0, 0]
                if action_break > 3:  #shoot+right/left --> right/left
                    action_break = action_break - 2
                screen_break, _, reward_break, done_break, info_break = self.env_break.step(
                    action_break)

                total_reward += int(reward_break)
                total_reward += int(reward_space)
                done = done_break or done_space

                #   clamp rewards
                reward_break = torch.Tensor([np.clip(reward_break, -1, 1)])
                reward_space = torch.Tensor([np.clip(reward_space, -1, 1)])
                reward = reward_break + reward_space
                total_reward_clamped += int(reward_break[0])
                total_reward_clamped += int(reward_space[0])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames - 1):
                    last_k_frames_break[j] = last_k_frames_break[j + 1]
                    last_k_frames_space[j] = last_k_frames_space[j + 1]
                last_k_frames_break[self.num_stored_frames -
                                    1] = gray2pytorch(screen_break)
                last_k_frames_space[self.num_stored_frames -
                                    1] = gray2pytorch(screen_space)

                # convert frames to range 0 to 1 again
                if not done:
                    next_state_break = torch.cat(last_k_frames_break,
                                                 1).type(FloatTensor) / 255.0
                    next_state_space = torch.cat(last_k_frames_space,
                                                 1).type(FloatTensor) / 255.0
                    next_state = torch.cat(
                        (next_state_break, next_state_space), 2)
                else:
                    next_state = None

                #Store transition
                #Frame concat, Trasition not (try)
                frame_break = last_k_frames_break[self.num_stored_frames -
                                                  1].cpu()
                frame_space = last_k_frames_space[self.num_stored_frames -
                                                  1].cpu()
                frame_both = torch.cat((frame_break, frame_space), 2)
                self.replay.pushFrame(frame_both)
                self.replay.pushTransition(
                    (self.replay.getCurrentIndex() - 1) % self.replay.capacity,
                    action, reward, done)

                #	only optimize each kth step
                if self.steps % self.optimize_each_k == 0:
                    loss = self.optimize(net_updates)

                    # Logging
                    loss_history.append(loss)
                    #q_history.append(q_value)
                    #exp_q_history.append(exp_q_value)

                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done:
                    break

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

            print(
                'Episode: {:6} |  '.format(i_episode),
                'steps {:8} |  '.format(self.steps),
                'loss: {:.2E} |  '.format(loss if loss else 0),
                'score: ({:4}/{:4}) |  '.format(total_reward_clamped,
                                                total_reward),
                'best score: ({:4}/{:4}) |  '.format(best_score_clamped,
                                                     best_score),
                'replay size: {:7}'.format(len(self.replay)))

            avg_score_clamped += total_reward_clamped
            avg_score += total_reward
            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            if i_episode % log_avg_episodes == 0 and i_episode != 0:
                avg_score_clamped /= log_avg_episodes
                avg_score /= log_avg_episodes

                print(
                    '----------------------------------------------------------------'
                    '-----------------------------------------------------------------',
                    '\nLogging to file: \nEpisode: {:6}   '.format(i_episode),
                    'steps: {:8}   '.format(self.steps),
                    'avg on last {:4} games ({:6.1f}/{:6.1f})   '.format(
                        log_avg_episodes, avg_score_clamped, avg_score),
                    'best score: ({:4}/{:4})'.format(best_score_clamped,
                                                     best_score),
                    '\n---------------------------------------------------------------'
                    '------------------------------------------------------------------'
                )
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write(
                        'Episode: {:6} |  '.format(i_episode) +
                        'steps: {:8} |  '.format(self.steps) +
                        'avg on last {:4} games ({:6.1f}/{:6.1f}) |  '.format(
                            log_avg_episodes, avg_score_clamped, avg_score) +
                        'best score: ({:4}/{:4})\n'.format(
                            best_score_clamped, best_score))
                # Dump loss & reward
                with open(loss_file, 'wb') as fp:
                    pickle.dump(loss_history, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) +
                             '...\n')
                self.target_net.save(sub_dir + self.game + '-' +
                                     str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + self.game + '.model')
Esempio n. 18
0
    def __init__(self,
                 game1,
                 game2,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 pretrained_subnet1 = False,
                 pretrained_subnet2 = False,
                 frameskip = 4,
                 frozen = False
                 ):
        """
        Inputs:
        - game 1: string to select the game 1
        - game 2: string to select the game 2
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - pretrained_subnet1: str path to the model of the subnet
        - pretrained_subnet2: str path to the model of the subnet
        - frozen: boolean freeze pretrained subnets
        """

        # Namestring
        self.game1 = game1
        self.game2 = game2

        # Environment
        self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip)
        self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip)


        # Neural net
        self.pretrained_subnet1 = pretrained_subnet1
        self.pretrained_subnet2 = pretrained_subnet2
        self.net = TwinDQN(channels_in = state_buffer_size,
                             num_actions = self.env2.get_number_of_actions(),
                             pretrained_subnet1 = pretrained_subnet1,
                             pretrained_subnet2 = pretrained_subnet2,
                             frozen = frozen)
        self.target_net = TwinDQN(channels_in = state_buffer_size,
                                    num_actions = self.env2.get_number_of_actions(),
                                    pretrained_subnet1 = pretrained_subnet1,
                                    pretrained_subnet2 = pretrained_subnet2,
                                    frozen = frozen)

        # Cuda
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        # Pretrained
        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()),
                                    lr=learning_rate)
        #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()),
        #                               lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500
Esempio n. 19
0
class DoubleAgent(object):
    def __init__(self,
                 game1,
                 game2,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 pretrained_subnet1 = False,
                 pretrained_subnet2 = False,
                 frameskip = 4,
                 frozen = False
                 ):
        """
        Inputs:
        - game 1: string to select the game 1
        - game 2: string to select the game 2
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - pretrained_subnet1: str path to the model of the subnet
        - pretrained_subnet2: str path to the model of the subnet
        - frozen: boolean freeze pretrained subnets
        """

        # Namestring
        self.game1 = game1
        self.game2 = game2

        # Environment
        self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip)
        self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip)


        # Neural net
        self.pretrained_subnet1 = pretrained_subnet1
        self.pretrained_subnet2 = pretrained_subnet2
        self.net = TwinDQN(channels_in = state_buffer_size,
                             num_actions = self.env2.get_number_of_actions(),
                             pretrained_subnet1 = pretrained_subnet1,
                             pretrained_subnet2 = pretrained_subnet2,
                             frozen = frozen)
        self.target_net = TwinDQN(channels_in = state_buffer_size,
                                    num_actions = self.env2.get_number_of_actions(),
                                    pretrained_subnet1 = pretrained_subnet1,
                                    pretrained_subnet2 = pretrained_subnet2,
                                    frozen = frozen)

        # Cuda
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        # Pretrained
        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()),
                                    lr=learning_rate)
        #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()),
        #                               lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500


    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        - action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        EPSILON_PLAY = 0.01
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        elif mode=='play':
            epsilon = EPSILON_PLAY
        else:
            epsilon = EPSILON_END

        if epsilon < random():
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1,1)

            # Prevent noops
            if action[0,0]!=1:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0,0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:
            # Random action
            action = self.env2.sample_action()
            action = LongTensor([[action]])

        return action


    def map_action(self, action):
        """
        Maps action from game with more actions
        to game with less actions

        Inputs:
        - action: int
        Returns:
        - action: int
        """
        # Map SpaceInvaders on Breakout
        if self.game1=='Breakout' and self.game2=='SpaceInvaders':
            if action>3: # shoot+right/left --> right/left
                return action-2

        # Map Assault on SpaceInvaders
        if self.game1=='SpaceInvaders' and self.game2=='Assault':
            if action!=0: # all actions except 2nd idle
                return action-1

        # Map Phoenix on SpaceInvaders
        if self.game1=='SpaceInvaders' and self.game2=='Phoenix':
            if action==4: # shield --> idle
                return 0
            if action==7: # shield+shot --> shot
                return 1
            if action>4: # shoot+right/left --> shoot+right/left
                return action-1

        # Map Phoenix on Assault
        if self.game1=='Assault' and self.game2=='Phoenix':
            if action==4: # shield --> idle
                return 0
            if action==7: # shield+shot --> shot
                return 2
            if 1<= action and action<=3: # shot/right/left --> shot/right/left
                return action+1

        # No mapping necessary
        return action


    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition
        batch = self.replay.sampleTransition(self.batch_size)

        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]),
                                              volatile=True) # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(1)[0]
        next_state_values[non_final_mask]= next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()


        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates%self.update_target_net_each_k_steps==0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]


    def play(self, n):
        """
        Play a game with the current net and render it

        Inputs:
        - n: games to play
        """
        for i in range(n):
            done = False # games end indicator variable

            # Score counter
            total_reward_game1 = 0
            total_reward_game2 = 0
            total_reward = 0

            # Reset game
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            while not done:
                action = self.select_action(state, mode='play')[0,0]
                action1 = self.map_action(action)
                action2 = action

                # perform selected action on game
                screen1, reward1, _, done1, _ = self.env1.step(action1, mode='play')
                screen2, reward2, _, done2, _ = self.env2.step(action2, mode='play')

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)

                # save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

                # convert frames to range 0 to 1 again
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

                # Merged game over indicator
                done = done1 or done2
            print('Final scores Game ({}/{}): {}: {}    '.format(i+1, n, self.game1, total_reward_game1) +
                  '{}: {}    '.format(self.game2, total_reward_game2) +
                  'total: {}'.format(total_reward))
        self.env1.game.close()
        self.env2.game.close()


    def play_stats(self, n_games, mode='random'):
        """
        Play N games randomly or evaluate a net and log results for statistics

        Input:
        - n_games: int Number of games to play
        - mode: str 'random' or 'evaluation'
        """
        # Subdirectory for logging
        sub_dir = mode + '_' + self.game1 + '+' + self.game2 + '/'
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # Store history total
        reward_history = []
        reward_clamped_history = []
        # Store history game 1
        reward_history_game1 = []
        reward_clamped_history_game1 = []
        # Store history game 2
        reward_history_game2 = []
        reward_clamped_history_game2 = []

        # Number of actions to sample from
        n_actions = self.env2.get_number_of_actions()

        for i_episode in range(1, n_games+1):
            # Reset game
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # Store screen
            if mode=='evaluation':
                # list of k last frames
                last_k_frames = []
                for j in range(self.num_stored_frames):
                    last_k_frames.append(None)
                    last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)
                # frame is saved as ByteTensor -> convert to gray value between 0 and 1
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done = False

            # reset score with initial lives, because every lost live adds -1
            total_reward_game1 = 0
            total_reward_clamped_game1 = self.env1.get_lives()
            total_reward_game2 = 0
            total_reward_clamped_game2 = self.env2.get_lives()
            # total scores for both games
            total_reward = total_reward_game1 + total_reward_game2
            total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2

            while not done:
                if mode=='random':
                    action = randrange(n_actions)
                elif mode=='evaluation':
                    action = self.select_action(state, mode='play')[0,0]
                action1 = self.map_action(action)
                action2 = action

                screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1)
                screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2)

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)
                total_reward_clamped_game1 += reward1_clamped
                total_reward_clamped_game2 += reward2_clamped
                total_reward_clamped += reward1_clamped + reward2_clamped

                if mode=='evaluation':
                    # save latest frame, discard oldest
                    for j in range(self.num_stored_frames-1):
                        last_k_frames[j] = last_k_frames[j+1]
                    last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

                    # convert frames to range 0 to 1 again
                    state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

                # Merged game over indicator
                done = done1 or done2

            # Print current result
            print('Episode: {:6}/{:6} |   '.format(i_episode, n_games) +
                  'score total: ({:6.1f}/{:7.1f}) |   '.format(total_reward_clamped,total_reward) +
                  'score game1: ({:6.1f}/{:7.1f}) |   '.format(total_reward_clamped_game1,total_reward_game1) +
                  'score game2: ({:6.1f}/{:7.1f})'.format(total_reward_clamped_game2,total_reward_game2))

            # Save rewards
            reward_history_game1.append(total_reward_game1)
            reward_history_game2.append(total_reward_game2)
            reward_history.append(total_reward)
            reward_clamped_history_game1.append(total_reward_clamped_game1)
            reward_clamped_history_game2.append(total_reward_clamped_game2)
            reward_clamped_history.append(total_reward_clamped)

        avg_reward_total = np.sum(reward_history) / len(reward_history)
        avg_reward_total_clamped = np.sum(reward_clamped_history) / len(reward_clamped_history)
        avg_reward_game1 = np.sum(reward_history_game1) / len(reward_history_game1)
        avg_reward_game1_clamped = np.sum(reward_clamped_history_game1) / len(reward_clamped_history_game1)
        avg_reward_game2 = np.sum(reward_history_game2) / len(reward_history_game2)
        avg_reward_game2_clamped = np.sum(reward_clamped_history_game2) / len(reward_clamped_history_game2)

        # Print final result
        print('\n\n===========================================\n' +
              'avg score after {:6} episodes:\n'.format(n_games) +
              'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) +
              'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) +
              'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2))

        # Log results to files
        with open(sub_dir + mode + '.txt', 'w') as fp:
            fp.write('avg score after {:6} episodes:\n'.format(n_games) +
                     'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) +
                     'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) +
                     'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2))

        # Dump reward
        with open(sub_dir + mode + '_reward_game1.pickle', 'wb') as fp:
            pickle.dump(reward_history_game1, fp)
        with open(sub_dir + mode + '_reward_game2.pickle', 'wb') as fp:
            pickle.dump(reward_history_game2, fp)
        with open(sub_dir + mode + '_reward_total.pickle', 'wb') as fp:
            pickle.dump(reward_history, fp)

        with open(sub_dir + mode + '_reward_clamped_game1', 'wb') as fp:
            pickle.dump(reward_clamped_history_game1, fp)
        with open(sub_dir + mode + '_reward_clamped_game2', 'wb') as fp:
            pickle.dump(reward_clamped_history_game2, fp)
        with open(sub_dir + mode + '_reward_clamped_total', 'wb') as fp:
            pickle.dump(reward_clamped_history, fp)


    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game1 + '+' + self.game2 + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + 'train.txt'
        reward_file = sub_dir + 'reward.pickle'
        reward_file_game1 = sub_dir + 'reward_game1.pickle'
        reward_file_game2 = sub_dir + 'reward_game2.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        reward_clamped_file_game1 = sub_dir + 'reward_clamped_game1.pickle'
        reward_clamped_file_game2 = sub_dir + 'reward_clamped_game2.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        # Total scores
        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        reward_history = []
        reward_clamped_history = []
        # Scores game 1
        avg_score_game1 = 0
        avg_score_clamped_game1 = 0
        reward_history_game1 = []
        reward_clamped_history_game1 = []
        # Scores game 2
        avg_score_game2 = 0
        avg_score_clamped_game2 = 0
        reward_history_game2 = []
        reward_clamped_history_game2 = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' +
                     'Trained game (first):               {}\n'.format(self.game1) +
                     'Trained game (second):              {}\n'.format(self.game2) +
                     'Learning rate:                      {:.2E}\n'.format(self.learning_rate) +
                     'Batch size:                         {:d}\n'.format(self.batch_size) +
                     'Memory size(replay):                {:d}\n'.format(self.mem_size) +
                     'Pretrained:                         {}\n'.format(self.pretrained_model) +
                     'Pretrained subnet 1:                {}\n'.format(self.pretrained_subnet1) +
                     'Pretrained subnet 2:                {}\n'.format(self.pretrained_subnet2) +
                     'Started training after k frames:    {:d}\n'.format(self.start_train_after) +
                     'Optimized after k frames:           {:d}\n'.format(self.optimize_each_k) +
                     'Target net update after k frame:    {:d}\n\n'.format(self.update_target_net_each_k_steps) +
                     '--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n' +
                     'Episode | Steps     | ' +
                     '{:3} games avg total  | '.format(log_avg_episodes) +
                     '{:3} games avg game1  | '.format(log_avg_episodes) +
                     '{:3} games avg game2  | '.format(log_avg_episodes) +
                     'best score total \n' +
                     '--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n')

        print('Started training...\nLogging to {}\n'.format(sub_dir) +
              'Episode | Steps     |   score total        |   score game 1       |   ' +
              'score game 2       | best score total')

        for i_episode in range(1,num_episodes):
            # reset game at the start of each episode
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = torch.cat((gray2pytorch(screen1),
                                              gray2pytorch(screen2)), dim=1)

            if i_episode == 1:
                self.replay.pushFrame(last_k_frames[0].cpu())

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done1 = False
            done2 = False

            # reset score with initial lives, because every lost live adds -1
            total_reward_game1 = 0
            total_reward_clamped_game1 = self.env1.get_lives()
            total_reward_game2 = 0
            total_reward_clamped_game2 = self.env2.get_lives()
            # total scores for both games
            total_reward = total_reward_game1 + total_reward_game2
            total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2

            # Loop over one game
            while not done1 and not done2:
                self.steps +=1

                action = self.select_action(state)
                action1 = self.map_action(action[0,0])
                action2 = action[0,0]

                # perform selected action on game
                screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1)
                screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2)

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)
                total_reward_clamped_game1 += reward1_clamped
                total_reward_clamped_game2 += reward2_clamped
                total_reward_clamped += reward1_clamped + reward2_clamped

                # Bake reward into tensor
                reward = torch.FloatTensor([reward1_clamped+reward2_clamped])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),
                                                                     gray2pytorch(screen2)), dim=1)

                # convert frames to range 0 to 1 again
                if not done1 and not done2:
                    next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                else:
                    next_state = None

                # Store transition
                self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu())
                self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity,
                                            action, reward, done1 or done2)

                #	only optimize each kth step
                if self.steps%self.optimize_each_k == 0:
                    self.optimize(net_updates)
                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done1 or done2:
                    break;

            # Save rewards
            reward_history_game1.append(total_reward_game1)
            reward_history_game2.append(total_reward_game2)
            reward_history.append(total_reward)
            reward_clamped_history_game1.append(total_reward_clamped_game1)
            reward_clamped_history_game2.append(total_reward_clamped_game2)
            reward_clamped_history.append(total_reward_clamped)

            # Sum up for averages
            avg_score_clamped_game1 += total_reward_clamped_game1
            avg_score_clamped_game2 += total_reward_clamped_game2
            avg_score_clamped += total_reward_clamped
            avg_score_game1 += total_reward_game1
            avg_score_game2 += total_reward_game2
            avg_score += total_reward

            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            print('{:7} | '.format(i_episode) +
                  '{:9} |     '.format(self.steps) +
                  '({:6.1f}/{:7.1f}) |     '.format(total_reward_clamped,total_reward) +
                  '({:6.1f}/{:7.1f}) |     '.format(total_reward_clamped_game1,total_reward_game1) +
                  '({:6.1f}/{:7.1f}) |  '.format(total_reward_clamped_game2,total_reward_game2) +
                  '({:6.1f}/{:8.1f})'.format(best_score_clamped, best_score))

            if i_episode % log_avg_episodes == 0 and i_episode!=0:
                avg_score_clamped_game1 /= log_avg_episodes
                avg_score_clamped_game2 /= log_avg_episodes
                avg_score_clamped /= log_avg_episodes
                avg_score_game1 /= log_avg_episodes
                avg_score_game2 /= log_avg_episodes
                avg_score /= log_avg_episodes

                print('--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n' +
                      'Episode | Steps     | ' +
                      '{:3} games avg total  | '.format(log_avg_episodes) +
                      '{:3} games avg game1  | '.format(log_avg_episodes) +
                      '{:3} games avg game2  | '.format(log_avg_episodes) +
                      'best score total \n' +
                      '{:7} | '.format(i_episode) +
                      '{:9} |     '.format(self.steps) +
                      '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped,avg_score) +
                      '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped_game1,avg_score_game1) +
                      '({:6.1f}/{:7.1f}) |  '.format(avg_score_clamped_game2,avg_score_game2) +
                      '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score) +
                      '\nLogging to file...\n\n'
                      '--------+-----------+----------------------+------------' +
                      '----------+----------------------+--------------------\n' +
                      'Episode | Steps     |   score total        |   score game 1       |   ' +
                      'score game 2       | best score total')
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write('{:7} | '.format(i_episode) +
                             '{:9} |     '.format(self.steps) +
                             '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped,avg_score) +
                             '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped_game1,avg_score_game1) +
                             '({:6.1f}/{:7.1f}) |  '.format(avg_score_clamped_game2,avg_score_game2) +
                             '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score))
                # Dump reward
                with open(reward_file_game1, 'wb') as fp:
                    pickle.dump(reward_history_game1, fp)
                with open(reward_file_game2, 'wb') as fp:
                    pickle.dump(reward_history_game2, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)

                with open(reward_clamped_file_game1, 'wb') as fp:
                    pickle.dump(reward_clamped_history_game1, fp)
                with open(reward_clamped_file_game2, 'wb') as fp:
                    pickle.dump(reward_clamped_history_game2, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped_game1 = 0
                avg_score_clamped_game2 = 0
                avg_score_clamped = 0
                avg_score_game1 = 0
                avg_score_game2 = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) + '...\n')
                self.target_net.save(sub_dir + str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + 'final.model')
Esempio n. 20
0
    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 mem_size=10**6,
                 batch_size=256,
                 n_hid1=256,
                 n_hid2=256,
                 lr=3e-4,
                 gamma=0.99,
                 tau=5e-3,
                 reward_scale=2):

        self.load_checkpoint = load_checkpoint

        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.reward_scale = reward_scale

        self.memory_counter = 0
        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  self.max_action,
                                  lr,
                                  checkpoint_file,
                                  name='_actor')
        self.critic_1 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic1')
        self.critic_2 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic2')

        self.value_net = ValueNetwork(n_states,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_value')
        self.target_value_net = ValueNetwork(n_states,
                                             n_hid1,
                                             n_hid2,
                                             lr,
                                             checkpoint_file,
                                             name='_value_target')

        # tau=1 performs an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.value_net,
                                       self.target_value_net,
                                       tau=1)
Esempio n. 21
0
class Agent:
    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 mem_size=10**6,
                 batch_size=256,
                 n_hid1=256,
                 n_hid2=256,
                 lr=3e-4,
                 gamma=0.99,
                 tau=5e-3,
                 reward_scale=2):

        self.load_checkpoint = load_checkpoint

        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.reward_scale = reward_scale

        self.memory_counter = 0
        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  self.max_action,
                                  lr,
                                  checkpoint_file,
                                  name='_actor')
        self.critic_1 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic1')
        self.critic_2 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic2')

        self.value_net = ValueNetwork(n_states,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_value')
        self.target_value_net = ValueNetwork(n_states,
                                             n_hid1,
                                             n_hid2,
                                             lr,
                                             checkpoint_file,
                                             name='_value_target')

        # tau=1 performs an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.value_net,
                                       self.target_value_net,
                                       tau=1)
        # self.update_network_parameters_phil(tau=1)

    def store_transition(self, obs, action, reward, obs_, done):
        self.memory.store_transition(obs, action, reward, obs_, done)

    def sample_transitions(self):
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer(
            self.batch_size)
        # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class)
        state_batch = torch.tensor(state_batch,
                                   dtype=torch.float).to(self.actor.device)
        action_batch = torch.tensor(action_batch,
                                    dtype=torch.float).to(self.actor.device)
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float).to(self.actor.device)
        new_state_batch = torch.tensor(new_state_batch,
                                       dtype=torch.float).to(self.actor.device)
        done_batch = torch.tensor(done_batch).to(self.actor.device)
        return state_batch, action_batch, reward_batch, new_state_batch, done_batch

    def update_network_parameters(self, network, target_network, tau=None):
        for par, target_par in zip(network.parameters(),
                                   target_network.parameters()):
            target_par.data.copy_(tau * par.data + (1 - tau) * target_par.data)

    def choose_action(self, obs):
        obs = torch.tensor([obs], dtype=torch.float).to(self.actor.device)
        actions, _ = self.actor.sample_normal(obs, reparametrize=False)
        return actions.cpu().detach().numpy()[0]

    def learn_phil(self):
        if self.memory.mem_counter < self.batch_size:
            return

        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        reward = torch.tensor(reward,
                              dtype=torch.float).to(self.critic_1.device)
        done = torch.tensor(done).to(self.critic_1.device)
        state_ = torch.tensor(new_state,
                              dtype=torch.float).to(self.critic_1.device)
        state = torch.tensor(state, dtype=torch.float).to(self.critic_1.device)
        action = torch.tensor(action,
                              dtype=torch.float).to(self.critic_1.device)

        value = self.value_net(state).view(-1)
        value_ = self.target_value_net(state_).view(-1)
        value_[done] = 0.0

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparametrize=False)
        # actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value_net.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * (F.mse_loss(value, value_target))
        value_loss.backward(retain_graph=True)
        self.value_net.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparametrize=True)
        # actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = torch.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        q_hat = self.reward_scale * reward + self.gamma * value_
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()
        self.update_network_parameters(self.value_net, self.target_value_net,
                                       self.tau)
        # self.update_network_parameters_phil()

    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return

        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions(
        )
        # state_batch, action_batch, reward_batch, new_state_batch, done_batch = \
        #     self.memory.sample_buffer(self.batch_size)
        #
        # reward_batch = torch.tensor(reward_batch, dtype=torch.float).to(self.critic_1.device)
        # done_batch = torch.tensor(done_batch).to(self.critic_1.device)
        # new_state_batch = torch.tensor(new_state_batch, dtype=torch.float).to(self.critic_1.device)
        # state_batch = torch.tensor(state_batch, dtype=torch.float).to(self.critic_1.device)
        # action_batch = torch.tensor(action_batch, dtype=torch.float).to(self.critic_1.device)
        '''Compute Value Network loss'''
        self.value_net.optimizer.zero_grad()
        val = self.value_net(state_batch).view(-1)
        val_ = self.target_value_net(new_state_batch).view(-1)
        val_[done_batch] = 0.0

        actions, log_probs = self.actor.sample_normal(state_batch,
                                                      reparametrize=False)
        log_probs = log_probs.view(-1)
        q1 = self.critic_1(state_batch, actions)  # action_batch)
        q2 = self.critic_1(state_batch, actions)  # action_batch)
        q = torch.min(q1, q2).view(-1)
        value_target = q - log_probs
        value_loss = 0.5 * F.mse_loss(val, value_target)

        value_loss.backward(retain_graph=True)
        self.value_net.optimizer.step()
        # val = val - q + log_prob
        '''Compute Actor loss'''
        self.actor.optimizer.zero_grad()
        # here we need to reparametrize and thus use rsample to make the distribution differentiable
        # because the log prob of the chosen action will be part of our loss.
        actions, log_probs = self.actor.sample_normal(state_batch,
                                                      reparametrize=True)
        log_probs = log_probs.view(-1)
        q1 = self.critic_1(state_batch, actions)
        q2 = self.critic_2(state_batch, actions)
        q = torch.min(q1, q2).view(-1)
        actor_loss = log_probs - q
        actor_loss = torch.mean(actor_loss)

        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()
        '''Compute Critic loss'''
        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        val_ = self.target_value_net(new_state_batch).view(
            -1)  # value for the critic update
        val_[done_batch] = 0.0
        q_hat = self.reward_scale * reward_batch + self.gamma * val_
        q1_old_policy = self.critic_1(state_batch, action_batch).view(-1)
        q2_old_policy = self.critic_2(state_batch, action_batch).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters(self.value_net, self.target_value_net,
                                       self.tau)
        # self.update_network_parameters_phil()

    def save_models(self):
        self.actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.value_net.save_checkpoint()
        self.target_value_net.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.value_net.load_checkpoint()
        self.target_value_net.load_checkpoint()

    def update_network_parameters_phil(self, tau=None):
        if tau is None:
            tau = self.tau

        target_value_params = self.target_value_net.named_parameters()
        value_params = self.value_net.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau*value_state_dict[name].clone() + \
                    (1-tau)*target_value_state_dict[name].clone()

        self.target_value_net.load_state_dict(value_state_dict)
class DDPGAgent():
    def __init__(self,
                 load_checkpoint,
                 n_states,
                 n_actions,
                 checkpoint_file,
                 mem_size=10**6,
                 batch_size=64,
                 n_hid1=400,
                 n_hid2=300,
                 alpha=1e-4,
                 beta=1e-3,
                 gamma=0.99,
                 tau=0.99):
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  alpha,
                                  checkpoint_file,
                                  name='actor')
        self.critic = CriticNetwork(n_states,
                                    n_actions,
                                    n_hid1,
                                    n_hid2,
                                    beta,
                                    checkpoint_file,
                                    name='critic')

        self.actor_target = ActorNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         alpha,
                                         checkpoint_file,
                                         name='actor_target')
        self.critic_target = CriticNetwork(n_states,
                                           n_actions,
                                           n_hid1,
                                           n_hid2,
                                           beta,
                                           checkpoint_file,
                                           name='critic_target')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))
        self.memory = ReplayMemory(mem_size, n_states, n_actions)
        self.update_network_parameters_phil(tau=1)
        if load_checkpoint:
            self.actor.eval()
        self.load_checkpoint = load_checkpoint

    def reset_noise(self):
        self.noise.reset()

    def __copy_param(self, net_param_1, net_param_2, tau):
        # a.copy_(b) reads content from b and copy it to a
        for par, target_par in zip(net_param_1, net_param_2):
            with torch.no_grad():
                val_to_copy = tau * par.weight + (1 - tau) * target_par.weight
                target_par.weight.copy_(val_to_copy)
                if target_par.bias is not None:
                    val_to_copy = tau * par.bias + (1 - tau) * target_par.bias
                    target_par.bias.copy_(val_to_copy)

    def update_network_parameters(self, tau=None):
        # TODO: Controlla equivalenza con metodo Phil
        # during the class initialization we call this method with tau=1, to perform an exact copy of the nets to targets
        # then when we call this without specifying tau, we use the field stored
        if tau is None:
            tau = self.tau

        actor_params = self.actor.children()
        actor_target_params = self.actor_target.children()
        self.__copy_param(actor_params, actor_target_params, tau)

        critic_params = self.critic.children()
        critic_target_params = self.critic_target.children()
        self.__copy_param(critic_params, critic_target_params, tau)

    def choose_action(self, obs):
        # when using layer norm, we do not want to calculate statistics for the forward propagation. Not needed
        # if using batchnorm or dropout
        self.actor.eval()
        obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device)
        # compute actions
        mu = self.actor(obs)
        # add some random noise for exploration
        mu_prime = mu
        if not self.load_checkpoint:
            mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to(
                self.actor.device)
            self.actor.train()
        return mu_prime.cpu().detach().numpy()

    def store_transitions(self, obs, action, reward, obs_, done):
        self.memory.store_transition(obs, action, reward, obs_, done)

    def sample_transitions(self):
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer(
            self.batch_size)
        # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class)
        state_batch = torch.tensor(state_batch,
                                   dtype=torch.float).to(self.actor.device)
        action_batch = torch.tensor(action_batch,
                                    dtype=torch.float).to(self.actor.device)
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float).to(self.actor.device)
        new_state_batch = torch.tensor(new_state_batch,
                                       dtype=torch.float).to(self.actor.device)
        done_batch = torch.tensor(done_batch).to(self.actor.device)
        return state_batch, action_batch, reward_batch, new_state_batch, done_batch

    def save_models(self):
        self.actor.save_checkpoint()
        self.actor_target.save_checkpoint()
        self.critic.save_checkpoint()
        self.critic_target.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.actor_target.load_checkpoint()
        self.critic.load_checkpoint()
        self.critic_target.load_checkpoint()

    def learn(self):
        # deal with the situation in which we still not have filled the memory to batch size
        if self.memory.mem_counter < self.batch_size:
            return
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions(
        )
        ''' compute actor_target actions and critic_target values, then use obtained values to compute target y_i '''
        target_actions = self.actor_target(
            new_state_batch
        )  #  + torch.tensor(self.noise(), dtype=torch.float).to(self.actor.device)
        target_critic_value_ = self.critic_target(new_state_batch,
                                                  target_actions)
        # target_critic_value_next[done_batch==1] = 0.0  # if done_batch is integer valued
        target_critic_value_[
            done_batch] = 0.0  # if done_batch is bool -- see if it works this way
        target_critic_value_ = target_critic_value_.view(
            -1)  # necessary for operations on matching shapes
        target = reward_batch + self.gamma * target_critic_value_
        target = target.view(self.batch_size, 1)
        ''' zero out gradients '''
        self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()
        ''' compute critic loss '''
        critic_value = self.critic(state_batch, action_batch)
        critic_loss = F.mse_loss(target, critic_value)
        ''' compute actor loss'''
        # cannot directly use critic value, because it is evaluating a certain (s,a) pair.
        # The formula given in the paper - it appears that - wants to use critic to evaluate
        # the actions produced by an updated actor, given the state
        # actor_loss = torch.mean(critic_value)
        actor_loss = -self.critic(state_batch, self.actor(state_batch))
        actor_loss = torch.mean(actor_loss)

        critic_loss.backward()
        actor_loss.backward()

        self.actor.optimizer.step()
        self.critic.optimizer.step()

        self.update_network_parameters_phil()

    def __copy_params_phil(self, net_a, net_b, tau):
        net_a_params = net_a.named_parameters()
        net_b_params = net_b.named_parameters()
        net_a_state_dict = dict(net_a_params)
        net_b_state_dict = dict(net_b_params)
        for name in net_a_state_dict:
            net_a_state_dict[name] = tau * net_a_state_dict[name].clone() + (
                1 - tau) * net_b_state_dict[name].clone()
        return net_a_state_dict

    def update_network_parameters_phil(self, tau=None):
        if tau is None:
            tau = self.tau

        updated_actor_state_dict = self.__copy_params_phil(
            self.actor, self.actor_target, tau)
        updated_critic_state_dict = self.__copy_params_phil(
            self.critic, self.critic_target, tau)

        self.actor_target.load_state_dict(updated_actor_state_dict)
        self.critic_target.load_state_dict(updated_critic_state_dict)
class Agent:
    def __init__(self, policy_net, target_net, durability, optimizer, name,
                 constants):
        """An agent class that takes action on the environment and optimizes
        the action based on the reward.

        Parameters
        ----------
        policy_net : DQN
            [description]
        target_net : DQN
            [description]
        durability : int
            [description]
        optimizer : [type]
            [description]
        name : str
            The name of agent
        constants: Constants
            The hyper-parameters from Constants class
        """
        self.CONSTANTS = constants
        self.policy_net = policy_net
        self.target_net = target_net
        self.target_net.load_state_dict(policy_net.state_dict())
        self.durability = durability
        self.optimizer = optimizer
        self.name = name
        self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE)
        self.steps_done = 0
        self.total_reward = 0.0
        self.reward = 0.0
        self.obtained_reward = 0.0
        self.n_best = 0
        self.policy_net_flag = False

    def select_action(self, state, is_first=False):
        sample = random.random()
        eps_threshold = self.CONSTANTS.EPS_END + (self.CONSTANTS.EPS_START - self.CONSTANTS.EPS_END) * \
                        math.exp(-1. * self.steps_done / self.CONSTANTS.EPS_DECAY)
        self.steps_done += 1
        if is_first:
            self.writer.add_graph(self.policy_net,
                                  input_to_model=state.to(
                                      self.CONSTANTS.DEVICE),
                                  profile_with_cuda=True)
        if sample > eps_threshold:
            with torch.no_grad():
                self.policy_net_flag = True
                return self.policy_net(state.to(
                    self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.CONSTANTS.N_ACTIONS)]],
                                device=self.CONSTANTS.DEVICE,
                                dtype=torch.long)

    def select_core_action(self, best_agent_state, flag, best_agent_action):
        self.steps_done += 1
        if flag:
            with torch.no_grad():
                if best_agent_state is None:
                    return self.policy_net(self.state.to(
                        self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
                else:
                    return self.policy_net(
                        best_agent_state.to(
                            self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
        else:
            return best_agent_action

    def optimize_model(self):
        if len(self.memory) < self.CONSTANTS.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.CONSTANTS.BATCH_SIZE)

        # zip(*transitions) unzips the transitions into
        # Transition(*) creates new named tuple
        # batch.state - tuple of all the states (each state is a tensor)
        # batch.next_state - tuple of all the next states (each state is a tensor)
        # batch.reward - tuple of all the rewards (each reward is a float)
        # batch.action - tuple of all the actions (each action is an int)

        # Transition = ReplayMemory.get_transition()
        transition = self.CONSTANTS.TRANSITION
        batch = transition(*zip(*transitions))

        actions = tuple(
            (map(lambda a: torch.tensor([[a]], device=self.CONSTANTS.DEVICE),
                 batch.action)))
        rewards = tuple(
            (map(lambda r: torch.tensor([r], device=self.CONSTANTS.DEVICE),
                 batch.reward)))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=utils.get_device(),
                                      dtype=torch.bool)

        non_final_next_states = torch.cat([
            s for s in batch.next_state if s is not None
        ]).to(self.CONSTANTS.DEVICE)

        state_batch = torch.cat(batch.state).to(self.CONSTANTS.DEVICE)
        action_batch = torch.cat(actions)
        reward_batch = torch.cat(rewards)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = torch.zeros(self.CONSTANTS.BATCH_SIZE,
                                        device=self.CONSTANTS.DEVICE)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.CONSTANTS.GAMMA) + reward_batch

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def set_tf_writer(self, path):
        self.writer = self._set_tf_writer(path)

    def _set_tf_writer(self, path):
        if self.name == "core":
            writer = SummaryWriter(log_dir="{}/tf-board/core/".format(path))
        else:
            writer = SummaryWriter(
                log_dir="{}/tf-board/{}".format(path, self.name))
        return writer

    def get_state(self):
        return self.state

    def get_next_state(self):
        return self.next_state

    def get_init_state(self):
        return self.init_state

    def get_name(self):
        return self.name

    def get_policy_net_flag(self):
        return self.policy_net_flag

    def set_init_state(self, state):
        self.init_state = state

    def set_state(self, state):
        self.state = state
        self.next_state = state

    def set_env(self, env):
        self.env = env

    def get_env(self):
        return self.env

    def set_action(self, action):
        self.action = action

    def get_action(self):
        return self.action

    def get_durability(self):
        return self.durability

    def get_policy_net(self):
        return self.policy_net

    def reduce_durability(self, value):
        self.durability = self.durability - value

    def heal_durability(self, value):
        self.durability = self.durability + value

    def set_done_state(self, done):
        self.done = done

    def set_total_reward(self, reward):
        self.reward = reward
        if reward > 0.0:
            self.obtained_reward += reward
        self.total_reward += reward

    def reset_total_reward(self):
        self.total_reward = 0.0
        self.obtained_reward = 0.0

    def get_reward(self):
        return self.reward

    def get_obtained_reward(self):
        return self.obtained_reward

    def best_counter(self):
        self.n_best += 1

    def get_n_best(self):
        return self.n_best

    def get_total_reward(self):
        return self.total_reward

    def set_step_retrun_value(self, obs, done, info):
        self.obs = obs
        self.done = done
        self.info = info

    def is_done(self):
        return self.done
Esempio n. 24
0
class Worker():
    def __init__(self, env, name, s_size, a_size, trainer, model_path,
                 global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = env
        self.replaymemory = ReplayMemory(max_memory)

    def train(self, rollout, sess, gamma, ISWeights):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        dones = rollout[:, 4]

        Q_target = sess.run(
            self.local_Q.Q,
            feed_dict={self.local_Q.inputs: np.vstack(next_observations)})
        actions_ = np.argmax(Q_target, axis=1)

        action = np.zeros((batch_size, a_size))
        action_ = np.zeros((batch_size, a_size))
        for i in range(batch_size):
            action[i][actions[i]] = 1
            action_[i][actions_[i]] = 1
        action_now = np.zeros((batch_size, a_size, N))
        action_next = np.zeros((batch_size, a_size, N))
        for i in range(batch_size):
            for j in range(a_size):
                for k in range(N):
                    action_now[i][j][k] = action[i][j]
                    action_next[i][j][k] = action_[i][j]

        q_target = sess.run(self.local_Q.q_action,
                            feed_dict={
                                self.local_Q.inputs:
                                np.vstack(next_observations),
                                self.local_Q.actions_q: action_next
                            })
        q_target_batch = []
        for i in range(len(q_target)):
            qi = q_target[i]  # * (1 - dones[i])
            z_target_step = []
            for j in range(len(qi)):
                z_target_step.append(gamma * qi[j] + rewards[i])
            q_target_batch.append(z_target_step)
        q_target_batch = np.array(q_target_batch)
        #print q_target_batch
        isweight = np.zeros((batch_size, N))
        for i in range(batch_size):
            for j in range(N):
                isweight[i, j] = ISWeights[i]
        feed_dict = {
            self.local_Q.inputs: np.vstack(observations),
            self.local_Q.actions_q: action_now,
            self.local_Q.q_target: q_target_batch,
            self.local_Q.ISWeights: isweight
        }
        u, l, g_n, v_n, _ = sess.run([
            self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms,
            self.local_Q.var_norms, self.local_Q.apply_grads
        ],
                                     feed_dict=feed_dict)
        return l / len(rollout), g_n, v_n, Q_target, u

    def work(self, gamma, sess, coord, saver):
        global GLOBAL_STEP
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        epsilon = 0.2

        print("Starting worker " + str(self.number))
        best_mean_episode_reward = -float('inf')
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                #episode_buffer = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.reset()
                s = process_frame(s)
                if epsilon > 0.01:
                    epsilon = epsilon * 0.997
                while not d:
                    #self.env.render()
                    GLOBAL_STEP += 1
                    #Take an action using probabilities from policy network output.
                    if random.random() > epsilon:
                        a_dist_list = sess.run(
                            self.local_Q.Q,
                            feed_dict={self.local_Q.inputs: [s]})
                        a_dist = a_dist_list[0]
                        a = np.argmax(a_dist)
                    else:
                        a = random.randint(0, 5)

                    s1, r, d, _ = self.env.step(a)
                    if d == False:
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                    self.replaymemory.add([s, a, r, s1, d])
                    episode_reward += r
                    s = s1
                    total_steps += 1
                    episode_step_count += 1
                    if total_steps % 2 == 0 and d != True and total_steps > 50000:
                        episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(
                            batch_size)
                        l, g_n, v_n, Q_target, u = self.train(
                            episode_buffer, sess, gamma, ISWeights)
                        u = np.mean(u, axis=1) + 1e-6
                        self.replaymemory.update_priorities(tree_idx, u)
                        #sess.run(self.update_local_ops)
                    if d == True:
                        break
                sess.run(self.update_local_ops)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory:
                    if self.name == 'worker_0' and episode_count % 5 == 0:
                        print('\n episode: ', episode_count, 'global_step:', \
                              GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \
                              'epsilon: ', epsilon)

                    print('loss', l, 'Qtargetmean', np.mean(Q_target))
                    #print 'p_target', p_target
                    if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000:
                        saver.save(
                            sess, self.model_path + '/qr-dqn-' +
                            str(episode_count) + '.cptk')
                        print("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-100:])
                    if episode_count > 20 and best_mean_episode_reward < mean_reward:
                        best_mean_episode_reward = mean_reward

                if self.name == 'worker_0':
                    sess.run(self.increment)
                    #if episode_count%1==0:
                    #print('\r {} {}'.format(episode_count, episode_reward),end=' ')
                episode_count += 1
Esempio n. 25
0
class Brain:
    def __init__(self, num_actions, Double, Dueling, PER):
        self.num_actions = num_actions  # 행동 가짓수(2)를 구함
        self.Double = Double
        self.Dueling = Dueling
        self.PER = PER

        # transition을 기억하기 위한 메모리 객체 생성
        self.memory = ReplayMemory(CAPACITY)

        # 신경망 구성
        n_out = num_actions
        self.main_q_network = Net_CNN(n_out, Dueling)  # Net 클래스를 사용
        self.target_q_network = Net_CNN(n_out, Dueling)  # Net 클래스를 사용
        print(self.main_q_network)  # 신경망의 구조를 출력

        # 최적화 기법을 선택
        self.optimizer = optim.Adam(self.main_q_network.parameters(),
                                    lr=0.0001)

        # PER - TD 오차를 기억하기 위한 메모리 객체 생성
        if self.PER == True:
            self.td_error_memory = TDerrorMemory(CAPACITY)

    def replay(self, episode=0):
        ''' Experience Replay로 신경망의 결합 가중치 학습 '''

        # 1. 저장된 transition 수 확인
        if len(self.memory) < BATCH_SIZE:
            return

        # 2. 미니배치 생성
        if self.PER == True:
            self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch(
                episode)
        else:
            self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch(
            )

        # 3. 정답신호로 사용할 Q(s_t, a_t)를 계산
        self.expected_state_action_values = self.get_expected_state_action_values(
        )

        # 4. 결합 가중치 수정
        self.update_main_q_network()

    def decide_action(self, state, episode):
        '''현재 상태로부터 행동을 결정함'''
        # e-greedy 알고리즘에서 서서히 최적행동의 비중을 늘린다
        epsilon = 0.5 * (1 / (episode + 1))

        if epsilon <= np.random.uniform(0, 1):
            self.main_q_network.eval()  # 신경망을 추론 모드로 전환
            with torch.no_grad():
                action = self.main_q_network(state).max(1)[1].view(1, 1)
            # 신경망 출력의 최댓값에 대한 인덱스 = max(1)[1]
            # .view(1,1)은 [torch.LongTensor of size 1]을 size 1*1로 변환하는 역할을 함

        else:
            # 행동을 무작위로 반환 (0 혹은 1)
            action = torch.LongTensor([[random.randrange(self.num_actions)]
                                       ])  #행동을 무작위로 반환(0 혹은 1)
            # action은 [torch.LongTensor of size 1*1] 형태가 된다.

        return action

    def make_minibatch(self, episode=0):
        '''2. 미니배치 생성'''

        if self.PER == True:
            # 2.1 PER - 메모리 객체에서 미니배치를 추출
            # def make_minibatch(self, episode):
            if episode < 30:
                transitions = self.memory.sample(BATCH_SIZE)
            else:
                # TD 오차를 이용해 미니배치를 추출하도록 수정
                indexes = self.td_error_memory.get_prioritized_indexes(
                    BATCH_SIZE)
                transitions = [self.memory.memory[n] for n in indexes]
        else:
            # 2.1 메모리 객체에서 미니배치를 추출
            transitions = self.memory.sample(BATCH_SIZE)

        # 2.2 각 변수를 미니배치에 맞는 형태로 변형
        # transitions는 각 단계별로 (state, action, state_next, reward) 형태로 BATCH_SIZE 개수만큼 저장됨
        # 다시 말해, (state, action, state_next, reward) * BATCH_SIZE 형태가 된다.
        # 이를 미니배치로 만들기 위해
        # (state*BATCH_SIZE, action*BATCH_SIZE), state_next*BATCH_SIZE, reward*BATCH_SIZE)
        # 형태로 변환한다.

        batch = Transition(*zip(*transitions))

        # 2.3 각 변수의 요소를 미니배치에 맞게 변형하고, 신경망으로 다룰 수 있게 Variable로 만든다
        # state를 예로 들면, [torch.FloatTensor of size 1*4] 형태의 요소가 BATCH_SIZE 개수만큼 있는 형태다
        # 이를 torch.FloatTensor of size BATCH_SIZE*4 형태로 변형한다
        # 상태, 행동, 보상, non_final 상태로 된 미니배치를 나타내는 Variable을 생성
        # cat은 Concatenates(연접)를 의미한다.
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        return batch, state_batch, action_batch, reward_batch, non_final_next_states

    def get_expected_state_action_values(self):
        ''' 정답 신호로 사용할 Q(s_t,a_t)를 계산'''

        # 3.1 신경망을 추론 모드로 전환
        self.main_q_network.eval()
        self.target_q_network.eval()

        # 3.2 신경망으로 Q(s_t, a_t)를 계산
        # self.model(state_batch)은 왼쪽, 오른쪽에 대한 Q값을 출력하며
        # [torch.FloatTensor of size BATCH_SIZEx2] 형태다
        # 여기서부터는 실행한 행동 a_t에 대한 Q값을 계산하므로 action_batch에서 취한 행동
        # a_t가 왼쪽이냐 오른쪽이냐에 대한 인덱스를 구하고, 이에 대한 Q값을 gather메서드로 모아온다.
        self.state_action_values = self.main_q_network(
            self.state_batch).gather(1, self.action_batch)

        # 3.3 max{Q(s_t+1, a)}값을 계산한다. 이때 다음 상태가 존재하는지에 주의해야 한다

        # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듬
        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, self.batch.next_state)))

        # 먼저 전체를 0으로 초기화
        next_state_values = torch.zeros(BATCH_SIZE)

        # Double DQN
        if self.Double == True:
            a_m = torch.zeros(BATCH_SIZE).type(torch.LongTensor)

            # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산
            # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함
            a_m[non_final_mask] = self.main_q_network(
                self.non_final_next_states).detach().max(1)[1]

            # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1로 변환
            a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

            # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산
            # detach() 메서드로 값을 꺼내옴
            # squeeze()메서드로 size[minibatch*1]을 [minibatch]로 변환
            next_state_values[non_final_mask] = self.target_q_network(
                self.non_final_next_states).gather(
                    1, a_m_non_final_next_states).detach().squeeze()
        else:
            # 다음 상태가 있는 인덱스에 대한 최대 Q값을 구한다
            # 출력에 접근해서 열방향 최댓값(max(1))이 되는 [값, 인덱스]를 구한다
            # 그리고 이 Q값(인덱스 = 0)을 출력한 다음
            # detach 메서드로 이 값을 꺼내온다
            next_state_values[non_final_mask] = self.target_q_network(
                self.non_final_next_states).max(1)[0].detach()

        # 3.4 정답신호로 사용할 Q(s_t, a_t) 값을 Q러닝 식으로 계산
        expected_state_action_values = self.reward_batch + GAMMA * next_state_values

        return expected_state_action_values

    def update_main_q_network(self):
        ''' 4. 결합 가중치 수정 '''

        # 4.1 신경망을 학습 모드로 전환
        self.main_q_network.train()

        # 4.2 손실함수를 계산(smooth_l1_loss는 Huber 함수)
        # expected_state_action_values은 size가 [minibatch]이므로 unsqueeze해서 [minibatch*1]로 만듦
        loss = F.smooth_l1_loss(self.state_action_values,
                                self.expected_state_action_values.unsqueeze(1))

        # 4.3 결합 가중치를 수정
        self.optimizer.zero_grad()  # 경사를 초기화
        loss.backward()  # 역전파 계산
        self.optimizer.step()  # 결합 가중치 수정

    def update_target_q_network(self):  # DDQN에서 추가됨
        ''' Target Q-Network을 Main Q-Network와 맞춤 '''
        self.target_q_network.load_state_dict(self.main_q_network.state_dict())

    def update_td_error_memory(self):  # Prioritized Experience Replay 에서 추가됨
        ''' TD 오차 메모리에 저장된 TD 오차를 업데이트 '''

        # 신경망을 추론 모드로 전환
        self.main_q_network.eval()
        self.target_q_network.eval()

        # 전체 transition으로 미니배치를 생성
        transitions = self.memory.memory
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        # 신경망의 출력 Q(s_t, a_t)를 계산
        state_action_values = self.main_q_network(state_batch).gather(
            1, action_batch)

        # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듦
        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        # 먼저 전체를 0으로 초기화, 크기는 기억한 transition 개수만큼
        next_state_values = torch.zeros(len(self.memory))
        a_m = torch.zeros(len(self.memory)).type(torch.LongTensor)

        # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산
        # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함
        a_m[non_final_mask] = self.main_q_network(
            non_final_next_states).detach().max(1)[1]

        # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1 로 변환
        a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

        # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산
        # detach() 메서드로 값을 꺼내옴
        # squeeze() 메서드로 size[minibatch*1]을 [minibatch]로 변환
        next_state_values[non_final_mask] = self.target_q_network(
            non_final_next_states).gather(
                1, a_m_non_final_next_states).detach().squeeze()

        # TD 오차를 계산
        td_errors = (reward_batch + GAMMA *
                     next_state_values) - state_action_values.squeeze()
        # state_action_values는 size[minibatch*1]이므로 squeeze 메서드로 size[minibatch]로 변환

        # TD 오차 메모리를 업데이트. Tensor를 detach() 메서드로 꺼내와 NumPy 변수로 변환하고
        # 다시 파이썬 리스트로 반환
        self.td_error_memory.memory = td_errors.detach().numpy().tolist()
Esempio n. 26
0
class DDPG:
    def __init__(self, dim):
        self.critic_path = cst.CN_CKPT_PATH
        self.actor_path = cst.AN_CKPT_PATH
        self.replaymemory_path = cst.RM_PATH

        self.dim_body = dim[0]
        self.dim_sensor = dim[1]
        self.dim_state = dim[0] + dim[1] * 3
        self.dim_action = dim[2]

        self.sess = tf.InteractiveSession()
        self.act_lr = cst.ACT_LEARNING_RATE
        self.cri_lr = cst.CRI_LEARNING_RATE
        self.tau = cst.TAU
        self.batch_size = cst.BATCH_SIZE
        self.gamma = cst.REWARD_DECAY

        self.actorNN = ActorNetwork(self.sess, self.dim_state, self.dim_action,
                                    self.act_lr, self.tau, self.batch_size)
        self.criticNN = CriticNetwork(self.sess, self.dim_state,
                                      self.dim_action, self.cri_lr, self.tau,
                                      self.gamma,
                                      self.actorNN.get_num_trainable_vars())

        self.sess.run(tf.global_variables_initializer())

        self.actorNN.update_target_network()
        self.criticNN.update_target_network()

        self.rm = ReplayMemory('DDPG')

        self.agent_count = cst.AGENT_COUNT
        self.exploration_rate = cst.EXPLORATION_RATE
        self.epsilon = cst.CRITIC_EPSILON
        self.LOSS_ITERATION = cst.LOSS_ITERATION

        self.expl_noise = OUNoise(self.dim_action)

        self.expl = False
        self.expl_decay = cst.EXPLORATION_DECAY

    #=====================action===========================

    def Action(self, obs, action_type, run_type):
        if action_type == 'GREEDY':
            return self.action_greedy(obs)

        self.isExploration(run_type == 'TRAIN')

        action_list = []
        agent_num = len(obs['agent'])
        for i in range(0, agent_num):
            agent_obs = obs['agent'][i]
            if np.linalg.norm(agent_obs['d'] -
                              agent_obs['p']) < cst.AGENT_RADIUS + 10:
                action = {}
                action['theta'] = 0
                action['velocity'] = 0
                action['stop'] = True
            else:
                action = self.get_action(agent_obs, run_type == 'TEST')
                if self.expl:
                    action = self.action_random(action)

            action_list.append(action)

        return action_list

    def action(self, obs, action_type, run_type):
        if action_type == 'GREEDY':
            return self.action_greedy(obs)

        self.isExploration(run_type == 'TRAIN')

        action_list = []
        for i in range(0, self.agent_count):
            agent_obs = obs['agent'][i]
            if np.linalg.norm(agent_obs['d'] -
                              agent_obs['p']) < agent_obs['r'] + 10:
                action = {}
                action['theta'] = 0
                action['velocity'] = 0
                action['stop'] = True
            else:
                action = self.get_action(agent_obs, run_type == 'TEST')
                if self.expl:
                    action = self.action_random(action)

            action_list.append(action)

        # for i in range(self.agent_count):
        #   agent_obs = obs['agent'][i]
        #   if np.linalg.norm(agent_obs['d']-agent_obs['p']) < agent_obs['r'] + 5:
        #       action = {}
        #       action['theta'] = 0
        #       action['velocity'] = 0
        #       action['stop'] = True
        #   else:
        #       if i == 0:
        #           action = self.get_action(agent_obs, run_type=='TEST')
        #           if self.expl:
        #               action = self.action_random()
        #       else:
        #           action = self.get_action_greedy(agent_obs)

        #   action_list.append(action)

        return action_list

    def get_action(self, agent_obs, action_target=False):
        state_ = {}
        state_ = self.preprocess(agent_obs)
        state_body = np.reshape(state_['body'], (1, self.dim_body))
        state_sensor = np.reshape(state_['sensor'], (1, self.dim_sensor))

        if action_target:
            prediction = self.actorNN.predict_target(state_body, state_sensor)
        else:
            prediction = self.actorNN.predict(state_body, state_sensor)

        action = {}
        action['theta'] = prediction[0][0]
        action['velocity'] = prediction[0][1]
        action['stop'] = False

        return action

    def action_greedy(self, obs):
        action_list = []
        agent_num = len(obs['agent'])
        for i in range(agent_num):
            agent_obs = obs['agent'][i]

            action = self.get_action_greedy(agent_obs)
            action_list.append(action)

        return action_list

    def get_action_greedy(self, agent_obs):
        if np.linalg.norm(agent_obs['d'] - agent_obs['p']) < 10 + 10:
            action = {}
            action['theta'] = 0
            action['velocity'] = 0
            action['stop'] = True
            return action

        greedy_dis = None

        angle_num = 20
        next_angle = (190 / 2.0)

        offset = 2
        direction = np.array(agent_obs['d']) - np.array(agent_obs['p'])
        direction /= np.linalg.norm(direction)

        greedy_dir = 0
        if random.random() < 0.5:
            greedy_dir = 1

        for angle in range(angle_num):
            if agent_obs['d_map'][angle] < 10 + offset:
                continue

            curr_angle = 190 / 2 - angle * 10
            curr_q = mMath.AngleToCoor(
                curr_angle + agent_obs['front']) * agent_obs['d_map'][angle]
            curr_dis = direction[0] * curr_q[0] + direction[1] * curr_q[1]
            if greedy_dir == 0:
                if (greedy_dis is None) or (greedy_dis < curr_dis):
                    next_angle = curr_angle
                    greedy_dis = curr_dis
                    next_q = curr_q
            else:
                if (greedy_dis is None) or (greedy_dis <= curr_dis):
                    next_angle = curr_angle
                    greedy_dis = curr_dis
                    next_q = curr_q

        action = {}
        action['theta'] = np.clip(next_angle, -10, 10) / 10.0

        if greedy_dis is None:
            action['velocity'] = -1
        else:
            action['velocity'] = 1

        action['stop'] = False

        return action

    def action_random(self, action=None):
        if action is None:
            action = dict()
            action['theta'] = np.random.normal()
            action['velocity'] = np.random.normal()
        else:
            noise_theta, noise_vel = self.expl_noise.noise()
            action['theta'] = action['theta'] + noise_theta
            action['velocity'] = action['velocity'] + noise_vel

        action['stop'] = False

        return action

    #=====================update==========================

    def Update(self):
        if len(self.rm.memory['critic']) > 0 and len(
                self.rm.memory['actor']) > 0:
            self.update_network()

    def update_network(self):
        rm_critic_batch = self.rm.getRandomMemories('critic')

        s_body_batch, s_sensor_batch, a_batch, r_batch, t_batch, s2_body_batch, s2_sensor_batch = [], [], [], [], [], [], []
        for m in rm_critic_batch:
            state_ = copy.copy(self.preprocess(m['state']['agent'][0]))
            state_body = copy.copy(state_['body'])
            state_sensor = copy.copy(state_['sensor'])
            action = copy.copy(
                np.array([m['action'][0]['theta'],
                          m['action'][0]['velocity']]))
            next_state_ = copy.copy(
                self.preprocess(m['next_state']['agent'][0]))
            next_state_body = copy.copy(next_state_['body'])
            next_state_sensor = copy.copy(next_state_['sensor'])

            s_body_batch.append(state_body[0])
            s_sensor_batch.append(state_sensor[0])
            a_batch.append(action)
            r_batch.append(m['reward'])
            t_batch.append(m['term'])
            s2_body_batch.append(next_state_body[0])
            s2_sensor_batch.append(next_state_sensor[0])

        target_q = self.criticNN.predict_target(
            s2_body_batch, s2_sensor_batch,
            self.actorNN.predict_target(s2_body_batch, s2_sensor_batch))

        y_i = []
        c_batch_size = len(rm_critic_batch)
        for k in range(c_batch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.gamma * target_q[k])

        # Update the critic given the targets
        predicted_q_value, _ = self.criticNN.train(
            s_body_batch, s_sensor_batch, a_batch,
            np.reshape(y_i, (int(c_batch_size), 1)))

        # Update the actor policy using the sampled gradient
        rm_actor_batch = self.rm.getRandomMemories('actor')

        actor_body_batch, actor_sensor_batch, actor_a_batch = [], [], []
        for m in rm_actor_batch:
            state_ = copy.copy(self.preprocess(m['state']['agent'][0]))
            state_body = copy.copy(state_['body'])
            state_sensor = copy.copy(state_['sensor'])
            action = copy.copy(
                np.array([m['action'][0]['theta'],
                          m['action'][0]['velocity']]))
            actor_body_batch.append(state_body[0])
            actor_sensor_batch.append(state_sensor[0])
            actor_a_batch.append(action)

        act_batch = self.actorNN.predict(actor_body_batch, actor_sensor_batch)
        grads = self.criticNN.action_gradients(actor_body_batch,
                                               actor_sensor_batch, act_batch)
        self.actorNN.train(actor_body_batch, actor_sensor_batch, grads[0])

        # Update target networks
        self.actorNN.update_target_network()
        self.criticNN.update_target_network()

    #===================evaluate===========================

    def evaluate(self, obs, agent_idx, action, run_type='TRAIN'):
        state_ = {}
        agent_obs = obs['agent'][agent_idx]

        state_['body'] = np.array(
            self.preprocess_body(agent_obs['p'], agent_obs['q'],
                                 agent_obs['v'], agent_obs['d']))
        state_['action'] = np.array([action['theta'], action['velocity']])
        state_['sensor'] = np.array(
            self.preprocess_sensor(agent_obs['d_map'], agent_obs['v_map'],
                                   agent_obs['q_lim'], agent_obs['v_depth']))

        state_body = np.reshape(state_['body'], (1, self.dim_body))
        state_sensor = np.reshape(state_['sensor'], (1, self.dim_sensor))
        action = np.reshape(state_['action'], (1, self.dim_action))

        if run_type == 'TEST':
            prediction = self.criticNN.predict_target(state_body, state_sensor,
                                                      action)[0]
        else:
            prediction = self.criticNN.predict(state_body, state_sensor,
                                               action)[0]

        return prediction

    def expl_rate_decay(self):
        if self.exploration_rate > 0.2:
            self.exploration_rate *= self.expl_decay
            print "exploration rate : ", self.exploration_rate

    #=====================replay_memory===========================

    def addMemory(self, is_greedy, obs, act, next_state, reward, is_term):
        if is_greedy:
            self.rm.addMemory('actor', obs, act, next_state, reward, is_term)
            self.rm.addMemory('critic', obs, act, next_state, reward, is_term)
        else:
            if self.expl:
                self.rm.addMemory('actor', obs, act, next_state, reward,
                                  is_term)
                self.expl = False
            else:
                self.rm.addMemory('critic', obs, act, next_state, reward,
                                  is_term)

    #==================save & load==========================

    def save(self, m_replay=False, training_time=0, eval_list=None):
        cur_time = strftime("%Y%m%d_%I%M.ckpt", localtime())

        print "Save Critic Network : ",
        self.criticNN.save(self.critic_path, cur_time)

        print "Save Actor Network : ",
        self.actorNN.save(self.actor_path, cur_time)

        print "Parameters Saved...!"
        self.save_parameters(cur_time, training_time)

        print "Networks Saved...!"

        if m_replay:
            print "Replay Memories Saved...!"
            self.save_replaymemory(cur_time)

        if eval_list != None:
            print "Evaluation List Saved...!"
            self.save_evaluation(cur_time, eval_list)

    def save_replaymemory(self, cur_time):
        f = open(cst.RM_PATH + "checkpoint", 'w')
        f.write(cur_time)
        f.close()

        f = open(cst.RM_PATH + "rm_" + cur_time, 'w')
        pickle.dump(self.rm, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()

    def save_evaluation(self, cur_time, eval_list=None):
        f = open(cst.EVAL_PATH + "checkpoint", 'w')
        f.write(cur_time)
        f.close()

        f = open(cst.EVAL_PATH + "eval_" + cur_time, 'w')
        pickle.dump(eval_list, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()

    def save_parameters(self, cur_time, training_time):
        f_read = open(cst.PM_READ_PATH, 'r')
        f_write = open(cst.PM_WRITE_PATH + "pm_" + cur_time + ".txt", 'w')
        f_write.write("traning time : " + str(training_time))
        while True:
            line = f_read.readline()
            if not line:
                break
            f_write.write(line)
        f_read.close()
        f_write.close()

    def load_network(self, type):
        if type == 'actor':
            print "Load Recent Actor Network : ",
            self.actorNN.load(self.actor_path)
        elif type == 'critic':
            print "Load Recent Critic Network : ",
            self.criticNN.load(self.critic_path)

    def load_memory(self):
        f = open(cst.RM_PATH + "checkpoint", 'r')
        recent_file_name = f.readline()
        f.close()

        f_rm = open(cst.RM_PATH + "rm_" + recent_file_name, 'r')
        self.rm = pickle.load(f_rm)
        f_rm.close()

        print "Load Replay Memory :  ", cst.RM_PATH, "rm_", recent_file_name

    def load_eval(self):
        f = open(cst.EVAL_PATH + "checkpoint", 'r')
        recent_file_name = f.readline()
        f.close()

        f_eval = open(cst.EVAL_PATH + "eval_" + recent_file_name, 'r')
        self.eval = pickle.load(f_eval)
        f_eval.close()

        print "Load Eval List :  ", cst.EVAL_PATH, "eval_", recent_file_name

    #=================other===============================

    def preprocess(self, agent_obs):
        state = {}
        state['body'] = np.array(
            self.preprocess_body(agent_obs['p'], agent_obs['q'],
                                 agent_obs['v'], agent_obs['d'])).reshape(
                                     (1, self.dim_body))
        state['sensor'] = np.array(
            self.preprocess_sensor(agent_obs['d_map'], agent_obs['delta'], 20,
                                   cst.VISION_DEPTH)).reshape((1, 40))

        return state

    def preprocess_body(self, p, q, v, d):
        p_ = np.array(p)
        q_ = np.array(q)
        d_ = np.array(d)

        width = cst.WINDOW_WIDTH / 2.0
        height = cst.WINDOW_HEIGHT / 2.0

        p_[0] = p_[0] / width
        p_[1] = p_[1] / height

        d_[0] = d_[0] / width
        d_[1] = d_[1] / height

        q_norm = np.linalg.norm(q_)
        q_ = (q_ / q_norm)

        pd = np.array(d_ - p_)
        pd_len = np.linalg.norm(pd)
        pd_vec = pd / pd_len

        inner = mMath.InnerProduct(q_, pd_vec)
        cross = mMath.CrossProduct(q_, pd_vec)

        cross_val = 1.0
        if cross < 0:
            cross_val = 0.0

        return [v, inner, cross_val, pd_len]

    def preprocess_sensor(self, d_map, delta_map, q_lim, vision_depth):
        depth = [d / float(vision_depth) for d in d_map]
        delta = [d / float(vision_depth) for d in delta_map]

        # print "depth : ", depth
        # print "delta : ", delta

        sensor = depth + delta
        return np.array(sensor)

    def get_agent_count(self, is_train, obs):
        if is_train:
            return 1
        else:
            return len(obs['agent'])

    def isExploration(self, flag):
        self.expl = (flag and random.random() < self.exploration_rate)
Esempio n. 27
0
    def __init__(self,
                 game,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 frameskip = 4
                 ):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # Environment
        self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())

        self.target_net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500
class Agent():
    def __init__(self, game, agent_type, display, load_model, record, test):
        self.name = game
        self.agent_type = agent_type
        self.ale = ALEInterface()
        self.ale.setInt(str.encode('random_seed'), np.random.randint(100))
        self.ale.setBool(str.encode('display_screen'), display or record)
        if record:
            self.ale.setString(str.encode('record_screen_dir'), str.encode('./data/recordings/{}/{}/tmp/'.format(game, agent_type)))

        self.ale.loadROM(str.encode('./roms/{}.bin'.format(self.name)))
        self.action_list = list(self.ale.getMinimalActionSet())
        self.frame_shape = np.squeeze(self.ale.getScreenGrayscale()).shape
        if test:
            self.name += '_test'

        if 'space_invaders' in self.name:
            # Account for blinking bullets
            self.frameskip = 2
        else:
            self.frameskip = 3

        self.frame_buffer = deque(maxlen=4)
        if load_model and not record:
            self.load_replaymemory()
        else:
            self.replay_memory = ReplayMemory(500000, 32)

        model_input_shape = self.frame_shape + (4,)
        model_output_shape = len(self.action_list)

        if agent_type == 'dqn':
            self.model = DeepQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )
        elif agent_type == 'double':
            self.model = DoubleDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        else:
            self.model = DuelingDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        print('{} Loaded!'.format(' '.join(self.name.split('_')).title()))
        print('Displaying: ', display)
        print('Frame Shape: ', self.frame_shape)
        print('Frame Skip: ', self.frameskip)
        print('Action Set: ', self.action_list)
        print('Model Input Shape: ', model_input_shape)
        print('Model Output Shape: ', model_output_shape)
        print('Agent: ', agent_type)

    def training(self, steps):
        '''
        Trains the agent for :steps number of weight updates.

        Returns the average model loss
        '''

        loss = []

        # Initialize frame buffer. np.squeeze removes empty dimensions e.g. if shape=(210,160,__)
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))

        try:
            for step in range(steps):
                gameover = False
                initial_state = np.stack(self.frame_buffer, axis=-1)
                action = self.model.predict_action(initial_state)

                # Backup data
                if step % 5000 == 0:
                    self.model.save_model()
                    self.model.save_hyperparams()
                    self.save_replaymemory()

                # If using a target model check for weight updates
                if hasattr(self.model, 'tau'):
                    if self.model.tau == 0:
                        self.model.update_target_model()
                        self.model.tau = 10000
                    else:
                        self.model.tau -= 1

                # Frame skipping technique https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/
                lives_before = self.ale.lives()
                for _ in range(self.frameskip):
                    self.ale.act(action)

                reward = self.ale.act(action)
                self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
                lives_after = self.ale.lives()

                if lives_after < lives_before:
                    gameover = True  # Taking advice from dude on reddit
                    reward = -1

                if self.ale.game_over():
                    gameover = True
                    reward = -1
                    self.ale.reset_game()

                new_state = np.stack(self.frame_buffer, axis=-1)

                # Experiment with clipping rewards for stability purposes
                reward = np.clip(reward, -1, 1)
                self.replay_memory.add(
                    initial_state,
                    action,
                    reward,
                    gameover,
                    new_state
                )

                loss += self.model.replay_train()
        except:
            self.model.save_model()
            self.model.save_hyperparams()
            self.save_replaymemory()
            raise KeyboardInterrupt

        return np.mean(loss, axis=0)

    def simulate_random(self):
        print('Simulating game randomly')
        done = False
        total_reward = 0
        while not done:
            action = np.random.choice(self.ale.getMinimalActionSet())
            reward = self.ale.act(action)
            total_reward += reward
            if self.ale.game_over():
                reward = -1
                done = True

            reward = np.clip(reward, -1, 1)
            if reward != 0:
                print(reward)

        frames_survived = self.ale.getEpisodeFrameNumber()
        self.ale.reset_game()
        return total_reward, frames_survived

    def simulate_intelligent(self, evaluating=False):
        done = False
        total_score = 0

        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        while not done:
            state = np.stack(self.frame_buffer, axis=-1)
            action = self.model.predict_action(state, evaluating)

            for _ in range(self.frameskip):
                self.ale.act(action)

            # Remember, ale.act returns the increase in game score with this action
            total_score += self.ale.act(action)

            # Pushes oldest frame out
            self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
            if self.ale.game_over():
                done = True

        frames_survived = self.ale.getEpisodeFrameNumber()
        print('   Game Over')
        print('   Frames Survived: ', frames_survived)
        print('   Score: ', total_score)
        print('===========================')
        self.ale.reset_game()
        return total_score, frames_survived

    def save_replaymemory(self):
        with bz2.BZ2File('./data/{}/{}_replaymem.obj'.format(self.agent_type, self.name), 'wb') as f:
            pickle.dump(self.replay_memory, f, protocol=pickle.HIGHEST_PROTOCOL)
            print('Saved replay memory at ', datetime.now())

    def load_replaymemory(self):
        try:
            with bz2.BZ2File('./data/{}/{}_replaymem.obj'.format(self.agent_type, self.name), 'rb') as f:
                self.replay_memory = pickle.load(f)
                print('Loaded replay memory at ', datetime.now())
        except FileNotFoundError:
            print('No replay memory file found')
            raise KeyboardInterrupt
Esempio n. 29
0
class Agent():
    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 update_actor_interval=2,
                 warmup=1000,
                 mem_size=10**6,
                 batch_size=100,
                 n_hid1=400,
                 n_hid2=300,
                 lr_alpha=1e-3,
                 lr_beta=1e-3,
                 gamma=0.99,
                 tau=5e-3,
                 noise_mean=0,
                 noise_sigma=0.1):

        self.load_checkpoint = load_checkpoint
        self.checkpoint_file = checkpoint_file
        # needed for clamping in the learn function
        self.env = env
        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.n_actions = n_actions
        # to keep track of how often we call "learn" function, for the actor network
        self.learn_step_counter = 0
        # to handle countdown to the end of the warmup period, incremented every time we call an action
        self.time_step = 0
        self.update_actor_interval = update_actor_interval
        self.warmup = warmup
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.noise_mean = noise_mean
        self.noise_sigma = noise_sigma

        self.actor = TD3ActorNetwork(n_states,
                                     n_actions,
                                     n_hid1,
                                     n_hid2,
                                     lr_alpha,
                                     checkpoint_file,
                                     name='actor')
        self.target_actor = TD3ActorNetwork(n_states,
                                            n_actions,
                                            n_hid1,
                                            n_hid2,
                                            lr_alpha,
                                            checkpoint_file,
                                            name='target_actor')

        self.critic_1 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_1')
        self.critic_2 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_2')
        self.target_critic_1 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_1')
        self.target_critic_2 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_2')

        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        # tau=1 perform an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.actor, self.target_actor, tau=1)
        self.update_network_parameters(self.critic_1,
                                       self.target_critic_1,
                                       tau=1)
        self.update_network_parameters(self.critic_2,
                                       self.target_critic_2,
                                       tau=1)

    def choose_action(self, obs):
        if self.time_step < self.warmup:
            self.time_step += 1
            action = torch.tensor(self.env.action_space.sample())
        else:
            obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device)
            action = self.actor(obs)

            # exploratory noise, scaled wrt action scale (max_action)
            noise = torch.tensor(
                np.random.normal(self.noise_mean,
                                 self.noise_sigma * self.max_action,
                                 size=self.n_actions)).to(self.actor.device)
            action += noise
        action = torch.clamp(action, self.low_action, self.max_action)
        return action.cpu().detach().numpy()

    def choose_action_eval(self, obs):
        obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device)
        action = self.actor(obs)
        action = torch.clamp(action, self.low_action, self.max_action)
        return action.cpu().detach().numpy()

    def store_transition(self, obs, action, reward, obs_, done):
        self.memory.store_transition(obs, action, reward, obs_, done)

    def sample_transitions(self):
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer(
            self.batch_size)
        # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class)
        state_batch = torch.tensor(state_batch,
                                   dtype=torch.float).to(self.actor.device)
        action_batch = torch.tensor(action_batch,
                                    dtype=torch.float).to(self.actor.device)
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float).to(self.actor.device)
        new_state_batch = torch.tensor(new_state_batch,
                                       dtype=torch.float).to(self.actor.device)
        done_batch = torch.tensor(done_batch).to(self.actor.device)
        return state_batch, action_batch, reward_batch, new_state_batch, done_batch

    def __copy_param(self, net_param_1, net_param_2, tau):
        # a.copy_(b) reads content from b and copy it to a
        for par, target_par in zip(net_param_1, net_param_2):
            #with torch.no_grad():
            val_to_copy = tau * par.weight + (1 - tau) * target_par.weight
            target_par.weight.copy_(val_to_copy)
            if target_par.bias is not None:
                val_to_copy = tau * par.bias + (1 - tau) * target_par.bias
                target_par.bias.copy_(val_to_copy)

    def update_network_parameters(self, network, target_network, tau=None):
        for par, target_par in zip(network.parameters(),
                                   target_network.parameters()):
            target_par.data.copy_(tau * par.data + (1 - tau) * target_par.data)

        #
        # # TODO: Controlla equivalenza con metodo Phil
        # # during the class initialization we call this method with tau=1, to perform an exact copy of the nets to targets
        # # then when we call this without specifying tau, we use the field stored
        # if tau is None:
        #     tau = self.tau
        #
        # actor_params = self.actor.children()
        # target_actor_params = self.target_actor.children()
        # self.__copy_param(actor_params, target_actor_params, tau)
        #
        # critic_params1 = self.critic_1.children()
        # target_critic_1_params = self.target_critic_1.children()
        # self.__copy_param(critic_params1, target_critic_1_params, tau)
        #
        # critic_params2 = self.critic_2.children()
        # target_critic_2_params = self.target_critic_2.children()
        # self.__copy_param(critic_params2, target_critic_2_params, tau)

    def learn(self):
        self.learn_step_counter += 1

        # deal with the situation in which we still not have filled the memory to batch size
        if self.memory.mem_counter < self.batch_size:
            return
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions(
        )
        # +- 0.5 as per paper. To be tested if min and max actions are not equal (e.g. -2 and 1)
        noise = torch.tensor(
            np.clip(
                np.random.normal(self.noise_mean, 0.2, size=self.n_actions),
                -0.5, 0.5)).to(self.actor.device)
        target_next_action = torch.clamp(
            self.target_actor(new_state_batch) + noise, self.low_action,
            self.max_action)

        target_q1_ = self.target_critic_1(new_state_batch, target_next_action)
        target_q2_ = self.target_critic_1(new_state_batch, target_next_action)
        target_q_ = torch.min(
            target_q1_,
            target_q2_)  # take the min q_vale for every element in the batch
        target_q_[done_batch] = 0.0
        target = target_q_.view(-1)  # probably not needed
        target = reward_batch + self.gamma * target  #_q
        target = target.view(self.batch_size, 1)  # probably not needed

        q_val1 = self.critic_1(state_batch, action_batch)
        q_val2 = self.critic_1(state_batch, action_batch)

        critic_loss1 = F.mse_loss(q_val1, target)
        critic_loss2 = F.mse_loss(q_val2, target)
        critic_loss = critic_loss1 + critic_loss2

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        critic_loss.backward()
        #critic_loss1.backward()
        #critic_loss2.backward()

        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        if self.learn_step_counter % self.update_actor_interval:
            action = self.actor(state_batch)
            # compute actor loss proportional to the estimated value from q1 given state, action pairs, where the action
            # is recomputed using the new policy
            actor_loss = -torch.mean(self.critic_1(state_batch, action))

            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            self.update_network_parameters(self.actor, self.target_actor,
                                           self.tau)
            self.update_network_parameters(self.critic_1, self.target_critic_1,
                                           self.tau)
            self.update_network_parameters(self.critic_2, self.target_critic_2,
                                           self.tau)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.target_critic_1.save_checkpoint()
        self.target_critic_2.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.target_critic_1.load_checkpoint()
        self.target_critic_2.load_checkpoint()
    def __init__(self, game, agent_type, display, load_model, record, test):
        self.name = game
        self.agent_type = agent_type
        self.ale = ALEInterface()
        self.ale.setInt(str.encode('random_seed'), np.random.randint(100))
        self.ale.setBool(str.encode('display_screen'), display or record)
        if record:
            self.ale.setString(str.encode('record_screen_dir'), str.encode('./data/recordings/{}/{}/tmp/'.format(game, agent_type)))

        self.ale.loadROM(str.encode('./roms/{}.bin'.format(self.name)))
        self.action_list = list(self.ale.getMinimalActionSet())
        self.frame_shape = np.squeeze(self.ale.getScreenGrayscale()).shape
        if test:
            self.name += '_test'

        if 'space_invaders' in self.name:
            # Account for blinking bullets
            self.frameskip = 2
        else:
            self.frameskip = 3

        self.frame_buffer = deque(maxlen=4)
        if load_model and not record:
            self.load_replaymemory()
        else:
            self.replay_memory = ReplayMemory(500000, 32)

        model_input_shape = self.frame_shape + (4,)
        model_output_shape = len(self.action_list)

        if agent_type == 'dqn':
            self.model = DeepQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )
        elif agent_type == 'double':
            self.model = DoubleDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        else:
            self.model = DuelingDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        print('{} Loaded!'.format(' '.join(self.name.split('_')).title()))
        print('Displaying: ', display)
        print('Frame Shape: ', self.frame_shape)
        print('Frame Skip: ', self.frameskip)
        print('Action Set: ', self.action_list)
        print('Model Input Shape: ', model_input_shape)
        print('Model Output Shape: ', model_output_shape)
        print('Agent: ', agent_type)
Esempio n. 31
0
class Learner():
    def __init__(self, sess, s_size, a_size, scope, queues, trainer):
        self.queue = queues[0]
        self.param_queue = queues[1]
        self.replaymemory = ReplayMemory(100000)
        self.sess = sess
        self.learner_net = network(s_size, a_size, scope, 20)

        self.q = self.learner_net.q
        self.Q = self.learner_net.Q

        self.actions_q = tf.placeholder(shape=[None, a_size, N],
                                        dtype=tf.float32)
        self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32)
        self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32)

        self.q_actiona = tf.multiply(self.q, self.actions_q)
        self.q_action = tf.reduce_sum(self.q_actiona, axis=1)
        self.u = tf.abs(self.q_target - self.q_action)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1))

        self.local_vars = self.learner_net.local_vars  #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        self.gradients = tf.gradients(self.loss, self.local_vars)
        #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0)
        self.apply_grads = trainer.apply_gradients(
            zip(self.gradients, self.local_vars))
        self.sess.run(tf.global_variables_initializer())

    def run(self, gamma, s_size, a_size, batch_size, env):
        print('start learning')
        step, train1 = 0, False
        epi_q = []
        self.env = env
        while True:
            if self.queue.empty():
                pass
            else:
                while not self.queue.empty():
                    t_error = self.queue.get()
                    step += 1
                    self.replaymemory.add(t_error)

            if self.param_queue.empty():
                params = self.sess.run(self.local_vars)
                self.param_queue.put(params)

            if step >= 10000:
                train1 = True
                step = 0

            if train1 == True:
                episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(
                    batch_size)
                #print 'fadsfdasfadsfa'
                episode_buffer = np.array(episode_buffer)
                #print episode_buffer
                observations = episode_buffer[:, 0]
                actions = episode_buffer[:, 1]
                rewards = episode_buffer[:, 2]
                observations_next = episode_buffer[:, 3]
                dones = episode_buffer[:, 4]
                Q_target = self.sess.run(self.Q,
                                         feed_dict={
                                             self.learner_net.inputs:
                                             np.vstack(observations_next)
                                         })

                actions_ = np.argmax(Q_target, axis=1)

                action = np.zeros((batch_size, a_size))
                action_ = np.zeros((batch_size, a_size))
                for i in range(batch_size):
                    action[i][actions[i]] = 1
                    action_[i][actions_[i]] = 1
                action_now = np.zeros((batch_size, a_size, N))
                action_next = np.zeros((batch_size, a_size, N))
                for i in range(batch_size):
                    for j in range(a_size):
                        for k in range(N):
                            action_now[i][j][k] = action[i][j]
                            action_next[i][j][k] = action_[i][j]
                q_target = self.sess.run(self.q_action,
                                         feed_dict={
                                             self.learner_net.inputs:
                                             np.vstack(observations_next),
                                             self.actions_q:
                                             action_next
                                         })

                q_target_batch = []
                for i in range(len(q_target)):
                    qi = q_target[i]
                    z_target_step = []
                    for j in range(len(qi)):
                        z_target_step.append(gamma * qi[j] * (1 - dones[i]) +
                                             rewards[i])
                    q_target_batch.append(z_target_step)
                q_target_batch = np.array(q_target_batch)

                isweight = np.zeros((batch_size, N))
                for i in range(batch_size):
                    for j in range(N):
                        isweight[i, j] = ISWeights[i]
                feed_dict = {
                    self.q_target: q_target_batch,
                    self.learner_net.inputs: np.vstack(observations),
                    self.actions_q: action_now,
                    self.ISWeights: isweight
                }

                l, abs_errors, _ = self.sess.run(
                    [self.loss, self.u, self.apply_grads], feed_dict=feed_dict)
                #print abs_errors
                abs_errors = np.mean(abs_errors, axis=1) + 1e-6

                self.replaymemory.update_priorities(tree_idx, abs_errors)
Esempio n. 32
0
class Worker():
    def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = env
        self.replaymemory = ReplayMemory(max_memory)
        
    def train(self,rollout,sess,gamma,ISWeights):
        rollout = np.array(rollout)
        observations      = rollout[:,0]
        actions           = rollout[:,1]
        rewards           = rollout[:,2]
        next_observations = rollout[:,3]
        dones             = rollout[:,4]
        
        Q_target = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:np.vstack(next_observations)})
        actions_ = np.argmax(Q_target, axis=1)
        
        action = np.zeros((batch_size, a_size))
        action_ = np.zeros((batch_size, a_size))
        for i in range(batch_size):
            action[i][actions[i]] = 1
            action_[i][actions_[i]] = 1
        action_now = np.zeros((batch_size, a_size, N))
        action_next = np.zeros((batch_size, a_size, N))
        for i in range(batch_size):
            for j in range(a_size):
                for k in range(N):
                    action_now[i][j][k] = action[i][j]
                    action_next[i][j][k] = action_[i][j]

        q_target = sess.run(self.local_Q.q_action, feed_dict={self.local_Q.inputs:np.vstack(next_observations),
                                                               self.local_Q.actions_q:action_next})
        q_target_batch = []
        for i in range(len(q_target)):
            qi = q_target[i]# * (1 - dones[i])
            z_target_step = []
            for j in range(len(qi)):
                z_target_step.append(gamma * qi[j] + rewards[i])
            q_target_batch.append(z_target_step)
        q_target_batch = np.array(q_target_batch)
        #print q_target_batch
        isweight = np.zeros((batch_size,N))
        for i in range(batch_size):
            for j in range(N):
                isweight[i,j] = ISWeights[i]
        feed_dict = {self.local_Q.inputs:np.vstack(observations),
                     self.local_Q.actions_q:action_now,
                     self.local_Q.q_target:q_target_batch,
                     self.local_Q.ISWeights:isweight}
        u,l,g_n,v_n,_ = sess.run([self.local_Q.u,
                                  self.local_Q.loss,
                                  self.local_Q.grad_norms,
                                  self.local_Q.var_norms,
                                  self.local_Q.apply_grads],feed_dict=feed_dict)
        return l/len(rollout), g_n, v_n, Q_target, u

    def work(self,gamma,sess,coord,saver):
        global GLOBAL_STEP
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        epsilon = 0.2
        
        print ("Starting worker " + str(self.number))
        best_mean_episode_reward = -float('inf')
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                #episode_buffer = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.reset()
                s = process_frame(s)
                if epsilon > 0.01:
                    epsilon = epsilon * 0.997
                while not d:
                    #self.env.render()
                    GLOBAL_STEP += 1
                    #Take an action using probabilities from policy network output.
                    if random.random() > epsilon:
                        a_dist_list = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:[s]})
                        a_dist = a_dist_list[0]
                        a = np.argmax(a_dist)
                    else:
                        a = random.randint(0, 5)
                    
                    s1, r, d, _ = self.env.step(a)
                    if d == False:
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                    self.replaymemory.add([s,a,r,s1,d])
                    episode_reward += r
                    s = s1                    
                    total_steps += 1
                    episode_step_count += 1
                    if total_steps % 2 == 0 and d != True and total_steps > 50000:
                        episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(batch_size)
                        l,g_n,v_n,Q_target,u = self.train(episode_buffer,sess,gamma,ISWeights)
                        u = np.mean(u,axis=1) + 1e-6
                        self.replaymemory.update_priorities(tree_idx,u)
                        #sess.run(self.update_local_ops)
                    if d == True:
                        break
                sess.run(self.update_local_ops)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory:
                    if self.name == 'worker_0' and episode_count % 5 == 0:
                        print('\n episode: ', episode_count, 'global_step:', \
                              GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \
                              'epsilon: ', epsilon)
                    
                    print ('loss', l, 'Qtargetmean', np.mean(Q_target))
                    #print 'p_target', p_target
                    if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000:
                        saver.save(sess,self.model_path+'/qr-dqn-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-100:])
                    if episode_count > 20 and best_mean_episode_reward < mean_reward:
                        best_mean_episode_reward = mean_reward

                if self.name == 'worker_0':
                    sess.run(self.increment)
                    #if episode_count%1==0:
                        #print('\r {} {}'.format(episode_count, episode_reward),end=' ')
                episode_count += 1