예제 #1
0
    def __init__(self, device, data):
        self.data = data
        self.actor = Actor().to(device)
        self.critic = Critic().to(device)
        #self.ctarget = Critic().to(device)
        self.actor_opt = torch.optim.Adam(itertools.chain(
            self.actor.parameters()),
                                          lr=0.0001,
                                          betas=(0.0, 0.9))
        self.critic_opt = torch.optim.Adam(itertools.chain(
            self.critic.parameters()),
                                           lr=0.001,
                                           betas=(0.0, 0.9))

        def init_weights(m):
            if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
                torch.nn.init.xavier_uniform_(m.weight.data)

        self.actor.apply(init_weights)
        self.critic.apply(init_weights)
        #self.ctarget.apply(init_weights)

        self.device = device
        self.memory = ReplayMemory(1000, device=device)
        self.batch_size = 5
        self.GAMMA = 0.99
        self.count = 0
예제 #2
0
 def __init__(self,device,actionsize):
     self.samplenet = DQN(actionsize).to(device)
     self.targetnet = DQN(actionsize).to(device)
     self.opt = torch.optim.Adam(itertools.chain(self.samplenet.parameters()),lr=0.00001,betas=(0.0,0.9))
     self.device = device
     self.memory = ReplayMemory(1000,device=device)
     self.BATCH_SIZE = 10
     self.GAMMA = 0.99
     self.count = 0
예제 #3
0
class Sampler():

    def __init__(self,device,actionsize):
        self.samplenet = DQN(actionsize).to(device)
        self.targetnet = DQN(actionsize).to(device)
        self.opt = torch.optim.Adam(itertools.chain(self.samplenet.parameters()),lr=0.00001,betas=(0.0,0.9))
        self.device = device
        self.memory = ReplayMemory(1000,device=device)
        self.BATCH_SIZE = 10
        self.GAMMA = 0.99
        self.count = 0

    def select_action(self, model):
        self.samplenet.eval()
        action = self.samplenet(model.conv2.weight.data.view(-1,5,5).unsqueeze(0))
        return torch.max(action,1)[1]

    def step(self,state,action,next_state,reward,done):
        self.memory.push(state,action,next_state,reward,done)

        #don't bother if you don't have enough in memory
        if len(self.memory) >= self.BATCH_SIZE:
            self.optimize()

    def optimize(self):
        self.samplenet.train()
        self.targetnet.eval()
        s1,actions,r1,s2,d = self.memory.sample(self.BATCH_SIZE)

        #get old Q values and new Q values for belmont eq
        qvals = self.samplenet(s1)
        state_action_values = qvals.gather(1,actions[:,0].unsqueeze(1))
        with torch.no_grad():
            qvals_t = self.targetnet(s2)
            q1_t = qvals_t.max(1)[0].unsqueeze(1)

        expected_state_action_values = (q1_t * self.GAMMA) * (1-d) + r1

        #LOSS IS l2 loss of belmont equation
        loss = torch.nn.MSELoss()(state_action_values,expected_state_action_values)

        self.opt.zero_grad()
        loss.backward()
        self.opt.step()

        if self.count % 20 == 0:
            self.targetnet.load_state_dict(self.samplenet.state_dict())

        return loss.item()
예제 #4
0
	def __init__(self, session=None, arguments=None):

		self.sess = session
		self.args = arguments

		# Initialize Gym environment.
		self.environment = gym.make(self.args.env)        

		if self.args.env=='MountainCarContinuous-v0':
			input_dimensions = 2
			output_dimensions = 1
		elif self.args.env=='InvertedPendulum-v2':
			input_dimensions = 4
			output_dimensions = 1
		elif self.args.env=='FetchReach-v0':
			input_dimensions = 16
			output_dimensions = 4
		elif self.args.env=='FetchPush-v0':
			input_dimensions = 31
			output_dimensions = 4

		# Initialize a polivy network. 
		self.ACModel = ActorCriticModel(input_dimensions,output_dimensions,number_layers=4,hidden_units=40,sess=session,to_train=self.args.train, env=self.args.env)

		# Create the actual network
		if self.args.weights:
			self.ACModel.create_policy_network(session, pretrained_weights=self.args.weights,to_train=self.args.train)
		else:
			self.ACModel.create_policy_network(session, to_train=self.args.train)

		# Initialize a memory replay. 
		self.memory = ReplayMemory()

		# Create a trainer instance. 
		self.trainer = Trainer(sess=session,policy=self.ACModel, environment=self.environment, memory=self.memory,args=self.args)
예제 #5
0
    def __init__(self, **config):
        self.config = config
        self.n_actions = self.config["n_actions"]
        self.state_shape = self.config["state_shape"]
        self.batch_size = self.config["batch_size"]
        self.gamma = self.config["gamma"]
        self.initial_mem_size_to_train = self.config[
            "initial_mem_size_to_train"]
        torch.manual_seed(self.config["seed"])

        if torch.cuda.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            torch.cuda.empty_cache()
            torch.cuda.manual_seed(self.config["seed"])
            torch.cuda.manual_seed_all(self.config["seed"])
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.memory = ReplayMemory(self.config["mem_size"],
                                   self.config["alpha"], self.config["seed"])
        self.v_min = self.config["v_min"]
        self.v_max = self.config["v_max"]
        self.n_atoms = self.config["n_atoms"]
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.n_atoms).to(self.device)
        self.delta_z = (self.v_max - self.v_min) / (self.n_atoms - 1)
        self.offset = torch.linspace(0, (self.batch_size - 1) * self.n_atoms, self.batch_size).long() \
            .unsqueeze(1).expand(self.batch_size, self.n_atoms).to(self.device)

        self.n_step = self.config["n_step"]
        self.n_step_buffer = deque(maxlen=self.n_step)

        self.online_model = Model(self.state_shape, self.n_actions,
                                  self.n_atoms, self.support,
                                  self.device).to(self.device)
        self.target_model = Model(self.state_shape, self.n_actions,
                                  self.n_atoms, self.support,
                                  self.device).to(self.device)
        self.hard_update_target_network()

        self.optimizer = Adam(self.online_model.parameters(),
                              lr=self.config["lr"],
                              eps=self.config["adam_eps"])
예제 #6
0
def get_player(pred_model: keras.Model, strat_model: keras.Model) -> RnnPlayer:
    small_replay_memory = ReplayMemory(1)
    small_rnn_replay_memory = RnnReplayMemory(1)

    prediction_network = PredictionNetwork(pred_model, small_replay_memory, 1,
                                           False)
    strategy_network = RnnStrategyNetwork(strat_model, small_rnn_replay_memory,
                                          1, False)

    return RnnPlayer(prediction_network, strategy_network, 0.0, 0.0)
예제 #7
0
    def __init__(self, session=None, arguments=None):

        self.sess = session
        self.args = arguments

        # Initialize Gym environment.
        self.environment = gym.make(self.args.env)

        if self.args.env == 'FetchReach-v0':
            input_dimensions = 16
        elif self.args.env == 'FetchPush-v0':
            input_dimensions = 31

        output_dimensions = 4

        # Initialize a polivy network.
        # self.ACModel = ActorCriticModel(input_dimensions,output_dimensions,sess=session,to_train=self.args.train)
        self.PolicyModel = DAggerPolicy(input_dimensions,
                                        output_dimensions,
                                        name_scope='PolicyModel',
                                        sess=session,
                                        to_train=self.args.train)

        # Create the actual network
        if self.args.weights:
            self.PolicyModel.create_policy_network(
                session,
                pretrained_weights=self.args.weights,
                to_train=self.args.train)
        else:
            self.PolicyModel.create_policy_network(session,
                                                   to_train=self.args.train)

        # Initialize a memory replay.
        self.memory = ReplayMemory()

        # Create a trainer instance.
        self.trainer = Trainer(sess=session,
                               policy=self.PolicyModel,
                               environment=self.environment,
                               memory=self.memory,
                               args=self.args)
예제 #8
0
class Generator():
    def __init__(self, device, data):
        self.data = data
        self.actor = Actor().to(device)
        self.critic = Critic().to(device)
        #self.ctarget = Critic().to(device)
        self.actor_opt = torch.optim.Adam(itertools.chain(
            self.actor.parameters()),
                                          lr=0.0001,
                                          betas=(0.0, 0.9))
        self.critic_opt = torch.optim.Adam(itertools.chain(
            self.critic.parameters()),
                                           lr=0.001,
                                           betas=(0.0, 0.9))

        def init_weights(m):
            if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
                torch.nn.init.xavier_uniform_(m.weight.data)

        self.actor.apply(init_weights)
        self.critic.apply(init_weights)
        #self.ctarget.apply(init_weights)

        self.device = device
        self.memory = ReplayMemory(1000, device=device)
        self.batch_size = 5
        self.GAMMA = 0.99
        self.count = 0

    def select_action(self, imgs):
        with torch.no_grad():
            self.actor.eval()
            action = self.actor(imgs)
            return action

    def step(self, state, action, next_state, reward, done):
        self.memory.push(state, action, next_state, reward, done)

        if len(self.memory) >= self.batch_size:
            self.optimize()

    def optimize(self):
        self.actor.train()
        self.critic.train()
        #self.ctarget.eval()

        s1, a, r, s2, d = self.memory.sample(self.batch_size)

        #train the critic
        for reward, action in zip(r, a):
            qval = self.critic(action)
            avgQ = qval.mean().unsqueeze(0)
            loss = torch.nn.L1Loss()(avgQ, reward)
            self.critic_opt.zero_grad()
            loss.backward()
            self.critic_opt.step()

        #train the actor
        img, target = self.data[random.randint(0, len(self.data) - 1)]
        batch = self.actor(img)
        score = self.critic(batch)
        actor_loss = -score.mean()
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        #if self.count % 5 == 0:
        #    self.ctarget.load_state_dict(self.critic.state_dict())
        #self.count += 1

    def save(self):
        torch.save(self.actor.state_dict(), os.path.join('model', 'actor.pth'))
        torch.save(self.critic.state_dict(),
                   os.path.join('model', 'critic.pth'))
예제 #9
0
def main():
    # create replay memories
    pred_memories = [ReplayMemory(prediction_replay_memory_size) for _ in range(1)]
    strat_memories = [RnnReplayMemory(strategy_replay_memory_size) for _ in range(1)]

    # create Networks
    pred_networks = [
        PredictionNetwork(prediction_resnet(), pred_memories[0], prediction_net_batch_size, True),
    ]
    print(pred_networks[0]._neural_network.summary())
    strat_networks = [
        RnnStrategyNetwork(strategy_deep_lstm_resnet(), strat_memories[0], strategy_net_batch_size, True),
    ]
    print(strat_networks[0]._neural_network.summary())

    # make pairs of the networks
    networks = list(sum(zip(pred_networks, strat_networks), ()))

    # give each network a name
    pred_network_names = [
        'normal_prediction'
    ]
    strat_network_names = [
        'normal_strategy'
    ]

    # make the same pairs as above
    network_names = list(sum(zip(pred_network_names, strat_network_names), ()))

    # create players
    players = [
        [RnnPlayer(pred_networks[0], strat_networks[0], prediction_exploration_rate, strategy_exploration_rate)
         for _ in range(4)],
    ]

    # flatten players
    players = sum(players, [])

    # create one PlayerInterlayer for each player
    players = [
        [RnnPlayerInterlayer(player, normal_pred_y_func, normal_strat_y_func) for player in players]
    ]
    players = sum(players, [])

    # create one Sitting
    sitting = Sitting(debugging)
    last_stop = datetime.datetime.now()
    r = random.Random()
    with open('stats_dev.txt', 'w') as f:
        f.write("// interval to print stats: " + str(interval_to_print_stats) + "\n")
        total_diff = 0
        total_losses = [0.0 for _ in range(len(networks))]
        for i in range(start_offset, total_rounds, 10):
            sitting.set_players(r.sample(players, 4))
            for _ in range(10):
                total_diff += sitting.play_full_round()
            i += 9
            if only_train_in_turn:
                index_to_train = i // turn_size % len(networks)
                total_losses[index_to_train] += networks[index_to_train].train()
            else:
                for net_i, network in enumerate(networks):
                    total_losses[net_i] += network.train()
            if (i + 1) % interval_to_print_stats == 0:
                print(str(i + 1), "rounds have been played")
                avg = total_diff / 4 / interval_to_print_stats
                print("Average difference of one player:\t", avg)
                losses_string = ', '.join([str(l) for l in np.array(total_losses) / interval_to_print_stats])
                print("The losses are:\t", losses_string)
                print("It took:", datetime.datetime.now() - last_stop)
                last_stop = datetime.datetime.now()
                print('')
                f.write(str(i + 1) + "\n")
                f.write(str(avg) + "\n")
                f.write(losses_string + "\n")
                total_diff = 0
                total_losses = [0.0 for _ in range(len(networks))]
            if (i + 1) % rounds_until_save == 0:
                for keras_net, net_name in zip(networks, network_names):
                    if 'random' in net_name:
                        continue
                    elif 'pred' in net_name:
                        full_name = prediction_save_path
                    elif 'strat' in net_name:
                        full_name = strategy_save_path
                    else:
                        assert 0, net_name
                    full_name += net_name + '_' + str(i + 1) + '.h5'
                    keras_net.save_network(full_name)
            if i + 1 == round_when_adding_players:
                print('adding players')
                # add 2 more normal players
                nps = [RnnPlayer(networks[-2], networks[-1], prediction_exploration_rate, strategy_exploration_rate)
                       for _ in range(2)]
                inps = [RnnPlayerInterlayer(nps[i], normal_pred_y_func, normal_strat_y_func) for i in range(2)]
                players += inps

                # add 2 static versions of the current normal player
                pred_mem = ReplayMemory(1)
                strat_mem = RnnReplayMemory(1)
                pred_net = load_model(prediction_save_path + 'normal_prediction_' + str(i + 1) + '.h5')
                strat_net = load_model(strategy_save_path + 'normal_strategy_' + str(i + 1) + '.h5')
                p_net = PredictionNetwork(pred_net, pred_mem, 1, False)
                s_net = RnnStrategyNetwork(strat_net, strat_mem, 1, False)
                ps = [RnnPlayer(p_net, s_net, 0.02, 0.02) for _ in range(2)]
                ips = [RnnPlayerInterlayer(ps[i], normal_pred_y_func, normal_strat_y_func) for i in range(2)]
                players += ips
예제 #10
0
 def __init__(self,
              observation_space,
              action_space,
              device,
              gamma=0.99,
              actor_lr=1e-4,
              critic_lr=1e-3,
              batch_size=64,
              memory_size=50000,
              tau=1e-3,
              weight_decay=1e-2,
              writer=None,
              is_image=False):
     super(DdpgAgent, self).__init__()
     self.num_state = observation_space.shape[0]
     self.num_action = action_space.shape[0]
     self.state_mean = None
     self.state_halfwidth = None
     if abs(observation_space.high[0]) != math.inf:
         self.state_mean = 0.5 * (observation_space.high +
                                  observation_space.low)
         self.state_halfwidth = 0.5 * (observation_space.high -
                                       observation_space.low)
     self.gamma = gamma
     self.batch_size = batch_size
     self.device = device
     self.actor = ActorNetwork(self.num_state,
                               action_space,
                               device,
                               is_image=is_image).to(self.device)
     self.actor_target = ActorNetwork(self.num_state,
                                      action_space,
                                      device,
                                      is_image=is_image).to(self.device)
     self.actor_target.load_state_dict(self.actor.state_dict())
     self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
     self.critic = CriticNetwork(self.num_state,
                                 action_space,
                                 device,
                                 is_image=is_image).to(self.device)
     self.critic_target = CriticNetwork(self.num_state,
                                        action_space,
                                        device,
                                        is_image=is_image).to(self.device)
     self.critic_target.load_state_dict(self.critic.state_dict())
     self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                        lr=critic_lr,
                                        weight_decay=weight_decay)
     self.memory = ReplayMemory(observation_space,
                                action_space,
                                device,
                                num_state=self.num_state,
                                memory_size=memory_size,
                                is_image=is_image)
     self.criterion = nn.SmoothL1Loss()
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu")
     self.tau = tau
     self.writer = writer
     self.update_step = 0
     self.is_image = is_image
예제 #11
0
class Td3Agent:
    def __init__(self,
                 observation_space,
                 action_space,
                 device,
                 gamma=0.99,
                 actor_lr=5e-3,
                 critic_lr=5e-3,
                 batch_size=100,
                 memory_size=50000,
                 tau=1e-3,
                 weight_decay=1e-2,
                 sigma=0.2,
                 noise_clip=0.5,
                 policy_freq=2,
                 writer=None,
                 is_image=False):
        super(Td3Agent, self).__init__()
        self.action_mean = (0.5 * (action_space.high + action_space.low))[0]
        self.action_halfwidth = (0.5 *
                                 (action_space.high - action_space.low))[0]
        self.num_state = observation_space.shape[0]
        self.num_action = action_space.shape[0]
        self.state_mean = None
        self.state_halfwidth = None
        if abs(observation_space.high[0]) != math.inf:
            self.state_mean = 0.5 * (observation_space.high +
                                     observation_space.low)
            self.state_halfwidth = 0.5 * (observation_space.high -
                                          observation_space.low)
        self.gamma = gamma
        self.batch_size = batch_size
        self.device = device
        self.actor = ActorNetwork(self.num_state,
                                  action_space,
                                  device,
                                  is_image=is_image).to(self.device)
        self.actor_target = ActorNetwork(self.num_state,
                                         action_space,
                                         device,
                                         is_image=is_image).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = CriticNetwork(self.num_state,
                                    action_space,
                                    device,
                                    is_image=is_image).to(self.device)
        self.critic_target = CriticNetwork(self.num_state,
                                           action_space,
                                           device,
                                           is_image=is_image).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr,
                                           weight_decay=weight_decay)

        self.memory = ReplayMemory(observation_space,
                                   action_space,
                                   device,
                                   num_state=self.num_state,
                                   memory_size=memory_size,
                                   is_image=is_image)
        self.criterion = nn.SmoothL1Loss()
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.tau = tau
        self.writer = writer
        self.update_step = 0
        self.is_image = is_image
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

    def normalize_state(self, state):
        if self.state_mean is None:
            return state
        state = (state - self.state_mean) / self.state_halfwidth
        return state

    def soft_update(self, target_net, net):
        for target_param, param in zip(target_net.parameters(),
                                       net.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def update(self):
        self.update_step += 1
        with torch.no_grad():
            batch, indices, probability_distribution = self.memory.random_sample(
            )
            #各サンプルにおける状態行動の値を取ってくる
            action_batch = batch['actions'].to(self.device)
            state_batch = batch['obs'].to(self.device)
            next_state_batch = batch['next_obs'].clone().to(self.device)
            reward_batch = batch['rewards'].to(self.device)
            terminate_batch = batch['terminates'].to(self.device)

            noise = (torch.randn_like(action_batch) * self.sigma).clamp(
                -self.noise_clip, self.noise_clip)
            next_action = (self.actor_target(next_state_batch) + noise).clamp(
                -self.action_mean - self.action_halfwidth,
                self.action_mean + self.action_halfwidth)

            target_q1, target_q2 = self.critic_target(next_state_batch,
                                                      next_action)
            target_q = torch.min(target_q1, target_q2)

            target_q_values = reward_batch + self.gamma * target_q * (
                1 - terminate_batch)

        self.actor.train()
        self.critic.train()

        current_q1, current_q2 = self.critic(state_batch, action_batch)
        #誤差の計算
        critic_loss = self.criterion(current_q1,
                                     target_q_values) + self.criterion(
                                         current_q2, target_q_values)
        #勾配を0にリセットする
        self.critic_optimizer.zero_grad()
        #逆誤差伝搬を計算する
        critic_loss.backward()
        #勾配を更新する
        self.critic_optimizer.step()

        if self.writer and self.update_step % 1000 == 0:
            self.writer.add_scalar("loss/critic", critic_loss.item(),
                                   self.update_step / 1000)
            #print("loss/critic", critic_loss.item())

        if self.update_step % self.policy_freq == 0:
            actor_loss = -self.critic.q1_forward(
                state_batch, self.actor(state_batch)).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            if self.writer and self.update_step % 1000 == 0:
                self.writer.add_scalar("loss/actor", actor_loss.item(),
                                       self.update_step / 1000)
                #print("loss/actor", actor_loss.item())
            self.soft_update(self.actor_target, self.actor)
            self.soft_update(self.critic_target, self.critic)
        self.actor.eval()
        self.critic.eval()

    # Q値が最大の行動を選択
    def get_action(self, state, is_noise=True):
        if not self.is_image:
            state_tensor = torch.tensor(self.normalize_state(state),
                                        dtype=torch.float).view(
                                            -1, self.num_state).to(self.device)
        else:
            state_tensor = torch.tensor(state.copy() / 255.,
                                        dtype=torch.float).unsqueeze(0).to(
                                            self.device)
        with torch.no_grad():
            action = self.actor(state_tensor).view(self.num_action)
            noise = np.random.normal(loc=0.0, scale=self.sigma)
            action_with_noise = np.clip(
                action.to('cpu').detach().numpy().copy() + noise, -1, 1)
            action = action.to('cpu').detach().numpy().copy()
        if not is_noise:
            return action
        return action_with_noise
예제 #12
0
    def __init__(
            self,
            observation_space,
            action_space,
            device,
            gamma=0.995,
            actor_lr=5e-4,
            critic_lr=5e-4,
            batch_size=128,
            memory_size=50000,
            tau=5e-3,
            weight_decay=1e-2,
            sigma=0.2,
            noise_clip=0.5,
            alpha=0.2,
            alpha_lr=3e-4,
            rollout_length=2048,
            lambda_=0.95,
            beta_clone=1.0,
            coef_ent=0.01,
            num_updates=32,
            policy_epoch=1,
            value_epoch=1,
            aux_num_updates=6,
            aux_epoch_batch=64,
            max_grad_norm=0.5,
            aux_critic_loss_coef=1.0,
            clip_eps=0.2,
            writer=None,
            is_image=False,
            clip_aux_critic_loss=None,
            clip_aux_multinet_critic_loss=None,
            multipleet_upadte_clip_grad_norm=None,
            summary_interval=1,
            debug_no_aux_phase=False):
        super(PpgAgent, self).__init__()
        self.action_mean = (0.5 * (action_space.high + action_space.low))[0]
        self.action_halfwidth = (0.5 * (action_space.high - action_space.low))[0]
        self.num_state = observation_space.shape[0]
        self.num_action = action_space.shape[0]
        self.state_mean = None
        self.state_halfwidth = None
        if abs(observation_space.high[0]) != math.inf:
            self.state_mean = 0.5 * (observation_space.high + observation_space.low)
            self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low)
        self.gamma = gamma
        self.batch_size = batch_size
        self.device = device
        self.multipleNet = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.multipleNet_target = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.multipleNet_target.load_state_dict(self.multipleNet.state_dict())
        self.multipleNet_optimizer = optim.Adam(self.multipleNet.parameters(), lr=actor_lr)

        self.critic = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = critic_lr, weight_decay=weight_decay)

        self.alpha = alpha
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr = alpha_lr)

        self.memory = ReplayMemory(observation_space, action_space, device, num_state = self.num_state, memory_size = memory_size, is_image = is_image)
        self.criterion = nn.MSELoss()
        self.device = device
        self.tau = tau
        self.writer = writer
        self.is_image =is_image
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.rollout_length = rollout_length
        self.lambda_ = lambda_
        self.coef_ent = coef_ent
        self.aux_critic_loss_coef = aux_critic_loss_coef
        self.max_grad_norm = max_grad_norm
        self.aux_num_updates = aux_num_updates
        self.clip_eps = clip_eps
        self.beta_clone = beta_clone
        self.policy_epoch = policy_epoch
        self.value_epoch = value_epoch
        self.num_updates = num_updates
        self.aux_epoch_batch = aux_epoch_batch
        self.clip_aux_critic_loss = clip_aux_critic_loss
        self.clip_aux_multinet_critic_loss = clip_aux_multinet_critic_loss
        self.multipleet_upadte_clip_grad_norm = multipleet_upadte_clip_grad_norm
        self.summary_interval = summary_interval
        self.debug_no_aux_phase = debug_no_aux_phase
        self.update_step = 0
예제 #13
0
class PpgAgent:
    def __init__(
            self,
            observation_space,
            action_space,
            device,
            gamma=0.995,
            actor_lr=5e-4,
            critic_lr=5e-4,
            batch_size=128,
            memory_size=50000,
            tau=5e-3,
            weight_decay=1e-2,
            sigma=0.2,
            noise_clip=0.5,
            alpha=0.2,
            alpha_lr=3e-4,
            rollout_length=2048,
            lambda_=0.95,
            beta_clone=1.0,
            coef_ent=0.01,
            num_updates=32,
            policy_epoch=1,
            value_epoch=1,
            aux_num_updates=6,
            aux_epoch_batch=64,
            max_grad_norm=0.5,
            aux_critic_loss_coef=1.0,
            clip_eps=0.2,
            writer=None,
            is_image=False,
            clip_aux_critic_loss=None,
            clip_aux_multinet_critic_loss=None,
            multipleet_upadte_clip_grad_norm=None,
            summary_interval=1,
            debug_no_aux_phase=False):
        super(PpgAgent, self).__init__()
        self.action_mean = (0.5 * (action_space.high + action_space.low))[0]
        self.action_halfwidth = (0.5 * (action_space.high - action_space.low))[0]
        self.num_state = observation_space.shape[0]
        self.num_action = action_space.shape[0]
        self.state_mean = None
        self.state_halfwidth = None
        if abs(observation_space.high[0]) != math.inf:
            self.state_mean = 0.5 * (observation_space.high + observation_space.low)
            self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low)
        self.gamma = gamma
        self.batch_size = batch_size
        self.device = device
        self.multipleNet = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.multipleNet_target = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.multipleNet_target.load_state_dict(self.multipleNet.state_dict())
        self.multipleNet_optimizer = optim.Adam(self.multipleNet.parameters(), lr=actor_lr)

        self.critic = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = critic_lr, weight_decay=weight_decay)

        self.alpha = alpha
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr = alpha_lr)

        self.memory = ReplayMemory(observation_space, action_space, device, num_state = self.num_state, memory_size = memory_size, is_image = is_image)
        self.criterion = nn.MSELoss()
        self.device = device
        self.tau = tau
        self.writer = writer
        self.is_image =is_image
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.rollout_length = rollout_length
        self.lambda_ = lambda_
        self.coef_ent = coef_ent
        self.aux_critic_loss_coef = aux_critic_loss_coef
        self.max_grad_norm = max_grad_norm
        self.aux_num_updates = aux_num_updates
        self.clip_eps = clip_eps
        self.beta_clone = beta_clone
        self.policy_epoch = policy_epoch
        self.value_epoch = value_epoch
        self.num_updates = num_updates
        self.aux_epoch_batch = aux_epoch_batch
        self.clip_aux_critic_loss = clip_aux_critic_loss
        self.clip_aux_multinet_critic_loss = clip_aux_multinet_critic_loss
        self.multipleet_upadte_clip_grad_norm = multipleet_upadte_clip_grad_norm
        self.summary_interval = summary_interval
        self.debug_no_aux_phase = debug_no_aux_phase
        self.update_step = 0

    def normalize_state(self, state):
        """return normalized state
        """
        if self.state_mean is None:
            return state
        state = (state - self.state_mean) / self.state_halfwidth
        return state

    def soft_update(self, target_net, net):
        """Polyark update
        """
        for target_param, param in zip(target_net.parameters(), net.parameters()):
            target_param.data.copy_(
                self.tau * param.data + (1 - self.tau) * target_param.data
            )

    def is_update(self, steps):
        """update in rollout length interval
        """
        return steps % self.rollout_length == 0

    def update(self, state = None):
        """Training process, according to the original paper, trainign process is follow:
        initialize replay buffer B
        1. perform rollout N_{\pi} times \\ Policy phase
        2. update multiplenet, loss =  L^{clip} + Bhevior Cloning Loss
           update critic, loss = L^{value} (using GAE)
        3. update multiplenet, loss =  L^{joint} \\ Auxiliary Phase
           update critic, loss = L^{value} (using GAE)
        4. reset B
        """
        if not self.is_update(self.memory.index):
            return
        self.update_step += 1
        # sample from replay buffer
        with torch.no_grad():
            batch = self.memory.sample(state)
            action_batch = batch['actions'].to(self.device)
            state_batch = batch['obs'].to(self.device)
            reward_batch = batch['rewards'].to(self.device)
            terminate_batch = batch['terminates'].to(self.device)
            log_pis_batch = batch['log_pis'].to(self.device)
            values = self.critic(state_batch)

        # calulate value target (\hat{V}\_t^{targ}) for each state
        targets, advantages = util.calculate_advantage(values, reward_batch, terminate_batch, self.gamma, self.lambda_)

        # 2. policy phase, update multiplenet and critic
        # https://arxiv.org/pdf/2009.04416.pdf algorithm 1, line 6-7
        for i in range(self.value_epoch):
            indices = np.arange(self.rollout_length)
            np.random.shuffle(indices)
            for start in range(0, self.rollout_length, self.batch_size):
                idxes = indices[start:start + self.batch_size]
                loss_critic = self.update_critic(state_batch[idxes], targets[idxes])
                loss_critic += loss_critic

        n_train_iteration = self.value_epoch * (self.rollout_length // self.batch_size)
        loss_critic = loss_critic / n_train_iteration
        self.writer.add_scalar('/critic/loss/policy_phase', loss_critic, self.update_step)

        # https://arxiv.org/pdf/2009.04416.pdf algorithm 1, line 8-9
        loss_actor, l_clip, bc_loss = 0, 0, 0
        for i in range(self.policy_epoch):
            indices = np.arange(self.rollout_length)
            np.random.shuffle(indices)
            for start in range(0, self.rollout_length, self.batch_size):
                idxes = indices[start:start + self.batch_size]
                loss_actor, l_clip, bc_loss =  self.update_MultipleNet(state_batch[idxes], action_batch[idxes], log_pis_batch[idxes], advantages[idxes])
                loss_actor += loss_actor
                l_clip += l_clip
                bc_loss += bc_loss

        # averaging losses and writeto tensorboard
        n_train_iteration = self.policy_epoch * (self.rollout_length // self.batch_size)
        loss_actor = loss_actor / n_train_iteration
        l_clip = l_clip / n_train_iteration
        bc_loss = bc_loss / n_train_iteration
        self.writer.add_scalar('/multiplenet/policy_phase/actor', loss_actor, self.update_step)
        self.writer.add_scalar('/multiplenet/policy_phase/l_clip', l_clip, self.update_step)
        self.writer.add_scalar('/multiplenet/policy_phase/bc_loss', bc_loss, self.update_step)

        with torch.no_grad():
            log_pis_old = self.multipleNet.evaluate_log_pi(state_batch[:-1], action_batch)

        # 3. auxialry phase, update multiplenet and critic
        # https://arxiv.org/pdf/2009.04416.pdf algorithm 1, line 12-14
        # if self.debug_no_aux_phase is True, skip this phase which should makae this code equivalant to TPO
        loss_critic_multi, bc_loss, loss_joint, loss_critic_aux = 0, 0, 0, 0
        if (self.update_step % self.num_updates == 0) and (not self.debug_no_aux_phase):
            for _ in range(self.aux_num_updates):
                indices = np.arange(self.rollout_length)
                np.random.shuffle(indices)
                for start in range(0, self.rollout_length, self.batch_size):
                    idxes = indices[start:start + self.batch_size]
                    loss_critic_multi, bc_loss, loss_joint = self.update_actor_Auxiliary(state_batch[idxes], action_batch[idxes], log_pis_old[idxes], targets[idxes], advantages[idxes])
                    loss_critic_aux = self.update_critic_Auxiliary(state_batch[idxes], targets[idxes])
                    loss_critic_multi += loss_critic_multi
                    bc_loss += bc_loss
                    loss_joint += loss_joint
                    loss_critic_aux += loss_critic_aux
            # 4. initialize replay buffer to empty
            # https://arxiv.org/pdf/2009.04416.pdf algorithm 1, line 2
            self.memory.reset()

            # averaging losses and writeto tensorboard
            n_train_iteration = self.aux_num_updates * (self.rollout_length // self.batch_size)
            loss_critic_multi = loss_critic_multi / n_train_iteration
            bc_loss = bc_loss / n_train_iteration
            loss_joint = loss_joint / n_train_iteration
            loss_critic_aux = loss_critic_aux / n_train_iteration
            self.writer.add_scalar('/multiplenet/loss/auxialry_phase/critic', loss_critic_multi, self.update_step)
            self.writer.add_scalar('/multiplenet/loss/auxialry_phase/bc_loss', bc_loss, self.update_step)
            self.writer.add_scalar('/multiplenet/loss/auxialry_phase/loss_joint', loss_joint, self.update_step)
            self.writer.add_scalar('/critic/loss/auxialry_phase/critic', loss_critic_aux, self.update_step)
        self.multipleNet.eval()
        self.critic.eval()

    def update_actor_Auxiliary(self, states, actions, log_pis_old, targets, advantages):
        """loss = L^{joint}
        L^{joint} = L^{aux} + \beta_{clone} * KL(\pi_{old}, \pi_{current})
        In original paper,  L^{aux} =  mse(v_{pi}(s_t), v_targ) \\ taks for V_{\theta_\pi}
        """
        loss_critic = (self.multipleNet.q_forward(states) - targets).pow_(2).mean() * 0.5
        if self.clip_aux_multinet_critic_loss is not None:
            loss_critic = torch.clamp(loss_critic, min=0, max=self.clip_aux_critic_loss)
        loss_critic = self.aux_critic_loss_coef * loss_critic
        log_pis = self.multipleNet.evaluate_log_pi(states, actions)
        pis_old = log_pis_old.exp_()
        kl_loss = (pis_old * (log_pis - log_pis_old)).mean()
        
        loss_joint = loss_critic + self.beta_clone * kl_loss
        self.multipleNet_optimizer.zero_grad()
        loss_joint.backward(retain_graph=False)
        self.multipleNet_optimizer.step()
        return loss_critic, self.beta_clone * kl_loss, loss_joint
        
    def update_critic_Auxiliary(self, states, targets):
        """loss = L^{value} = mse(v(s) - v_targ)
        """
        # add * 0.5 according to https://arxiv.org/pdf/2009.04416.pdf page 2
        loss_critic_aux = (self.critic(states) - targets).pow_(2).mean() * 0.5
        if self.clip_aux_critic_loss is not None:
            loss_critic_aux = torch.clamp(loss_critic_aux, min=0, max=self.clip_aux_critic_loss)
        self.critic_optimizer.zero_grad()
        loss_critic_aux.backward(retain_graph=False)
        # nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
        self.critic_optimizer.step()
        return loss_critic_aux

    def update_critic(self, states, targets):
        """loss = L^{value} = mse(v(s) - v_targ)
        """
        # add * 0.5 according to https://arxiv.org/pdf/2009.04416.pdf page 2
        loss_critic = (self.critic(states) - targets).pow_(2).mean() * 0.5
        self.critic_optimizer.zero_grad()
        loss_critic.backward(retain_graph=False)
        # nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
        self.critic_optimizer.step()
        return loss_critic

    def update_MultipleNet(self, states, actions, log_pis_old, advantages):
        """policy phase, update multiplenet.
        loss =  L^{clip} + behavir_cloing_loss
        """
        log_pis = self.multipleNet.evaluate_log_pi(states, actions)
        mean_ent = log_pis.mean()
        ratios = (log_pis - log_pis_old).exp_()
        loss_actor1 = -ratios * advantages
        loss_actor2 = -torch.clamp(
            ratios,
            1.0 - self.clip_eps,
            1.0 + self.clip_eps
        ) * advantages
        l_clip = torch.max(loss_actor1, loss_actor2).mean()
        bc_loss = self.coef_ent * mean_ent
        loss_actor = l_clip + bc_loss    
        loss_actor.backward(retain_graph=False)
        if self.multipleet_upadte_clip_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.multipleNet.parameters(), 
                                          self.multipleet_upadte_clip_grad_norm)
        #nn.utils.clip_grad_norm_(self.multipleNet.parameters(), self.max_grad_norm)
        self.multipleNet_optimizer.step()
        return loss_actor, l_clip, bc_loss

    def get_action(self, state):
        """select action that has maximus Q value.
        """
        self.multipleNet.eval()
        if not self.is_image:
            state_tensor = torch.tensor(self.normalize_state(state), dtype=torch.float).view(-1, self.num_state).to(self.device)
        else:
            state_tensor = torch.tensor(state.copy() / 255., dtype=torch.float).unsqueeze(0).to(self.device)
        with torch.no_grad():
            action, log_pis = self.multipleNet.sample(state_tensor)
            action = action.view(self.num_action).to('cpu').detach().numpy().copy()
        return action, log_pis
예제 #14
0
class Agent:
    def __init__(self, **config):
        self.config = config
        self.n_actions = self.config["n_actions"]
        self.state_shape = self.config["state_shape"]
        self.batch_size = self.config["batch_size"]
        self.gamma = self.config["gamma"]
        self.initial_mem_size_to_train = self.config[
            "initial_mem_size_to_train"]
        torch.manual_seed(self.config["seed"])

        if torch.cuda.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            torch.cuda.empty_cache()
            torch.cuda.manual_seed(self.config["seed"])
            torch.cuda.manual_seed_all(self.config["seed"])
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.memory = ReplayMemory(self.config["mem_size"],
                                   self.config["alpha"], self.config["seed"])
        self.v_min = self.config["v_min"]
        self.v_max = self.config["v_max"]
        self.n_atoms = self.config["n_atoms"]
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.n_atoms).to(self.device)
        self.delta_z = (self.v_max - self.v_min) / (self.n_atoms - 1)
        self.offset = torch.linspace(0, (self.batch_size - 1) * self.n_atoms, self.batch_size).long() \
            .unsqueeze(1).expand(self.batch_size, self.n_atoms).to(self.device)

        self.n_step = self.config["n_step"]
        self.n_step_buffer = deque(maxlen=self.n_step)

        self.online_model = Model(self.state_shape, self.n_actions,
                                  self.n_atoms, self.support,
                                  self.device).to(self.device)
        self.target_model = Model(self.state_shape, self.n_actions,
                                  self.n_atoms, self.support,
                                  self.device).to(self.device)
        self.hard_update_target_network()

        self.optimizer = Adam(self.online_model.parameters(),
                              lr=self.config["lr"],
                              eps=self.config["adam_eps"])

    def choose_action(self, state):
        state = np.expand_dims(state, axis=0)
        state = from_numpy(state).byte().to(self.device)
        with torch.no_grad():
            self.online_model.reset()
            action = self.online_model.get_q_value(state).argmax(-1)
        return action.item()

    def store(self, state, action, reward, next_state, done):
        """Save I/O s to store them in RAM and not to push pressure on GPU RAM """
        assert state.dtype == "uint8"
        assert next_state.dtype == "uint8"
        assert isinstance(reward, int)
        assert isinstance(done, bool)

        self.n_step_buffer.append((state, action, reward, next_state, done))
        if len(self.n_step_buffer) < self.n_step:
            return

        reward, next_state, done = self.get_n_step_returns()
        state, action, *_ = self.n_step_buffer.popleft()

        self.memory.add(state, np.uint8(action), reward, next_state, done)

    def soft_update_target_network(self, tau):
        for target_param, local_param in zip(self.target_model.parameters(),
                                             self.online_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
        # self.target_model.train()
        for param in self.target_model.parameters():
            param.requires_grad = False

    def hard_update_target_network(self):
        self.target_model.load_state_dict(self.online_model.state_dict())
        # self.target_model.train()
        for param in self.target_model.parameters():
            param.requires_grad = False

    def unpack_batch(self, batch):
        batch = self.config["transition"](*zip(*batch))

        states = from_numpy(np.stack(batch.state)).to(self.device)
        actions = from_numpy(np.stack(batch.action)).to(self.device).view(
            (-1, 1))
        rewards = from_numpy(np.stack(batch.reward)).to(self.device).view(
            (-1, 1))
        next_states = from_numpy(np.stack(batch.next_state)).to(self.device)
        dones = from_numpy(np.stack(batch.done)).to(self.device).view((-1, 1))
        return states, actions, rewards, next_states, dones

    def train(self, beta):
        if len(self.memory) < self.initial_mem_size_to_train:
            return 0, 0  # as no loss
        batch, weights, indices = self.memory.sample(self.batch_size, beta)
        states, actions, rewards, next_states, dones = self.unpack_batch(batch)
        weights = from_numpy(weights).float().to(self.device)

        with torch.no_grad():
            self.online_model.reset()
            self.target_model.reset()
            q_eval_next = self.online_model.get_q_value(next_states)
            selected_actions = torch.argmax(q_eval_next, dim=-1)
            q_next = self.target_model(next_states)[range(self.batch_size),
                                                    selected_actions]

            projected_atoms = rewards + (self.gamma**
                                         self.n_step) * self.support * (~dones)
            projected_atoms = projected_atoms.clamp(min=self.v_min,
                                                    max=self.v_max)

            b = (projected_atoms - self.v_min) / self.delta_z
            lower_bound = b.floor().long()
            upper_bound = b.ceil().long()
            lower_bound[(upper_bound > 0) * (lower_bound == upper_bound)] -= 1
            upper_bound[(lower_bound < (self.n_atoms - 1)) *
                        (lower_bound == upper_bound)] += 1

            projected_dist = torch.zeros(q_next.size(),
                                         dtype=torch.float64).to(self.device)
            projected_dist.view(-1).index_add_(
                0, (lower_bound + self.offset).view(-1),
                (q_next * (upper_bound.float() - b)).view(-1))
            projected_dist.view(-1).index_add_(
                0, (upper_bound + self.offset).view(-1),
                (q_next * (b - lower_bound.float())).view(-1))

        eval_dist = self.online_model(states)[range(self.batch_size),
                                              actions.squeeze().long()]
        dqn_loss = -(projected_dist * torch.log(eval_dist + 1e-6)).sum(-1)
        td_error = dqn_loss.abs() + 1e-6
        self.memory.update_priorities(indices, td_error.detach().cpu().numpy())
        dqn_loss = (dqn_loss * weights).mean()

        self.optimizer.zero_grad()
        dqn_loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(
            self.online_model.parameters(), self.config["clip_grad_norm"])
        self.optimizer.step()

        return dqn_loss.item(), grad_norm.item()

    def ready_to_play(self, state_dict):
        self.online_model.load_state_dict(state_dict)
        self.online_model.eval()

    def get_n_step_returns(self):
        reward, next_state, done = self.n_step_buffer[-1][-3:]

        for transition in reversed(list(self.n_step_buffer)[:-1]):
            r, n_s, d = transition[-3:]

            reward = r + self.gamma * reward * (1 - d)
            next_state, done = (n_s, d) if d else (next_state, done)

        return reward, next_state, done
예제 #15
0
TARGET_UPDATE = 10
learningrate = 0.001

# Creates the environment, search strategy and agent
env = EnvironmentManager(device, "CarRacing-v0", actionDict)
strat = EpsilonGreedyStrategy(EPS_END, EPS_END, EPS_DECAY)
agent = Agent(strat, env.num_actions_available(), device)

# Creates the policy and target network
policy_net = DQN(env.get_screen_height(), env.get_screen_width(), env.num_actions_available(), n_latent_var).to(device)
target_net = DQN(env.get_screen_height(), env.get_screen_width(), env.num_actions_available(), n_latent_var).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=learningrate)
memory = ReplayMemory(10000)

InputLayer = keras.layers.Input(batch_shape=(None, 224, 224, 3))
road = keras.applications.MobileNetV2(input_tensor=InputLayer, weights=None, classes=2)
Nadam = keras.optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.999)
road.compile(optimizer=Nadam, loss='mean_squared_error', metrics=['accuracy'])
road.load_weights('Unitygym.h5')
print("Loaded keras weights")

writer = open("DQNRoad.csv", mode="a")

def runner(num_episodes, max_timestep, BATCH_SIZE, env):
    episodeRew = []
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
예제 #16
0

def print_play_message(card_tnr: np.ndarray):
    output_str = "The Network plays: "
    if card_tnr[1] == 0:
        tmp = rank_strings[3:4] + rank_strings[5:6] + rank_strings[:3] + rank_strings[4:5] + rank_strings[6:]
        output_str += tmp[card_tnr[0]]
    else:
        output_str += rank_strings[card_tnr[0]]
    output_str += suit_strings[card_tnr[1]]
    print(output_str)


assert len(sys.argv) == 3, sys.argv

pred_memory = ReplayMemory(1)
strat_memory = RnnReplayMemory(1)

pred_network = PredictionNetwork(keras.models.load_model(sys.argv[1]), pred_memory, batch_size=1, can_train=False)
strat_network = RnnStrategyNetwork(keras.models.load_model(sys.argv[2]), strat_memory, batch_size=1, can_train=False)

player = RnnPlayer(pred_network, strat_network, 0, 0)

absolute_position = int(input("What is the index of the player? "))
assert 0 <= absolute_position < 4, "the given absolute position is " + str(absolute_position)
player_inter = RnnPlayerInterlayer(player, sum, sum)
player_inter.set_absolute_position(absolute_position)

# get the trump suit
trump_string = None
while trump_string not in suit_strings:
예제 #17
0
class DdpgAgent:
    def __init__(self,
                 observation_space,
                 action_space,
                 device,
                 gamma=0.99,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 batch_size=64,
                 memory_size=50000,
                 tau=1e-3,
                 weight_decay=1e-2,
                 writer=None,
                 is_image=False):
        super(DdpgAgent, self).__init__()
        self.num_state = observation_space.shape[0]
        self.num_action = action_space.shape[0]
        self.state_mean = None
        self.state_halfwidth = None
        if abs(observation_space.high[0]) != math.inf:
            self.state_mean = 0.5 * (observation_space.high +
                                     observation_space.low)
            self.state_halfwidth = 0.5 * (observation_space.high -
                                          observation_space.low)
        self.gamma = gamma
        self.batch_size = batch_size
        self.device = device
        self.actor = ActorNetwork(self.num_state,
                                  action_space,
                                  device,
                                  is_image=is_image).to(self.device)
        self.actor_target = ActorNetwork(self.num_state,
                                         action_space,
                                         device,
                                         is_image=is_image).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic = CriticNetwork(self.num_state,
                                    action_space,
                                    device,
                                    is_image=is_image).to(self.device)
        self.critic_target = CriticNetwork(self.num_state,
                                           action_space,
                                           device,
                                           is_image=is_image).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr,
                                           weight_decay=weight_decay)
        self.memory = ReplayMemory(observation_space,
                                   action_space,
                                   device,
                                   num_state=self.num_state,
                                   memory_size=memory_size,
                                   is_image=is_image)
        self.criterion = nn.SmoothL1Loss()
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.tau = tau
        self.writer = writer
        self.update_step = 0
        self.is_image = is_image

    def normalize_state(self, state):
        if self.state_mean is None:
            return state
        state = (state - self.state_mean) / self.state_halfwidth
        return state

    def soft_update(self, target_net, net):
        for target_param, param in zip(target_net.parameters(),
                                       net.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def update(self):
        self.update_step += 1
        with torch.no_grad():
            batch, indices, probability_distribution = self.memory.random_sample(
            )
            #各サンプルにおける状態行動の値を取ってくる
            action_batch = batch['actions'].to(self.device)
            state_batch = batch['obs'].to(self.device)
            next_obs_batch = batch['next_obs'].clone().to(self.device)
            reward_batch = batch['rewards'].to(self.device)
            terminate_batch = batch['terminates'].to(self.device)
            next_q_value_index = self.actor_target(next_obs_batch)
            #target-Q-network内の、対応する行動のインデックスにおける価値関数の値を取ってくる
            next_q_value = self.critic_target(next_obs_batch,
                                              next_q_value_index)
            #目的とする値の導出
            target_q_values = reward_batch + self.gamma * next_q_value * (
                1 - terminate_batch)
        self.actor.train()
        self.critic.train()
        q_values = self.critic(state_batch, action_batch)
        #誤差の計算
        critic_loss = self.criterion(q_values, target_q_values)
        #勾配を0にリセットする
        self.critic_optimizer.zero_grad()
        #逆誤差伝搬を計算する
        critic_loss.backward()
        #勾配を更新する
        self.critic_optimizer.step()
        if self.writer and self.update_step % 1000 == 0:
            self.writer.add_scalar("loss/critic", critic_loss.item(),
                                   self.update_step / 1000)
            #print("loss/critic", critic_loss.item())
        actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        if self.writer and self.update_step % 1000 == 0:
            self.writer.add_scalar("loss/actor", actor_loss.item(),
                                   self.update_step / 1000)
            #print("loss/actor", actor_loss.item())
        self.soft_update(self.actor_target, self.actor)
        self.soft_update(self.critic_target, self.critic)
        self.actor.eval()
        self.critic.eval()

    # Q値が最大の行動を選択
    def get_action(self, state, noise=None, timestep=0):
        if not self.is_image:
            state_tensor = torch.tensor(self.normalize_state(state),
                                        dtype=torch.float).view(
                                            -1, self.num_state).to(self.device)
        else:
            state_tensor = torch.tensor(state.copy() / 255.,
                                        dtype=torch.float).unsqueeze(0).to(
                                            self.device)
        with torch.no_grad():
            action = self.actor(state_tensor).view(self.num_action)
            if noise is not None:
                noise = noise(timestep)
                action = np.clip(
                    action.to('cpu').detach().numpy().copy() + noise, -1, 1)
            else:
                action = np.clip(
                    action.to('cpu').detach().numpy().copy(), -1, 1)
        return action
예제 #18
0
class PpgAgent:
    def __init__(self,
                 observation_space,
                 action_space,
                 device,
                 gamma=0.995,
                 actor_lr=5e-4,
                 critic_lr=5e-4,
                 batch_size=128,
                 memory_size=50000,
                 tau=5e-3,
                 weight_decay=1e-2,
                 sigma=0.2,
                 noise_clip=0.5,
                 alpha=0.2,
                 alpha_lr=3e-4,
                 rollout_length=2048,
                 lambda_=0.95,
                 beta_clone=1.0,
                 coef_ent=0.01,
                 num_updates=32,
                 policy_epoch=1,
                 value_epoch=1,
                 aux_num_updates=6,
                 aux_epoch_batch=16,
                 max_grad_norm=0.5,
                 clip_eps=0.2,
                 writer=None,
                 is_image=False):
        super(PpgAgent, self).__init__()
        self.action_mean = (0.5 * (action_space.high + action_space.low))[0]
        self.action_halfwidth = (0.5 *
                                 (action_space.high - action_space.low))[0]
        self.num_state = observation_space.shape[0]
        self.num_action = action_space.shape[0]
        self.state_mean = None
        self.state_halfwidth = None
        if abs(observation_space.high[0]) != math.inf:
            self.state_mean = 0.5 * (observation_space.high +
                                     observation_space.low)
            self.state_halfwidth = 0.5 * (observation_space.high -
                                          observation_space.low)
        self.gamma = gamma
        self.batch_size = batch_size
        self.device = device
        self.multipleNet = MultipleNetwork(self.num_state,
                                           action_space,
                                           device,
                                           is_image=is_image).to(self.device)
        self.multipleNet_target = MultipleNetwork(self.num_state,
                                                  action_space,
                                                  device,
                                                  is_image=is_image).to(
                                                      self.device)
        self.multipleNet_target.load_state_dict(self.multipleNet.state_dict())
        self.multipleNet_optimizer = optim.Adam(self.multipleNet.parameters(),
                                                lr=actor_lr)

        self.critic = CriticNetwork(self.num_state,
                                    action_space,
                                    device,
                                    is_image=is_image).to(self.device)
        self.critic_target = CriticNetwork(self.num_state,
                                           action_space,
                                           device,
                                           is_image=is_image).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr,
                                           weight_decay=weight_decay)

        self.alpha = alpha
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr)

        self.memory = ReplayMemory(observation_space,
                                   action_space,
                                   device,
                                   num_state=self.num_state,
                                   memory_size=memory_size,
                                   is_image=is_image)
        self.criterion = nn.MSELoss()
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.tau = tau
        self.writer = writer
        self.update_step = 0
        self.is_image = is_image
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.rollout_length = rollout_length
        self.lambda_ = lambda_
        self.coef_ent = coef_ent
        self.max_grad_norm = max_grad_norm
        self.aux_num_updates = aux_num_updates
        self.clip_eps = clip_eps
        self.beta_clone = beta_clone
        self.policy_epoch = policy_epoch
        self.value_epoch = value_epoch
        self.num_updates = num_updates
        self.aux_epoch_batch = aux_epoch_batch

    def normalize_state(self, state):
        if self.state_mean is None:
            return state
        state = (state - self.state_mean) / self.state_halfwidth
        return state

    def soft_update(self, target_net, net):
        for target_param, param in zip(target_net.parameters(),
                                       net.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def is_update(self, steps):
        return steps % self.rollout_length == 0

    def update(self, state=None):
        if not self.is_update(self.memory.index):
            return
        self.update_step += 1
        with torch.no_grad():
            batch = self.memory.sample(state)
            #各サンプルにおける状態行動の値を取ってくる
            action_batch = batch['actions'].to(self.device)
            state_batch = batch['obs'].to(self.device)
            reward_batch = batch['rewards'].to(self.device)
            terminate_batch = batch['terminates'].to(self.device)
            log_pis_batch = batch['log_pis'].to(self.device)

            values = self.critic(state_batch)
        targets, advantages = util.calculate_advantage(values, reward_batch,
                                                       terminate_batch,
                                                       self.gamma,
                                                       self.lambda_)
        for j in range(self.num_updates):
            for i in range(max(self.policy_epoch, self.value_epoch)):
                indices = np.arange(self.rollout_length)
                np.random.shuffle(indices)
                for start in range(0, self.rollout_length, self.batch_size):
                    idxes = indices[start:start + self.batch_size]
                    if self.policy_epoch > i:
                        self.update_MultipleNet(state_batch[idxes],
                                                action_batch[idxes],
                                                log_pis_batch[idxes],
                                                advantages[idxes])
                    if self.value_epoch > i:
                        self.update_critic(state_batch[idxes], targets[idxes])
        with torch.no_grad():
            log_pis_old = self.multipleNet.evaluate_log_pi(
                state_batch[:-1], action_batch)
        for _ in range(self.aux_num_updates):
            indices = np.arange(self.rollout_length)
            np.random.shuffle(indices)
            for start in range(0, self.rollout_length, self.aux_epoch_batch):
                idxes = indices[start:start + self.aux_epoch_batch]
                self.update_actor_Auxiliary(state_batch[idxes],
                                            action_batch[idxes],
                                            log_pis_old[idxes], targets[idxes],
                                            advantages[idxes])
                self.update_critic_Auxiliary(state_batch[idxes],
                                             targets[idxes])
        self.multipleNet.eval()
        self.critic.eval()

    def update_actor_Auxiliary(self, states, actions, log_pis_old, targets,
                               advantages):
        loss_critic = (self.multipleNet.q_forward(states) -
                       targets).pow_(2).mean()
        loss_bc = (self.multipleNet.p_forward(states) - actions).pow_(2).mean()
        log_pis = self.multipleNet.evaluate_log_pi(states, actions)
        pis_old = log_pis_old.exp_()
        kl_loss = (pis_old * (log_pis - log_pis_old)).mean()

        loss_joint = loss_critic + self.beta_clone * kl_loss
        self.multipleNet_optimizer.zero_grad()
        loss_joint.backward(retain_graph=False)
        self.multipleNet_optimizer.step()
        if self.update_step % 10 == 0:
            print("aux actor loss:", loss_joint.item())

    def update_critic_Auxiliary(self, states, targets):
        loss_critic_aux = (self.critic(states) - targets).pow_(2).mean()
        self.critic_optimizer.zero_grad()
        loss_critic_aux.backward(retain_graph=False)
        #nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
        self.critic_optimizer.step()
        #if self.update_step % 50 == 0:
        #    print("aux citic loss:", loss_critic_aux.item())

    def update_critic(self, states, targets):
        loss_critic = (self.critic(states) - targets).pow_(2).mean()
        self.critic_optimizer.zero_grad()
        loss_critic.backward(retain_graph=False)
        #nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
        self.critic_optimizer.step()
        #if self.update_step % 50 == 0:
        #    print("citic loss:", loss_critic.item())

    def update_MultipleNet(self, states, actions, log_pis_old, advantages):
        log_pis = self.multipleNet.evaluate_log_pi(states, actions)
        if self.update_step % 50 == 0:
            print("log_pis:", log_pis)
        mean_ent = -log_pis.mean()
        ratios = (log_pis - log_pis_old).exp_()
        loss_actor1 = -ratios * advantages
        loss_actor2 = -torch.clamp(ratios, 1.0 - self.clip_eps,
                                   1.0 + self.clip_eps) * advantages
        loss_actor = torch.max(loss_actor1,
                               loss_actor2).mean() - self.coef_ent * mean_ent
        self.multipleNet_optimizer.zero_grad()
        loss_actor.backward(retain_graph=False)
        #nn.utils.clip_grad_norm_(self.multipleNet.parameters(), self.max_grad_norm)
        self.multipleNet_optimizer.step()
        if self.update_step % 50 == 0:
            print("actor loss:", loss_actor.item())

    # Q値が最大の行動を選択
    def get_action(self, state):
        self.multipleNet.eval()
        if not self.is_image:
            state_tensor = torch.tensor(self.normalize_state(state),
                                        dtype=torch.float).view(
                                            -1, self.num_state).to(self.device)
        else:
            state_tensor = torch.tensor(state.copy() / 255.,
                                        dtype=torch.float).unsqueeze(0).to(
                                            self.device)
        with torch.no_grad():
            action, log_pis = self.multipleNet.sample(state_tensor)
            action = action.view(
                self.num_action).to('cpu').detach().numpy().copy()
        return action, log_pis