Esempio n. 1
0
    def __init__(self, args):
        super().__init__(args)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        self.actor = DeterministicPolicy(state_dim, action_dim, 64,
                                         self.env.action_space).to(device)
        self.actor_target = DeterministicPolicy(
            state_dim, action_dim, 64, self.env.action_space).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.args.lr)

        self.critic = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           self.args.lr)

        self.replay_buffer = ReplayBuffer(self.args.capacity)
        self.num_critic_update_iteration = 0
        self.num_actor_update_iteration = 0
        self.num_training = 0
        self.global_steps = 0

        if self.args.last_episode > 0:
            self.load(self.args.last_episode)
Esempio n. 2
0
    def __init__(self, params):
        super(BCAgent, self).__init__(params)

        # Initialize policy network
        pol_params = self.params['p-bc']['pol_params']
        pol_params['input_size'] = self.N
        pol_params['output_size'] = self.M
        if 'final_activation' not in pol_params:
            pol_params['final_activation'] = torch.tanh

        self.pol = MLP(pol_params)

        # Create policy optimizer
        ppar = self.params['p-bc']['pol_optim']
        self.pol_optim = torch.optim.Adam(self.pol.parameters(),
                                          lr=ppar['lr'],
                                          weight_decay=ppar['reg'])

        # Use a replay buffer that will save planner actions
        self.pol_buf = ReplayBuffer(self.N, self.M,
                                    self.params['p-bc']['buf_size'])

        # Logging (store cum_rew, cum_emp_rew)
        self.hist['pols'] = np.zeros((self.T, 2))

        self.has_pol = True

        self.pol_cache = ()
Esempio n. 3
0
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))
        # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0
        self.actionCounter = np.zeros((env.width, env.height, env.num_actions))

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)
    def __init__(self, args):
        super().__init__(args)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        self.actor = GaussianPolicy(state_dim, action_dim, 64,
                                    self.env.action_space).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.args.lr)

        self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(),
                                             self.args.lr)
        self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target_1.load_state_dict(self.critic_1.state_dict())

        self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(),
                                             self.args.lr)
        self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target_2.load_state_dict(self.critic_2.state_dict())

        self.replay_buffer = ReplayBuffer(self.args.capacity)

        self.global_steps = 0
Esempio n. 5
0
  def __init__(self, network, prep, exp_policy, state_dim, action_dim, name, learning_rate=1e-3,
               hard_update_frequency=500, soft_update_rate=None, buffer_size=50000, batch_size=32, num_steps=200000,
               discount=0.99, use_huber_loss=True, detailed_summary=False, max_reward=200, steps_before_learn=1000,
               train_freq=1, save_end=True):

    self.network = network
    self.prep = prep
    self.exp_policy = exp_policy
    self.greedy_policy = policy.Greedy()
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.discount = discount
    self.summary_dir = os.path.join(name, "summary")
    self.use_huber_loss = use_huber_loss
    self.detailed_summary = detailed_summary

    self.learning_rate = learning_rate
    self.batch_size = batch_size
    self.hard_update_frequency = hard_update_frequency
    self.soft_update_rate = soft_update_rate
    self.num_steps = num_steps
    self.step = 0
    self.steps_before_learn = steps_before_learn
    self.train_freq = train_freq
    self.solved = False
    self.max_reward = max_reward
    self.save_end = save_end

    self.actions = None
    self.rewards = None
    self.done = None
    self.action_q_values = None
    self.max_target_q_values = None
    self.targets = None
    self.global_step = None
    self.inc_global_step = None
    self.train_op = None
    self.states = None
    self.q_values = None
    self.next_states = None
    self.target_q_values = None
    self.target_update = None

    self.build_all()
  
    self.merged = tf.summary.merge_all()

    self.session = tf.Session()

    self.summary_dir = utils.new_summary_dir(self.summary_dir)
    self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph)

    self.saver = tf.train.Saver(max_to_keep=None)

    init_op = tf.global_variables_initializer()
    self.session.run(init_op)

    self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim)
Esempio n. 6
0
 def __init__(self, features, actions, state_array, params):
     super(DQN, self).__init__(features, actions , params)
     self.buffer_BACK = ReplayBuffer(1000)
     self.buffer_STAY = ReplayBuffer(1000)
     self.buffer_FORWARD = ReplayBuffer(1000)
     self.back_values = []
     self.stay_values = []
     self.forward_values = []
     self.td_loss = []
     self.state_array = state_array
Esempio n. 7
0
 def __init__(self, args):
     self.args = args
     self.policy = [Q_net(args) for _ in range(args.n_agents)]
     self.hyperNet = HyperNet(args)
     self.policy_target = [copy.deepcopy(p) for p in self.policy]
     self.hyperNet_target = copy.deepcopy(self.hyperNet)
     self.replayBuffer = ReplayBuffer(args)
     self.preference_pool = Preference(args)
     policy_param = [policy.parameters() for policy in self.policy]
     self.optim = torch.optim.Adam(itertools.chain(
         *policy_param, self.hyperNet.parameters()),
                                   lr=self.args.learning_rate)
     self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optim,
                                                         step_size=10,
                                                         gamma=0.95,
                                                         last_epoch=-1)
Esempio n. 8
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = DDPG_Actor(state_size, action_size,
                                      random_seed).to(device)
        self.actor_target = DDPG_Actor(state_size, action_size,
                                       random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPG_Critic(state_size, action_size,
                                        random_seed).to(device)
        self.critic_target = DDPG_Critic(state_size, action_size,
                                         random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        #Statistics
        self.stats = {
            "actor_loss": [],
            "critic_loss": [],
            "reward_sum": [],
        }
    def __init__(self, features: int, actions: int, params: Dict, seed: int, collector: Collector):
        self.features = features
        self.actions = actions
        self.params = params
        self.collector = collector
        self.seed = seed

        # define parameter contract
        self.gamma = params['gamma']
        self.epsilon = params.get('epsilon', 0)
        # the mellowmax parameter
        self.omega = params.get('omega', 1.0)

        # set up network for estimating Q(s, a)
        self.value_net = Network(features, actions, params, seed).to(device)

        # build the optimizer
        self.optimizer_params = params['optimizer']
        self.optimizer = deserializeOptimizer(self.value_net.parameters(), self.optimizer_params)

        self.steps = 0

        # set up the replay buffer
        self.buffer_size = params['buffer_size']
        self.batch_size = params['batch']
        self.buffer_type = params.get('buffer', 'standard')

        if self.buffer_type == 'per':
            prioritization = params['prioritization']
            self.buffer = PrioritizedReplayMemory(self.buffer_size, prioritization)
        else:
            self.buffer = ReplayBuffer(self.buffer_size)

        # build a target network
        self.target_refresh = params.get('target_refresh', 1)
        self.target_net = copy.deepcopy(self.value_net)
        self.initializeTargetNet()

        def getValues(x: torch.Tensor):
            qs = self.values(x).detach().cpu().squeeze(0).numpy()
            return qs

        self.policy = createEpsilonGreedy(seed, self.epsilon, getValues)
Esempio n. 10
0
    def __init__(self, params):
        super(POLOAgent, self).__init__(params)
        self.H_backup = self.params['polo']['H_backup']

        # Create ensemble of value functions
        model_params = params['polo']['ens_params']['model_params']
        model_params['input_size'] = self.N
        model_params['output_size'] = 1

        params['polo']['ens_params']['dtype'] = self.dtype
        params['polo']['ens_params']['device'] = self.device

        self.val_ens = Ensemble(self.params['polo']['ens_params'])

        # Learn from replay buffer
        self.polo_buf = ReplayBuffer(self.N, self.M,
                                     self.params['polo']['buf_size'])

        # Value (from forward), value mean, value std
        self.hist['vals'] = np.zeros((self.T, 3))
Esempio n. 11
0
	def __init__(self, args, env = None):
		self.args = args
		# actor
		self.actor = DeterministicPolicy(128).to(device)
		self.actor_target = DeterministicPolicy(128).to(device)
		self.actor_target.load_state_dict(self.actor.state_dict())
		self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr)
		# critics
		self.critic = QNetwork(128).to(device)
		self.critic_target = QNetwork(128).to(device)
		self.critic_target.load_state_dict(self.critic.state_dict())
		self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr)

		self.replay_buffer = ReplayBuffer(self.args.capacity)
		self.num_critic_update_iteration = 0
		self.num_actor_update_iteration = 0
		self.num_training = 0
		self.global_steps = 0

		self.action_scale = torch.FloatTensor([[20, 1]]).to(device)
		self.env = env
Esempio n. 12
0
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.det_net = Network(features, self.h1, self.h2, actions).to(device)
        self.bpolicy_net = Network(features, self.h1, self.h2,
                                   actions).to(device)
        self.bpolicy_net.load_state_dict(
            torch.load(
                "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt"
            ))

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)
Esempio n. 13
0
    def init_run(self):
        self.log("Starting init")
        self.r_sum = 0

        if self.state_rep == SQUARE:
            self.state_proc = SquareAroundHeadState(radius=self.state_radius,
                                                    step_forward=self.step_forward, flatten=self.flatten)
        elif self.state_rep == DIAMOND:
            self.state_proc = DiamondAroundHeadState(radius=self.state_radius,
                                                     step_forward=self.step_forward, flatten=self.flatten)
        elif self.state_rep == RADAR:
            self.state_proc = RadarState(num_per_type=NUM_PER_TYPE)

        elif self.state_rep == RADAR_PLUS:
            self.state_proc = DoubleStateWrapper(
                SquareAroundHeadState(radius=self.state_radius, step_forward=self.step_forward, flatten=self.flatten),
                RadarState(num_per_type=NUM_PER_TYPE))

        self.input_shape = self.state_proc.get_shape()

        self.model = self._build_model()
        self.model.summary()

        if self.huber_loss:
            loss = huber_loss
        else:
            loss = 'mse'

        opt = Adam(self.learning_rate)
        self.model.compile(loss=loss, optimizer=opt)

        self.old_model = keras.models.clone_model(self.model)
        self._save_model()

        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.log("Init finished!")

        self.num_of_samples = 0
        self.sum_of_loss = 0
Esempio n. 14
0
 def __init__(self, features, actions, params):
     super(DQN, self).__init__(features, actions, params)
     self.buffer_BACK = ReplayBuffer(1000)
     self.buffer_STAY = ReplayBuffer(1000)
     self.buffer_FORWARD = ReplayBuffer(1000)
     self.back_values = []
     self.stay_values = []
     self.forward_values = []
     self.ratioMap = params['ratioMap']
     self.sampleSize = params['sampleSize']
Esempio n. 15
0
    def __init__(self, features, actions, state_array, params):
        super(DQN, self).__init__(features, actions, params)
        self.buffer_BACK = ReplayBuffer(1000)
        self.buffer_STAY = ReplayBuffer(1000)
        self.buffer_FORWARD = ReplayBuffer(1000)

        self.back_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.back_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.back_q_net.cloneWeightsTo(self.back_target_q_net)

        self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.stay_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)

        self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.forward_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))

        self.back_values = []
        self.stay_values = []
        self.forward_values = []

        self.back_values_baseline = []
        self.stay_values_baseline = []
        self.forward_values_baseline = []

        self.td_loss = []
        self.state_array = state_array
        self.penultimate_features = []

        self.ratioMap = params['ratioMap']
        self.sampleSize = params['sampleSize']
Esempio n. 16
0
    def __init__(self,
                 prep,
                 build,
                 policy,
                 state_dim,
                 action_dim,
                 monitor_directory,
                 buffer_size=10000,
                 batch_size=32,
                 steps_before_train=100,
                 train_freq=1,
                 num_steps=1000000,
                 learning_rate=1e-3,
                 update_rate=1e-3,
                 max_reward=None,
                 detailed_summary=False):

        self.prep = prep
        self.build_mode = build
        self.policy = policy
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.summary_dir = os.path.join(monitor_directory, "summary")
        self.detailed_summary = detailed_summary

        self.discount = 0.99
        self.learning_rate = learning_rate
        self.target_update_rate = update_rate
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.steps_before_train = steps_before_train
        self.train_freq = train_freq
        self.max_reward = max_reward
        self.max_iters = num_steps

        self.step = 0
        self.solved = False

        self.state_layers = [64, 32]

        self.mu_layers = [16, 8, self.action_dim]

        self.l_layers = [16, 8, (self.action_dim * (self.action_dim + 1)) / 2]

        self.v_layers = [16, 8, 1]

        self.action_inputs = None
        self.reward_inputs = None
        self.done = None
        self.state_inputs = None
        self.state_outputs = None
        self.mu_outputs = None
        self.l_outputs = None
        self.value_outputs = None
        self.next_state_inputs = None
        self.next_state_outputs = None
        self.target_value_outputs = None
        self.target = None
        self.advantages = None
        self.q_values = None
        self.loss = None
        self.global_step = None
        self.inc_global_step = None
        self.train_op = None
        self.target_update = None

        self.buffer = ReplayBuffer(buffer_size, self.state_dim,
                                   self.action_dim)

        self.build()

        self.merged = tf.summary.merge_all()

        self.session = tf.Session()

        self.summary_dir = utils.new_summary_dir(self.summary_dir)
        utils.log_params(
            self.summary_dir, {
                "learning rate": self.learning_rate,
                "batch size": self.batch_size,
                "update rate": self.target_update_rate,
                "buffer size": self.buffer_size,
                "build": self.build_mode.name,
                "train frequency": self.train_freq
            })
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.session.graph)

        self.saver = tf.train.Saver(max_to_keep=None)

        init_op = tf.global_variables_initializer()
        self.session.run(init_op)
class SAC(algorithms):
    def __init__(self, args):
        super().__init__(args)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        self.actor = GaussianPolicy(state_dim, action_dim, 64,
                                    self.env.action_space).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.args.lr)

        self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(),
                                             self.args.lr)
        self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target_1.load_state_dict(self.critic_1.state_dict())

        self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(),
                                             self.args.lr)
        self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target_2.load_state_dict(self.critic_2.state_dict())

        self.replay_buffer = ReplayBuffer(self.args.capacity)

        self.global_steps = 0

    def update(self):
        for it in range(self.args.update_iteration):
            # sample from replay buffer
            x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # get the next action and compute target Q
            with torch.no_grad():
                next_action, log_prob, _ = self.actor.sample(next_state)
                target_Q1 = self.critic_target_1(next_state, next_action)
                target_Q2 = self.critic_target_2(next_state, next_action)
                target_Q = torch.min(target_Q1,
                                     target_Q2) - self.args.alpha * log_prob
                y_Q = reward + self.args.gamma * (1 - done) * target_Q

            # update critic
            current_Q1 = self.critic_1(state, action)
            critic_loss1 = F.mse_loss(current_Q1, y_Q)
            self.critic_optimizer_1.zero_grad()
            critic_loss1.backward()
            self.critic_optimizer_1.step()

            current_Q2 = self.critic_2(state, action)
            critic_loss2 = F.mse_loss(current_Q2, y_Q)
            self.critic_optimizer_2.zero_grad()
            critic_loss2.backward()
            self.critic_optimizer_2.step()

            # update actor
            actor_action, actor_log_prob, _ = self.actor.sample(state)
            Q1 = self.critic_1(state, actor_action)
            Q2 = self.critic_2(state, actor_action)
            actor_loss = -(torch.min(Q1, Q2) -
                           self.args.alpha * actor_log_prob).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target network
            for param, target_param in zip(self.critic_1.parameters(),
                                           self.critic_target_1.parameters()):
                target_param.data.copy_((1 - self.args.tau) *
                                        target_param.data +
                                        self.args.tau * param.data)

            for param, target_param in zip(self.critic_2.parameters(),
                                           self.critic_target_2.parameters()):
                target_param.data.copy_((1 - self.args.tau) *
                                        target_param.data +
                                        self.args.tau * param.data)

    def train(self):
        for i in range(self.args.max_episode):
            state = self.env.reset()
            ep_r = 0
            for t in count():
                action, _, _ = self.actor.sample(
                    torch.FloatTensor([state]).to(device))
                action = action.cpu().detach().numpy()[0]
                next_state, reward, done, info = self.env.step(action)
                self.global_steps += 1
                ep_r += reward
                self.replay_buffer.push(
                    (state, next_state, action, reward, np.float(done)))
                state = next_state

                if done or t > self.args.max_length_trajectory:
                    if i % self.args.print_log == 0:
                        print(
                            "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}"
                            .format(i, ep_r, t, self.global_steps))
                        self.evaluate(10, False)
                    ep_r = 0
                    break

            if len(self.replay_buffer.storage) >= self.args.capacity - 1:
                self.update()

        self.save(i + 1)

    def evaluate(self, number=1, render=True):
        rewards = []
        for _ in range(number):
            state = self.env.reset()
            done = False
            total_rews = 0
            time_step = 0
            while not done:
                with torch.no_grad():
                    # use the mean action
                    action, _, _ = self.actor.sample(
                        torch.FloatTensor([state]).to(device))
                    action = action.cpu().detach().numpy()[0]
                if render:
                    self.env.render()
                state, reward, done, _ = self.env.step(action)
                total_rews += reward
                time_step += 1

            if render:
                print("total reward of this episode is " + str(total_rews))
            rewards.append(total_rews)
        rewards = np.array(rewards)
        if not render:
            pickle.dump((self.global_steps, rewards), self.log_file)
        return rewards.max(), rewards.min(), rewards.mean()

    def save(self, episode):
        file_name = self.weights_file(episode)
        torch.save(
            {
                'actor': self.actor.state_dict(),
                'critic_1': self.critic_1.state_dict(),
                'critic_2': self.critic_2.state_dict(),
                'critic_target_1': self.critic_target_1.state_dict(),
                'critic_target_2': self.critic_target_2.state_dict()
            }, file_name)
        print("save model to " + file_name)

    def load(self, episode):
        file_name = self.weights_file(episode)
        checkpoint = torch.load(file_name)
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic_1.load_state_dict(checkpoint['critic_1'])
        self.critic_2.load_state_dict(checkpoint['critic_2'])
        self.critic_target_1.load_state_dict(checkpoint['critic_target_1'])
        self.critic_target_2.load_state_dict(checkpoint['critic_target_2'])
        print("successfully load model from " + file_name)
Esempio n. 18
0
                       MINI_BATCH, TAU, 0.001, L2C)

##################
# graph auxiliries
##################
saver = tf.train.Saver()
init = tf.initialize_all_variables()
summary = tf.merge_all_summaries()

logger = tf.train.SummaryWriter(OUT_DIR, sess.graph)

# initialize mdp state structure
mdp = MDP_state(STATE_SIZE, FRAMES)

# initialize replay buffer
R = ReplayBuffer(MDP_STATE_SIZE, ACTION_SIZE, BUFFER_SIZE)
buf = R.LoadBuffer(OUT_DIR + BUFFER_FILE)
if buf:
    EXP_PROB = EPSILON
    populated = R.GetOccupency()
    print("Replay buffer loaded from disk, occupied: " + str(populated))
else:
    print("Creating new replay buffer")

# load saved model
ckpt = tf.train.get_checkpoint_state(OUT_DIR)
if ckpt and ckpt.model_checkpoint_path:
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Model loaded from disk")

# define action discretization
Esempio n. 19
0
class DQN(BaseAgent):
    def __init__(self, features, actions, state_array, params):
        super(DQN, self).__init__(features, actions, params)
        self.buffer_BACK = ReplayBuffer(1000)
        self.buffer_STAY = ReplayBuffer(1000)
        self.buffer_FORWARD = ReplayBuffer(1000)

        self.back_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.back_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.back_q_net.cloneWeightsTo(self.back_target_q_net)

        self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.stay_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)

        self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.forward_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))

        self.back_values = []
        self.stay_values = []
        self.forward_values = []

        self.back_values_baseline = []
        self.stay_values_baseline = []
        self.forward_values_baseline = []

        self.td_loss = []
        self.state_array = state_array
        self.penultimate_features = []

        self.ratioMap = params['ratioMap']
        self.sampleSize = params['sampleSize']

    def updateNetwork(self, samples):
        # organize the mini-batch so that we can request "columns" from the data
        # e.g. we can get all of the actions, or all of the states with a single call
        batch = getBatchColumns(samples)

        # compute Q(s, a) for each sample in mini-batch
        Qs, x = self.policy_net(batch.states)
        Qsa = Qs.gather(1, batch.actions).squeeze()

        self.penultimate_features.append(x)

        # by default Q(s', a') = 0 unless the next states are non-terminal

        Qspap = torch.zeros(batch.size, device=device)
        # for i in range(len(batch.actions.numpy())):
        #     if batch.actions.numpy()[i][0] == 0:
        #         self.back_values.append(Qsa.detach().numpy()[i])
        #     elif batch.actions.numpy()[i][0] == 1:
        #         self.stay_values.append(Qsa.detach().numpy()[i])
        #     elif batch.actions.numpy()[i][0] == 2:
        #         self.forward_values.append(Qsa.detach().numpy()[i])

        # if we don't have any non-terminal next states, then no need to bootstrap
        if batch.nterm_sp.shape[0] > 0:
            Qsp, _ = self.target_net(batch.nterm_sp)

            # bootstrapping term is the max Q value for the next-state
            # only assign to indices where the next state is non-terminal
            Qspap[batch.nterm] = Qsp.max(1).values

        # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize
        # don't worry about detaching the bootstrapping term for semi-gradient Q-learning
        # the target network handles that
        target = batch.rewards + batch.gamma * Qspap.detach()
        td_loss = 0.5 * f.mse_loss(target, Qsa)

        # make sure we have no gradients left over from previous update
        self.optimizer.zero_grad()
        self.target_net.zero_grad()

        # compute the entire gradient of the network using only the td error
        td_loss.backward()

        self.td_loss.append(td_loss.detach().numpy())

        # self.td_loss = self.td_loss + list(td_loss.detach().numpy())

        Qs_state_array, _ = self.policy_net(self.state_array)

        Qsa_mean_states = torch.mean(Qs_state_array, 0)

        self.back_values.append(Qsa_mean_states[0].detach().numpy())
        self.stay_values.append(Qsa_mean_states[1].detach().numpy())
        self.forward_values.append(Qsa_mean_states[2].detach().numpy())

        # update the *policy network* using the combined gradients
        self.optimizer.step()

    def updateActionNet(self, samples, q_net, target_q_net, optimizer, storeList):
        batch = getBatchColumns(samples)
        Qs, x = q_net(batch.states)

        # Qsa = Qs.squeeze()
        # for i in range(len(batch.actions)):
        #     storeList.append(Qsa.detach().numpy()[i])
        Qspap = torch.zeros(batch.size, device=device)

        ############  ============  CHECK ================= ###############################
        if batch.nterm_sp.shape[0] > 0:
            ##  Qsp, _ = target_q_net(batch.nterm_sp) #### Is this correct ????

            Qsp_back, _ = self.back_target_q_net(batch.nterm_sp)
            Qsp_stay, _ = self.stay_target_q_net(batch.nterm_sp)
            Qsp_forward, _ = self.forward_target_q_net(batch.nterm_sp)

            Qsp = torch.hstack([Qsp_back, Qsp_stay, Qsp_forward])

            # bootstrapping term is the max Q value for the next-state
            # only assign to indices where the next state is non-terminal
            Qspap[batch.nterm] = Qsp.max(1).values

        ############  ============  CHECK ================= ###############################
        # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize
        # don't worry about detaching the bootstrapping term for semi-gradient Q-learning
        # the target network handles that
        target = batch.rewards + batch.gamma * Qspap.detach()
        td_loss = 0.5 * f.mse_loss(target, Qsa)

        # make sure we have no gradients left over from previous update
        optimizer.zero_grad()
        target_q_net.zero_grad()
        self.back_target_q_net.zero_grad()
        self.stay_target_q_net.zero_grad()
        self.forward_target_q_net.zero_grad()

        # compute the entire gradient of the network using only the td error
        td_loss.backward()

        Qs_state_array, _ = q_net(self.state_array)
        Qsa_mean_states = torch.mean(Qs_state_array, 0)
        storeList.append(Qsa_mean_states[0].detach().numpy())

        # update the *policy network* using the combined gradients
        optimizer.step()

    def update(self, s, a, sp, r, gamma):
        if a.cpu().numpy() == 0:
            self.buffer_BACK.add((s, a, sp, r, gamma))
        elif a.cpu().numpy() == 1:
            self.buffer_STAY.add((s, a, sp, r, gamma))
        elif a.cpu().numpy() == 2:
            self.buffer_FORWARD.add((s, a, sp, r, gamma))

        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)
            self.back_q_net.cloneWeightsTo(self.back_target_q_net)
            self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)
            self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        back_sample_count = math.floor(
            self.ratioMap.backward_ratio * self.sampleSize)
        stay_sample_count = math.floor(
            self.ratioMap.stay_ratio * self.sampleSize)
        forward_sample_count = math.floor(
            self.ratioMap.forward_ratio * self.sampleSize)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer_BACK) > back_sample_count \
                and len(self.buffer_STAY) > stay_sample_count \
                and len(self.buffer_FORWARD) > forward_sample_count:

            samplesBack, idcs = self.buffer_BACK.sample(back_sample_count)
            samplesStay, idcs = self.buffer_STAY.sample(stay_sample_count)
            samplesForward, idcs = self.buffer_FORWARD.sample(forward_sample_count)
            self.updateActionNet(samplesBack, self.back_q_net, self.back_target_q_net, self.optimizerBack,
                                 self.back_values_baseline)
            self.updateActionNet(samplesStay, self.stay_q_net, self.stay_target_q_net, self.optimizerStay,
                                 self.stay_values_baseline)
            self.updateActionNet(samplesForward, self.forward_q_net, self.forward_target_q_net, self.optimizerForward,
                                 self.forward_values_baseline)
            samples = samplesBack + samplesStay + samplesForward

            self.updateNetwork(samples)
Esempio n. 20
0
class BCAgent(POLOAgent):
    """
    An agent extending upon POLO that uses behavior cloning on the planner
    predicted actions as a prior to MPC.
    """
    def __init__(self, params):
        super(BCAgent, self).__init__(params)

        # Initialize policy network
        pol_params = self.params['p-bc']['pol_params']
        pol_params['input_size'] = self.N
        pol_params['output_size'] = self.M
        if 'final_activation' not in pol_params:
            pol_params['final_activation'] = torch.tanh

        self.pol = MLP(pol_params)

        # Create policy optimizer
        ppar = self.params['p-bc']['pol_optim']
        self.pol_optim = torch.optim.Adam(self.pol.parameters(),
                                          lr=ppar['lr'],
                                          weight_decay=ppar['reg'])

        # Use a replay buffer that will save planner actions
        self.pol_buf = ReplayBuffer(self.N, self.M,
                                    self.params['p-bc']['buf_size'])

        # Logging (store cum_rew, cum_emp_rew)
        self.hist['pols'] = np.zeros((self.T, 2))

        self.has_pol = True

        self.pol_cache = ()

    def get_action(self):
        """
        BCAgent generates a planned trajectory using the behavior-cloned policy
        and then optimizes it via MPC.
        """
        self.pol.eval()

        # Run a rollout using the policy starting from the current state
        infos = self.get_traj_info()

        self.hist['pols'][self.time] = infos[3:5]
        self.pol_cache = (infos[0], infos[2])

        self.prior_actions = infos[1]

        # Generate trajectory via MPC with the prior actions as a prior
        action = super(BCAgent, self).get_action(prior=self.prior_actions)

        # Add final planning trajectory to BC buffer
        fin_states, fin_rews = self.cache[2], self.cache[3]
        fin_states = np.concatenate(([self.prev_obs], fin_states[1:]))
        pb_pct = self.params['p-bc']['pb_pct']
        pb_len = int(pb_pct * fin_states.shape[0])
        for t in range(pb_len):
            self.pol_buf.update(fin_states[t], fin_states[t + 1], fin_rews[t],
                                self.planned_actions[t], False)

        return action

    def do_updates(self):
        """
        Learn from the saved buffer of planned actions.
        """
        super(BCAgent, self).do_updates()

        if self.time % self.params['p-bc']['update_freq'] == 0:
            self.update_pol()

    def update_pol(self):
        """
        Update the policy via BC on the planner actions.
        """
        self.pol.train()

        params = self.params['p-bc']

        # Generate batches for training
        size = min(self.pol_buf.size, self.pol_buf.total_in)
        num_inds = params['batch_size'] * params['grad_steps']
        inds = np.random.randint(0, size, size=num_inds)

        states = self.pol_buf.buffer['s'][inds]
        acts = self.pol_buf.buffer['a'][inds]

        states = torch.tensor(states, dtype=self.dtype)
        actions = torch.tensor(acts, dtype=self.dtype)

        for i in range(params['grad_steps']):
            bi, ei = i * params['batch_size'], (i + 1) * params['batch_size']

            # Train based on L2 distance between actions and predictions
            preds = self.pol.forward(states[bi:ei])
            preds = torch.squeeze(preds, dim=-1)
            targets = torch.squeeze(actions[bi:ei], dim=-1)

            loss = torch.nn.functional.mse_loss(preds, targets)

            self.pol_optim.zero_grad()
            loss.backward()
            self.pol_optim.step()

    def get_traj_info(self):
        """
        Run the policy for a full trajectory and return details about the
        trajectory.
        """
        env_state = self.env.sim.get_state() if self.mujoco else None

        infos = traj.eval_traj(copy.deepcopy(self.env),
                               env_state,
                               self.prev_obs,
                               mujoco=self.mujoco,
                               perturb=self.perturb,
                               H=self.H,
                               gamma=self.gamma,
                               act_mode='deter',
                               pt=(self.pol, 0),
                               terminal=self.val_ens,
                               tvel=self.tvel)

        return infos

    def print_logs(self):
        """
        BC-specific logging information.
        """
        bi, ei = super(BCAgent, self).print_logs()

        self.print('BC metrics', mode='head')

        self.print('policy traj rew', self.hist['pols'][self.time - 1][0])
        self.print('policy traj emp rew', self.hist['pols'][self.time - 1][1])

        return bi, ei

    def test_policy(self):
        """
        Run the BC action selection mechanism.
        """
        env = copy.deepcopy(self.env)
        obs = env.reset()

        if self.tvel is not None:
            env.set_target_vel(self.tvel)
            obs = env._get_obs()

        env_state = env.sim.get_state() if self.mujoco else None
        infos = traj.eval_traj(env,
                               env_state,
                               obs,
                               mujoco=self.mujoco,
                               perturb=self.perturb,
                               H=self.eval_len,
                               gamma=1,
                               act_mode='deter',
                               pt=(self.pol, 0),
                               tvel=self.tvel)

        self.hist['pol_test'][self.time] = infos[3]
Esempio n. 21
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 monitor_directory,
                 actor_learning_rate=1e-5,
                 critic_learning_rate=1e-3,
                 critic_target_update_rate=1e-3,
                 actor_target_update_rate=1e-3,
                 discount=0.99,
                 l2_decay=1e-2,
                 buffer_size=1000000,
                 batch_size=64,
                 detail_summary=False,
                 tanh_action=True,
                 input_batch_norm=True,
                 all_batch_norm=True,
                 log_frequency=10):

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.critic_learning_rate = critic_learning_rate
        self.actor_learning_rate = actor_learning_rate
        self.critic_target_update_rate = critic_target_update_rate
        self.actor_target_update_rate = actor_target_update_rate
        self.discount = discount
        self.batch_size = batch_size
        self.l2_decay = l2_decay
        self.buffer_size = buffer_size
        self.summary_dir = os.path.join(monitor_directory, "summary")
        self.detail_summary = detail_summary
        self.tanh_action = tanh_action
        self.input_batch_norm = input_batch_norm
        self.all_batch_norm = all_batch_norm
        self.log_frequency = log_frequency

        self.step = 0
        self.solved = False

        self.buffer = ReplayBuffer(buffer_size, self.state_dim,
                                   self.action_dim)

        self.__build()

        self.summary_dir = utils.new_summary_dir(self.summary_dir)
        utils.log_params(
            self.summary_dir, {
                "actor learning rate": self.actor_learning_rate,
                "critic learning rate": self.critic_learning_rate,
                "batch size": self.batch_size,
                "actor update rate": self.actor_target_update_rate,
                "critic update rate": self.critic_target_update_rate,
                "buffer size": self.buffer_size,
            })

        self.saver = tf.train.Saver(max_to_keep=None)

        init_op = tf.global_variables_initializer()
        self.session = tf.Session()

        self.merged = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.session.graph)

        self.session.run(init_op)
Esempio n. 22
0
class BaseAgent:
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.det_net = Network(features, self.h1, self.h2, actions).to(device)
        self.bpolicy_net = Network(features, self.h1, self.h2,
                                   actions).to(device)
        self.bpolicy_net.load_state_dict(
            torch.load(
                "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt"
            ))

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)

    def selectAction(self, x):
        # take a random action about epsilon percent of the time
        q_s, _ = self.bpolicy_net(x)

        if q_s.shape[0] == 3:
            q_s = q_s.unsqueeze(0)
            #act = q_s.argmax().detach()

    # else:
        act = torch.max(q_s, 1).indices.detach().numpy()

        for i in range(act.shape[0]):
            action = act[i]
            if action == 1:
                if np.random.rand() < self.epsilon:
                    act[i] = np.random.choice([0, 2])

        # if act.cpu().numpy() == 1:
        #     if np.random.rand() < self.epsilon:
        #         a = np.random.randint(self.actions-1)

        # if np.random.rand() < self.epsilon:
        #     a = np.random.randint(self.actions)
        #     return torch.tensor(a, device=device)

        # # otherwise take a greedy action
        # q_s, _ = self.bpolicy_net(x)
        # # print(q_s)
        # return q_s.argmax().detach()
        act_tensor = torch.from_numpy(act).detach().to(device)

        return act_tensor

    def updateNetwork(self, samples):
        pass

    def update(self, s, a, sp, r, gamma):
        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer) > 200:
            samples, idcs = self.buffer.sample(200)
            self.updateNetwork(samples)
Esempio n. 23
0
class BaseAgent:
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))
        # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0
        self.actionCounter = np.zeros((env.width, env.height, env.num_actions))

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)

    def selectAction(self, x):
        # take a random action about epsilon percent of the time
        if np.random.rand() < self.epsilon:
            a = np.random.randint(self.actions)
            return torch.tensor(a, device=device)

        # otherwise take a greedy action
        q_s, _ = self.policy_net(x)
        # print(q_s.detach().numpy()[0][3])
        print(q_s.argmax().detach())

        return q_s.argmax().detach()

    def updateNetwork(self, samples):
        pass

    def update(self, s, a, r, sp, gamma):
        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, r, sp, gamma))
        self.steps += 1
        a = a.numpy()
        s = s.numpy()

        self.actionCounter[s[0][0]][s[0][1]][a] += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer) > 32:
            samples, idcs = self.buffer.sample(32)
            self.updateNetwork(samples)
Esempio n. 24
0
class NAF:

    MODEL_NAME = "NAF"
    TARGET_MODEL_NAME = "target-NAF"

    class Build(Enum):
        SINGLE = 1
        MULTIPLE = 2
        HYDRA = 3

    def __init__(self,
                 prep,
                 build,
                 policy,
                 state_dim,
                 action_dim,
                 monitor_directory,
                 buffer_size=10000,
                 batch_size=32,
                 steps_before_train=100,
                 train_freq=1,
                 num_steps=1000000,
                 learning_rate=1e-3,
                 update_rate=1e-3,
                 max_reward=None,
                 detailed_summary=False):

        self.prep = prep
        self.build_mode = build
        self.policy = policy
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.summary_dir = os.path.join(monitor_directory, "summary")
        self.detailed_summary = detailed_summary

        self.discount = 0.99
        self.learning_rate = learning_rate
        self.target_update_rate = update_rate
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.steps_before_train = steps_before_train
        self.train_freq = train_freq
        self.max_reward = max_reward
        self.max_iters = num_steps

        self.step = 0
        self.solved = False

        self.state_layers = [64, 32]

        self.mu_layers = [16, 8, self.action_dim]

        self.l_layers = [16, 8, (self.action_dim * (self.action_dim + 1)) / 2]

        self.v_layers = [16, 8, 1]

        self.action_inputs = None
        self.reward_inputs = None
        self.done = None
        self.state_inputs = None
        self.state_outputs = None
        self.mu_outputs = None
        self.l_outputs = None
        self.value_outputs = None
        self.next_state_inputs = None
        self.next_state_outputs = None
        self.target_value_outputs = None
        self.target = None
        self.advantages = None
        self.q_values = None
        self.loss = None
        self.global_step = None
        self.inc_global_step = None
        self.train_op = None
        self.target_update = None

        self.buffer = ReplayBuffer(buffer_size, self.state_dim,
                                   self.action_dim)

        self.build()

        self.merged = tf.summary.merge_all()

        self.session = tf.Session()

        self.summary_dir = utils.new_summary_dir(self.summary_dir)
        utils.log_params(
            self.summary_dir, {
                "learning rate": self.learning_rate,
                "batch size": self.batch_size,
                "update rate": self.target_update_rate,
                "buffer size": self.buffer_size,
                "build": self.build_mode.name,
                "train frequency": self.train_freq
            })
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.session.graph)

        self.saver = tf.train.Saver(max_to_keep=None)

        init_op = tf.global_variables_initializer()
        self.session.run(init_op)

    def build(self):
        self.action_inputs = tf.placeholder(tf.float32,
                                            (None, self.action_dim))
        self.reward_inputs = tf.placeholder(tf.float32, (None, ))
        self.done = tf.placeholder(tf.float32, (None, ))

        self.state_inputs, self.state_outputs, self.mu_outputs, self.l_outputs, self.value_outputs = \
          self.build_network(self.MODEL_NAME)

        self.next_state_inputs, self.next_state_outputs, _, _, self.target_value_outputs = \
          self.build_network(self.TARGET_MODEL_NAME)

        self.target = tf.expand_dims(self.reward_inputs, 1) + self.discount * (
            1 - tf.expand_dims(self.done, 1)) * self.target_value_outputs

        # taken from https://github.com/carpedm20/NAF-tensorflow/blob/master/src/network.py
        pivot = 0
        rows = []
        for idx in range(self.action_dim):
            count = self.action_dim - idx

            diag_elem = tf.exp(tf.slice(self.l_outputs, (0, pivot), (-1, 1)))
            non_diag_elems = tf.slice(self.l_outputs, (0, pivot + 1),
                                      (-1, count - 1))
            row = tf.pad(tf.concat((diag_elem, non_diag_elems), 1),
                         ((0, 0), (idx, 0)))
            rows.append(row)

            pivot += count

        L = tf.transpose(tf.stack(rows, axis=1), (0, 2, 1))
        P = tf.matmul(L, tf.transpose(L, (0, 2, 1)))

        adv_term = tf.expand_dims(self.action_inputs - self.mu_outputs, -1)
        self.advantages = -tf.matmul(tf.transpose(adv_term, [0, 2, 1]),
                                     tf.matmul(P, adv_term)) / 2
        self.advantages = tf.reshape(self.advantages, [-1, 1])

        self.q_values = self.advantages + self.value_outputs

        self.loss = tf.reduce_mean(
            architect.huber_loss(self.q_values -
                                 tf.stop_gradient(self.target)))

        tf.summary.scalar("training_loss", self.loss)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.inc_global_step = tf.assign(self.global_step,
                                         tf.add(self.global_step, 1))

        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.train_op = optimizer.minimize(self.loss)

        self.create_target_update_op()

    def build_network(self, name):

        detailed_summary = self.detailed_summary
        if name == self.TARGET_MODEL_NAME:
            detailed_summary = False

        with tf.variable_scope(name):

            state_inputs = tf.placeholder(tf.float32,
                                          shape=(None, self.state_dim))

            if self.build_mode == self.Build.SINGLE:
                state_outputs = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="state_branch",
                    detailed_summary=detailed_summary)
                mu_outputs = architect.dense_block(
                    state_outputs, [self.mu_layers[-1]],
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    state_outputs, [self.l_layers[-1]],
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    state_outputs, [self.v_layers[-1]],
                    "value_branch",
                    detailed_summary=detailed_summary)
            elif self.build_mode == self.Build.MULTIPLE:
                state_outputs = None
                mu_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="mu_state",
                    detailed_summary=detailed_summary)
                l_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="l_state",
                    detailed_summary=detailed_summary)
                value_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="value_state",
                    detailed_summary=detailed_summary)

                mu_outputs = architect.dense_block(
                    mu_state, [self.mu_layers[-1]],
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    l_state, [self.l_layers[-1]],
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    value_state, [self.v_layers[-1]],
                    "value_branch",
                    detailed_summary=detailed_summary)
            elif self.build_mode == self.Build.HYDRA:
                state_outputs = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="state_branch",
                    detailed_summary=detailed_summary)
                mu_outputs = architect.dense_block(
                    state_outputs,
                    self.mu_layers,
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    state_outputs,
                    self.l_layers,
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    state_outputs,
                    self.v_layers,
                    "value_branch",
                    detailed_summary=detailed_summary)
            else:
                raise ValueError("Wrong build type.")

            return state_inputs, state_outputs, mu_outputs, l_outputs, value_outputs

    def create_target_update_op(self):
        # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py
        net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     scope=self.MODEL_NAME)
        target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                            scope=self.TARGET_MODEL_NAME)

        self.target_update = []
        for v_source, v_target in zip(net_vars, target_net_vars):
            # this is equivalent to target = (1-alpha) * target + alpha * source
            update_op = v_target.assign_sub(self.target_update_rate *
                                            (v_target - v_source))
            self.target_update.append(update_op)

        self.target_update = tf.group(*self.target_update)

    def learn(self):
        # learn
        batch = self.buffer.sample(self.batch_size)

        merged, targets, _ = self.session.run(
            [self.merged, self.target, self.train_op],
            feed_dict={
                self.state_inputs: batch["states"],
                self.action_inputs: batch["actions"],
                self.reward_inputs: batch["rewards"],
                self.next_state_inputs: batch["next_states"],
                self.done: batch["done"]
            })

        self.summary_writer.add_summary(merged, global_step=self.step)

        # target update
        self.session.run(self.target_update)

    def run_episode(self, env):

        self.policy.reset()

        state = env.reset()
        state, skip = self.prep.process(state)

        total_reward = 0

        while True:
            # play
            if skip:
                action = env.action_space.sample()
            else:
                action = self.session.run(self.mu_outputs,
                                          feed_dict={self.state_inputs:
                                                     state})[0]
                action = self.policy.add_noise(action)

            tmp_state = state
            tmp_skip = skip

            state, reward, done, _ = env.step(action)
            state, skip = self.prep.process(state)

            total_reward += reward

            if not tmp_skip and not tmp_skip:
                self.buffer.add({
                    "state": tmp_state[0],
                    "action": action,
                    "reward": reward,
                    "next_state": state[0],
                    "done": int(done)
                })

            if self.step >= self.steps_before_train and not self.solved:
                # learn
                for _ in range(self.train_freq):
                    self.learn()
                    _, self.step = self.session.run(
                        [self.inc_global_step, self.global_step])
            else:
                _, self.step = self.session.run(
                    [self.inc_global_step, self.global_step])

            if done:
                break

        summary_value = summary_pb2.Summary.Value(tag="episode_reward",
                                                  simple_value=total_reward)
        summary_2 = summary_pb2.Summary(value=[summary_value])
        self.summary_writer.add_summary(summary_2, global_step=self.step)

        if self.max_reward is not None:
            if total_reward >= self.max_reward:
                self.solved = True
            else:
                self.solved = False

        if self.step == self.max_iters:
            self.saver.save(self.session,
                            self.summary_dir,
                            global_step=self.step)

        return total_reward, self.step

    def close(self):
        self.session.close()
Esempio n. 25
0
class DDPG:

    CRITIC_NAME = "critic"
    TARGET_CRITIC_NAME = "target_critic"

    ACTOR_NAME = "actor"
    TARGET_ACTOR_NAME = "target_actor"

    def __init__(self,
                 state_dim,
                 action_dim,
                 monitor_directory,
                 actor_learning_rate=1e-5,
                 critic_learning_rate=1e-3,
                 critic_target_update_rate=1e-3,
                 actor_target_update_rate=1e-3,
                 discount=0.99,
                 l2_decay=1e-2,
                 buffer_size=1000000,
                 batch_size=64,
                 detail_summary=False,
                 tanh_action=True,
                 input_batch_norm=True,
                 all_batch_norm=True,
                 log_frequency=10):

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.critic_learning_rate = critic_learning_rate
        self.actor_learning_rate = actor_learning_rate
        self.critic_target_update_rate = critic_target_update_rate
        self.actor_target_update_rate = actor_target_update_rate
        self.discount = discount
        self.batch_size = batch_size
        self.l2_decay = l2_decay
        self.buffer_size = buffer_size
        self.summary_dir = os.path.join(monitor_directory, "summary")
        self.detail_summary = detail_summary
        self.tanh_action = tanh_action
        self.input_batch_norm = input_batch_norm
        self.all_batch_norm = all_batch_norm
        self.log_frequency = log_frequency

        self.step = 0
        self.solved = False

        self.buffer = ReplayBuffer(buffer_size, self.state_dim,
                                   self.action_dim)

        self.__build()

        self.summary_dir = utils.new_summary_dir(self.summary_dir)
        utils.log_params(
            self.summary_dir, {
                "actor learning rate": self.actor_learning_rate,
                "critic learning rate": self.critic_learning_rate,
                "batch size": self.batch_size,
                "actor update rate": self.actor_target_update_rate,
                "critic update rate": self.critic_target_update_rate,
                "buffer size": self.buffer_size,
            })

        self.saver = tf.train.Saver(max_to_keep=None)

        init_op = tf.global_variables_initializer()
        self.session = tf.Session()

        self.merged = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.session.graph)

        self.session.run(init_op)

    """
  PUBLIC
  """

    def learn(self):

        batch = self.buffer.sample(self.batch_size)
        self.__train_critic(batch["states"], batch["actions"],
                            batch["rewards"], batch["next_states"],
                            batch["done"])
        self.__train_actor(batch["states"])

        self.session.run([
            self.target_critic_update, self.target_actor_update,
            self.inc_global_step
        ])

    def act(self, state):
        a = self.session.run(self.action,
                             feed_dict={
                                 self.state_input: state,
                                 self.is_training: False
                             })[0]
        return a

    def perceive(self, transition):
        self.buffer.add(transition)

    def log_scalar(self, name, value, index):
        summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value)
        summary_2 = summary_pb2.Summary(value=[summary_value])
        self.summary_writer.add_summary(summary_2, global_step=index)

    def save(self):
        self.saver.save(self.session,
                        self.summary_dir,
                        global_step=self.session.run(self.global_step))

    def close(self):
        self.session.close()

    """
  PRIVATE
  """

    def __build_critic(self, name, state_input, action_input):

        bn_training = self.is_training
        if name == self.TARGET_CRITIC_NAME:
            bn_training = False

        with tf.variable_scope(name):

            # weights and biases
            W1 = self.__get_weights((self.state_dim, 400),
                                    self.state_dim,
                                    name="W1")
            b1 = self.__get_weights((400, ), self.state_dim, name="b1")

            W2 = self.__get_weights((400, 300),
                                    400 + self.action_dim,
                                    name="W2")
            b2 = self.__get_weights((300, ), 400 + self.action_dim, name="b2")

            W2_action = self.__get_weights((self.action_dim, 300),
                                           400 + self.action_dim,
                                           name="W2_action")

            W3 = tf.Variable(tf.random_uniform((300, 1), -3e-3, 3e-3),
                             name="W3")
            b3 = tf.Variable(tf.random_uniform((1, ), -3e-3, 3e-3), name="b3")

            # layers
            if self.input_batch_norm:
                state_input = tf.layers.batch_normalization(
                    state_input, training=bn_training)

            layer_1 = tf.matmul(state_input, W1) + b1

            if self.all_batch_norm:
                layer_1 = tf.layers.batch_normalization(layer_1,
                                                        training=bn_training)

            layer_1 = tf.nn.relu(layer_1)

            layer_2 = tf.nn.relu(
                tf.matmul(layer_1, W2) + tf.matmul(action_input, W2_action) +
                b2)

            output_layer = tf.matmul(layer_2, W3) + b3

            # summary
            if name == self.CRITIC_NAME:
                self.critic_summaries = [
                    tf.summary.histogram("W1", W1),
                    tf.summary.histogram("b1", b1),
                    tf.summary.histogram("W2", W2),
                    tf.summary.histogram("b2", b2),
                    tf.summary.histogram("W2_action", W2_action),
                    tf.summary.histogram("W3", W3),
                    tf.summary.histogram("b3", b3),
                    tf.summary.histogram("layer_1", layer_1),
                    tf.summary.histogram("layer_2", layer_2),
                    tf.summary.histogram("output_layer", output_layer)
                ]

            # weight decay
            weights = [W1, b1, W2, b2, W2_action, W3, b3]
            weight_decay = tf.add_n(
                [self.l2_decay * tf.nn.l2_loss(var) for var in weights])

            return output_layer, weight_decay

    def __build_actor(self, name, state_input):

        bn_training = self.is_training
        if name == self.TARGET_ACTOR_NAME:
            bn_training = False

        with tf.variable_scope(name):

            # weights and biases
            W1 = self.__get_weights((self.state_dim, 400),
                                    self.state_dim,
                                    name="W1")
            b1 = self.__get_weights((400, ), self.state_dim, name="b1")

            W2 = self.__get_weights((400, 300), 400, name="W2")
            b2 = self.__get_weights((300, ), 400, name="b2")

            W3 = tf.Variable(tf.random_uniform((300, self.action_dim),
                                               minval=-3e-3,
                                               maxval=3e-3),
                             name="W3")
            b3 = tf.Variable(tf.random_uniform((self.action_dim, ), -3e-3,
                                               3e-3),
                             name="b3")

            # layers
            if self.input_batch_norm:
                state_input = tf.layers.batch_normalization(
                    state_input, training=bn_training)

            layer_1 = tf.matmul(state_input, W1) + b1

            if self.all_batch_norm:
                layer_1 = tf.layers.batch_normalization(layer_1,
                                                        training=bn_training)

            layer_1 = tf.nn.relu(layer_1)

            layer_2 = tf.matmul(layer_1, W2) + b2

            if self.all_batch_norm:
                layer_2 = tf.layers.batch_normalization(layer_2,
                                                        training=bn_training)

            layer_2 = tf.nn.relu(layer_2)

            output_layer = tf.matmul(layer_2, W3) + b3

            # summary
            if name == self.ACTOR_NAME:
                self.actor_summaries = [
                    tf.summary.histogram("W1", W1),
                    tf.summary.histogram("b1", b1),
                    tf.summary.histogram("W2", W2),
                    tf.summary.histogram("b2", b2),
                    tf.summary.histogram("W3", W3),
                    tf.summary.histogram("b3", b3),
                    tf.summary.histogram("layer_1", layer_1),
                    tf.summary.histogram("layer_2", layer_2),
                    tf.summary.histogram("output_layer", output_layer)
                ]

            if self.tanh_action:
                return tf.nn.tanh(output_layer)
            else:
                return output_layer

    def __build(self):

        self.state_input = tf.placeholder(tf.float32,
                                          shape=(None, self.state_dim),
                                          name="state_input")
        self.next_state_input = tf.placeholder(tf.float32,
                                               shape=(None, self.state_dim),
                                               name="next_state_input")
        self.action_input = tf.placeholder(tf.float32,
                                           shape=(None, self.action_dim),
                                           name="action_input")
        self.reward_input = tf.placeholder(tf.float32,
                                           shape=(None, ),
                                           name="reward_input")
        self.done_input = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name="done_input")
        self.is_training = tf.placeholder(tf.bool, name="is_training")

        # inputs summary
        if self.detail_summary:
            self.input_summaries = [
                tf.summary.histogram("state", self.state_input),
                tf.summary.histogram("next_state", self.next_state_input),
                tf.summary.histogram("action", self.action_input),
                tf.summary.histogram("reward", self.reward_input),
                tf.summary.histogram("done", self.done_input)
            ]

        self.target_action = self.__build_actor(self.TARGET_ACTOR_NAME,
                                                self.next_state_input)

        self.q_value, weight_decay = self.__build_critic(
            self.CRITIC_NAME, self.state_input, self.action_input)
        self.target_q_value, _ = self.__build_critic(self.TARGET_CRITIC_NAME,
                                                     self.next_state_input,
                                                     self.target_action)

        self.tmp = tf.expand_dims(self.reward_input, 1)

        self.targets = tf.expand_dims(self.reward_input, 1) + self.discount * (
            1 - tf.expand_dims(self.done_input, 1)) * self.target_q_value
        self.diff = self.targets - self.q_value

        self.loss = tf.reduce_mean(
            tf.square(tf.stop_gradient(self.targets) -
                      self.q_value)) + weight_decay
        self.loss_summary = tf.summary.scalar("critic_loss", self.loss)

        self.critic_train_op = tf.train.AdamOptimizer(
            self.critic_learning_rate).minimize(self.loss)

        # add critic batch norm. update
        if self.input_batch_norm or self.all_batch_norm:
            self.critic_bn_update_op = tf.get_collection(
                tf.GraphKeys.UPDATE_OPS, scope=self.CRITIC_NAME)
            self.critic_bn_update_op = tf.group(*self.critic_bn_update_op)
            self.critic_train_op = tf.group(self.critic_train_op,
                                            self.critic_bn_update_op)

        self.action = self.__build_actor(self.ACTOR_NAME, self.state_input)
        self.actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              scope=self.ACTOR_NAME)
        self.action_gradients = tf.gradients(self.q_value,
                                             self.action_input)[0]
        self.actor_params_gradient = tf.gradients(self.action,
                                                  self.actor_params,
                                                  -self.action_gradients)

        # actor gradients summary
        if self.detail_summary:
            self.actor_summaries.append(
                tf.summary.histogram("action_gradient", self.action_gradients))
            for grad in self.actor_params_gradient:
                self.actor_summaries.append(
                    tf.summary.histogram("actor_parameter_gradients", grad))

        self.actor_train_op = tf.train.AdamOptimizer(
            self.actor_learning_rate).apply_gradients(
                zip(self.actor_params_gradient, self.actor_params))

        # add actor batch norm. update
        if self.input_batch_norm or self.all_batch_norm:
            self.actor_bn_update_op = tf.get_collection(
                tf.GraphKeys.UPDATE_OPS, scope=self.ACTOR_NAME)
            self.actor_bn_update_op = tf.group(*self.actor_bn_update_op)
            self.actor_train_op = tf.group(self.actor_train_op,
                                           self.actor_bn_update_op)

        self.target_critic_update = architect.create_target_update_ops(
            self.CRITIC_NAME, self.TARGET_CRITIC_NAME,
            self.critic_target_update_rate)
        self.target_actor_update = architect.create_target_update_ops(
            self.ACTOR_NAME, self.TARGET_ACTOR_NAME,
            self.actor_target_update_rate)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.inc_global_step = tf.assign(self.global_step,
                                         tf.add(self.global_step, 1))

        # group summaries
        self.critic_summaries = tf.summary.merge(self.critic_summaries)

        if self.detail_summary:
            self.actor_summaries = tf.summary.merge(self.actor_summaries)
            self.input_summaries = tf.summary.merge(self.input_summaries)

    @staticmethod
    def __get_weights(shape, input_shape, name="var"):
        return tf.Variable(tf.random_uniform(shape,
                                             -1 / math.sqrt(input_shape),
                                             1 / math.sqrt(input_shape)),
                           name=name)

    def __train_actor(self, states):

        actions = self.session.run(self.action,
                                   feed_dict={
                                       self.state_input: states,
                                       self.is_training: True
                                   })

        self.session.run(self.actor_train_op,
                         feed_dict={
                             self.state_input: states,
                             self.action_input: actions,
                             self.is_training: True
                         })

    def __train_critic(self, states, actions, rewards, next_states, done):
        feed_dict = {
            self.state_input: states,
            self.action_input: actions,
            self.reward_input: rewards,
            self.next_state_input: next_states,
            self.done_input: done,
            self.is_training: True
        }
        step = self.session.run(self.global_step)

        if step % self.log_frequency == 0:

            ops = [self.critic_train_op, self.loss_summary]

            if self.detail_summary:
                ops.append(self.actor_summaries)
                ops.append(self.input_summaries)

            res = self.session.run(ops, feed_dict=feed_dict)

            self.summary_writer.add_summary(res[1], global_step=step)

            if self.detail_summary:
                self.summary_writer.add_summary(res[2], global_step=step)
                self.summary_writer.add_summary(res[3], global_step=step)
        else:
            self.session.run(self.critic_train_op, feed_dict=feed_dict)
Esempio n. 26
0
class DDPG_Agent(Agent):
    """Interacts with and learns from the environment."""
    policy_type = "DDPG"

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = DDPG_Actor(state_size, action_size,
                                      random_seed).to(device)
        self.actor_target = DDPG_Actor(state_size, action_size,
                                       random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPG_Critic(state_size, action_size,
                                        random_seed).to(device)
        self.critic_target = DDPG_Critic(state_size, action_size,
                                         random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        #Statistics
        self.stats = {
            "actor_loss": [],
            "critic_loss": [],
            "reward_sum": [],
        }

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        action = self.actor_local.select_action(state)
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        #tmp = np.array((critic_loss.item(), actor_loss.item()))
        #print(tmp)
        # --------------------------- for the plot ----------------------------- #

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        with torch.no_grad():
            actions_pred_target = self.actor_target(states)
            actor_loss_target = -self.critic_target(
                states, actions_pred_target).mean()
            Q_expected_target = self.critic_target(states, actions)
            critic_loss_target = F.mse_loss(Q_expected_target, Q_targets)
            with open("saveDDPG_critic-actor_loss.csv", "a") as f:
                tmp = str(critic_loss_target.item()) + "," + str(
                    actor_loss_target.item()) + "\n"
                f.write(tmp)
            self.save_stats(actor_loss=actor_loss.item(),
                            critic_loss=critic_loss.item(),
                            reward_sum=rewards.sum().item())

    def store_policy(self, env_name, score):
        traced = torch.jit.script(self.actor_target)
        torch.jit.save(
            traced, "data/policies/" + "DDPGAgent" + str(env_name) + "#" +
            str(score) + ".zip")

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 27
0
class DDPG(algorithms):
    def __init__(self, args):
        super().__init__(args)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        self.actor = DeterministicPolicy(state_dim, action_dim, 64,
                                         self.env.action_space).to(device)
        self.actor_target = DeterministicPolicy(
            state_dim, action_dim, 64, self.env.action_space).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.args.lr)

        self.critic = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           self.args.lr)

        self.replay_buffer = ReplayBuffer(self.args.capacity)
        self.num_critic_update_iteration = 0
        self.num_actor_update_iteration = 0
        self.num_training = 0
        self.global_steps = 0

        if self.args.last_episode > 0:
            self.load(self.args.last_episode)

    def update(self):
        for it in range(self.args.update_iteration):
            # sample from replay buffer
            x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # computer the target Q value
            next_action, _, _ = self.actor_target.sample(next_state)
            target_Q = self.critic_target(next_state, next_action)
            target_Q = reward + (
                (1 - done) * self.args.gamma * target_Q).detach()

            # get current Q estimate
            current_Q = self.critic(state, action)

            # compute cirtic loss and update
            critic_loss = F.mse_loss(current_Q, target_Q)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # computer actor loss
            actor_action, _, _ = self.actor.sample(state)
            actor_loss = -self.critic(state, actor_action).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target model
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.args.tau * param.data +
                                        (1 - self.args.tau) *
                                        target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.args.tau * param.data +
                                        (1 - self.args.tau) *
                                        target_param.data)

            self.num_actor_update_iteration += 1
            self.num_critic_update_iteration += 1

    def train(self):
        for i in range(self.args.max_episode):
            state = self.env.reset()
            ep_r = 0
            for t in count():
                action, _, _ = self.actor.sample(
                    torch.FloatTensor([state]).to(device))
                action = action.cpu().detach().numpy()[0]

                next_state, reward, done, info = self.env.step(action)
                self.global_steps += 1
                ep_r += reward
                self.replay_buffer.push(
                    (state, next_state, action, reward, np.float(done)))
                state = next_state

                if done or t > self.args.max_length_trajectory:
                    if i % self.args.print_log == 0:
                        print(
                            "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}"
                            .format(i, ep_r, t, self.global_steps))
                        self.evaluate(10, False)
                    break

            if len(self.replay_buffer.storage) >= self.args.capacity - 1:
                self.update()
        self.save(i + 1)

    def evaluate(self, number=1, render=True):
        rewards = []
        for _ in range(number):
            total_rews = 0
            time_step = 0
            done = False
            state = self.env.reset()
            while not done:
                with torch.no_grad():
                    # use the mean action
                    _, _, action = self.actor.sample(
                        torch.FloatTensor([state]).to(device))
                    action = action.cpu().detach().numpy()[0]
                if render:
                    self.env.render()
                state, reward, done, _ = self.env.step(action)
                total_rews += reward
                time_step += 1

            if render:
                print("total reward of this episode is " + str(total_rews))
            rewards.append(total_rews)
        rewards = np.array(rewards)
        if not render:
            pickle.dump((self.global_steps, rewards), self.log_file)
        print("mean reward {}, max reward {}".format(rewards.mean(),
                                                     rewards.max()))

    def load(self, episode=None):
        file_name = self.weights_file(episode)
        checkpoint = torch.load(file_name)
        self.actor.load_state_dict(checkpoint['actor'])
        self.actor_target.load_state_dict(checkpoint['actor_target'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.critic.load_state_dict(checkpoint['critic_target'])
        print("successfully load model from " + file_name)

    def save(self, episode=None):
        file_name = self.weights_file(episode)
        torch.save(
            {
                'actor': self.actor.state_dict(),
                'critic': self.critic.state_dict(),
                'actor_target': self.actor_target.state_dict(),
                'critic_target': self.critic_target.state_dict()
            }, file_name)
        print("save model to " + file_name)
Esempio n. 28
0
class DDPG():
	def __init__(self, args, env = None):
		self.args = args
		# actor
		self.actor = DeterministicPolicy(128).to(device)
		self.actor_target = DeterministicPolicy(128).to(device)
		self.actor_target.load_state_dict(self.actor.state_dict())
		self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr)
		# critics
		self.critic = QNetwork(128).to(device)
		self.critic_target = QNetwork(128).to(device)
		self.critic_target.load_state_dict(self.critic.state_dict())
		self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr)

		self.replay_buffer = ReplayBuffer(self.args.capacity)
		self.num_critic_update_iteration = 0
		self.num_actor_update_iteration = 0
		self.num_training = 0
		self.global_steps = 0

		self.action_scale = torch.FloatTensor([[20, 1]]).to(device)
		self.env = env
		#self.load()

	def update(self):
		for it in range(self.args.update_iteration):
			# sample from replay buffer
			obs, local_goal, next_obs, next_goal, action, reward, done = self.replay_buffer.sample(self.args.batch_size)
			obs = torch.FloatTensor(obs).to(device)
			local_goal = torch.FloatTensor(local_goal).to(device)
			next_obs = torch.FloatTensor(next_obs).to(device)
			next_goal = torch.FloatTensor(next_goal).to(device)
			action = torch.FloatTensor(action).to(device)
			reward = torch.FloatTensor(reward).to(device)
			done = torch.FloatTensor(done).to(device)

			# computer the target Q value
			next_action, _ = self.actor_target.sample(next_obs, next_goal)
			target_Q = self.critic_target(next_obs, next_goal, next_action / self.action_scale)
			target_Q = reward + ((1-done) * self.args.gamma * target_Q).detach()

			# get current Q estimate
			current_Q = self.critic(obs, local_goal, action)

			# compute cirtic loss and update
			critic_loss = F.mse_loss(current_Q, target_Q)
			self.critic_optimizer.zero_grad()
			critic_loss.backward()
			self.critic_optimizer.step()

			# computer actor loss
			actor_action, _ = self.actor.sample(obs, local_goal)
			actor_loss = -self.critic(obs, local_goal, actor_action / self.action_scale).mean()
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()

			# update target model 
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data)

			self.num_actor_update_iteration += 1
			self.num_critic_update_iteration += 1

	def train(self):
		for i in range(self.args.max_episode):
			obs, local_goal = self.env.reset()
			ep_r = 0

			for t in count():
				action, _ = self.actor.sample(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device))
				action = action.cpu().detach().numpy()[0]

				next_obs, next_goal, done, reward = self.env.step(action)
				self.global_steps += 1
				ep_r += reward
				self.replay_buffer.push((obs / 4.0, local_goal / 20., next_obs / 4.0, next_goal / 20., action / np.array([20, 1]), reward, np.float(done)))
				obs = next_obs
				local_goal = next_goal

				if done or t > self.args.max_length_trajectory:
					if i % self.args.print_log == 0:
						print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps))
						self.evaluate(10, False)
					break

			if len(self.replay_buffer.storage) >= self.args.capacity * 0.2:
				self.update()

		self.save()

	def evaluate(self, number = 1, render = True):
		rewards = []
		for _ in range(number):
			total_rews = 0
			time_step = 0
			done = False
			obs, local_goal = self.env.reset()
			while not done:
				action = self.predict(obs / 4., local_goal / 20.)
				# with torch.no_grad():
				# 	# use the mean action
				# 	_, action = self.actor.sample(torch.FloatTensor(obs).to(device) / 4., torch.FloatTensor(local_goal).to(device) / 20)
				# 	action = action.cpu().detach().numpy()[0]

				obs, local_goal, done, reward = self.env.step(action)
				
				if render:
					self.env.render()
				total_rews += reward
				time_step += 1
				if time_step > self.args.max_length_trajectory:
					break
				#print(str(action) + "  " + str(local_goal))
				if done:
					break

			rewards.append(total_rews)
		rewards = np.array(rewards)
		print("mean reward {}, max reward {}, min reward {}".format(rewards.mean(), rewards.max(), rewards.min()))

	def predict(self, obs, local_goal):
		with torch.no_grad():
			action = self.actor.forward(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device))
		action = action.cpu().detach().numpy()[0]
		return action

	def load(self, episode = None):
		file_name = "weights/DDPG.pt"
		checkpoint = torch.load(file_name)
		self.actor.load_state_dict(checkpoint['actor'])
		self.actor_target.load_state_dict(checkpoint['actor_target'])
		self.critic.load_state_dict(checkpoint['critic'])
		self.critic.load_state_dict(checkpoint['critic_target'])
		print("successfully load model from " + file_name)

	def save(self, episode = None):
		file_name = "weights/DDPG.pt"
		torch.save({'actor' : self.actor.state_dict(),
					'critic' : self.critic.state_dict(),
					'actor_target' : self.actor_target.state_dict(),
					'critic_target' : self.critic_target.state_dict()}, file_name)
		print("save model to " + file_name)
Esempio n. 29
0
File: DQN.py Progetto: ataitler/DQN
# initialize variables (and target network)
sess.run(init)
Ws,bs = Q.get_weights()
Q_target.assign(sess, Ws,bs)

ann_fric = (1-EPSILON)/ANNEALING
EXP_PROB = 1

# initialize environment
env = gym.make(ENVIRONMENT)

# initialize mdp state structure
mdp = MDP_state(STATE_SIZE, FRAMES)

# initialize replay buffer
R = ReplayBuffer(MDP_STATE_SIZE, 1, BUFFER_SIZE)
buf = R.LoadBuffer(OUT_DIR+BUFFER_FILE)
if buf:
	EXP_PROB = EPSILON
	populated = R.GetOccupency()
	print("Replay buffer loaded from disk, occupied: " + str(populated))
else:
	print("Creating new replay buffer")

# load saved model
ckpt = tf.train.get_checkpoint_state(OUT_DIR)
if ckpt and ckpt.model_checkpoint_path:
	saver.restore(sess,ckpt.model_checkpoint_path)
	print("Model loaded from disk")

# define action discretization
class BaseAgent:
    def __init__(self, features: int, actions: int, params: Dict, seed: int, collector: Collector):
        self.features = features
        self.actions = actions
        self.params = params
        self.collector = collector
        self.seed = seed

        # define parameter contract
        self.gamma = params['gamma']
        self.epsilon = params.get('epsilon', 0)
        # the mellowmax parameter
        self.omega = params.get('omega', 1.0)

        # set up network for estimating Q(s, a)
        self.value_net = Network(features, actions, params, seed).to(device)

        # build the optimizer
        self.optimizer_params = params['optimizer']
        self.optimizer = deserializeOptimizer(self.value_net.parameters(), self.optimizer_params)

        self.steps = 0

        # set up the replay buffer
        self.buffer_size = params['buffer_size']
        self.batch_size = params['batch']
        self.buffer_type = params.get('buffer', 'standard')

        if self.buffer_type == 'per':
            prioritization = params['prioritization']
            self.buffer = PrioritizedReplayMemory(self.buffer_size, prioritization)
        else:
            self.buffer = ReplayBuffer(self.buffer_size)

        # build a target network
        self.target_refresh = params.get('target_refresh', 1)
        self.target_net = copy.deepcopy(self.value_net)
        self.initializeTargetNet()

        def getValues(x: torch.Tensor):
            qs = self.values(x).detach().cpu().squeeze(0).numpy()
            return qs

        self.policy = createEpsilonGreedy(seed, self.epsilon, getValues)

    # return the Q(s, a) values from the value network
    def values(self, x):
        return self.value_net(x)[0]

    # sample an action according to our policy
    def selectAction(self, x):
        return self.policy.selectAction(x)

    def initializeTargetNet(self):
        # if we aren't using target nets, then save some compute
        if self.target_refresh > 1:
            self.target_net = copy.deepcopy(self.value_net)
            cloneNetworkWeights(self.value_net, self.target_net)
        else:
            self.target_net = self.value_net

    @abstractmethod
    def updateNetwork(self, batch: Batch, predictions: Dict):
        pass

    @abstractmethod
    def forward(self, batch: Batch) -> Dict[str, torch.Tensor]:
        pass

    @abstractmethod
    def bootstrap(self, batch: Batch, next_values: torch.Tensor) -> Dict[str, torch.Tensor]:
        pass

    # a helper method that lets us bypass combining gradients whenever
    # target networks are disabled
    def combineTargetGrads(self):
        if self.target_net == self.value_net:
            return

        addGradients_(self.value_net, self.target_net)

    def update(self, s, a, sp, r, gamma):
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        if self.steps % self.target_refresh == 0 and self.target_refresh > 1:
            cloneNetworkWeights(self.value_net, self.target_net)

        if len(self.buffer) > self.batch_size + 1:
            samples, idcs = self.buffer.sample(self.batch_size)
            batch = getBatchColumns(samples)
            predictions = self.forward(batch)
            tde = self.updateNetwork(batch, predictions)

            self.buffer.update_priorities(idcs, tde)