Ejemplo n.º 1
0
    def __init__(self, vail_sample, reward_shift, reward_aug, gae_norm,
                 global_norm, actor_lr, critic_lr, disc_lr, actor_units,
                 critic_units, disc_units, disc_reduce_units, gamma, lambd,
                 clip, entropy, epochs, batch_size, update_rate, data_dir,
                 demo_list):
        # build network
        self.actor = Actor(lr=actor_lr, hidden_units=actor_units)
        self.critic = Critic(lr=critic_lr, hidden_units=critic_units)
        self.discriminator = Discriminator(lr=disc_lr,
                                           hidden_units=disc_units,
                                           reduce_units=disc_reduce_units)
        self.encoder = VAE_Encoder(latent_num=64)

        # set hyperparameters
        self.vail_sample = vail_sample
        self.reward_shift = reward_shift
        self.reward_aug = reward_aug
        self.gae_norm = gae_norm
        self.gamma = gamma
        self.lambd = lambd
        self.gam_lam = gamma * lambd
        self.clip = clip
        self.entropy = entropy
        self.epochs = epochs
        self.batch_size = batch_size
        self.half_batch_size = batch_size // 2
        self.update_rate = update_rate
        self.grad_global_norm = global_norm
        self.beta = BETA_INIT

        # build memory
        self.memory = HorizonMemory(use_reward=reward_aug)
        self.replay = ReplayMemory()

        # build expert demonstration Pipeline
        self.data_dir = data_dir
        self.demo_list = os.listdir(data_dir)
        self.demo_group_num = 500
        self.demo_rotate = 5
        assert len(demo_list) >= self.demo_group_num
        self.set_demo()

        # ready
        self.dummy_forward()
        self.actor_vars = self.actor.trainable_variables + self.encoder.trainable_variables
        self.critic_vars = self.critic.trainable_variables + self.encoder.trainable_variables
        self.disc_vars = self.discriminator.trainable_variables + self.encoder.trainable_variables
Ejemplo n.º 2
0
    def __init__(self, algo, optimizer, env, num_actions, memory_size=10000):
        self.algo = algo  # currently not used. Future work to make learning algorithms modular.
        self.env = env
        self.policy = Feedforward(env.observation_space.shape[0],
                                  env.action_space.n)

        # Reusing ReplayMemory class for Q-learning, but this may be a bit confusing so let me clarify (I'm just lazy here)
        # As policy gradient is on-policy method, you can not use experiences generated from a different policy
        # I'm flushing the memory at the beggining of every epoch (see train function) so the policy is updated
        # only on the trajectories from the current policy
        self.memory = ReplayMemory(memory_size)

        self.n_actions = env.action_space.n
        if optimizer == 'Adam':
            self.optim = optim.Adam(self.policy.parameters())
        else:
            raise NotImplementedError
Ejemplo n.º 3
0
def main(args):

    traj_txt = load_txt(args.traj_path)
    force_txt = load_txt(args.force_path)

    memory = ReplayMemory(500000)

    traj = sort_traj(traj_txt)
    force = sort_force(force_txt, traj)

    # x_1 , sr_1 = librosa.load(args.audio_path)

        # memory.push(state, action, next_state, reward)

    a = 1
Ejemplo n.º 4
0
class GAIL:
    def __init__(self, vail_sample, reward_shift, reward_aug, gae_norm,
                 global_norm, actor_lr, critic_lr, disc_lr, actor_units,
                 critic_units, disc_units, disc_reduce_units, gamma, lambd,
                 clip, entropy, epochs, batch_size, update_rate, data_dir,
                 demo_list):
        # build network
        self.actor = Actor(lr=actor_lr, hidden_units=actor_units)
        self.critic = Critic(lr=critic_lr, hidden_units=critic_units)
        self.discriminator = Discriminator(lr=disc_lr,
                                           hidden_units=disc_units,
                                           reduce_units=disc_reduce_units)
        self.encoder = VAE_Encoder(latent_num=64)

        # set hyperparameters
        self.vail_sample = vail_sample
        self.reward_shift = reward_shift
        self.reward_aug = reward_aug
        self.gae_norm = gae_norm
        self.gamma = gamma
        self.lambd = lambd
        self.gam_lam = gamma * lambd
        self.clip = clip
        self.entropy = entropy
        self.epochs = epochs
        self.batch_size = batch_size
        self.half_batch_size = batch_size // 2
        self.update_rate = update_rate
        self.grad_global_norm = global_norm
        self.beta = BETA_INIT

        # build memory
        self.memory = HorizonMemory(use_reward=reward_aug)
        self.replay = ReplayMemory()

        # build expert demonstration Pipeline
        self.data_dir = data_dir
        self.demo_list = os.listdir(data_dir)
        self.demo_group_num = 500
        self.demo_rotate = 5
        assert len(demo_list) >= self.demo_group_num
        self.set_demo()

        # ready
        self.dummy_forward()
        self.actor_vars = self.actor.trainable_variables + self.encoder.trainable_variables
        self.critic_vars = self.critic.trainable_variables + self.encoder.trainable_variables
        self.disc_vars = self.discriminator.trainable_variables + self.encoder.trainable_variables

    def dummy_forward(self):
        # connect networks
        dummy_state = np.zeros([1] + STATE_SHAPE, dtype=np.float32)
        dummy_action = np.zeros([1] + ACTION_SHAPE, dtype=np.float32)
        self.encoder(dummy_state)
        self.actor(self.encoder, dummy_state)
        self.critic(self.encoder, dummy_state)
        self.discriminator(self.encoder, dummy_state, dummy_action)

    def set_demo(self):
        self.demo_list = os.listdir(data_dir)
        selected_demos = random.sample(self.demo_list, self.demo_group_num)

        expert_states = []
        expert_actions = []
        for demo_name in selected_demos:
            demo = np.load(self.data_dir + demo_name)
            states = demo['state']
            actions = demo['action']

            expert_states.append(states)
            expert_actions.append(actions)
        self.expert_states = np.concatenate(expert_states, axis=0)
        self.expert_actions = np.concatenate(expert_actions, axis=0)
        del demo

    def get_demonstration(self, sample_num):
        index = np.arange(len(self.expert_states))
        try:
            assert len(self.expert_states) >= sample_num
        except Exception:
            self.set_demo()
        np.random.shuffle(index)
        index = index[:sample_num]
        return self.expert_states[index], self.expert_actions[index]

    def memory_process(self, next_state, done):
        # [[(1,64,64,3)], [], ...], [[(1,2),(1,9),(1,3),(1,4)], [], ...], [[c_pi, d_pi, s_pi, a_pi], [], ...]
        if self.reward_aug:
            states, actions, log_old_pis, rewards = self.memory.rollout()
        else:
            states, actions, log_old_pis = self.memory.rollout()
        np_states = np.concatenate(states + [next_state], axis=0)
        np_actions = np.concatenate(actions, axis=0)

        np_rewards = self.get_reward(np_states[:-1], np_actions)  # (N, 1)
        if self.reward_aug:
            np_env_rewards = np.stack(rewards, axis=0).reshape(-1, 1)
            np_rewards = np_rewards + np_env_rewards
        gae, oracle = self.get_gae_oracle(np_states, np_rewards,
                                          done)  # (N, 1), (N, 1)
        self.replay.append(states, actions, log_old_pis, gae, oracle)
        self.memory.flush()
        if len(self.replay) >= self.update_rate:
            self.update()
            self.replay.flush()

    def get_action(self, state):
        policy = self.actor(self.encoder, state).numpy()[0]
        action = np.random.choice(ACTION_NUM, p=policy)
        # action = np.argmax(policy)
        action_one_hot = np.eye(ACTION_NUM,
                                dtype=np.float32)[[action]]  # (1, 4)
        log_old_pi = [[np.log(policy[action] + 1e-8)]]  # (1, 1)
        return action, action_one_hot, log_old_pi, policy

    def get_reward(self, states, actions):
        d = self.discriminator(self.encoder, states, actions).numpy()  # (N, 1)
        # rewards = 0.5 - d       # linear reward
        # rewards = np.tan(0.5 - d)     # tan reward
        if self.reward_shift:
            rewards = -np.log(2.0 * d + 1e-8)  # log equil reward
        else:
            rewards = -np.log(d + 1e-8)  # log reward
        # rewards = 0.1 * np.where(rewards>1, 1, rewards)
        return rewards

    def get_gae_oracle(self, states, rewards, done):
        # states include next state
        values = self.critic(self.encoder, states).numpy()  # (N+1, 1)
        if done:
            values[-1] = np.float32([0])
        N = len(rewards)
        gae = 0
        gaes = np.zeros((N, 1), dtype=np.float32)
        oracles = np.zeros((N, 1), dtype=np.float32)
        for t in reversed(range(N)):
            oracles[t] = rewards[t] + self.gamma * values[t + 1]
            delta = oracles[t] - values[t]
            gae = delta + self.gam_lam * gae
            gaes[t][0] = gae

        # oracles = gaes + values[:-1]        # (N, 1)
        if self.gae_norm:
            gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + 1e-8)
        return gaes, oracles

    def update(self):
        # load & calculate data
        states, actions, log_old_pis, gaes, oracles \
            = self.replay.rollout()

        states = np.concatenate(states, axis=0)
        actions = np.concatenate(actions, axis=0)
        log_old_pis = np.concatenate(log_old_pis, axis=0)
        gaes = np.concatenate(gaes, axis=0)
        oracles = np.concatenate(oracles, axis=0)
        N = len(states)
        # update discriminator
        # load expert demonstration
        s_e, a_e = self.get_demonstration(N)

        batch_num = N // self.half_batch_size
        index = np.arange(N)
        np.random.shuffle(index)
        for i in range(batch_num):
            idx = index[i * self.half_batch_size:(i + 1) *
                        self.half_batch_size]
            s_concat = np.concatenate([states[idx], s_e[idx]], axis=0)
            a_concat = np.concatenate([actions[idx], a_e[idx]], axis=0)

            with tf.GradientTape(persistent=True) as tape:
                mu, std, sampled = self.discriminator.encode(
                    self.encoder, s_concat, a_concat)

                discs = self.discriminator.decode(
                    sampled if self.vail_sample else mu)
                kld_loss = tf.reduce_mean(tf_gaussian_KL(mu, 0, std, 1))
                agent_loss = -tf.reduce_mean(
                    tf.math.log(discs[:self.half_batch_size] + 1e-8))
                expert_loss = -tf.reduce_mean(
                    tf.math.log(1 + 1e-8 - discs[self.half_batch_size:]))
                disc_loss = agent_loss + expert_loss
                discriminator_loss = disc_loss + self.beta * kld_loss
            disc_grads = tape.gradient(discriminator_loss, self.disc_vars)
            if self.grad_global_norm > 0:
                disc_grads, _ = tf.clip_by_global_norm(disc_grads,
                                                       self.grad_global_norm)
            self.discriminator.opt.apply_gradients(
                zip(disc_grads, self.disc_vars))
            del tape

        # TODO: update posterior
        # L1 loss = logQ(code|s,prev_a,prev_code)
        # update actor & critic
        # batch_num = math.ceil(len(states) / self.batch_size)
        batch_num = len(gaes) // self.batch_size
        index = np.arange(len(gaes))
        for _ in range(self.epochs):
            np.random.shuffle(index)
            for i in range(batch_num):
                # if i == batch_num - 1:
                #     idx = index[i*self.batch_size : ]
                # else:
                idx = index[i * self.batch_size:(i + 1) * self.batch_size]
                state = states[idx]
                action = actions[idx]
                log_old_pi = log_old_pis[idx]
                gae = gaes[idx]
                oracle = oracles[idx]

                # update critic
                with tf.GradientTape(persistent=True) as tape:
                    values = self.critic(self.encoder, state)  # (N, 1)
                    critic_loss = tf.reduce_mean(
                        (oracle - values)**2)  # MSE loss
                critic_grads = tape.gradient(critic_loss, self.critic_vars)
                if self.grad_global_norm > 0:
                    critic_grads, _ = tf.clip_by_global_norm(
                        critic_grads, self.grad_global_norm)
                self.critic.opt.apply_gradients(
                    zip(critic_grads, self.critic_vars))
                del tape

                # update actor
                with tf.GradientTape(persistent=True) as tape:
                    pred_action = self.actor(self.encoder, state)

                    # RL (PPO) term
                    log_pi = tf.expand_dims(tf.math.log(
                        tf.reduce_sum(pred_action * action, axis=1) + 1e-8),
                                            axis=1)  # (N, 1)
                    ratio = tf.exp(log_pi - log_old_pi)
                    clip_ratio = tf.clip_by_value(ratio, 1 - self.clip,
                                                  1 + self.clip)
                    clip_loss = -tf.reduce_mean(
                        tf.minimum(ratio * gae, clip_ratio * gae))
                    entropy = tf.reduce_mean(tf.exp(log_pi) * log_pi)
                    actor_loss = clip_loss + self.entropy * entropy

                actor_grads = tape.gradient(
                    actor_loss, self.actor_vars)  # NOTE: freeze posterior
                if self.grad_global_norm > 0:
                    actor_grads, _ = tf.clip_by_global_norm(
                        actor_grads, self.grad_global_norm)
                self.actor.opt.apply_gradients(
                    zip(actor_grads, self.actor_vars))

                del tape
            # print('%d samples trained... D loss: %.4f C loss: %.4f A loss: %.4f\t\t\t'
            #     % (len(gaes), disc_loss, critic_loss, actor_loss), end='\r')

    def save_model(self, dir, tag=''):
        self.actor.save_weights(dir + tag + 'actor.h5')
        self.critic.save_weights(dir + tag + 'critic.h5')
        self.discriminator.save_weights(dir + tag + 'discriminator.h5')
        self.encoder.save_weights(dir + tag + 'encoder.h5')

    def load_model(self, dir, tag=''):
        if os.path.exists(dir + tag + 'actor.h5'):
            self.actor.load_weights(dir + tag + 'actor.h5')
            print('Actor loaded... %s%sactor.h5' % (dir, tag))
        if os.path.exists(dir + tag + 'critic.h5'):
            self.critic.load_weights(dir + tag + 'critic.h5')
            print('Critic loaded... %s%scritic.h5' % (dir, tag))
        if os.path.exists(dir + tag + 'discriminator.h5'):
            self.discriminator.load_weights(dir + tag + 'discriminator.h5')
            print('Discriminator loaded... %s%sdiscriminator.h5' % (dir, tag))
        if os.path.exists(dir + tag + 'encoder.h5'):
            self.encoder.load_weights(dir + tag + 'encoder.h5')
            print('encoder loaded... %s%sencoder.h5' % (dir, tag))

    def load_encoder(self, dir, tag=''):
        if os.path.exists(dir + tag + 'encoder.h5'):
            self.encoder.load_weights(dir + tag + 'encoder.h5')
            print('encoder loaded... %s%sencoder.h5' % (dir, tag))
Ejemplo n.º 5
0
        device  = torch.device(args.device)
        eval_net = GraphNet( hidden_size=args.hidden_size, n_head=args.nhead, nlayers=args.nlayer, duel_dqn=args.duel_dqn, n_gat_head=args.n_gat_head).to(device)
        target_net = GraphNet( hidden_size=args.hidden_size, n_head=args.nhead, nlayers=args.nlayer, duel_dqn=args.duel_dqn, n_gat_head=args.n_gat_head).to(device)
        optimizer = torch.optim.Adam(eval_net.parameters(), lr=args.lr)
        gamma = args.gamma
        epsilon = args.epsilon
        batch_size = args.batch_size
        max_step = args.max_step
        num_env = args.num_env
        time_last = time.time()
        loss_func = nn.MSELoss()
        learn_step_counter = 0
        q_network_iteration = 10

        memory = ReplayMemory(4096)

        performance = []
        envs = Envs(10000,100000, num_env)

        state = envs.reset()

        for _i in range(max_step):
            eval_net.eval()
            values = eval_net(*state)
            # print(values)
            # print(state)
            if random.random() > epsilon:
                prob_uniform = (values > -9999999).float()
                dist = Categorical(prob_uniform)
                action = dist.sample()
Ejemplo n.º 6
0
def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--mode',
        help='Select mode',
        choices=['train', 'test', 'demo'],
        default='train',
    )
    args = parser.parse_args()

    config = yaml.safe_load(open("config.yml"))

    if config['LOAD_MODEL']:
        model = DQN(
            in_channels=config['IN_CHANNELS'],
            out_dim=config['OUT_DIM'],
        )
        model_name = config['LOAD_MODEL']
        model.load_model(model_name)
    else:
        model = DQN(
            in_channels=config['IN_CHANNELS'],
            out_dim=config['OUT_DIM'],
        )

    if args.mode == 'test':
        test(
            device=config['DEVICE'],
            n_games=config['TEST_GAMES'],
            model=model,
            frame_skipping=config['FRAME_SKIPPING'],
        )
    elif args.mode == 'demo':
        demo(
            device=config['DEVICE'],
            model=model,
            frame_skipping=config['FRAME_SKIPPING'],
        )
    else:
        memory = ReplayMemory(capacity=config['N'])

        optimizer_name = config['OPTIMIZER']
        if optimizer_name == 'adam':
            optimizer = torch.optim.Adam(lr=config['LEARNING_RATE'],
                                         betas=(0.9, 0.999),
                                         eps=1e-8,
                                         amsgrad=False,
                                         params=model.model.parameters())
        elif optimizer_name == 'sgd':
            optimizer = torch.optim.SGD(lr=config['LEARNING_RATE'],
                                        momentum=0.9,
                                        params=model.model.parameters())
        else:
            raise ValueError(f'Unknown optimizer name: {optimizer_name}')

        experiment = Experiment(
            api_key=os.environ['COMET_ML_API_KEY'],
            project_name=config['COMET_ML_PROJECT_NAME'],
            workspace=config['COMET_ML_WORKSPACE'],
        )

        experiment.set_name(config['COMET_ML_NAME'])
        experiment.add_tag(config['COMET_ML_TAG'])
        experiment.log_parameters({
            'n_games':
            config['M'],
            'minibatch_size':
            config['MINIBATCH_SIZE'],
            'eps':
            config['EPS'],
            'eps_n_frames':
            config['EPS_N_FRAMES'],
            'gamma':
            config['GAMMA'],
            'frame_skipping':
            config['FRAME_SKIPPING'],
            'save_model_every':
            config['SAVE_MODEL_EVERY']
        })
        experiment.set_model_graph(str(model.model))

        train(
            device=config['DEVICE'],
            n_games=config['M'],
            memory=memory,
            optimizer=optimizer,
            model=model,
            experiment=experiment,
            minibatch_size=config['MINIBATCH_SIZE'],
            eps=config['EPS'],
            eps_n_frames=config['EPS_N_FRAMES'],
            gamma=config['GAMMA'],
            frame_skipping=config['FRAME_SKIPPING'],
            update_model_target_every=config['UPDATE_MODEL_TARGET_EVERY'],
            save_model_every=config['SAVE_MODEL_EVERY'],
            save_model_as=config['SAVE_MODEL_AS'],
            save_average_metrics_every=config['SAVE_AVERAGE_METRICS_EVERY'],
        )
class MDP:
    def __init__(self, args):
        self.args = args
        self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up',
                        'down']  # 'open', 'close']
        self.P_START = 0.999
        self.P_END = 0.05
        self.P_DECAY = 500
        self.max_iter = args.max_iter
        self.gripping_force = args.grip_force
        self.breaking_threshold = args.break_thresh

        # Prepare the drawing figure
        fig, (ax1, ax2) = plt.subplots(1, 2)
        self.figure = (fig, ax1, ax2)

    # Function to select an action from our policy or a random one
    def select_action(self, state):
        sample = random.random()
        p_threshold = self.P_END + (self.P_START - self.P_END) * math.exp(
            -1. * self.steps_done / self.P_DECAY)
        self.steps_done += 1

        if sample > p_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                self.policy_net_1.eval()
                torch_state = torch.from_numpy(state).float().to(
                    self.args.device)
                action = self.policy_net_1(torch_state.unsqueeze(0)).max(1)[1]
                self.policy_net_1.train()
                return action.item()
        else:
            return random.randrange(self.args.outdim)

    def optimize_model(self):
        args = self.args
        if len(self.memory) < args.batch_size:
            return

        transitions = self.memory.sample(args.batch_size)

        state_batch, action_batch, reward_batch, nextstate_batch = [], [], [], []
        for transition in transitions:
            state_batch.append(transition.state)
            action_batch.append(transition.action)
            reward_batch.append(transition.reward)
            nextstate_batch.append(transition.next_state)

        state_batch = torch.from_numpy(np.array(state_batch)).float().to(
            args.device)
        action_batch = torch.from_numpy(np.array(action_batch)).to(
            args.device).unsqueeze(1)
        reward_batch = torch.from_numpy(np.array(reward_batch)).float().to(
            args.device).unsqueeze(1)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, nextstate_batch)),
                                      device=args.device,
                                      dtype=torch.bool).unsqueeze(1)
        non_final_next_states = torch.cat([
            torch.from_numpy(s).float().to(args.device).unsqueeze(0)
            for s in nextstate_batch if s is not None
        ])

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values_1 = self.policy_net_1(state_batch).gather(
            1, action_batch)
        state_action_values_2 = self.policy_net_2(state_batch).gather(
            1, action_batch)
        state_action_values_3 = self.policy_net_3(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values_1 = torch.zeros((args.batch_size, 1),
                                          device=args.device)
        next_state_values_2 = torch.zeros((args.batch_size, 1),
                                          device=args.device)
        next_state_values_3 = torch.zeros((args.batch_size, 1),
                                          device=args.device)
        next_state_values_1[non_final_mask] = self.policy_net_1(
            non_final_next_states).max(1)[0].detach()
        next_state_values_2[non_final_mask] = self.policy_net_2(
            non_final_next_states).max(1)[0].detach()
        next_state_values_3[non_final_mask] = self.policy_net_3(
            non_final_next_states).max(1)[0].detach()

        next_state_values = torch.min(
            torch.min(next_state_values_1, next_state_values_2),
            next_state_values_3)

        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        args.gamma) + reward_batch

        # Compute Huber loss
        loss_1 = F.smooth_l1_loss(state_action_values_1,
                                  expected_state_action_values)
        loss_2 = F.smooth_l1_loss(state_action_values_2,
                                  expected_state_action_values)
        loss_3 = F.smooth_l1_loss(state_action_values_3,
                                  expected_state_action_values)

        # Optimize the model
        self.optimizer_1.zero_grad()
        self.optimizer_2.zero_grad()
        self.optimizer_3.zero_grad()
        loss_1.backward()
        loss_2.backward()
        loss_3.backward()
        for param in self.policy_net_1.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.policy_net_2.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.policy_net_3.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer_1.step()
        self.optimizer_2.step()
        self.optimizer_3.step()
        return [loss_1, loss_2, loss_3]

    def train_MDP(self):
        args = self.args
        # Create the output directory if it does not exist
        if not os.path.isdir(args.output_dir):
            os.makedirs(args.output_dir)

        # Create our policy net and a target net
        self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device)
        self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device)

        self.target_net = DQN(args.indim, args.outdim).to(args.device)
        self.target_net.load_state_dict(self.policy_net_1.state_dict())
        self.target_net.eval()

        # Set up the optimizer
        self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                         args.lr)
        self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                         args.lr)
        self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(),
                                         args.lr)
        self.memory = ReplayMemory(500000)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Normalizer(args.indim, device=args.device)
        print_variables = {'durations': [], 'rewards': [], 'loss': []}

        # Load old checkpoint if provided
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(
                    checkpoint['model_state_dict'])
                self.policy_net_2.load_state_dict(
                    checkpoint['model_state_dict'])
                self.policy_net_3.load_state_dict(
                    checkpoint['model_state_dict'])
                self.target_net.load_state_dict(checkpoint['model_state_dict'])
                start_episode = checkpoint['epoch']
                self.steps_done = start_episode
                self.optimizer_1.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                self.optimizer_2.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                self.optimizer_3.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_geom_mdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        action_space = ActionSpace(dp=0.06, df=10)

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            if args.sim:
                # Create robot, reset simulation and grasp handle
                model, model_params = init_model(args.model_path)
                sim = MjSim(model)
                sim.step()
                viewer = None
                if args.render:
                    viewer = MjViewer(sim)
                else:
                    viewer = None

                sim_param = SimParameter(sim)
                robot = RobotSim(sim, viewer, sim_param, args.render,
                                 self.breaking_threshold)
                robot.reset_simulation()
                ret = robot.grasp_handle()
                if not ret:
                    continue

                # Get current state
                state_space = Observation(
                    robot.get_gripper_jpos(),
                    robot.get_shear_buffer(args.hap_sample),
                    robot.get_all_touch_buffer(args.hap_sample))
                broken_so_far = 0

            for t in count():
                if not args.quiet and t % 20 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                state = state_space.get_state()
                if args.position:
                    state = state[6:]
                if args.shear:
                    indices = np.ones(len(state), dtype=bool)
                    indices[6:166] = False
                    state = state[indices]
                if args.force:
                    state = state[:166]
                normalizer.observe(state)
                state = normalizer.normalize(state)
                action = self.select_action(state)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(state_space.get_current_position(),
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))

                if args.sim:
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Get reward
                    done, num = robot.update_tendons()
                    failure = robot.check_slippage()
                    if num > broken_so_far:
                        reward = num - broken_so_far
                        broken_so_far = num
                    else:
                        reward = 0

                    # # Add a movement reward
                    # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta)

                    # Observe new state
                    state_space.update(
                        robot.get_gripper_jpos(),
                        robot.get_shear_buffer(args.hap_sample),
                        robot.get_all_touch_buffer(args.hap_sample))

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_state = state_space.get_state()
                    if args.position:
                        next_state = next_state[6:]
                    if args.shear:
                        indices = np.ones(len(next_state), dtype=bool)
                        indices[6:166] = False
                        next_state = next_state[indices]
                    if args.force:
                        next_state = next_state[:166]
                    normalizer.observe(next_state)
                    next_state = normalizer.normalize(next_state)
                else:
                    next_state = None

                # Push new Transition into memory
                self.memory.push(state, action, next_state, reward)

                # Optimize the model
                loss = self.optimize_model()
                #        if loss:
                #            print_variables['loss'].append(loss.item())

                # If we are done, reset the model
                if done or failure:
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   'Training MDP')
                    print("Model parameters: {}".format(model_params))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))
                    break

            # Update the target network, every x iterations
            if ii % 10 == 0:
                self.target_net.load_state_dict(self.policy_net_1.state_dict())

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    args.output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epoch': ii,
                        'model_state_dict': self.target_net.state_dict(),
                        'optimizer_state_dict': self.optimizer_1.state_dict(),
                    }, save_path)

        # Save normalizer state for inference
        normalizer.save_state(
            os.path.join(args.output_dir, 'normalizer_state.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables
    def train_MDP(self):
        args = self.args
        # Create the output directory if it does not exist
        if not os.path.isdir(args.output_dir):
            os.makedirs(args.output_dir)

        # Create our policy net and a target net
        self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device)
        self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device)

        self.target_net = DQN(args.indim, args.outdim).to(args.device)
        self.target_net.load_state_dict(self.policy_net_1.state_dict())
        self.target_net.eval()

        # Set up the optimizer
        self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                         args.lr)
        self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                         args.lr)
        self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(),
                                         args.lr)
        self.memory = ReplayMemory(500000)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Normalizer(args.indim, device=args.device)
        print_variables = {'durations': [], 'rewards': [], 'loss': []}

        # Load old checkpoint if provided
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(
                    checkpoint['model_state_dict'])
                self.policy_net_2.load_state_dict(
                    checkpoint['model_state_dict'])
                self.policy_net_3.load_state_dict(
                    checkpoint['model_state_dict'])
                self.target_net.load_state_dict(checkpoint['model_state_dict'])
                start_episode = checkpoint['epoch']
                self.steps_done = start_episode
                self.optimizer_1.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                self.optimizer_2.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                self.optimizer_3.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_geom_mdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        action_space = ActionSpace(dp=0.06, df=10)

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            if args.sim:
                # Create robot, reset simulation and grasp handle
                model, model_params = init_model(args.model_path)
                sim = MjSim(model)
                sim.step()
                viewer = None
                if args.render:
                    viewer = MjViewer(sim)
                else:
                    viewer = None

                sim_param = SimParameter(sim)
                robot = RobotSim(sim, viewer, sim_param, args.render,
                                 self.breaking_threshold)
                robot.reset_simulation()
                ret = robot.grasp_handle()
                if not ret:
                    continue

                # Get current state
                state_space = Observation(
                    robot.get_gripper_jpos(),
                    robot.get_shear_buffer(args.hap_sample),
                    robot.get_all_touch_buffer(args.hap_sample))
                broken_so_far = 0

            for t in count():
                if not args.quiet and t % 20 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                state = state_space.get_state()
                if args.position:
                    state = state[6:]
                if args.shear:
                    indices = np.ones(len(state), dtype=bool)
                    indices[6:166] = False
                    state = state[indices]
                if args.force:
                    state = state[:166]
                normalizer.observe(state)
                state = normalizer.normalize(state)
                action = self.select_action(state)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(state_space.get_current_position(),
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))

                if args.sim:
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Get reward
                    done, num = robot.update_tendons()
                    failure = robot.check_slippage()
                    if num > broken_so_far:
                        reward = num - broken_so_far
                        broken_so_far = num
                    else:
                        reward = 0

                    # # Add a movement reward
                    # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta)

                    # Observe new state
                    state_space.update(
                        robot.get_gripper_jpos(),
                        robot.get_shear_buffer(args.hap_sample),
                        robot.get_all_touch_buffer(args.hap_sample))

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_state = state_space.get_state()
                    if args.position:
                        next_state = next_state[6:]
                    if args.shear:
                        indices = np.ones(len(next_state), dtype=bool)
                        indices[6:166] = False
                        next_state = next_state[indices]
                    if args.force:
                        next_state = next_state[:166]
                    normalizer.observe(next_state)
                    next_state = normalizer.normalize(next_state)
                else:
                    next_state = None

                # Push new Transition into memory
                self.memory.push(state, action, next_state, reward)

                # Optimize the model
                loss = self.optimize_model()
                #        if loss:
                #            print_variables['loss'].append(loss.item())

                # If we are done, reset the model
                if done or failure:
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   'Training MDP')
                    print("Model parameters: {}".format(model_params))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))
                    break

            # Update the target network, every x iterations
            if ii % 10 == 0:
                self.target_net.load_state_dict(self.policy_net_1.state_dict())

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    args.output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epoch': ii,
                        'model_state_dict': self.target_net.state_dict(),
                        'optimizer_state_dict': self.optimizer_1.state_dict(),
                    }, save_path)

        # Save normalizer state for inference
        normalizer.save_state(
            os.path.join(args.output_dir, 'normalizer_state.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables
Ejemplo n.º 9
0
    def run(self):
        BufferRecord = namedtuple(
            'BufferRecord',
            ['x', 's', 'm', 'f', 'a', 'old_logp_a', 'v_target', 'adv'])
        buffer = ReplayMemory(tuple_class=BufferRecord,
                              capacity=self.args.buffer_size)

        reward_history = []
        reward_averaged = []
        best_reward = -np.inf
        step = 0
        total_rec = 0

        for n_iter in range(self.args.iters):
            clip = self._ratio_clip_fn(n_iter)
            if self.args.clean_buffer: buffer.clean()
            ep_reward, n_rec = self._generate_rollout(buffer)
            reward_history.append(ep_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            total_rec += n_rec

            for batch in buffer.loop(self.args.record_size, self.args.epochs):
                if self.args.finetune_model and n_iter >= self.args.finetune_warmup:
                    self.model.finetune(batch)

                _, summ_str = self.sess.run(
                    [self.train_ops, self.merged_summary],
                    feed_dict={
                        self.lr_a:
                        self.args.lr_a,
                        self.lr_c:
                        self.args.lr_c,
                        self.clip_range:
                        clip,
                        self.state:
                        batch['s'],
                        self.mask:
                        batch['m'],
                        self.auxilary:
                        batch['f'],
                        self.action:
                        batch['a'],
                        self.old_logp_a:
                        batch['old_logp_a'],
                        self.v_target:
                        batch['v_target'],
                        self.adv:
                        batch['adv'],
                        self.ep_reward:
                        np.mean(reward_history[-10:])
                        if reward_history else 0.0,
                    })
                self.writer.add_summary(summ_str, step)
                step += 1

            if self.args.log_freq > 0 and (n_iter +
                                           1) % self.args.log_freq == 0:
                logger.info(
                    "[iteration:{}/step:{}], best:{}, avg:{:.2f}, clip:{:.2f}; {} transitions."
                    .format(n_iter, step, np.max(reward_history),
                            np.mean(reward_history[-10:]), clip, total_rec))

            if self.args.eval_freq > 0 and n_iter % self.args.eval_freq == 0:
                self.evaluate(folder=f'{n_iter}', load=False)

            if self.args.save_freq > 0 and (n_iter +
                                            1) % self.args.save_freq == 0:
                self.save()

            if np.mean(reward_history[-10:]) > best_reward:
                best_reward = np.mean(reward_history[-10:])
                self.save('best')

        # FINISH
        self.save()
        logger.info(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))
        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_dict(f'{self.args.exp_dir}/learning_curve.png',
                  data_dict,
                  xlabel='episode')
Ejemplo n.º 10
0
    def __init__(self, action_spec: dm_env.specs.DiscreteArray,
                 observation_spec: dm_env.specs.Array, device: torch.device,
                 settings: dict) -> None:
        """
        Initializes the agent,  constructs the qnet and the q_target, initializes the optimizer and ReplayMemory.
        Args:
            action_spec(dm_env.specs.DiscreteArray): description of the action space of the environment
            observation_spec(dm_env.specs.Array): description of observations form the environment
            device(str): "gpu" or "cpu"
            settings(dict): dictionary with settings
        """
        self.device = device
        action_size = action_spec.num_values
        state_size = np.prod(observation_spec.shape)
        self.action_size = action_size
        self.state_size = state_size
        self.batch_size = settings['batch_size']
        self.noisy_nets = settings['qnet_settings']['noisy_nets']

        self.qnet = Dqn(state_size, action_size,
                        settings['qnet_settings']).to(device)
        self.q_target = Dqn(state_size, action_size,
                            settings['qnet_settings']).to(device)

        self.q_target.load_state_dict(self.qnet.state_dict())
        self.optimizer = optim.Adam(self.qnet.parameters(), lr=settings['lr'])

        self.epsilon = settings["epsilon_start"]
        self.decay = settings["epsilon_decay"]
        self.epsilon_min = settings["epsilon_min"]
        self.gamma = settings['gamma']

        self.start_optimization = settings["start_optimization"]
        self.update_qnet_every = settings["update_qnet_every"]
        self.update_target_every = settings["update_target_every"]
        self.number_steps = 0
        self.ddqn = settings["ddqn"]

        # Initialize replay memory
        self.prioritized_replay = settings["prioritized_buffer"]
        if self.prioritized_replay:
            self.memory = PrioritizedReplayMemory(
                device, settings["buffer_size"], self.gamma,
                settings["n_steps"], settings["alpha"], settings["beta0"],
                settings["beta_increment"])
        else:
            self.memory = ReplayMemory(device, settings["buffer_size"],
                                       self.gamma, settings["n_steps"])

        # Density Estimator
        self.features = 'd'
        self.DE_type = 'KDE'

        if self.DE_type == 'flow':
            self.density_estimator = MAFMOGDensityEstimator(
                batch_size=50,
                n_components=3,
                n_blocks=5,
                lr=1e-4,
                use_log_density=True,
                use_density_scaling=True)
        elif self.DE_type == 'KDE':
            # self.density_estimator = FixedKernelDensityEstimator('gaussian', 0.1, use_log_density = True)
            self.density_estimator = CVKernelDensityEstimator(
                use_log_density=True)

        # Epistemic predictor
        self.enet = Epn((state_size + len(self.features)) -
                        1 if "x" in self.features else len(self.features),
                        action_size, settings['qnet_settings']).to(device)
        self.e_optimizer = optim.Adam(self.enet.parameters(),
                                      lr=settings['lr'])

        self.burn_in_density = 10000
        return
Ejemplo n.º 11
0
class Agent:
    def __init__(self, action_spec: dm_env.specs.DiscreteArray,
                 observation_spec: dm_env.specs.Array, device: torch.device,
                 settings: dict) -> None:
        """
        Initializes the agent,  constructs the qnet and the q_target, initializes the optimizer and ReplayMemory.
        Args:
            action_spec(dm_env.specs.DiscreteArray): description of the action space of the environment
            observation_spec(dm_env.specs.Array): description of observations form the environment
            device(str): "gpu" or "cpu"
            settings(dict): dictionary with settings
        """
        self.device = device
        action_size = action_spec.num_values
        state_size = np.prod(observation_spec.shape)
        self.action_size = action_size
        self.state_size = state_size
        self.batch_size = settings['batch_size']
        self.noisy_nets = settings['qnet_settings']['noisy_nets']

        self.qnet = Dqn(state_size, action_size,
                        settings['qnet_settings']).to(device)
        self.q_target = Dqn(state_size, action_size,
                            settings['qnet_settings']).to(device)

        self.q_target.load_state_dict(self.qnet.state_dict())
        self.optimizer = optim.Adam(self.qnet.parameters(), lr=settings['lr'])

        self.epsilon = settings["epsilon_start"]
        self.decay = settings["epsilon_decay"]
        self.epsilon_min = settings["epsilon_min"]
        self.gamma = settings['gamma']

        self.start_optimization = settings["start_optimization"]
        self.update_qnet_every = settings["update_qnet_every"]
        self.update_target_every = settings["update_target_every"]
        self.number_steps = 0
        self.ddqn = settings["ddqn"]

        # Initialize replay memory
        self.prioritized_replay = settings["prioritized_buffer"]
        if self.prioritized_replay:
            self.memory = PrioritizedReplayMemory(
                device, settings["buffer_size"], self.gamma,
                settings["n_steps"], settings["alpha"], settings["beta0"],
                settings["beta_increment"])
        else:
            self.memory = ReplayMemory(device, settings["buffer_size"],
                                       self.gamma, settings["n_steps"])

        # Density Estimator
        self.features = 'd'
        self.DE_type = 'KDE'

        if self.DE_type == 'flow':
            self.density_estimator = MAFMOGDensityEstimator(
                batch_size=50,
                n_components=3,
                n_blocks=5,
                lr=1e-4,
                use_log_density=True,
                use_density_scaling=True)
        elif self.DE_type == 'KDE':
            # self.density_estimator = FixedKernelDensityEstimator('gaussian', 0.1, use_log_density = True)
            self.density_estimator = CVKernelDensityEstimator(
                use_log_density=True)

        # Epistemic predictor
        self.enet = Epn((state_size + len(self.features)) -
                        1 if "x" in self.features else len(self.features),
                        action_size, settings['qnet_settings']).to(device)
        self.e_optimizer = optim.Adam(self.enet.parameters(),
                                      lr=settings['lr'])

        self.burn_in_density = 10000
        return

    def select_action(self, timestep: dm_env.TimeStep) -> int:
        """
        Returns an action following an epsilon-greedy policy.
        Args:
            timestep(dm_env.TimeStep): An observation from the environment

        Returns:
            int: The chosen action.
        """
        observation = np.array(timestep.observation).flatten()
        observation = torch.from_numpy(observation).float().to(self.device)
        self.number_steps += 1

        if not self.noisy_nets:
            self.update_epsilon()

        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
        else:
            if self.number_steps <= self.burn_in_density:
                qvals = self.qnet.forward(observation)
            else:
                qvals = self.qnet.forward(
                    observation) + 0.1 * self._epistemic_uncertainty(
                        observation.unsqueeze(0))
            return int(torch.argmax(qvals, dim=-1).cpu().detach().numpy())

    def update_epsilon(self) -> None:
        """
        Decays epsilon until self.epsilon_min
        Returns:
            None
        """
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.decay

    @staticmethod
    def calc_loss(
            q_observed: torch.Tensor, q_target: torch.Tensor,
            weights: torch.Tensor) -> typing.Tuple[torch.Tensor, np.float64]:
        """
        Returns the mean weighted MSE loss and the loss for each sample
        Args:
            q_observed(torch.Tensor): calculated q_value
            q_target(torch.Tensor):   target q-value
            weights: weights of the batch samples

        Returns:
            tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample
        """
        # print('q_observed is cuda', q_observed.is_cuda)
        # print('q_target is cuda', q_target.is_cuda)

        losses = functional.mse_loss(q_observed, q_target, reduction='none')
        loss = (weights * losses).sum() / weights.sum()
        return loss, losses.cpu().detach().numpy() + 1e-8

    def update(self, step: dm_env.TimeStep, action: int,
               next_step: dm_env.TimeStep) -> None:
        """
        Adds experience to the replay memory, performs an optimization_step and updates the q_target neural network.
        Args:
            step(dm_env.TimeStep): Current observation from the environment
            action(int): The action that was performed by the agent.
            next_step(dm_env.TimeStep): Next observation from the environment
        Returns:
            None
        """

        observation = np.array(step.observation).flatten()
        next_observation = np.array(next_step.observation).flatten()
        done = next_step.last()
        exp = Experience(observation, action, next_step.reward,
                         next_step.discount, next_observation, 0, done)
        self.memory.add(exp)

        if self.memory.number_samples() < self.start_optimization:
            return

        if self.number_steps % self.update_qnet_every == 0:
            s0, a0, n_step_reward, discount, s1, _, dones, indices, weights = self.memory.sample_batch(
                self.batch_size)
            self.optimization_step(s0, a0, n_step_reward, discount, s1,
                                   indices, weights)

        if self.number_steps % self.update_target_every == 0:
            self.q_target.load_state_dict(self.qnet.state_dict())
        return

    def optimization_step(self, s0: torch.Tensor, a0: torch.Tensor,
                          n_step_reward: torch.Tensor, discount: torch.Tensor,
                          s1: torch.Tensor,
                          indices: typing.Optional[torch.Tensor],
                          weights: typing.Optional[torch.Tensor]) -> None:
        """
        Calculates the Bellmann update and updates the qnet.
        Args:
            s0(torch.Tensor): current state
            a0(torch.Tensor): current action
            n_step_reward(torch.Tensor): n-step reward
            discount(torch.Tensor): discount factor
            s1(torch.Tensor): next state
            indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet.
            weights(torch.Tensor): weights needed for prioritized replay

        Returns:
            None
        """

        with torch.no_grad():
            if self.noisy_nets:
                self.q_target.reset_noise()
                self.qnet.reset_noise()

            # Calculating the target values
            next_q_vals = self.q_target(s1)
            if self.ddqn:
                a1 = torch.argmax(self.qnet(s1), dim=1).unsqueeze(-1)
                next_q_val = next_q_vals.gather(1, a1).squeeze()
            else:
                next_q_val = torch.max(next_q_vals, dim=1).values
            q_target = n_step_reward.squeeze(
            ) + self.gamma * discount.squeeze() * next_q_val

        # Getting the observed q-values
        if self.noisy_nets:
            self.qnet.reset_noise()
        q_observed = self.qnet(s0).gather(1, a0.long()).squeeze()

        # Calculating the losses
        if not self.prioritized_replay:
            weights = torch.ones(self.batch_size)
        critic_loss, batch_loss = self.calc_loss(q_observed, q_target,
                                                 weights.to(self.device))

        # Backpropagation of the gradients
        self.optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5)
        self.optimizer.step()

        # Update density estimator
        if self.number_steps % self.burn_in_density == 0:
            s0_for_d, a0_for_d, _, _, _, _, _, _, _ = self.memory.sample_batch(
                self.burn_in_density)
            self.density_estimator.fit(s0_for_d.cpu())
            if hasattr(self.density_estimator, 'kde'):
                print('steps: {}, DE fitted: {}, bandwifth: {}'.format(
                    self.number_steps, self.density_estimator.kde,
                    self.density_estimator.kde.bandwidth))

        # Update Enet
        if self.memory.number_samples() > self.burn_in_density:
            e_observed = self._epistemic_uncertainty(s0).gather(
                1, a0.long()).squeeze()
            e_loss, e_batch_loss = self.calc_loss(
                e_observed,
                torch.tensor(batch_loss).to(self.device),
                weights.to(self.device))
            if self.number_steps % self.burn_in_density == 0:
                # print("steps, Top k samples from Qnet: batch_loss:", self.number_steps, torch.topk(torch.tensor(batch_loss), 10))
                # print("Top k samples from Enet, e_observed:", torch.topk(e_observed, 10))
                print('steps, e_loss', self.number_steps, e_loss)
                # print('density', self.density_estimator.score_samples(s0.cpu()).to(self.device))

            self.e_optimizer.zero_grad()
            e_loss.backward()

            torch.nn.utils.clip_grad_norm_(self.enet.parameters(), 5)
            self.e_optimizer.step()

        # Update replay memory
        self.memory.update_priorities(indices, batch_loss)
        return

    def _epistemic_uncertainty(self, x):
        """
        Computes uncertainty for input sample and
        returns epistemic uncertainty estimate.
        """
        u_in = []
        if 'x' in self.features:
            u_in.append(x)
        if 'd' in self.features:
            density_feature = self.density_estimator.score_samples(x.cpu()).to(
                self.device)
            u_in.append(density_feature)
        u_in = torch.cat(u_in, dim=1)
        return self.enet.forward(u_in)

    def pretrain_density_estimator(self, x):
        """
        Trains density estimator on input samples
        """

        self.density_estimator.fit(x.cpu())
Ejemplo n.º 12
0
 def __init__(self, algo, env, num_actions, memory_size=10000):
     self.algo = algo
     self.env = env
     self.policy = CNN(3, 96, 96, 1)
     self.memory = ReplayMemory(memory_size)
Ejemplo n.º 13
0
class Agent():
    def __init__(self, algo, optimizer, env, num_actions, memory_size=10000):
        self.algo = algo  # currently not used. Future work to make learning algorithms modular.
        self.env = env
        self.policy = Feedforward(env.observation_space.shape[0],
                                  env.action_space.n)

        # Reusing ReplayMemory class for Q-learning, but this may be a bit confusing so let me clarify (I'm just lazy here)
        # As policy gradient is on-policy method, you can not use experiences generated from a different policy
        # I'm flushing the memory at the beggining of every epoch (see train function) so the policy is updated
        # only on the trajectories from the current policy
        self.memory = ReplayMemory(memory_size)

        self.n_actions = env.action_space.n
        if optimizer == 'Adam':
            self.optim = optim.Adam(self.policy.parameters())
        else:
            raise NotImplementedError

    def update_policy(self, gamma):
        memory = self.memory.get_memory()
        episode_length = len(memory)
        memory = list(zip(*memory))
        discounted_rewards = []
        s = memory[0]
        a = memory[1]
        ns = memory[2]
        r = memory[3]
        a_one_hot = torch.nn.functional.one_hot(torch.tensor(a),
                                                num_classes=2).float()

        for t in range(episode_length):
            r_forward = r[t:]
            G = 0
            # Compute a discounted cummulative reward from time step t
            for i, reward in enumerate(r_forward):
                G += gamma**i * reward
            discounted_rewards.append(G)

        rewards_t = torch.tensor(discounted_rewards).detach()
        s_t = torch.tensor(s).float()
        selected_action_probs = self.policy(s_t)
        prob = torch.sum(selected_action_probs * a_one_hot, axis=1)
        # A small hack to prevent inf when log(0)
        clipped = torch.clamp(prob, min=1e-10, max=1.0)
        J = -torch.log(clipped) * rewards_t
        grad = torch.sum(J)

        self.optim.zero_grad()
        grad.backward()
        self.optim.step()

    def train(self, num_epochs, gamma):
        # initialization

        for e in range(num_epochs):
            self.memory.flush()
            state = self.env.reset()
            done = False
            total_reward = 0
            self.env.render()
            t = 0

            while not done:
                action_prob = self.policy(torch.from_numpy(state).float())
                #print(action_prob.detach().numpy())
                #action = torch.argmax(action_prob).item()
                action = np.random.choice(range(self.n_actions),
                                          p=action_prob.detach().numpy())
                next_state, reward, done, _ = self.env.step(action)
                self.memory.push(state, action, next_state, reward)
                self.env.render()

                total_reward += reward
                state = next_state
                t += 1

            print("Episode: ", e)
            print("Reward: ", total_reward)
            writer.add_scalar('./runs/rewards', total_reward, e)

            self.update_policy(gamma)
        self.env.close()
        writer.close()
Ejemplo n.º 14
0
    def __init__(self,
                 n_agents,
                 dim_obs,
                 dim_act,
                 batch_size,
                 capacity,
                 episodes_before_train,
                 use_approx=False):
        self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)]
        self.critics = [
            Critic(n_agents, dim_obs, dim_act) for i in range(n_agents)
        ]
        self.actors_target = deepcopy(self.actors)
        self.critics_target = deepcopy(self.critics)

        self.n_agents = n_agents
        self.n_states = dim_obs
        self.n_actions = dim_act
        self.memory = ReplayMemory(capacity)
        self.batch_size = batch_size
        self.use_cuda = torch.cuda.is_available()
        self.episodes_before_train = episodes_before_train

        self.GAMMA = 0.95
        self.tau = 0.01

        self.use_approx = use_approx

        # for gaussian noise
        self.var = [1.0 for i in range(n_agents)]

        self.critic_optimizer = [
            Adam(x.parameters(), lr=0.001) for x in self.critics
        ]
        self.actor_optimizer = [
            Adam(x.parameters(), lr=0.0001) for x in self.actors
        ]

        if (self.use_approx):
            self.approx_policies = [[
                ApproxPolicy(dim_obs, dim_act) if i != j else None
                for i in range(self.n_agents)
            ] for j in range(self.n_agents)]
            self.approx_targets = deepcopy(self.approx_policies)
            self.approx_optimizer = [[
                Adam(x.parameters(), lr=0.001) if x is not None else None
                for x in approx_actor
            ] for approx_actor in self.approx_policies]

        if self.use_cuda:
            for x in self.actors:
                x.cuda()
            for x in self.critics:
                x.cuda()
            for x in self.actors_target:
                x.cuda()
            for x in self.critics_target:
                x.cuda()

            if self.use_approx:
                for i in range(self.n_agents):
                    for j in range(self.n_agents):
                        if self.approx_policies[i][j] is not None:
                            self.approx_policies[i][j].cuda()
                            self.approx_targets[i][j].cuda()

        self.steps_done = 0
        self.episode_done = 0
Ejemplo n.º 15
0
class MADDPG:
    def __init__(self,
                 n_agents,
                 dim_obs,
                 dim_act,
                 batch_size,
                 capacity,
                 episodes_before_train,
                 use_approx=False):
        self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)]
        self.critics = [
            Critic(n_agents, dim_obs, dim_act) for i in range(n_agents)
        ]
        self.actors_target = deepcopy(self.actors)
        self.critics_target = deepcopy(self.critics)

        self.n_agents = n_agents
        self.n_states = dim_obs
        self.n_actions = dim_act
        self.memory = ReplayMemory(capacity)
        self.batch_size = batch_size
        self.use_cuda = torch.cuda.is_available()
        self.episodes_before_train = episodes_before_train

        self.GAMMA = 0.95
        self.tau = 0.01

        self.use_approx = use_approx

        # for gaussian noise
        self.var = [1.0 for i in range(n_agents)]

        self.critic_optimizer = [
            Adam(x.parameters(), lr=0.001) for x in self.critics
        ]
        self.actor_optimizer = [
            Adam(x.parameters(), lr=0.0001) for x in self.actors
        ]

        if (self.use_approx):
            self.approx_policies = [[
                ApproxPolicy(dim_obs, dim_act) if i != j else None
                for i in range(self.n_agents)
            ] for j in range(self.n_agents)]
            self.approx_targets = deepcopy(self.approx_policies)
            self.approx_optimizer = [[
                Adam(x.parameters(), lr=0.001) if x is not None else None
                for x in approx_actor
            ] for approx_actor in self.approx_policies]

        if self.use_cuda:
            for x in self.actors:
                x.cuda()
            for x in self.critics:
                x.cuda()
            for x in self.actors_target:
                x.cuda()
            for x in self.critics_target:
                x.cuda()

            if self.use_approx:
                for i in range(self.n_agents):
                    for j in range(self.n_agents):
                        if self.approx_policies[i][j] is not None:
                            self.approx_policies[i][j].cuda()
                            self.approx_targets[i][j].cuda()

        self.steps_done = 0
        self.episode_done = 0

    def update_policy(self):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        BoolTensor = torch.cuda.BoolTensor if self.use_cuda else torch.BoolTensor
        FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor

        c_loss = []
        a_loss = []
        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))
            non_final_mask = BoolTensor(
                list(map(lambda s: s is not None, batch.next_states)))

            # state_batch: batch_size x n_agents x dim_obs
            state_batch = torch.stack(batch.states).type(FloatTensor)
            action_batch = torch.stack(batch.actions).type(FloatTensor)
            reward_batch = torch.stack(batch.rewards).type(FloatTensor)

            # non_final_next_states: (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = torch.stack([
                s for s in batch.next_states if s is not None
            ]).type(FloatTensor)

            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)

            # calculating current_Q : Q(x, a1, ..., an)
            self.critic_optimizer[agent].zero_grad()
            current_Q = self.critics[agent](whole_state, whole_action)

            # calculating for target_Q : y = r + Q(x', a'1, ... , a'n) --> for all states non final
            non_final_next_actions = None

            if self.use_approx:
                self.update_approx_policy(agent)

                param_list = [
                    self.approx_targets[agent][i](
                        non_final_next_states[:, i, :]) if i != agent else None
                    for i in range(self.n_agents)
                ]
                param_list = [
                    list(torch.chunk(param, 2 * self.n_actions))
                    if param is not None else None for param in param_list
                ]
                param_list = [
                    [torch.split(x, self.n_actions, dim=1)
                     for x in param] if param is not None else None
                    for param in param_list
                ]

                act_pd_n = [[Normal(loc=x[0], scale=x[1])
                             for x in param] if param is not None else None
                            for param in param_list]
                non_final_next_actions = [
                    torch.cat([x.sample() for x in act_pd])
                    if act_pd is not None else None for act_pd in act_pd_n
                ]
                non_final_next_actions[agent] = self.actors_target[agent](
                    non_final_next_states[:, agent, :])
            else:
                non_final_next_actions = [
                    self.actors_target[i](non_final_next_states[:, i, :])
                    for i in range(self.n_agents)
                ]

            non_final_next_actions = torch.stack(non_final_next_actions)
            non_final_next_actions = (non_final_next_actions.transpose(
                0, 1).contiguous())

            target_Q = torch.zeros(self.batch_size).type(FloatTensor)

            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.n_states),
                non_final_next_actions.view(-1, self.n_agents *
                                            self.n_actions)).squeeze()

            target_Q = (target_Q.unsqueeze(1) *
                        self.GAMMA) + (reward_batch[:, agent].unsqueeze(1))

            # calculating critic loss from current_Q and target_Q
            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            # calculating actor loss
            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.actors[agent](state_i)
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)

        if self.steps_done % 100 == 0 and self.steps_done > 0:
            for i in range(self.n_agents):
                soft_update(self.critics_target[i], self.critics[i], self.tau)
                soft_update(self.actors_target[i], self.actors[i], self.tau)

        return c_loss, a_loss

    def select_action(self, state_batch):
        # state_batch dimention: n_agents x state_dim

        # Define type of tensor
        FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor

        # create actions tensor
        actions = torch.zeros(self.n_agents, self.n_actions)

        # iterating for all agents
        for i in range(self.n_agents):
            # get all observation
            sb = state_batch[i, :].detach()

            # calculate forward
            act = self.actors[i](sb.unsqueeze(0)).squeeze()

            ## add gaussian noise
            act += torch.from_numpy(
                np.random.randn(self.n_actions) *
                self.var[i]).type(FloatTensor)

            # update gaussian noise
            if self.episode_done > self.episodes_before_train and self.var[
                    i] > 0.05:
                self.var[i] *= 0.999998
            act = torch.clamp(act, -1.0, 1.0)

            actions[i, :] = act
        self.steps_done += 1

        return actions

    def update_approx_policy(self, agent_idx):
        # Define type of tensor
        FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor

        # implementing infering policy of other's agent
        # get latest sample
        latest_sample = self.memory.latest_sample()
        experience = Experience(*zip(*latest_sample))

        latest_state = torch.stack(
            experience.states).type(FloatTensor).squeeze()
        latest_action = torch.stack(
            experience.actions).type(FloatTensor).squeeze()

        # update for each approx policy
        for i in range(self.n_agents):
            if i == agent_idx: continue
            # run neural network for getting param
            self.approx_optimizer[agent_idx][i].zero_grad()
            param = self.approx_policies[agent_idx][i](latest_state[i, :])
            param = param.unsqueeze(0)

            ## create normal distribution from param
            param = torch.split(param, self.n_actions, dim=1)

            act_pd = Normal(loc=param[0], scale=param[1])

            # get sample act
            act_sample = act_pd.sample()

            # calculate entrophy loss
            p_reg = -torch.mean(act_pd.entropy())

            # calculate log prob loss
            act_target = latest_action
            pg_loss = -torch.mean(act_pd.log_prob(act_target))

            loss = pg_loss + p_reg * 1e-3
            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                self.approx_policies[agent_idx][i].parameters(), 1)
            self.approx_optimizer[agent_idx][i].step()

            # target network
            soft_update(self.approx_targets[agent_idx][i],
                        self.approx_policies[agent_idx][i], self.tau)

            # TODO : calculate KL difference, can't do it right now because I use different type neural network for
            # approximation and target network. The approx network is outputting parameters for distribution,
            # meanwhile target network is outputting direct actions.

    def save(self, time, episode):
        # check path exists
        cwd = os.getcwd()
        save_dir = os.path.join(cwd, 'checkpoint')
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)

        # create filename
        time = time.replace(' ', '_')
        filename = 'Time_{}_NAgent_{}_Episode_{}.pth'.format(
            time, self.n_agents, episode)
        save_dir = os.path.join(save_dir, filename)

        # create saving dictionary
        checkpoint = dict()

        # saving model
        for i in range(self.n_agents):
            checkpoint['actor_{}'.format(i)] = self.actors[i].state_dict()
            checkpoint['critic_{}'.format(i)] = self.critics[i].state_dict()
            checkpoint['actor_target_{}'.format(
                i)] = self.actors_target[i].state_dict()
            checkpoint['critic_target_{}'.format(
                i)] = self.critics_target[i].state_dict()
            checkpoint['actor_optimizer_{}'.format(
                i)] = self.actor_optimizer[i].state_dict()
            checkpoint['critic_optimizer_{}'.format(
                i)] = self.critic_optimizer[i].state_dict()
            checkpoint['var_{}'.format(i)] = self.var[i]

            if self.use_approx:
                for j in range(self.n_agents):
                    if i != j:
                        checkpoint['approx_policy_{}_{}'.format(
                            i, j)] = self.approx_policies[i][j].state_dict()
                        checkpoint['approx_target_{}_{}'.format(
                            i, j)] = self.approx_targets[i][j].state_dict()
                        checkpoint['approx_optimizer_{}_{}'.format(
                            i, j)] = self.approx_optimizer[i][j].state_dict()

        # saving model info
        checkpoint['n_agents'] = self.n_agents
        checkpoint['episode'] = episode
        checkpoint['time'] = str(datetime.now())

        # save
        torch.save(checkpoint, save_dir)

    def load(self, path, map_location):
        checkpoint = torch.load(path, map_location=map_location)

        # loading model
        for i in range(self.n_agents):
            self.actors[i].load_state_dict(checkpoint['actor_{}'.format(i)])
            self.critics[i].load_state_dict(checkpoint['critic_{}'.format(i)])
            self.actors_target[i].load_state_dict(
                checkpoint['actor_target_{}'.format(i)])
            self.critics_target[i].load_state_dict(
                checkpoint['critic_target_{}'.format(i)])
            self.actor_optimizer[i].load_state_dict(
                checkpoint['actor_optimizer_{}'.format(i)])
            self.critic_optimizer[i].load_state_dict(
                checkpoint['critic_optimizer_{}'.format(i)])
            self.var[i] = checkpoint['var_{}'.format(i)]

            if self.use_approx:
                for j in range(self.n_agents):
                    if i != j:
                        self.approx_policies[i][j].load_state_dict(
                            checkpoint['approx_policy_{}_{}'.format(i, j)])
                        self.approx_targets[i][j].load_state_dict(
                            checkpoint['approx_target_{}_{}'.format(i, j)])
                        self.approx_optimizer[i][j].load_state_dict(
                            checkpoint['approx_optimizer_{}_{}'.format(i, j)])

    def load_all_agent(self, path, model_number, map_location):
        '''strictly for testing, do not use for resume training due to critic's network's size differents'''
        checkpoint = torch.load(path, map_location=map_location)

        #loading from 1 agent
        if model_number >= self.n_agents or model_number < 0:
            model_number = 0

        for i in range(self.n_agents):
            self.actors[i].load_state_dict(
                checkpoint['actor_{}'.format(model_number)])
            self.actors_target[i].load_state_dict(
                checkpoint['actor_target_{}'.format(model_number)])
            self.actor_optimizer[i].load_state_dict(
                checkpoint['actor_optimizer_{}'.format(model_number)])
            self.var[i] = checkpoint['var_{}'.format(model_number)]

    def load_agent(self, path, agent_number, model_number, map_location):
        '''strictly for testing, do not use for resume training due to critic's network's size differents'''
        checkpoint = torch.load(path, map_location=map_location)

        #loading from 1 agent
        if agent_number >= self.n_agents or agent_number < 0:
            agent_number = 0

        if model_number >= self.n_agents or model_number < 0:
            model_number = 0

        self.actors[agent_number].load_state_dict(
            checkpoint['actor_{}'.format(model_number)])
        self.actors_target[agent_number].load_state_dict(
            checkpoint['actor_target_{}'.format(model_number)])
        self.actor_optimizer[agent_number].load_state_dict(
            checkpoint['actor_optimizer_{}'.format(model_number)])
        self.var[agent_number] = checkpoint['var_{}'.format(model_number)]