def __init__(self, env, policy_lr, value_lr, tau, gamma, buffer_size,
                 max_time_step, observate_time, batch_size, path,
                 soft_update_step, use_cuda):
        self.env = env
        self.policy_lr = policy_lr
        self.value_lr = value_lr
        self.use_cuda = bool(use_cuda)
        self.tau = tau
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.max_time_step = max_time_step
        self.observate_time = observate_time
        self.batch_size = batch_size
        self.global_time_step = 0
        self.path = path
        self.soft_update_step = soft_update_step

        print('IF USE CUDA: ' + str(self.use_cuda))

        num_inputs = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        # the scale of the action space....
        self.action_scale = self.env.action_space.high[0]

        # build up the network....
        # build the actor_network firstly...
        self.actor_net = models.Policy(num_inputs, self.num_actions)
        self.actor_target_net = models.Policy(num_inputs, self.num_actions)

        # build the critic_network....
        self.critic_net = models.Critic(num_inputs, self.num_actions)
        self.critic_target_net = models.Critic(num_inputs, self.num_actions)

        # if use cuda...
        if self.use_cuda:
            self.actor_net.cuda()
            self.actor_target_net.cuda()

            self.critic_net.cuda()
            self.critic_target_net.cuda()

        # init the same parameters....
        self.actor_target_net.load_state_dict(self.actor_net.state_dict())
        self.critic_target_net.load_state_dict(self.critic_net.state_dict())

        # define the optimize.... add the L2 reg in critic optimzier here...
        self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                                lr=self.policy_lr)
        self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(),
                                                 lr=self.value_lr,
                                                 weight_decay=1e-2)

        # init the filter...
        self.running_state = ZFilter((num_inputs, ), clip=5)
Exemple #2
0
    def __init__(self):
        self.replay_buffer = replaybuffer.ReplayBuffer(5000)

        self.env = PendulumEnv()

        observation = self.env.reset()

        self.device = torch.device("cuda")

        #INSTANCIATE MODELS
        state_size = 3
        action_size = 1
        self.state_dreamer = models.StateDreamer(state_size, action_size)
        self.reward_dreamer = models.RewardDreamer(state_size)
        self.actor = models.Actor(state_size, action_size)
        self.critic = models.Critic(state_size, action_size)

        #put models on device
        self.state_dreamer.to(self.device)
        self.reward_dreamer.to(self.device)
        self.actor.to(self.device)
        self.critic.to(self.device)

        #create optimiser for each model
        self.state_dreamer_optimizer = optim.SGD(
            self.state_dreamer.parameters(), lr=0.01, momentum=0.9)
        self.reward_dreamer_optimizer = optim.SGD(
            self.reward_dreamer.parameters(), lr=0.01, momentum=0.9)
        self.actor_optimizer = optim.SGD(self.actor.parameters(),
                                         lr=0.0001,
                                         momentum=0.9)
        self.critic_optimizer = optim.SGD(self.critic.parameters(),
                                          lr=0.001,
                                          momentum=0.9)
Exemple #3
0
    def __init__(self, config, out_dir):
        super().__init__(config)

        def env_make_fn():
            return gym.make(config['env'])

        self.env = env_make_fn()
        self.device = config['device']
        self.storage = StorageWrapper.remote(storage.ReplayBuffer, [config['replay_buffer_size']], {})
        critic_kwargs = {
            'num_inputs': self.env.observation_space.shape[0],
            'actions_dim': self.env.action_space.shape[0]
        }
        policy_kwargs = critic_kwargs
        self.critic = models.Critic(**critic_kwargs).to(self.device)
        self.policy = models.Policy(**policy_kwargs).to(self.device)
        self.target_policy = copy.deepcopy(self.policy)
        self.target_critic = copy.deepcopy(self.critic)

        self.params_server = ParamServer.remote(utils.get_cpu_state_dict(self.policy))
        self.evaluator = workers.Evaluator.as_remote(num_gpus=config['gpu_per_runner'], num_cpus=config['cpu_per_runner'])
        self.evaluator = self.evaluator.remote(models.Policy,
                                               policy_kwargs,
                                               env_make_fn,
                                               self.params_server,
                                               self.config)

        self.runners = [workers.Runner.as_remote(num_gpus=config['gpu_per_runner'], 
                                                 num_cpus=config['cpu_per_runner']).remote(models.Policy,
                                                                                           policy_kwargs,
                                                                                           env_make_fn,
                                                                                           self.params_server,
                                                                                           self.storage,
                                                                                           self.config)
                        for _ in range(self.config['n_runners'])]

        self.critic.train()
        self.policy.train()
        self.target_policy.eval()
        self.target_critic.eval()
        self.opt_policy = torch.optim.Adam([{'params': self.policy.parameters(), 'lr': self.config['policy_lr']}])
        self.opt_critic = torch.optim.Adam([{'params': self.critic.parameters(), 'lr': self.config['critic_lr']}])
        self.critic_loss = None
        self.policy_loss = None
Exemple #4
0
def single_run(args,
               logger,
               env,
               eval_env,
               num_episodes=100,
               num_eval_episodes=25,
               max_steps=200,
               γ=1.0,
               lr_actor=0.01,
               lr_critic=0.05,
               pol_ent=1):
    """Main algo to train an RL agent"""
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    return_run = np.zeros(num_episodes)
    samples_run = np.zeros(num_episodes)
    actor = models.SigmoidPolicy(num_states, num_actions)
    actor_opt = optim.Adam(actor.parameters(), lr=lr_actor)
    critic = models.Critic(num_states, num_actions)
    critic_opt = optim.Adam(critic.parameters(), lr=lr_critic)

    actor_params_sizes = torch.tensor(
        np.cumsum([0] + [len(t.flatten()) for t in list(actor.parameters())]))
    gradient_network = models.GradientNetwork(num_states,
                                              actor_params_sizes[-1])
    gradient_network_opt = optim.Adam(gradient_network.parameters(),
                                      lr=lr_critic)

    evaluations = []
    # bar = pyprind.ProgBar(num_episodes)
    for episode in range(num_episodes):
        print("episode", episode)
        # bar.update()
        obs = env.reset()
        obs_hist = deque()
        log_prob_a_hist = deque()
        adv_hist = deque()
        q_sa_target_hist = deque()
        q_sa_hist = deque()
        return_all_eval_episodes = np.zeros(num_eval_episodes)
        total_vae_loss = 0
        scale_gamma = 1.0
        actor_params_list = []

        gradient_td_error_loss = deque()

        # Detached params and pointers
        actor_params = (torch.cat([t.flatten() \
                for t in list(actor.parameters())]).view(1,-1)).clone().detach().requires_grad_(True)
        actor_params_list = list(actor.parameters())

        # *** COLLECT DATA ***
        for step in range(max_steps):

            # Predict gradient
            grad_output_current_state = gradient_network(
                one_hot_ify(obs, num_states), actor_params)

            # Get actor and critic values
            prob_a = actor(one_hot_ify(obs, num_states))
            q_s = critic(one_hot_ify(obs, num_states))
            a_dist = torch.distributions.Categorical(probs=prob_a)
            action = int(a_dist.sample().numpy()[0])

            # Log: action prob, advantage, q values
            log_prob_a_hist.append(
                (a_dist.log_prob(torch.tensor(action))).view(1, -1))
            adv_hist.append(
                (q_s.data[0, action] -
                 (q_s.data[0, :] * prob_a.data[0, :]).sum()).view(1, -1))
            q_sa_hist.append((q_s[0, action]).view(1, -1))

            obs, rew, done, _ = env.step(action)
            obs_hist.append(obs)

            rew = rew + pol_ent * entropy(
                prob_a.data[0])  #added policy entropy to the reward function

            # Get log_prob with grad function, for gradient network
            log_prob = a_dist.log_prob(torch.tensor(action))
            with torch.no_grad():
                # Next actor critic values
                q_s_next = critic(one_hot_ify(obs, num_states))
                prob_a_next = actor(one_hot_ify(obs, num_states))
                v_next = (q_s_next * prob_a_next).sum()
                q_target = rew + γ * v_next
                q_sa_target_hist.append((q_target).view(1, -1))

                # Predict next gradient
                # TODO: experiment with conditioning on either logits or params
                # Also, since we are taking the params, we cannot do a max over
                # actions for the next grad state
                grad_output_next_state = gradient_network(
                    one_hot_ify(obs, num_states), actor_params)

                # Compute next gradient target
                # gradient_reward = a_dist.log_prob(torch.tensor(action)) * (q_s.data[0,action] - (q_s.data[0,:]*prob_a.data[0,:]).sum())
                adv = (q_s.data[0, action] -
                       (q_s.data[0, :] * prob_a.data[0, :]).sum())
                gradient_reward = torch.autograd.grad(log_prob,
                                                      actor_params_list,
                                                      retain_graph=True)
                gradient_reward = (torch.cat([t.flatten() \
                        for t in list(gradient_reward)]).view(1,-1))
                gradient_target = gradient_reward * adv + γ * grad_output_next_state

            gradient_td_error = nn.MSELoss()(gradient_target,
                                             grad_output_current_state)
            gradient_td_error_loss.append(gradient_td_error.view(1, -1))

            samples_run[episode] = step + 1

            if done:
                break

        # *** POLICY UPDATE ***

        critic_loss = nn.MSELoss()(torch.cat(list(q_sa_hist)),
                                   torch.cat(list(q_sa_target_hist)))
        critic_opt.zero_grad()
        critic_loss.backward()
        critic_opt.step()

        actor_opt.zero_grad()

        # Update with the pg_bell function
        if args.pg_bellman:
            gradient_network_opt.zero_grad()
            gradient_loss = torch.cat(list(gradient_td_error_loss)).sum()
            gradient_loss.backward()
            gradient_network_opt.step()
            # for t_index, t in enumerate(list(actor.parameters())):
            #     t.grad = - lamda_ent * (actor_params.grad[0, actor_params_sizes[t_index]:actor_params_sizes[t_index+1]]).view(t.shape)

            # for t_index, t in enumerate(list(actor.parameters())):
            # t.grad = - lamda_ent * (actor_params.grad[0, actor_params_sizes[t_index]:actor_params_sizes[t_index+1]]).view(t.shape)

            # Policy param Update
            # TODO: Now that the Gradient Network is updated, we should loop
            # through the state/action history and update the policy params?

            # Loop state/action history and collect grads
            grads = torch.zeros_like(grad_output_current_state)
            for obs in obs_hist:
                grads += gradient_network(one_hot_ify(obs, num_states),
                                          actor_params)
            grads = grads / len(obs_hist)
            grads = grads.flatten()
            start = 0
            # Grab new param grads, reshape to same size
            for p in actor_params_list:
                stop = start + p.nelement()
                g = grads[start:stop].view(p.size())
                p.grad = -g.clone()  # clone otherwise opt won't work
                start = stop
            actor_opt.step()

        # If using standard ac policy gradient:
        else:
            actor_loss = -(torch.cat(list(log_prob_a_hist)) *
                           torch.cat(list(adv_hist))).sum()
            actor_loss.backward()
            actor_opt.step()

        # *** EVALUATION ***
        for eval_episode in range(num_eval_episodes):
            eval_obs = eval_env.reset()
            return_eval_episode = 0
            scale = 1.0
            for eval_step in range(max_steps):
                with torch.no_grad():
                    eval_prob_a = actor(one_hot_ify(eval_obs, num_states))
                    eval_a = torch.distributions.Categorical(
                        probs=eval_prob_a).sample().numpy()[0]
                eval_obs, eval_rew, eval_done, _ = eval_env.step(eval_a)
                return_eval_episode += scale * eval_rew
                scale *= γ
                if eval_done:
                    break
            return_all_eval_episodes[eval_episode] = return_eval_episode
        return_run[episode] = np.mean(return_all_eval_episodes)
        print("EvalRewards : ", episode, ":",
              np.mean(return_all_eval_episodes))
        evaluations.append(np.mean(return_all_eval_episodes))
        logger.record_reward(evaluations)

    logger.save()
    return return_run, samples_run, actor, critic
Exemple #5
0
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # * Step 1: init data folders
    print("init data folders")

    # * Init character folders for dataset construction
    metatrain_character_folders, metatest_character_folders = tg.mini_imagenet_folders(
    )

    # * Step 2: init neural networks
    print("init neural networks")

    feature_encoder = models.CNNEncoder()
    actor = models.Actor(FEATURE_DIM, RELATION_DIM, CLASS_NUM)
    critic = models.Critic(FEATURE_DIM, RELATION_DIM)

    #feature_encoder = torch.nn.DataParallel(feature_encoder)
    #actor = torch.nn.DataParallel(actor)
    #critic = torch.nn.DataParallel(critic)

    feature_encoder.train()
    actor.train()
    critic.train()

    feature_encoder.apply(models.weights_init)
    actor.apply(models.weights_init)
    critic.apply(models.weights_init)

    feature_encoder.to(device)
    actor.to(device)
    critic.to(device)

    agent = a2cAgent.A2CAgent(actor, critic, GAMMA, ENTROPY_WEIGHT,
                              FEATURE_DIM, RELATION_DIM, CLASS_NUM, device)

    #feature_encoder.eval()
    #relation_network.eval()

    if os.path.exists(
            str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) +
                "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")):
        feature_encoder.load_state_dict(
            torch.load(
                str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) +
                    "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")))
        print("load feature encoder success")

    if os.path.exists(
            str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) +
                "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")):
        actor.load_state_dict(
            torch.load(
                str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) +
                    "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")))
        print("load actor network success")

    if os.path.exists(
            str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) +
                "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")):
        critic.load_state_dict(
            torch.load(
                str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) +
                    "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")))
        print("load critic network success")

    max_accuracy_list = []
    mean_accuracy_list = []
    for episode in range(1):
        total_accuracy = []
        for i in range(TEST_EPISODE):
            # * Generate env
            env_states_list = []
            env_labels_list = []
            number_of_query_image = 15
            task = tg.MiniImagenetTask(metatest_character_folders, CLASS_NUM,
                                       SAMPLE_NUM_PER_CLASS,
                                       number_of_query_image)
            sample_dataloader = tg.get_mini_imagenet_data_loader(
                task,
                num_per_class=SAMPLE_NUM_PER_CLASS,
                split="train",
                shuffle=False)
            test_dataloader = tg.get_mini_imagenet_data_loader(
                task,
                num_per_class=number_of_query_image,
                split="test",
                shuffle=True)

            sample_images, sample_labels = next(iter(sample_dataloader))
            test_images, test_labels = next(iter(test_dataloader))

            sample_images, sample_labels = sample_images.to(
                device), sample_labels.to(device)
            test_images, test_labels = test_images.to(device), test_labels.to(
                device)

            # * calculate features
            sample_features = feature_encoder(sample_images)
            sample_features = sample_features.view(CLASS_NUM,
                                                   SAMPLE_NUM_PER_CLASS,
                                                   FEATURE_DIM, 19, 19)
            sample_features = torch.sum(sample_features, 1).squeeze(1)
            test_features = feature_encoder(test_images)

            # * calculate relations
            # * each batch sample link to every samples to calculate relations
            # * to form a 100x128 matrix for relation network

            sample_features_ext = sample_features.unsqueeze(0).repeat(
                number_of_query_image * CLASS_NUM, 1, 1, 1, 1)
            test_features_ext = test_features.unsqueeze(0).repeat(
                CLASS_NUM, 1, 1, 1, 1)
            test_features_ext = torch.transpose(test_features_ext, 0, 1)

            relation_pairs = torch.cat(
                (sample_features_ext, test_features_ext),
                2).view(-1, FEATURE_DIM * 2, 19, 19)
            env_states_list.append(relation_pairs)
            env_labels_list.append(test_labels)

            test_env = a2cAgent.env(env_states_list, env_labels_list)
            rewards = agent.test(test_env)
            test_accuracy = rewards / len(test_labels)
            print(test_accuracy)
            total_accuracy.append(test_accuracy)

        mean_accuracy, conf_int = mean_confidence_interval(total_accuracy)
        print(f"Total accuracy : {mean_accuracy:.4f}")
        print(f"confidence interval : {conf_int:.4f}")
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # * Step 1: init data folders
    print("init data folders")

    # * Init character folders for dataset construction
    metatrain_character_folders, metatest_character_folders = tg.mini_imagenet_folders(
    )

    # * Step 2: init neural networks
    print("init neural networks")

    feature_encoder = models.CNNEncoder()
    actor = models.Actor(FEATURE_DIM, RELATION_DIM, CLASS_NUM)
    critic = models.Critic(FEATURE_DIM, RELATION_DIM)

    #feature_encoder = torch.nn.DataParallel(feature_encoder)
    #actor = torch.nn.DataParallel(actor)
    #critic = torch.nn.DataParallel(critic)

    feature_encoder.train()
    actor.train()
    critic.train()

    feature_encoder.apply(models.weights_init)
    actor.apply(models.weights_init)
    critic.apply(models.weights_init)

    feature_encoder.to(device)
    actor.to(device)
    critic.to(device)

    cross_entropy = nn.CrossEntropyLoss()

    feature_encoder_optim = torch.optim.Adam(feature_encoder.parameters(),
                                             lr=LEARNING_RATE)
    feature_encoder_scheduler = StepLR(feature_encoder_optim,
                                       step_size=10000,
                                       gamma=0.5)

    actor_optim = torch.optim.Adam(actor.parameters(), lr=2.5 * LEARNING_RATE)
    actor_scheduler = StepLR(actor_optim, step_size=10000, gamma=0.5)

    critic_optim = torch.optim.Adam(critic.parameters(),
                                    lr=2.5 * LEARNING_RATE * 10)
    critic_scheduler = StepLR(critic_optim, step_size=10000, gamma=0.5)

    agent = a2cAgent.A2CAgent(actor, critic, GAMMA, ENTROPY_WEIGHT, CLASS_NUM,
                              device)

    if os.path.exists(
            str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) +
                "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")):
        feature_encoder.load_state_dict(
            torch.load(
                str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) +
                    "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")))
        print("load feature encoder success")

    if os.path.exists(
            str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) +
                "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")):
        actor.load_state_dict(
            torch.load(
                str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) +
                    "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")))
        print("load actor network success")

    if os.path.exists(
            str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) +
                "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")):
        critic.load_state_dict(
            torch.load(
                str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) +
                    "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")))
        print("load critic network success")

    # * Step 3: build graph
    print("Training...")

    last_accuracy = 0.0
    mbal_loss_list = []
    mbcl_loss_list = []
    loss_list = []
    number_of_query_image = 15
    for episode in range(EPISODE):
        #print(f"EPISODE : {episode}")
        policy_losses = []
        value_losses = []

        for meta_batch in range(META_BATCH_RANGE):
            meta_env_states_list = []
            meta_env_labels_list = []
            for inner_batch in range(INNER_BATCH_RANGE):
                # * Generate environment
                env_states_list = []
                env_labels_list = []
                for env in range(ENV_LENGTH):
                    task = tg.MiniImagenetTask(metatrain_character_folders,
                                               CLASS_NUM, SAMPLE_NUM_PER_CLASS,
                                               number_of_query_image)
                    sample_dataloader = tg.get_mini_imagenet_data_loader(
                        task,
                        num_per_class=SAMPLE_NUM_PER_CLASS,
                        split="train",
                        shuffle=False)
                    batch_dataloader = tg.get_mini_imagenet_data_loader(
                        task, num_per_class=5, split="test", shuffle=True)

                    samples, sample_labels = next(iter(sample_dataloader))
                    samples, sample_labels = samples.to(
                        device), sample_labels.to(device)
                    for batches, batch_labels in batch_dataloader:
                        batches, batch_labels = batches.to(
                            device), batch_labels.to(device)

                        inner_sample_features = feature_encoder(samples)
                        inner_sample_features = inner_sample_features.view(
                            CLASS_NUM, SAMPLE_NUM_PER_CLASS, FEATURE_DIM, 19,
                            19)
                        inner_sample_features = torch.sum(
                            inner_sample_features, 1).squeeze(1)

                        inner_batch_features = feature_encoder(batches)
                        inner_sample_feature_ext = inner_sample_features.unsqueeze(
                            0).repeat(5 * CLASS_NUM, 1, 1, 1, 1)
                        inner_batch_features_ext = inner_batch_features.unsqueeze(
                            0).repeat(CLASS_NUM, 1, 1, 1, 1)
                        inner_batch_features_ext = torch.transpose(
                            inner_batch_features_ext, 0, 1)

                        inner_relation_pairs = torch.cat(
                            (inner_sample_feature_ext,
                             inner_batch_features_ext),
                            2).view(-1, FEATURE_DIM * 2, 19, 19)
                        env_states_list.append(inner_relation_pairs)
                        env_labels_list.append(batch_labels)

                inner_env = a2cAgent.env(env_states_list, env_labels_list)
                agent.train(inner_env, inner_update=True)

            for meta_env in range(META_ENV_LENGTH):
                task = tg.MiniImagenetTask(metatrain_character_folders,
                                           CLASS_NUM, SAMPLE_NUM_PER_CLASS,
                                           number_of_query_image)
                sample_dataloader = tg.get_mini_imagenet_data_loader(
                    task,
                    num_per_class=SAMPLE_NUM_PER_CLASS,
                    split="train",
                    shuffle=False)
                batch_dataloader = tg.get_mini_imagenet_data_loader(
                    task,
                    num_per_class=number_of_query_image,
                    split="test",
                    shuffle=True)
                # * num_per_class : number of query images

                # * sample datas
                samples, sample_labels = next(iter(sample_dataloader))
                samples, sample_labels = samples.to(device), sample_labels.to(
                    device)
                # * Generate env for meta update
                batches, batch_labels = next(iter(batch_dataloader))
                # * init dataset
                # * sample_dataloader is to obtain previous samples for compare
                # * batch_dataloader is to batch samples for training
                batches, batch_labels = batches.to(device), batch_labels.to(
                    device)

                # * calculates features
                #feature_encoder.weight = feature_fast_weights

                sample_features = feature_encoder(samples)
                sample_features = sample_features.view(CLASS_NUM,
                                                       SAMPLE_NUM_PER_CLASS,
                                                       FEATURE_DIM, 19, 19)
                sample_features = torch.sum(sample_features, 1).squeeze(1)
                batch_features = feature_encoder(batches)

                # * calculate relations
                # * each batch sample link to every samples to calculate relations
                # * to form a 100 * 128 matrix for relation network
                sample_features_ext = sample_features.unsqueeze(0).repeat(
                    number_of_query_image * CLASS_NUM, 1, 1, 1, 1)
                batch_features_ext = batch_features.unsqueeze(0).repeat(
                    CLASS_NUM, 1, 1, 1, 1)
                batch_features_ext = torch.transpose(batch_features_ext, 0, 1)
                relation_pairs = torch.cat(
                    (sample_features_ext, batch_features_ext),
                    2).view(-1, FEATURE_DIM * 2, 19, 19)

                meta_env_states_list.append(relation_pairs)
                meta_env_labels_list.append(batch_labels)

            meta_env = a2cAgent.env(meta_env_states_list, meta_env_labels_list)
            agent.train(meta_env,
                        policy_loss_list=policy_losses,
                        value_loss_list=value_losses)

        feature_encoder_optim.zero_grad()
        actor_optim.zero_grad()
        critic_optim.zero_grad()

        torch.nn.utils.clip_grad_norm_(feature_encoder.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(actor.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(critic.parameters(), 0.5)

        meta_batch_actor_loss = torch.stack(policy_losses).mean()
        meta_batch_critic_loss = torch.stack(value_losses).mean()

        meta_batch_actor_loss.backward(retain_graph=True)
        meta_batch_critic_loss.backward()

        feature_encoder_optim.step()
        actor_optim.step()
        critic_optim.step()

        feature_encoder_scheduler.step()
        actor_scheduler.step()
        critic_scheduler.step()

        if (episode + 1) % 100 == 0:
            mbal = meta_batch_actor_loss.cpu().detach().numpy()
            mbcl = meta_batch_critic_loss.cpu().detach().numpy()
            print(
                f"episode : {episode+1}, meta_batch_actor_loss : {mbal:.4f}, meta_batch_critic_loss : {mbcl:.4f}"
            )

            mbal_loss_list.append(mbal)
            mbcl_loss_list.append(mbcl)
            loss_list.append(mbal + mbcl)

        if (episode + 1) % 500 == 0:
            print("Testing...")
            total_reward = 0

            total_num_of_test_samples = 0
            for i in range(TEST_EPISODE):
                # * Generate env
                env_states_list = []
                env_labels_list = []

                number_of_query_image = 10
                task = tg.MiniImagenetTask(metatest_character_folders,
                                           CLASS_NUM, SAMPLE_NUM_PER_CLASS,
                                           number_of_query_image)
                sample_dataloader = tg.get_mini_imagenet_data_loader(
                    task,
                    num_per_class=SAMPLE_NUM_PER_CLASS,
                    split="train",
                    shuffle=False)
                test_dataloader = tg.get_mini_imagenet_data_loader(
                    task,
                    num_per_class=number_of_query_image,
                    split="test",
                    shuffle=True)
                sample_images, sample_labels = next(iter(sample_dataloader))
                sample_images, sample_labels = sample_images.to(
                    device), sample_labels.to(device)

                test_images, test_labels = next(iter(test_dataloader))
                total_num_of_test_samples += len(test_labels)
                test_images, test_labels = test_images.to(
                    device), test_labels.to(device)

                # * calculate features
                sample_features = feature_encoder(sample_images)
                sample_features = sample_features.view(CLASS_NUM,
                                                       SAMPLE_NUM_PER_CLASS,
                                                       FEATURE_DIM, 19, 19)
                sample_features = torch.sum(sample_features, 1).squeeze(1)
                test_features = feature_encoder(test_images)

                # * calculate relations
                # * each batch sample link to every samples to calculate relations
                # * to form a 100x128 matrix for relation network

                sample_features_ext = sample_features.unsqueeze(0).repeat(
                    number_of_query_image * CLASS_NUM, 1, 1, 1, 1)
                test_features_ext = test_features.unsqueeze(0).repeat(
                    CLASS_NUM, 1, 1, 1, 1)
                test_features_ext = torch.transpose(test_features_ext, 0, 1)

                relation_pairs = torch.cat(
                    (sample_features_ext, test_features_ext),
                    2).view(-1, FEATURE_DIM * 2, 19, 19)
                env_states_list.append(relation_pairs)
                env_labels_list.append(test_labels)

                test_env = a2cAgent.env(env_states_list, env_labels_list)
                rewards = agent.test(test_env)
                total_reward += rewards

            test_accuracy = total_reward / (1.0 * total_num_of_test_samples)

            mean_loss = np.mean(loss_list)
            mean_actor_loss = np.mean(mbal_loss_list)
            mean_critic_loss = np.mean(mbcl_loss_list)

            print(f'mean loss : {mean_loss}')
            print("test accuracy : ", test_accuracy)

            writer.add_scalar('1.loss', mean_loss, episode + 1)
            writer.add_scalar('2.mean_actor_loss', mean_actor_loss,
                              episode + 1)
            writer.add_scalar('3.mean_critic_loss', mean_critic_loss,
                              episode + 1)
            writer.add_scalar('4.test accuracy', test_accuracy, episode + 1)

            loss_list = []
            mbal_loss_list = []
            mbcl_loss_list = []

            if test_accuracy > last_accuracy:
                # save networks
                torch.save(
                    feature_encoder.state_dict(),
                    str("./models/miniimagenet_feature_encoder_" +
                        str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) +
                        "shot.pkl"))
                torch.save(
                    actor.state_dict(),
                    str("./models/miniimagenet_actor_network_" +
                        str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) +
                        "shot.pkl"))

                torch.save(
                    critic.state_dict(),
                    str("./models/miniimagenet_critic_network_" +
                        str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) +
                        "shot.pkl"))
                print("save networks for episode:", episode)
                last_accuracy = test_accuracy
    def __init__(self,
                 config,
                 state_size,
                 action_size,
                 num_agents,
                 seed,
                 per=True):
        """Initialize an Agent object.
        
        Params
        ======
            config (config): instance of a config-class, which stores all the hyperparameters
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """

        self.config = config
        self.epsilon = self.config.EPSILON_START

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = seed

        # Initialize bins
        self.v_min = 0
        self.v_max = 5
        self.n_atoms = 51
        self.delta = (self.v_max - self.v_min) / float(self.n_atoms - 1)
        self.bin_centers = torch.from_numpy(
            np.array([
                self.v_min + i * self.delta for i in range(self.n_atoms)
            ]).reshape(-1, 1)).to(self.config.device)

        # Initialize the Actor and Critic Networks
        self.actor_local = models.Actor(state_size,
                                        action_size).to(self.config.device)
        self.actor_target = models.Actor(state_size,
                                         action_size).to(self.config.device)
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
                                                self.config.LR_actor)

        self.critic_local = models.Critic(state_size, action_size,
                                          self.n_atoms).to(self.config.device)
        self.critic_target = models.Critic(state_size, action_size,
                                           self.n_atoms).to(self.config.device)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            self.config.LR_critic,
            weight_decay=self.config.weight_decay)

        # Initialize the random-noise-process for action-noise
        self.is_training = True
        self.noise = OUNoise((self.num_agents, self.action_size), self.seed)

        # Hard update the target networks to have the same parameters as the local networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor_local.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic_local.parameters()):
            target_param.data.copy_(param.data)

        # Initialize the replay-buffer according to `per`
        self.memory = ReplayBuffer(self.config.BUFFER_SIZE,
                                   self.config.BATCH_SIZE, seed,
                                   self.config.device, self.config.N_BOOTSTRAP)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #8
0
    action_max = env.action_space.high[0]

    print("State dimension: {}".format(state_dimension))
    print("Action dimension: {}".format(action_dimension))
    print("Action max: {}".format(action_max))

    load_models = False

    # Actor network, critic network uusgeh

    actor = models.Actor(state_dimension, action_dimension, action_max)
    target_actor = models.Actor(state_dimension, action_dimension, action_max)
    actor_optimizer = torch.optim.Adam(actor.parameters(),
                                       lr=ACTOR_LEARNING_RATE)

    critic = models.Critic(state_dimension, action_dimension)
    target_critic = models.Critic(state_dimension, action_dimension)
    critic_optimizer = torch.optim.Adam(critic.parameters(),
                                        lr=CRITIC_LEARNING_RATE)

    # Target network-g huulah

    for target_param, param in zip(target_actor.parameters(),
                                   actor.parameters()):
        target_param.data.copy_(param.data)

    for target_param, param in zip(target_critic.parameters(),
                                   critic.parameters()):
        target_param.data.copy_(param.data)

    # Hadgalsan modeliig ashiglah
load_models = args.load_models
lr = args.lr
model_dim = args.model_dim
n_epochs = args.n_epochs
n_critic = args.n_critic
seed = args.seed

output_dim = 784 # 784 = 28 * 28, number of pixels in an MNIST image

torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

generator = models.Generator(output_dim, latent_dim, model_dim)
critic = models.Critic(model_dim)
if load_models == True:
    generator.load_state_dict(torch.load('models/generator.pth.tar'))
    critic.load_state_dict(torch.load('models/critic.pth.tar'))
generator.to(device)
critic.to(device)

critic_optimizer = optim.Adam(critic.parameters(), lr=lr, betas=(0.5, 0.9))
generator_optimizer = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.9))

# set distributions for later use
normal_dist = normal.Normal(0.0, 1.0)
uniform_dist = uniform.Uniform(0.0, 1.0)

# Create a latent variable that is used to visualize the progression of the generator
fixed_noise = normal_dist.sample((grid_size, latent_dim)).to(device)