def __init__(self, env, policy_lr, value_lr, tau, gamma, buffer_size, max_time_step, observate_time, batch_size, path, soft_update_step, use_cuda): self.env = env self.policy_lr = policy_lr self.value_lr = value_lr self.use_cuda = bool(use_cuda) self.tau = tau self.gamma = gamma self.buffer_size = buffer_size self.max_time_step = max_time_step self.observate_time = observate_time self.batch_size = batch_size self.global_time_step = 0 self.path = path self.soft_update_step = soft_update_step print('IF USE CUDA: ' + str(self.use_cuda)) num_inputs = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] # the scale of the action space.... self.action_scale = self.env.action_space.high[0] # build up the network.... # build the actor_network firstly... self.actor_net = models.Policy(num_inputs, self.num_actions) self.actor_target_net = models.Policy(num_inputs, self.num_actions) # build the critic_network.... self.critic_net = models.Critic(num_inputs, self.num_actions) self.critic_target_net = models.Critic(num_inputs, self.num_actions) # if use cuda... if self.use_cuda: self.actor_net.cuda() self.actor_target_net.cuda() self.critic_net.cuda() self.critic_target_net.cuda() # init the same parameters.... self.actor_target_net.load_state_dict(self.actor_net.state_dict()) self.critic_target_net.load_state_dict(self.critic_net.state_dict()) # define the optimize.... add the L2 reg in critic optimzier here... self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), lr=self.policy_lr) self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(), lr=self.value_lr, weight_decay=1e-2) # init the filter... self.running_state = ZFilter((num_inputs, ), clip=5)
def __init__(self): self.replay_buffer = replaybuffer.ReplayBuffer(5000) self.env = PendulumEnv() observation = self.env.reset() self.device = torch.device("cuda") #INSTANCIATE MODELS state_size = 3 action_size = 1 self.state_dreamer = models.StateDreamer(state_size, action_size) self.reward_dreamer = models.RewardDreamer(state_size) self.actor = models.Actor(state_size, action_size) self.critic = models.Critic(state_size, action_size) #put models on device self.state_dreamer.to(self.device) self.reward_dreamer.to(self.device) self.actor.to(self.device) self.critic.to(self.device) #create optimiser for each model self.state_dreamer_optimizer = optim.SGD( self.state_dreamer.parameters(), lr=0.01, momentum=0.9) self.reward_dreamer_optimizer = optim.SGD( self.reward_dreamer.parameters(), lr=0.01, momentum=0.9) self.actor_optimizer = optim.SGD(self.actor.parameters(), lr=0.0001, momentum=0.9) self.critic_optimizer = optim.SGD(self.critic.parameters(), lr=0.001, momentum=0.9)
def __init__(self, config, out_dir): super().__init__(config) def env_make_fn(): return gym.make(config['env']) self.env = env_make_fn() self.device = config['device'] self.storage = StorageWrapper.remote(storage.ReplayBuffer, [config['replay_buffer_size']], {}) critic_kwargs = { 'num_inputs': self.env.observation_space.shape[0], 'actions_dim': self.env.action_space.shape[0] } policy_kwargs = critic_kwargs self.critic = models.Critic(**critic_kwargs).to(self.device) self.policy = models.Policy(**policy_kwargs).to(self.device) self.target_policy = copy.deepcopy(self.policy) self.target_critic = copy.deepcopy(self.critic) self.params_server = ParamServer.remote(utils.get_cpu_state_dict(self.policy)) self.evaluator = workers.Evaluator.as_remote(num_gpus=config['gpu_per_runner'], num_cpus=config['cpu_per_runner']) self.evaluator = self.evaluator.remote(models.Policy, policy_kwargs, env_make_fn, self.params_server, self.config) self.runners = [workers.Runner.as_remote(num_gpus=config['gpu_per_runner'], num_cpus=config['cpu_per_runner']).remote(models.Policy, policy_kwargs, env_make_fn, self.params_server, self.storage, self.config) for _ in range(self.config['n_runners'])] self.critic.train() self.policy.train() self.target_policy.eval() self.target_critic.eval() self.opt_policy = torch.optim.Adam([{'params': self.policy.parameters(), 'lr': self.config['policy_lr']}]) self.opt_critic = torch.optim.Adam([{'params': self.critic.parameters(), 'lr': self.config['critic_lr']}]) self.critic_loss = None self.policy_loss = None
def single_run(args, logger, env, eval_env, num_episodes=100, num_eval_episodes=25, max_steps=200, γ=1.0, lr_actor=0.01, lr_critic=0.05, pol_ent=1): """Main algo to train an RL agent""" num_states = env.observation_space.n num_actions = env.action_space.n return_run = np.zeros(num_episodes) samples_run = np.zeros(num_episodes) actor = models.SigmoidPolicy(num_states, num_actions) actor_opt = optim.Adam(actor.parameters(), lr=lr_actor) critic = models.Critic(num_states, num_actions) critic_opt = optim.Adam(critic.parameters(), lr=lr_critic) actor_params_sizes = torch.tensor( np.cumsum([0] + [len(t.flatten()) for t in list(actor.parameters())])) gradient_network = models.GradientNetwork(num_states, actor_params_sizes[-1]) gradient_network_opt = optim.Adam(gradient_network.parameters(), lr=lr_critic) evaluations = [] # bar = pyprind.ProgBar(num_episodes) for episode in range(num_episodes): print("episode", episode) # bar.update() obs = env.reset() obs_hist = deque() log_prob_a_hist = deque() adv_hist = deque() q_sa_target_hist = deque() q_sa_hist = deque() return_all_eval_episodes = np.zeros(num_eval_episodes) total_vae_loss = 0 scale_gamma = 1.0 actor_params_list = [] gradient_td_error_loss = deque() # Detached params and pointers actor_params = (torch.cat([t.flatten() \ for t in list(actor.parameters())]).view(1,-1)).clone().detach().requires_grad_(True) actor_params_list = list(actor.parameters()) # *** COLLECT DATA *** for step in range(max_steps): # Predict gradient grad_output_current_state = gradient_network( one_hot_ify(obs, num_states), actor_params) # Get actor and critic values prob_a = actor(one_hot_ify(obs, num_states)) q_s = critic(one_hot_ify(obs, num_states)) a_dist = torch.distributions.Categorical(probs=prob_a) action = int(a_dist.sample().numpy()[0]) # Log: action prob, advantage, q values log_prob_a_hist.append( (a_dist.log_prob(torch.tensor(action))).view(1, -1)) adv_hist.append( (q_s.data[0, action] - (q_s.data[0, :] * prob_a.data[0, :]).sum()).view(1, -1)) q_sa_hist.append((q_s[0, action]).view(1, -1)) obs, rew, done, _ = env.step(action) obs_hist.append(obs) rew = rew + pol_ent * entropy( prob_a.data[0]) #added policy entropy to the reward function # Get log_prob with grad function, for gradient network log_prob = a_dist.log_prob(torch.tensor(action)) with torch.no_grad(): # Next actor critic values q_s_next = critic(one_hot_ify(obs, num_states)) prob_a_next = actor(one_hot_ify(obs, num_states)) v_next = (q_s_next * prob_a_next).sum() q_target = rew + γ * v_next q_sa_target_hist.append((q_target).view(1, -1)) # Predict next gradient # TODO: experiment with conditioning on either logits or params # Also, since we are taking the params, we cannot do a max over # actions for the next grad state grad_output_next_state = gradient_network( one_hot_ify(obs, num_states), actor_params) # Compute next gradient target # gradient_reward = a_dist.log_prob(torch.tensor(action)) * (q_s.data[0,action] - (q_s.data[0,:]*prob_a.data[0,:]).sum()) adv = (q_s.data[0, action] - (q_s.data[0, :] * prob_a.data[0, :]).sum()) gradient_reward = torch.autograd.grad(log_prob, actor_params_list, retain_graph=True) gradient_reward = (torch.cat([t.flatten() \ for t in list(gradient_reward)]).view(1,-1)) gradient_target = gradient_reward * adv + γ * grad_output_next_state gradient_td_error = nn.MSELoss()(gradient_target, grad_output_current_state) gradient_td_error_loss.append(gradient_td_error.view(1, -1)) samples_run[episode] = step + 1 if done: break # *** POLICY UPDATE *** critic_loss = nn.MSELoss()(torch.cat(list(q_sa_hist)), torch.cat(list(q_sa_target_hist))) critic_opt.zero_grad() critic_loss.backward() critic_opt.step() actor_opt.zero_grad() # Update with the pg_bell function if args.pg_bellman: gradient_network_opt.zero_grad() gradient_loss = torch.cat(list(gradient_td_error_loss)).sum() gradient_loss.backward() gradient_network_opt.step() # for t_index, t in enumerate(list(actor.parameters())): # t.grad = - lamda_ent * (actor_params.grad[0, actor_params_sizes[t_index]:actor_params_sizes[t_index+1]]).view(t.shape) # for t_index, t in enumerate(list(actor.parameters())): # t.grad = - lamda_ent * (actor_params.grad[0, actor_params_sizes[t_index]:actor_params_sizes[t_index+1]]).view(t.shape) # Policy param Update # TODO: Now that the Gradient Network is updated, we should loop # through the state/action history and update the policy params? # Loop state/action history and collect grads grads = torch.zeros_like(grad_output_current_state) for obs in obs_hist: grads += gradient_network(one_hot_ify(obs, num_states), actor_params) grads = grads / len(obs_hist) grads = grads.flatten() start = 0 # Grab new param grads, reshape to same size for p in actor_params_list: stop = start + p.nelement() g = grads[start:stop].view(p.size()) p.grad = -g.clone() # clone otherwise opt won't work start = stop actor_opt.step() # If using standard ac policy gradient: else: actor_loss = -(torch.cat(list(log_prob_a_hist)) * torch.cat(list(adv_hist))).sum() actor_loss.backward() actor_opt.step() # *** EVALUATION *** for eval_episode in range(num_eval_episodes): eval_obs = eval_env.reset() return_eval_episode = 0 scale = 1.0 for eval_step in range(max_steps): with torch.no_grad(): eval_prob_a = actor(one_hot_ify(eval_obs, num_states)) eval_a = torch.distributions.Categorical( probs=eval_prob_a).sample().numpy()[0] eval_obs, eval_rew, eval_done, _ = eval_env.step(eval_a) return_eval_episode += scale * eval_rew scale *= γ if eval_done: break return_all_eval_episodes[eval_episode] = return_eval_episode return_run[episode] = np.mean(return_all_eval_episodes) print("EvalRewards : ", episode, ":", np.mean(return_all_eval_episodes)) evaluations.append(np.mean(return_all_eval_episodes)) logger.record_reward(evaluations) logger.save() return return_run, samples_run, actor, critic
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # * Step 1: init data folders print("init data folders") # * Init character folders for dataset construction metatrain_character_folders, metatest_character_folders = tg.mini_imagenet_folders( ) # * Step 2: init neural networks print("init neural networks") feature_encoder = models.CNNEncoder() actor = models.Actor(FEATURE_DIM, RELATION_DIM, CLASS_NUM) critic = models.Critic(FEATURE_DIM, RELATION_DIM) #feature_encoder = torch.nn.DataParallel(feature_encoder) #actor = torch.nn.DataParallel(actor) #critic = torch.nn.DataParallel(critic) feature_encoder.train() actor.train() critic.train() feature_encoder.apply(models.weights_init) actor.apply(models.weights_init) critic.apply(models.weights_init) feature_encoder.to(device) actor.to(device) critic.to(device) agent = a2cAgent.A2CAgent(actor, critic, GAMMA, ENTROPY_WEIGHT, FEATURE_DIM, RELATION_DIM, CLASS_NUM, device) #feature_encoder.eval() #relation_network.eval() if os.path.exists( str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")): feature_encoder.load_state_dict( torch.load( str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl"))) print("load feature encoder success") if os.path.exists( str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")): actor.load_state_dict( torch.load( str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl"))) print("load actor network success") if os.path.exists( str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")): critic.load_state_dict( torch.load( str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl"))) print("load critic network success") max_accuracy_list = [] mean_accuracy_list = [] for episode in range(1): total_accuracy = [] for i in range(TEST_EPISODE): # * Generate env env_states_list = [] env_labels_list = [] number_of_query_image = 15 task = tg.MiniImagenetTask(metatest_character_folders, CLASS_NUM, SAMPLE_NUM_PER_CLASS, number_of_query_image) sample_dataloader = tg.get_mini_imagenet_data_loader( task, num_per_class=SAMPLE_NUM_PER_CLASS, split="train", shuffle=False) test_dataloader = tg.get_mini_imagenet_data_loader( task, num_per_class=number_of_query_image, split="test", shuffle=True) sample_images, sample_labels = next(iter(sample_dataloader)) test_images, test_labels = next(iter(test_dataloader)) sample_images, sample_labels = sample_images.to( device), sample_labels.to(device) test_images, test_labels = test_images.to(device), test_labels.to( device) # * calculate features sample_features = feature_encoder(sample_images) sample_features = sample_features.view(CLASS_NUM, SAMPLE_NUM_PER_CLASS, FEATURE_DIM, 19, 19) sample_features = torch.sum(sample_features, 1).squeeze(1) test_features = feature_encoder(test_images) # * calculate relations # * each batch sample link to every samples to calculate relations # * to form a 100x128 matrix for relation network sample_features_ext = sample_features.unsqueeze(0).repeat( number_of_query_image * CLASS_NUM, 1, 1, 1, 1) test_features_ext = test_features.unsqueeze(0).repeat( CLASS_NUM, 1, 1, 1, 1) test_features_ext = torch.transpose(test_features_ext, 0, 1) relation_pairs = torch.cat( (sample_features_ext, test_features_ext), 2).view(-1, FEATURE_DIM * 2, 19, 19) env_states_list.append(relation_pairs) env_labels_list.append(test_labels) test_env = a2cAgent.env(env_states_list, env_labels_list) rewards = agent.test(test_env) test_accuracy = rewards / len(test_labels) print(test_accuracy) total_accuracy.append(test_accuracy) mean_accuracy, conf_int = mean_confidence_interval(total_accuracy) print(f"Total accuracy : {mean_accuracy:.4f}") print(f"confidence interval : {conf_int:.4f}")
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # * Step 1: init data folders print("init data folders") # * Init character folders for dataset construction metatrain_character_folders, metatest_character_folders = tg.mini_imagenet_folders( ) # * Step 2: init neural networks print("init neural networks") feature_encoder = models.CNNEncoder() actor = models.Actor(FEATURE_DIM, RELATION_DIM, CLASS_NUM) critic = models.Critic(FEATURE_DIM, RELATION_DIM) #feature_encoder = torch.nn.DataParallel(feature_encoder) #actor = torch.nn.DataParallel(actor) #critic = torch.nn.DataParallel(critic) feature_encoder.train() actor.train() critic.train() feature_encoder.apply(models.weights_init) actor.apply(models.weights_init) critic.apply(models.weights_init) feature_encoder.to(device) actor.to(device) critic.to(device) cross_entropy = nn.CrossEntropyLoss() feature_encoder_optim = torch.optim.Adam(feature_encoder.parameters(), lr=LEARNING_RATE) feature_encoder_scheduler = StepLR(feature_encoder_optim, step_size=10000, gamma=0.5) actor_optim = torch.optim.Adam(actor.parameters(), lr=2.5 * LEARNING_RATE) actor_scheduler = StepLR(actor_optim, step_size=10000, gamma=0.5) critic_optim = torch.optim.Adam(critic.parameters(), lr=2.5 * LEARNING_RATE * 10) critic_scheduler = StepLR(critic_optim, step_size=10000, gamma=0.5) agent = a2cAgent.A2CAgent(actor, critic, GAMMA, ENTROPY_WEIGHT, CLASS_NUM, device) if os.path.exists( str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")): feature_encoder.load_state_dict( torch.load( str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl"))) print("load feature encoder success") if os.path.exists( str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")): actor.load_state_dict( torch.load( str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl"))) print("load actor network success") if os.path.exists( str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")): critic.load_state_dict( torch.load( str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl"))) print("load critic network success") # * Step 3: build graph print("Training...") last_accuracy = 0.0 mbal_loss_list = [] mbcl_loss_list = [] loss_list = [] number_of_query_image = 15 for episode in range(EPISODE): #print(f"EPISODE : {episode}") policy_losses = [] value_losses = [] for meta_batch in range(META_BATCH_RANGE): meta_env_states_list = [] meta_env_labels_list = [] for inner_batch in range(INNER_BATCH_RANGE): # * Generate environment env_states_list = [] env_labels_list = [] for env in range(ENV_LENGTH): task = tg.MiniImagenetTask(metatrain_character_folders, CLASS_NUM, SAMPLE_NUM_PER_CLASS, number_of_query_image) sample_dataloader = tg.get_mini_imagenet_data_loader( task, num_per_class=SAMPLE_NUM_PER_CLASS, split="train", shuffle=False) batch_dataloader = tg.get_mini_imagenet_data_loader( task, num_per_class=5, split="test", shuffle=True) samples, sample_labels = next(iter(sample_dataloader)) samples, sample_labels = samples.to( device), sample_labels.to(device) for batches, batch_labels in batch_dataloader: batches, batch_labels = batches.to( device), batch_labels.to(device) inner_sample_features = feature_encoder(samples) inner_sample_features = inner_sample_features.view( CLASS_NUM, SAMPLE_NUM_PER_CLASS, FEATURE_DIM, 19, 19) inner_sample_features = torch.sum( inner_sample_features, 1).squeeze(1) inner_batch_features = feature_encoder(batches) inner_sample_feature_ext = inner_sample_features.unsqueeze( 0).repeat(5 * CLASS_NUM, 1, 1, 1, 1) inner_batch_features_ext = inner_batch_features.unsqueeze( 0).repeat(CLASS_NUM, 1, 1, 1, 1) inner_batch_features_ext = torch.transpose( inner_batch_features_ext, 0, 1) inner_relation_pairs = torch.cat( (inner_sample_feature_ext, inner_batch_features_ext), 2).view(-1, FEATURE_DIM * 2, 19, 19) env_states_list.append(inner_relation_pairs) env_labels_list.append(batch_labels) inner_env = a2cAgent.env(env_states_list, env_labels_list) agent.train(inner_env, inner_update=True) for meta_env in range(META_ENV_LENGTH): task = tg.MiniImagenetTask(metatrain_character_folders, CLASS_NUM, SAMPLE_NUM_PER_CLASS, number_of_query_image) sample_dataloader = tg.get_mini_imagenet_data_loader( task, num_per_class=SAMPLE_NUM_PER_CLASS, split="train", shuffle=False) batch_dataloader = tg.get_mini_imagenet_data_loader( task, num_per_class=number_of_query_image, split="test", shuffle=True) # * num_per_class : number of query images # * sample datas samples, sample_labels = next(iter(sample_dataloader)) samples, sample_labels = samples.to(device), sample_labels.to( device) # * Generate env for meta update batches, batch_labels = next(iter(batch_dataloader)) # * init dataset # * sample_dataloader is to obtain previous samples for compare # * batch_dataloader is to batch samples for training batches, batch_labels = batches.to(device), batch_labels.to( device) # * calculates features #feature_encoder.weight = feature_fast_weights sample_features = feature_encoder(samples) sample_features = sample_features.view(CLASS_NUM, SAMPLE_NUM_PER_CLASS, FEATURE_DIM, 19, 19) sample_features = torch.sum(sample_features, 1).squeeze(1) batch_features = feature_encoder(batches) # * calculate relations # * each batch sample link to every samples to calculate relations # * to form a 100 * 128 matrix for relation network sample_features_ext = sample_features.unsqueeze(0).repeat( number_of_query_image * CLASS_NUM, 1, 1, 1, 1) batch_features_ext = batch_features.unsqueeze(0).repeat( CLASS_NUM, 1, 1, 1, 1) batch_features_ext = torch.transpose(batch_features_ext, 0, 1) relation_pairs = torch.cat( (sample_features_ext, batch_features_ext), 2).view(-1, FEATURE_DIM * 2, 19, 19) meta_env_states_list.append(relation_pairs) meta_env_labels_list.append(batch_labels) meta_env = a2cAgent.env(meta_env_states_list, meta_env_labels_list) agent.train(meta_env, policy_loss_list=policy_losses, value_loss_list=value_losses) feature_encoder_optim.zero_grad() actor_optim.zero_grad() critic_optim.zero_grad() torch.nn.utils.clip_grad_norm_(feature_encoder.parameters(), 0.5) torch.nn.utils.clip_grad_norm_(actor.parameters(), 0.5) torch.nn.utils.clip_grad_norm_(critic.parameters(), 0.5) meta_batch_actor_loss = torch.stack(policy_losses).mean() meta_batch_critic_loss = torch.stack(value_losses).mean() meta_batch_actor_loss.backward(retain_graph=True) meta_batch_critic_loss.backward() feature_encoder_optim.step() actor_optim.step() critic_optim.step() feature_encoder_scheduler.step() actor_scheduler.step() critic_scheduler.step() if (episode + 1) % 100 == 0: mbal = meta_batch_actor_loss.cpu().detach().numpy() mbcl = meta_batch_critic_loss.cpu().detach().numpy() print( f"episode : {episode+1}, meta_batch_actor_loss : {mbal:.4f}, meta_batch_critic_loss : {mbcl:.4f}" ) mbal_loss_list.append(mbal) mbcl_loss_list.append(mbcl) loss_list.append(mbal + mbcl) if (episode + 1) % 500 == 0: print("Testing...") total_reward = 0 total_num_of_test_samples = 0 for i in range(TEST_EPISODE): # * Generate env env_states_list = [] env_labels_list = [] number_of_query_image = 10 task = tg.MiniImagenetTask(metatest_character_folders, CLASS_NUM, SAMPLE_NUM_PER_CLASS, number_of_query_image) sample_dataloader = tg.get_mini_imagenet_data_loader( task, num_per_class=SAMPLE_NUM_PER_CLASS, split="train", shuffle=False) test_dataloader = tg.get_mini_imagenet_data_loader( task, num_per_class=number_of_query_image, split="test", shuffle=True) sample_images, sample_labels = next(iter(sample_dataloader)) sample_images, sample_labels = sample_images.to( device), sample_labels.to(device) test_images, test_labels = next(iter(test_dataloader)) total_num_of_test_samples += len(test_labels) test_images, test_labels = test_images.to( device), test_labels.to(device) # * calculate features sample_features = feature_encoder(sample_images) sample_features = sample_features.view(CLASS_NUM, SAMPLE_NUM_PER_CLASS, FEATURE_DIM, 19, 19) sample_features = torch.sum(sample_features, 1).squeeze(1) test_features = feature_encoder(test_images) # * calculate relations # * each batch sample link to every samples to calculate relations # * to form a 100x128 matrix for relation network sample_features_ext = sample_features.unsqueeze(0).repeat( number_of_query_image * CLASS_NUM, 1, 1, 1, 1) test_features_ext = test_features.unsqueeze(0).repeat( CLASS_NUM, 1, 1, 1, 1) test_features_ext = torch.transpose(test_features_ext, 0, 1) relation_pairs = torch.cat( (sample_features_ext, test_features_ext), 2).view(-1, FEATURE_DIM * 2, 19, 19) env_states_list.append(relation_pairs) env_labels_list.append(test_labels) test_env = a2cAgent.env(env_states_list, env_labels_list) rewards = agent.test(test_env) total_reward += rewards test_accuracy = total_reward / (1.0 * total_num_of_test_samples) mean_loss = np.mean(loss_list) mean_actor_loss = np.mean(mbal_loss_list) mean_critic_loss = np.mean(mbcl_loss_list) print(f'mean loss : {mean_loss}') print("test accuracy : ", test_accuracy) writer.add_scalar('1.loss', mean_loss, episode + 1) writer.add_scalar('2.mean_actor_loss', mean_actor_loss, episode + 1) writer.add_scalar('3.mean_critic_loss', mean_critic_loss, episode + 1) writer.add_scalar('4.test accuracy', test_accuracy, episode + 1) loss_list = [] mbal_loss_list = [] mbcl_loss_list = [] if test_accuracy > last_accuracy: # save networks torch.save( feature_encoder.state_dict(), str("./models/miniimagenet_feature_encoder_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")) torch.save( actor.state_dict(), str("./models/miniimagenet_actor_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")) torch.save( critic.state_dict(), str("./models/miniimagenet_critic_network_" + str(CLASS_NUM) + "way_" + str(SAMPLE_NUM_PER_CLASS) + "shot.pkl")) print("save networks for episode:", episode) last_accuracy = test_accuracy
def __init__(self, config, state_size, action_size, num_agents, seed, per=True): """Initialize an Agent object. Params ====== config (config): instance of a config-class, which stores all the hyperparameters state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.config = config self.epsilon = self.config.EPSILON_START self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = seed # Initialize bins self.v_min = 0 self.v_max = 5 self.n_atoms = 51 self.delta = (self.v_max - self.v_min) / float(self.n_atoms - 1) self.bin_centers = torch.from_numpy( np.array([ self.v_min + i * self.delta for i in range(self.n_atoms) ]).reshape(-1, 1)).to(self.config.device) # Initialize the Actor and Critic Networks self.actor_local = models.Actor(state_size, action_size).to(self.config.device) self.actor_target = models.Actor(state_size, action_size).to(self.config.device) self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), self.config.LR_actor) self.critic_local = models.Critic(state_size, action_size, self.n_atoms).to(self.config.device) self.critic_target = models.Critic(state_size, action_size, self.n_atoms).to(self.config.device) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), self.config.LR_critic, weight_decay=self.config.weight_decay) # Initialize the random-noise-process for action-noise self.is_training = True self.noise = OUNoise((self.num_agents, self.action_size), self.seed) # Hard update the target networks to have the same parameters as the local networks for target_param, param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(param.data) # Initialize the replay-buffer according to `per` self.memory = ReplayBuffer(self.config.BUFFER_SIZE, self.config.BATCH_SIZE, seed, self.config.device, self.config.N_BOOTSTRAP) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
action_max = env.action_space.high[0] print("State dimension: {}".format(state_dimension)) print("Action dimension: {}".format(action_dimension)) print("Action max: {}".format(action_max)) load_models = False # Actor network, critic network uusgeh actor = models.Actor(state_dimension, action_dimension, action_max) target_actor = models.Actor(state_dimension, action_dimension, action_max) actor_optimizer = torch.optim.Adam(actor.parameters(), lr=ACTOR_LEARNING_RATE) critic = models.Critic(state_dimension, action_dimension) target_critic = models.Critic(state_dimension, action_dimension) critic_optimizer = torch.optim.Adam(critic.parameters(), lr=CRITIC_LEARNING_RATE) # Target network-g huulah for target_param, param in zip(target_actor.parameters(), actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(target_critic.parameters(), critic.parameters()): target_param.data.copy_(param.data) # Hadgalsan modeliig ashiglah
load_models = args.load_models lr = args.lr model_dim = args.model_dim n_epochs = args.n_epochs n_critic = args.n_critic seed = args.seed output_dim = 784 # 784 = 28 * 28, number of pixels in an MNIST image torch.manual_seed(seed) torch.backends.cudnn.deterministic = True device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") generator = models.Generator(output_dim, latent_dim, model_dim) critic = models.Critic(model_dim) if load_models == True: generator.load_state_dict(torch.load('models/generator.pth.tar')) critic.load_state_dict(torch.load('models/critic.pth.tar')) generator.to(device) critic.to(device) critic_optimizer = optim.Adam(critic.parameters(), lr=lr, betas=(0.5, 0.9)) generator_optimizer = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.9)) # set distributions for later use normal_dist = normal.Normal(0.0, 1.0) uniform_dist = uniform.Uniform(0.0, 1.0) # Create a latent variable that is used to visualize the progression of the generator fixed_noise = normal_dist.sample((grid_size, latent_dim)).to(device)