def __init__(self, vail_sample, reward_shift, reward_aug, gae_norm, global_norm, actor_lr, critic_lr, disc_lr, actor_units, critic_units, disc_units, disc_reduce_units, gamma, lambd, clip, entropy, epochs, batch_size, update_rate, data_dir, demo_list): # build network self.actor = Actor(lr=actor_lr, hidden_units=actor_units) self.critic = Critic(lr=critic_lr, hidden_units=critic_units) self.discriminator = Discriminator(lr=disc_lr, hidden_units=disc_units, reduce_units=disc_reduce_units) self.encoder = VAE_Encoder(latent_num=64) # set hyperparameters self.vail_sample = vail_sample self.reward_shift = reward_shift self.reward_aug = reward_aug self.gae_norm = gae_norm self.gamma = gamma self.lambd = lambd self.gam_lam = gamma * lambd self.clip = clip self.entropy = entropy self.epochs = epochs self.batch_size = batch_size self.half_batch_size = batch_size // 2 self.update_rate = update_rate self.grad_global_norm = global_norm self.beta = BETA_INIT # build memory self.memory = HorizonMemory(use_reward=reward_aug) self.replay = ReplayMemory() # build expert demonstration Pipeline self.data_dir = data_dir self.demo_list = os.listdir(data_dir) self.demo_group_num = 500 self.demo_rotate = 5 assert len(demo_list) >= self.demo_group_num self.set_demo() # ready self.dummy_forward() self.actor_vars = self.actor.trainable_variables + self.encoder.trainable_variables self.critic_vars = self.critic.trainable_variables + self.encoder.trainable_variables self.disc_vars = self.discriminator.trainable_variables + self.encoder.trainable_variables
def __init__(self, algo, optimizer, env, num_actions, memory_size=10000): self.algo = algo # currently not used. Future work to make learning algorithms modular. self.env = env self.policy = Feedforward(env.observation_space.shape[0], env.action_space.n) # Reusing ReplayMemory class for Q-learning, but this may be a bit confusing so let me clarify (I'm just lazy here) # As policy gradient is on-policy method, you can not use experiences generated from a different policy # I'm flushing the memory at the beggining of every epoch (see train function) so the policy is updated # only on the trajectories from the current policy self.memory = ReplayMemory(memory_size) self.n_actions = env.action_space.n if optimizer == 'Adam': self.optim = optim.Adam(self.policy.parameters()) else: raise NotImplementedError
def main(args): traj_txt = load_txt(args.traj_path) force_txt = load_txt(args.force_path) memory = ReplayMemory(500000) traj = sort_traj(traj_txt) force = sort_force(force_txt, traj) # x_1 , sr_1 = librosa.load(args.audio_path) # memory.push(state, action, next_state, reward) a = 1
class GAIL: def __init__(self, vail_sample, reward_shift, reward_aug, gae_norm, global_norm, actor_lr, critic_lr, disc_lr, actor_units, critic_units, disc_units, disc_reduce_units, gamma, lambd, clip, entropy, epochs, batch_size, update_rate, data_dir, demo_list): # build network self.actor = Actor(lr=actor_lr, hidden_units=actor_units) self.critic = Critic(lr=critic_lr, hidden_units=critic_units) self.discriminator = Discriminator(lr=disc_lr, hidden_units=disc_units, reduce_units=disc_reduce_units) self.encoder = VAE_Encoder(latent_num=64) # set hyperparameters self.vail_sample = vail_sample self.reward_shift = reward_shift self.reward_aug = reward_aug self.gae_norm = gae_norm self.gamma = gamma self.lambd = lambd self.gam_lam = gamma * lambd self.clip = clip self.entropy = entropy self.epochs = epochs self.batch_size = batch_size self.half_batch_size = batch_size // 2 self.update_rate = update_rate self.grad_global_norm = global_norm self.beta = BETA_INIT # build memory self.memory = HorizonMemory(use_reward=reward_aug) self.replay = ReplayMemory() # build expert demonstration Pipeline self.data_dir = data_dir self.demo_list = os.listdir(data_dir) self.demo_group_num = 500 self.demo_rotate = 5 assert len(demo_list) >= self.demo_group_num self.set_demo() # ready self.dummy_forward() self.actor_vars = self.actor.trainable_variables + self.encoder.trainable_variables self.critic_vars = self.critic.trainable_variables + self.encoder.trainable_variables self.disc_vars = self.discriminator.trainable_variables + self.encoder.trainable_variables def dummy_forward(self): # connect networks dummy_state = np.zeros([1] + STATE_SHAPE, dtype=np.float32) dummy_action = np.zeros([1] + ACTION_SHAPE, dtype=np.float32) self.encoder(dummy_state) self.actor(self.encoder, dummy_state) self.critic(self.encoder, dummy_state) self.discriminator(self.encoder, dummy_state, dummy_action) def set_demo(self): self.demo_list = os.listdir(data_dir) selected_demos = random.sample(self.demo_list, self.demo_group_num) expert_states = [] expert_actions = [] for demo_name in selected_demos: demo = np.load(self.data_dir + demo_name) states = demo['state'] actions = demo['action'] expert_states.append(states) expert_actions.append(actions) self.expert_states = np.concatenate(expert_states, axis=0) self.expert_actions = np.concatenate(expert_actions, axis=0) del demo def get_demonstration(self, sample_num): index = np.arange(len(self.expert_states)) try: assert len(self.expert_states) >= sample_num except Exception: self.set_demo() np.random.shuffle(index) index = index[:sample_num] return self.expert_states[index], self.expert_actions[index] def memory_process(self, next_state, done): # [[(1,64,64,3)], [], ...], [[(1,2),(1,9),(1,3),(1,4)], [], ...], [[c_pi, d_pi, s_pi, a_pi], [], ...] if self.reward_aug: states, actions, log_old_pis, rewards = self.memory.rollout() else: states, actions, log_old_pis = self.memory.rollout() np_states = np.concatenate(states + [next_state], axis=0) np_actions = np.concatenate(actions, axis=0) np_rewards = self.get_reward(np_states[:-1], np_actions) # (N, 1) if self.reward_aug: np_env_rewards = np.stack(rewards, axis=0).reshape(-1, 1) np_rewards = np_rewards + np_env_rewards gae, oracle = self.get_gae_oracle(np_states, np_rewards, done) # (N, 1), (N, 1) self.replay.append(states, actions, log_old_pis, gae, oracle) self.memory.flush() if len(self.replay) >= self.update_rate: self.update() self.replay.flush() def get_action(self, state): policy = self.actor(self.encoder, state).numpy()[0] action = np.random.choice(ACTION_NUM, p=policy) # action = np.argmax(policy) action_one_hot = np.eye(ACTION_NUM, dtype=np.float32)[[action]] # (1, 4) log_old_pi = [[np.log(policy[action] + 1e-8)]] # (1, 1) return action, action_one_hot, log_old_pi, policy def get_reward(self, states, actions): d = self.discriminator(self.encoder, states, actions).numpy() # (N, 1) # rewards = 0.5 - d # linear reward # rewards = np.tan(0.5 - d) # tan reward if self.reward_shift: rewards = -np.log(2.0 * d + 1e-8) # log equil reward else: rewards = -np.log(d + 1e-8) # log reward # rewards = 0.1 * np.where(rewards>1, 1, rewards) return rewards def get_gae_oracle(self, states, rewards, done): # states include next state values = self.critic(self.encoder, states).numpy() # (N+1, 1) if done: values[-1] = np.float32([0]) N = len(rewards) gae = 0 gaes = np.zeros((N, 1), dtype=np.float32) oracles = np.zeros((N, 1), dtype=np.float32) for t in reversed(range(N)): oracles[t] = rewards[t] + self.gamma * values[t + 1] delta = oracles[t] - values[t] gae = delta + self.gam_lam * gae gaes[t][0] = gae # oracles = gaes + values[:-1] # (N, 1) if self.gae_norm: gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + 1e-8) return gaes, oracles def update(self): # load & calculate data states, actions, log_old_pis, gaes, oracles \ = self.replay.rollout() states = np.concatenate(states, axis=0) actions = np.concatenate(actions, axis=0) log_old_pis = np.concatenate(log_old_pis, axis=0) gaes = np.concatenate(gaes, axis=0) oracles = np.concatenate(oracles, axis=0) N = len(states) # update discriminator # load expert demonstration s_e, a_e = self.get_demonstration(N) batch_num = N // self.half_batch_size index = np.arange(N) np.random.shuffle(index) for i in range(batch_num): idx = index[i * self.half_batch_size:(i + 1) * self.half_batch_size] s_concat = np.concatenate([states[idx], s_e[idx]], axis=0) a_concat = np.concatenate([actions[idx], a_e[idx]], axis=0) with tf.GradientTape(persistent=True) as tape: mu, std, sampled = self.discriminator.encode( self.encoder, s_concat, a_concat) discs = self.discriminator.decode( sampled if self.vail_sample else mu) kld_loss = tf.reduce_mean(tf_gaussian_KL(mu, 0, std, 1)) agent_loss = -tf.reduce_mean( tf.math.log(discs[:self.half_batch_size] + 1e-8)) expert_loss = -tf.reduce_mean( tf.math.log(1 + 1e-8 - discs[self.half_batch_size:])) disc_loss = agent_loss + expert_loss discriminator_loss = disc_loss + self.beta * kld_loss disc_grads = tape.gradient(discriminator_loss, self.disc_vars) if self.grad_global_norm > 0: disc_grads, _ = tf.clip_by_global_norm(disc_grads, self.grad_global_norm) self.discriminator.opt.apply_gradients( zip(disc_grads, self.disc_vars)) del tape # TODO: update posterior # L1 loss = logQ(code|s,prev_a,prev_code) # update actor & critic # batch_num = math.ceil(len(states) / self.batch_size) batch_num = len(gaes) // self.batch_size index = np.arange(len(gaes)) for _ in range(self.epochs): np.random.shuffle(index) for i in range(batch_num): # if i == batch_num - 1: # idx = index[i*self.batch_size : ] # else: idx = index[i * self.batch_size:(i + 1) * self.batch_size] state = states[idx] action = actions[idx] log_old_pi = log_old_pis[idx] gae = gaes[idx] oracle = oracles[idx] # update critic with tf.GradientTape(persistent=True) as tape: values = self.critic(self.encoder, state) # (N, 1) critic_loss = tf.reduce_mean( (oracle - values)**2) # MSE loss critic_grads = tape.gradient(critic_loss, self.critic_vars) if self.grad_global_norm > 0: critic_grads, _ = tf.clip_by_global_norm( critic_grads, self.grad_global_norm) self.critic.opt.apply_gradients( zip(critic_grads, self.critic_vars)) del tape # update actor with tf.GradientTape(persistent=True) as tape: pred_action = self.actor(self.encoder, state) # RL (PPO) term log_pi = tf.expand_dims(tf.math.log( tf.reduce_sum(pred_action * action, axis=1) + 1e-8), axis=1) # (N, 1) ratio = tf.exp(log_pi - log_old_pi) clip_ratio = tf.clip_by_value(ratio, 1 - self.clip, 1 + self.clip) clip_loss = -tf.reduce_mean( tf.minimum(ratio * gae, clip_ratio * gae)) entropy = tf.reduce_mean(tf.exp(log_pi) * log_pi) actor_loss = clip_loss + self.entropy * entropy actor_grads = tape.gradient( actor_loss, self.actor_vars) # NOTE: freeze posterior if self.grad_global_norm > 0: actor_grads, _ = tf.clip_by_global_norm( actor_grads, self.grad_global_norm) self.actor.opt.apply_gradients( zip(actor_grads, self.actor_vars)) del tape # print('%d samples trained... D loss: %.4f C loss: %.4f A loss: %.4f\t\t\t' # % (len(gaes), disc_loss, critic_loss, actor_loss), end='\r') def save_model(self, dir, tag=''): self.actor.save_weights(dir + tag + 'actor.h5') self.critic.save_weights(dir + tag + 'critic.h5') self.discriminator.save_weights(dir + tag + 'discriminator.h5') self.encoder.save_weights(dir + tag + 'encoder.h5') def load_model(self, dir, tag=''): if os.path.exists(dir + tag + 'actor.h5'): self.actor.load_weights(dir + tag + 'actor.h5') print('Actor loaded... %s%sactor.h5' % (dir, tag)) if os.path.exists(dir + tag + 'critic.h5'): self.critic.load_weights(dir + tag + 'critic.h5') print('Critic loaded... %s%scritic.h5' % (dir, tag)) if os.path.exists(dir + tag + 'discriminator.h5'): self.discriminator.load_weights(dir + tag + 'discriminator.h5') print('Discriminator loaded... %s%sdiscriminator.h5' % (dir, tag)) if os.path.exists(dir + tag + 'encoder.h5'): self.encoder.load_weights(dir + tag + 'encoder.h5') print('encoder loaded... %s%sencoder.h5' % (dir, tag)) def load_encoder(self, dir, tag=''): if os.path.exists(dir + tag + 'encoder.h5'): self.encoder.load_weights(dir + tag + 'encoder.h5') print('encoder loaded... %s%sencoder.h5' % (dir, tag))
device = torch.device(args.device) eval_net = GraphNet( hidden_size=args.hidden_size, n_head=args.nhead, nlayers=args.nlayer, duel_dqn=args.duel_dqn, n_gat_head=args.n_gat_head).to(device) target_net = GraphNet( hidden_size=args.hidden_size, n_head=args.nhead, nlayers=args.nlayer, duel_dqn=args.duel_dqn, n_gat_head=args.n_gat_head).to(device) optimizer = torch.optim.Adam(eval_net.parameters(), lr=args.lr) gamma = args.gamma epsilon = args.epsilon batch_size = args.batch_size max_step = args.max_step num_env = args.num_env time_last = time.time() loss_func = nn.MSELoss() learn_step_counter = 0 q_network_iteration = 10 memory = ReplayMemory(4096) performance = [] envs = Envs(10000,100000, num_env) state = envs.reset() for _i in range(max_step): eval_net.eval() values = eval_net(*state) # print(values) # print(state) if random.random() > epsilon: prob_uniform = (values > -9999999).float() dist = Categorical(prob_uniform) action = dist.sample()
def main() -> int: parser = argparse.ArgumentParser() parser.add_argument( '--mode', help='Select mode', choices=['train', 'test', 'demo'], default='train', ) args = parser.parse_args() config = yaml.safe_load(open("config.yml")) if config['LOAD_MODEL']: model = DQN( in_channels=config['IN_CHANNELS'], out_dim=config['OUT_DIM'], ) model_name = config['LOAD_MODEL'] model.load_model(model_name) else: model = DQN( in_channels=config['IN_CHANNELS'], out_dim=config['OUT_DIM'], ) if args.mode == 'test': test( device=config['DEVICE'], n_games=config['TEST_GAMES'], model=model, frame_skipping=config['FRAME_SKIPPING'], ) elif args.mode == 'demo': demo( device=config['DEVICE'], model=model, frame_skipping=config['FRAME_SKIPPING'], ) else: memory = ReplayMemory(capacity=config['N']) optimizer_name = config['OPTIMIZER'] if optimizer_name == 'adam': optimizer = torch.optim.Adam(lr=config['LEARNING_RATE'], betas=(0.9, 0.999), eps=1e-8, amsgrad=False, params=model.model.parameters()) elif optimizer_name == 'sgd': optimizer = torch.optim.SGD(lr=config['LEARNING_RATE'], momentum=0.9, params=model.model.parameters()) else: raise ValueError(f'Unknown optimizer name: {optimizer_name}') experiment = Experiment( api_key=os.environ['COMET_ML_API_KEY'], project_name=config['COMET_ML_PROJECT_NAME'], workspace=config['COMET_ML_WORKSPACE'], ) experiment.set_name(config['COMET_ML_NAME']) experiment.add_tag(config['COMET_ML_TAG']) experiment.log_parameters({ 'n_games': config['M'], 'minibatch_size': config['MINIBATCH_SIZE'], 'eps': config['EPS'], 'eps_n_frames': config['EPS_N_FRAMES'], 'gamma': config['GAMMA'], 'frame_skipping': config['FRAME_SKIPPING'], 'save_model_every': config['SAVE_MODEL_EVERY'] }) experiment.set_model_graph(str(model.model)) train( device=config['DEVICE'], n_games=config['M'], memory=memory, optimizer=optimizer, model=model, experiment=experiment, minibatch_size=config['MINIBATCH_SIZE'], eps=config['EPS'], eps_n_frames=config['EPS_N_FRAMES'], gamma=config['GAMMA'], frame_skipping=config['FRAME_SKIPPING'], update_model_target_every=config['UPDATE_MODEL_TARGET_EVERY'], save_model_every=config['SAVE_MODEL_EVERY'], save_model_as=config['SAVE_MODEL_AS'], save_average_metrics_every=config['SAVE_AVERAGE_METRICS_EVERY'], )
class MDP: def __init__(self, args): self.args = args self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] # 'open', 'close'] self.P_START = 0.999 self.P_END = 0.05 self.P_DECAY = 500 self.max_iter = args.max_iter self.gripping_force = args.grip_force self.breaking_threshold = args.break_thresh # Prepare the drawing figure fig, (ax1, ax2) = plt.subplots(1, 2) self.figure = (fig, ax1, ax2) # Function to select an action from our policy or a random one def select_action(self, state): sample = random.random() p_threshold = self.P_END + (self.P_START - self.P_END) * math.exp( -1. * self.steps_done / self.P_DECAY) self.steps_done += 1 if sample > p_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. self.policy_net_1.eval() torch_state = torch.from_numpy(state).float().to( self.args.device) action = self.policy_net_1(torch_state.unsqueeze(0)).max(1)[1] self.policy_net_1.train() return action.item() else: return random.randrange(self.args.outdim) def optimize_model(self): args = self.args if len(self.memory) < args.batch_size: return transitions = self.memory.sample(args.batch_size) state_batch, action_batch, reward_batch, nextstate_batch = [], [], [], [] for transition in transitions: state_batch.append(transition.state) action_batch.append(transition.action) reward_batch.append(transition.reward) nextstate_batch.append(transition.next_state) state_batch = torch.from_numpy(np.array(state_batch)).float().to( args.device) action_batch = torch.from_numpy(np.array(action_batch)).to( args.device).unsqueeze(1) reward_batch = torch.from_numpy(np.array(reward_batch)).float().to( args.device).unsqueeze(1) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, nextstate_batch)), device=args.device, dtype=torch.bool).unsqueeze(1) non_final_next_states = torch.cat([ torch.from_numpy(s).float().to(args.device).unsqueeze(0) for s in nextstate_batch if s is not None ]) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values_1 = self.policy_net_1(state_batch).gather( 1, action_batch) state_action_values_2 = self.policy_net_2(state_batch).gather( 1, action_batch) state_action_values_3 = self.policy_net_3(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values_1 = torch.zeros((args.batch_size, 1), device=args.device) next_state_values_2 = torch.zeros((args.batch_size, 1), device=args.device) next_state_values_3 = torch.zeros((args.batch_size, 1), device=args.device) next_state_values_1[non_final_mask] = self.policy_net_1( non_final_next_states).max(1)[0].detach() next_state_values_2[non_final_mask] = self.policy_net_2( non_final_next_states).max(1)[0].detach() next_state_values_3[non_final_mask] = self.policy_net_3( non_final_next_states).max(1)[0].detach() next_state_values = torch.min( torch.min(next_state_values_1, next_state_values_2), next_state_values_3) # Compute the expected Q values expected_state_action_values = (next_state_values * args.gamma) + reward_batch # Compute Huber loss loss_1 = F.smooth_l1_loss(state_action_values_1, expected_state_action_values) loss_2 = F.smooth_l1_loss(state_action_values_2, expected_state_action_values) loss_3 = F.smooth_l1_loss(state_action_values_3, expected_state_action_values) # Optimize the model self.optimizer_1.zero_grad() self.optimizer_2.zero_grad() self.optimizer_3.zero_grad() loss_1.backward() loss_2.backward() loss_3.backward() for param in self.policy_net_1.parameters(): param.grad.data.clamp_(-1, 1) for param in self.policy_net_2.parameters(): param.grad.data.clamp_(-1, 1) for param in self.policy_net_3.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer_1.step() self.optimizer_2.step() self.optimizer_3.step() return [loss_1, loss_2, loss_3] def train_MDP(self): args = self.args # Create the output directory if it does not exist if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) # Create our policy net and a target net self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device) self.target_net = DQN(args.indim, args.outdim).to(args.device) self.target_net.load_state_dict(self.policy_net_1.state_dict()) self.target_net.eval() # Set up the optimizer self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(), args.lr) self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(), args.lr) self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(), args.lr) self.memory = ReplayMemory(500000) self.steps_done = 0 # Setup the state normalizer normalizer = Normalizer(args.indim, device=args.device) print_variables = {'durations': [], 'rewards': [], 'loss': []} # Load old checkpoint if provided start_episode = 0 if args.checkpoint_file: if os.path.exists(args.checkpoint_file): checkpoint = torch.load(args.checkpoint_file) self.policy_net_1.load_state_dict( checkpoint['model_state_dict']) self.policy_net_2.load_state_dict( checkpoint['model_state_dict']) self.policy_net_3.load_state_dict( checkpoint['model_state_dict']) self.target_net.load_state_dict(checkpoint['model_state_dict']) start_episode = checkpoint['epoch'] self.steps_done = start_episode self.optimizer_1.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_2.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_3.load_state_dict( checkpoint['optimizer_state_dict']) with open( os.path.join(os.path.dirname(args.checkpoint_file), 'results_geom_mdp.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variables['durations'] = plot_dict['durations'] print_variables['rewards'] = plot_dict['rewards'] if args.normalizer_file: if os.path.exists(args.normalizer_file): normalizer.restore_state(args.normalizer_file) action_space = ActionSpace(dp=0.06, df=10) # Main training loop for ii in range(start_episode, args.epochs): start_time = time.time() if args.sim: # Create robot, reset simulation and grasp handle model, model_params = init_model(args.model_path) sim = MjSim(model) sim.step() viewer = None if args.render: viewer = MjViewer(sim) else: viewer = None sim_param = SimParameter(sim) robot = RobotSim(sim, viewer, sim_param, args.render, self.breaking_threshold) robot.reset_simulation() ret = robot.grasp_handle() if not ret: continue # Get current state state_space = Observation( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) broken_so_far = 0 for t in count(): if not args.quiet and t % 20 == 0: print("Running training episode: {}, iteration: {}".format( ii, t)) # Select action state = state_space.get_state() if args.position: state = state[6:] if args.shear: indices = np.ones(len(state), dtype=bool) indices[6:166] = False state = state[indices] if args.force: state = state[:166] normalizer.observe(state) state = normalizer.normalize(state) action = self.select_action(state) # Perform action delta = action_space.get_action( self.ACTIONS[action])['delta'][:3] target_position = np.add(state_space.get_current_position(), np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) if args.sim: robot.move_joint(target_pose, True, self.gripping_force, hap_sample=args.hap_sample) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: reward = num - broken_so_far broken_so_far = num else: reward = 0 # # Add a movement reward # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta) # Observe new state state_space.update( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) # Set max number of iterations if t >= self.max_iter: done = True # Check if done if not done and not failure: next_state = state_space.get_state() if args.position: next_state = next_state[6:] if args.shear: indices = np.ones(len(next_state), dtype=bool) indices[6:166] = False next_state = next_state[indices] if args.force: next_state = next_state[:166] normalizer.observe(next_state) next_state = normalizer.normalize(next_state) else: next_state = None # Push new Transition into memory self.memory.push(state, action, next_state, reward) # Optimize the model loss = self.optimize_model() # if loss: # print_variables['loss'].append(loss.item()) # If we are done, reset the model if done or failure: if failure: print_variables['durations'].append(self.max_iter) else: print_variables['durations'].append(t) print_variables['rewards'].append(broken_so_far) plot_variables(self.figure, print_variables, 'Training MDP') print("Model parameters: {}".format(model_params)) print("Epoch {} took {}s, total number broken: {}\n\n". format(ii, time.time() - start_time, broken_so_far)) break # Update the target network, every x iterations if ii % 10 == 0: self.target_net.load_state_dict(self.policy_net_1.state_dict()) # Save checkpoints every vew iterations if ii % args.save_freq == 0: save_path = os.path.join( args.output_dir, 'checkpoint_model_' + str(ii) + '.pth') torch.save( { 'epoch': ii, 'model_state_dict': self.target_net.state_dict(), 'optimizer_state_dict': self.optimizer_1.state_dict(), }, save_path) # Save normalizer state for inference normalizer.save_state( os.path.join(args.output_dir, 'normalizer_state.pickle')) if args.savefig_path: now = dt.datetime.now() self.figure[0].savefig( args.savefig_path + '{}_{}_{}'.format(now.month, now.day, now.hour), format='png') print('Training done') plt.show() return print_variables
def train_MDP(self): args = self.args # Create the output directory if it does not exist if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) # Create our policy net and a target net self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device) self.target_net = DQN(args.indim, args.outdim).to(args.device) self.target_net.load_state_dict(self.policy_net_1.state_dict()) self.target_net.eval() # Set up the optimizer self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(), args.lr) self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(), args.lr) self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(), args.lr) self.memory = ReplayMemory(500000) self.steps_done = 0 # Setup the state normalizer normalizer = Normalizer(args.indim, device=args.device) print_variables = {'durations': [], 'rewards': [], 'loss': []} # Load old checkpoint if provided start_episode = 0 if args.checkpoint_file: if os.path.exists(args.checkpoint_file): checkpoint = torch.load(args.checkpoint_file) self.policy_net_1.load_state_dict( checkpoint['model_state_dict']) self.policy_net_2.load_state_dict( checkpoint['model_state_dict']) self.policy_net_3.load_state_dict( checkpoint['model_state_dict']) self.target_net.load_state_dict(checkpoint['model_state_dict']) start_episode = checkpoint['epoch'] self.steps_done = start_episode self.optimizer_1.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_2.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_3.load_state_dict( checkpoint['optimizer_state_dict']) with open( os.path.join(os.path.dirname(args.checkpoint_file), 'results_geom_mdp.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variables['durations'] = plot_dict['durations'] print_variables['rewards'] = plot_dict['rewards'] if args.normalizer_file: if os.path.exists(args.normalizer_file): normalizer.restore_state(args.normalizer_file) action_space = ActionSpace(dp=0.06, df=10) # Main training loop for ii in range(start_episode, args.epochs): start_time = time.time() if args.sim: # Create robot, reset simulation and grasp handle model, model_params = init_model(args.model_path) sim = MjSim(model) sim.step() viewer = None if args.render: viewer = MjViewer(sim) else: viewer = None sim_param = SimParameter(sim) robot = RobotSim(sim, viewer, sim_param, args.render, self.breaking_threshold) robot.reset_simulation() ret = robot.grasp_handle() if not ret: continue # Get current state state_space = Observation( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) broken_so_far = 0 for t in count(): if not args.quiet and t % 20 == 0: print("Running training episode: {}, iteration: {}".format( ii, t)) # Select action state = state_space.get_state() if args.position: state = state[6:] if args.shear: indices = np.ones(len(state), dtype=bool) indices[6:166] = False state = state[indices] if args.force: state = state[:166] normalizer.observe(state) state = normalizer.normalize(state) action = self.select_action(state) # Perform action delta = action_space.get_action( self.ACTIONS[action])['delta'][:3] target_position = np.add(state_space.get_current_position(), np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) if args.sim: robot.move_joint(target_pose, True, self.gripping_force, hap_sample=args.hap_sample) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: reward = num - broken_so_far broken_so_far = num else: reward = 0 # # Add a movement reward # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta) # Observe new state state_space.update( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) # Set max number of iterations if t >= self.max_iter: done = True # Check if done if not done and not failure: next_state = state_space.get_state() if args.position: next_state = next_state[6:] if args.shear: indices = np.ones(len(next_state), dtype=bool) indices[6:166] = False next_state = next_state[indices] if args.force: next_state = next_state[:166] normalizer.observe(next_state) next_state = normalizer.normalize(next_state) else: next_state = None # Push new Transition into memory self.memory.push(state, action, next_state, reward) # Optimize the model loss = self.optimize_model() # if loss: # print_variables['loss'].append(loss.item()) # If we are done, reset the model if done or failure: if failure: print_variables['durations'].append(self.max_iter) else: print_variables['durations'].append(t) print_variables['rewards'].append(broken_so_far) plot_variables(self.figure, print_variables, 'Training MDP') print("Model parameters: {}".format(model_params)) print("Epoch {} took {}s, total number broken: {}\n\n". format(ii, time.time() - start_time, broken_so_far)) break # Update the target network, every x iterations if ii % 10 == 0: self.target_net.load_state_dict(self.policy_net_1.state_dict()) # Save checkpoints every vew iterations if ii % args.save_freq == 0: save_path = os.path.join( args.output_dir, 'checkpoint_model_' + str(ii) + '.pth') torch.save( { 'epoch': ii, 'model_state_dict': self.target_net.state_dict(), 'optimizer_state_dict': self.optimizer_1.state_dict(), }, save_path) # Save normalizer state for inference normalizer.save_state( os.path.join(args.output_dir, 'normalizer_state.pickle')) if args.savefig_path: now = dt.datetime.now() self.figure[0].savefig( args.savefig_path + '{}_{}_{}'.format(now.month, now.day, now.hour), format='png') print('Training done') plt.show() return print_variables
def run(self): BufferRecord = namedtuple( 'BufferRecord', ['x', 's', 'm', 'f', 'a', 'old_logp_a', 'v_target', 'adv']) buffer = ReplayMemory(tuple_class=BufferRecord, capacity=self.args.buffer_size) reward_history = [] reward_averaged = [] best_reward = -np.inf step = 0 total_rec = 0 for n_iter in range(self.args.iters): clip = self._ratio_clip_fn(n_iter) if self.args.clean_buffer: buffer.clean() ep_reward, n_rec = self._generate_rollout(buffer) reward_history.append(ep_reward) reward_averaged.append(np.mean(reward_history[-10:])) total_rec += n_rec for batch in buffer.loop(self.args.record_size, self.args.epochs): if self.args.finetune_model and n_iter >= self.args.finetune_warmup: self.model.finetune(batch) _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.lr_a: self.args.lr_a, self.lr_c: self.args.lr_c, self.clip_range: clip, self.state: batch['s'], self.mask: batch['m'], self.auxilary: batch['f'], self.action: batch['a'], self.old_logp_a: batch['old_logp_a'], self.v_target: batch['v_target'], self.adv: batch['adv'], self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.writer.add_summary(summ_str, step) step += 1 if self.args.log_freq > 0 and (n_iter + 1) % self.args.log_freq == 0: logger.info( "[iteration:{}/step:{}], best:{}, avg:{:.2f}, clip:{:.2f}; {} transitions." .format(n_iter, step, np.max(reward_history), np.mean(reward_history[-10:]), clip, total_rec)) if self.args.eval_freq > 0 and n_iter % self.args.eval_freq == 0: self.evaluate(folder=f'{n_iter}', load=False) if self.args.save_freq > 0 and (n_iter + 1) % self.args.save_freq == 0: self.save() if np.mean(reward_history[-10:]) > best_reward: best_reward = np.mean(reward_history[-10:]) self.save('best') # FINISH self.save() logger.info( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_dict(f'{self.args.exp_dir}/learning_curve.png', data_dict, xlabel='episode')
def __init__(self, action_spec: dm_env.specs.DiscreteArray, observation_spec: dm_env.specs.Array, device: torch.device, settings: dict) -> None: """ Initializes the agent, constructs the qnet and the q_target, initializes the optimizer and ReplayMemory. Args: action_spec(dm_env.specs.DiscreteArray): description of the action space of the environment observation_spec(dm_env.specs.Array): description of observations form the environment device(str): "gpu" or "cpu" settings(dict): dictionary with settings """ self.device = device action_size = action_spec.num_values state_size = np.prod(observation_spec.shape) self.action_size = action_size self.state_size = state_size self.batch_size = settings['batch_size'] self.noisy_nets = settings['qnet_settings']['noisy_nets'] self.qnet = Dqn(state_size, action_size, settings['qnet_settings']).to(device) self.q_target = Dqn(state_size, action_size, settings['qnet_settings']).to(device) self.q_target.load_state_dict(self.qnet.state_dict()) self.optimizer = optim.Adam(self.qnet.parameters(), lr=settings['lr']) self.epsilon = settings["epsilon_start"] self.decay = settings["epsilon_decay"] self.epsilon_min = settings["epsilon_min"] self.gamma = settings['gamma'] self.start_optimization = settings["start_optimization"] self.update_qnet_every = settings["update_qnet_every"] self.update_target_every = settings["update_target_every"] self.number_steps = 0 self.ddqn = settings["ddqn"] # Initialize replay memory self.prioritized_replay = settings["prioritized_buffer"] if self.prioritized_replay: self.memory = PrioritizedReplayMemory( device, settings["buffer_size"], self.gamma, settings["n_steps"], settings["alpha"], settings["beta0"], settings["beta_increment"]) else: self.memory = ReplayMemory(device, settings["buffer_size"], self.gamma, settings["n_steps"]) # Density Estimator self.features = 'd' self.DE_type = 'KDE' if self.DE_type == 'flow': self.density_estimator = MAFMOGDensityEstimator( batch_size=50, n_components=3, n_blocks=5, lr=1e-4, use_log_density=True, use_density_scaling=True) elif self.DE_type == 'KDE': # self.density_estimator = FixedKernelDensityEstimator('gaussian', 0.1, use_log_density = True) self.density_estimator = CVKernelDensityEstimator( use_log_density=True) # Epistemic predictor self.enet = Epn((state_size + len(self.features)) - 1 if "x" in self.features else len(self.features), action_size, settings['qnet_settings']).to(device) self.e_optimizer = optim.Adam(self.enet.parameters(), lr=settings['lr']) self.burn_in_density = 10000 return
class Agent: def __init__(self, action_spec: dm_env.specs.DiscreteArray, observation_spec: dm_env.specs.Array, device: torch.device, settings: dict) -> None: """ Initializes the agent, constructs the qnet and the q_target, initializes the optimizer and ReplayMemory. Args: action_spec(dm_env.specs.DiscreteArray): description of the action space of the environment observation_spec(dm_env.specs.Array): description of observations form the environment device(str): "gpu" or "cpu" settings(dict): dictionary with settings """ self.device = device action_size = action_spec.num_values state_size = np.prod(observation_spec.shape) self.action_size = action_size self.state_size = state_size self.batch_size = settings['batch_size'] self.noisy_nets = settings['qnet_settings']['noisy_nets'] self.qnet = Dqn(state_size, action_size, settings['qnet_settings']).to(device) self.q_target = Dqn(state_size, action_size, settings['qnet_settings']).to(device) self.q_target.load_state_dict(self.qnet.state_dict()) self.optimizer = optim.Adam(self.qnet.parameters(), lr=settings['lr']) self.epsilon = settings["epsilon_start"] self.decay = settings["epsilon_decay"] self.epsilon_min = settings["epsilon_min"] self.gamma = settings['gamma'] self.start_optimization = settings["start_optimization"] self.update_qnet_every = settings["update_qnet_every"] self.update_target_every = settings["update_target_every"] self.number_steps = 0 self.ddqn = settings["ddqn"] # Initialize replay memory self.prioritized_replay = settings["prioritized_buffer"] if self.prioritized_replay: self.memory = PrioritizedReplayMemory( device, settings["buffer_size"], self.gamma, settings["n_steps"], settings["alpha"], settings["beta0"], settings["beta_increment"]) else: self.memory = ReplayMemory(device, settings["buffer_size"], self.gamma, settings["n_steps"]) # Density Estimator self.features = 'd' self.DE_type = 'KDE' if self.DE_type == 'flow': self.density_estimator = MAFMOGDensityEstimator( batch_size=50, n_components=3, n_blocks=5, lr=1e-4, use_log_density=True, use_density_scaling=True) elif self.DE_type == 'KDE': # self.density_estimator = FixedKernelDensityEstimator('gaussian', 0.1, use_log_density = True) self.density_estimator = CVKernelDensityEstimator( use_log_density=True) # Epistemic predictor self.enet = Epn((state_size + len(self.features)) - 1 if "x" in self.features else len(self.features), action_size, settings['qnet_settings']).to(device) self.e_optimizer = optim.Adam(self.enet.parameters(), lr=settings['lr']) self.burn_in_density = 10000 return def select_action(self, timestep: dm_env.TimeStep) -> int: """ Returns an action following an epsilon-greedy policy. Args: timestep(dm_env.TimeStep): An observation from the environment Returns: int: The chosen action. """ observation = np.array(timestep.observation).flatten() observation = torch.from_numpy(observation).float().to(self.device) self.number_steps += 1 if not self.noisy_nets: self.update_epsilon() if np.random.rand() < self.epsilon: return np.random.choice(self.action_size) else: if self.number_steps <= self.burn_in_density: qvals = self.qnet.forward(observation) else: qvals = self.qnet.forward( observation) + 0.1 * self._epistemic_uncertainty( observation.unsqueeze(0)) return int(torch.argmax(qvals, dim=-1).cpu().detach().numpy()) def update_epsilon(self) -> None: """ Decays epsilon until self.epsilon_min Returns: None """ if self.epsilon > self.epsilon_min: self.epsilon *= self.decay @staticmethod def calc_loss( q_observed: torch.Tensor, q_target: torch.Tensor, weights: torch.Tensor) -> typing.Tuple[torch.Tensor, np.float64]: """ Returns the mean weighted MSE loss and the loss for each sample Args: q_observed(torch.Tensor): calculated q_value q_target(torch.Tensor): target q-value weights: weights of the batch samples Returns: tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample """ # print('q_observed is cuda', q_observed.is_cuda) # print('q_target is cuda', q_target.is_cuda) losses = functional.mse_loss(q_observed, q_target, reduction='none') loss = (weights * losses).sum() / weights.sum() return loss, losses.cpu().detach().numpy() + 1e-8 def update(self, step: dm_env.TimeStep, action: int, next_step: dm_env.TimeStep) -> None: """ Adds experience to the replay memory, performs an optimization_step and updates the q_target neural network. Args: step(dm_env.TimeStep): Current observation from the environment action(int): The action that was performed by the agent. next_step(dm_env.TimeStep): Next observation from the environment Returns: None """ observation = np.array(step.observation).flatten() next_observation = np.array(next_step.observation).flatten() done = next_step.last() exp = Experience(observation, action, next_step.reward, next_step.discount, next_observation, 0, done) self.memory.add(exp) if self.memory.number_samples() < self.start_optimization: return if self.number_steps % self.update_qnet_every == 0: s0, a0, n_step_reward, discount, s1, _, dones, indices, weights = self.memory.sample_batch( self.batch_size) self.optimization_step(s0, a0, n_step_reward, discount, s1, indices, weights) if self.number_steps % self.update_target_every == 0: self.q_target.load_state_dict(self.qnet.state_dict()) return def optimization_step(self, s0: torch.Tensor, a0: torch.Tensor, n_step_reward: torch.Tensor, discount: torch.Tensor, s1: torch.Tensor, indices: typing.Optional[torch.Tensor], weights: typing.Optional[torch.Tensor]) -> None: """ Calculates the Bellmann update and updates the qnet. Args: s0(torch.Tensor): current state a0(torch.Tensor): current action n_step_reward(torch.Tensor): n-step reward discount(torch.Tensor): discount factor s1(torch.Tensor): next state indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet. weights(torch.Tensor): weights needed for prioritized replay Returns: None """ with torch.no_grad(): if self.noisy_nets: self.q_target.reset_noise() self.qnet.reset_noise() # Calculating the target values next_q_vals = self.q_target(s1) if self.ddqn: a1 = torch.argmax(self.qnet(s1), dim=1).unsqueeze(-1) next_q_val = next_q_vals.gather(1, a1).squeeze() else: next_q_val = torch.max(next_q_vals, dim=1).values q_target = n_step_reward.squeeze( ) + self.gamma * discount.squeeze() * next_q_val # Getting the observed q-values if self.noisy_nets: self.qnet.reset_noise() q_observed = self.qnet(s0).gather(1, a0.long()).squeeze() # Calculating the losses if not self.prioritized_replay: weights = torch.ones(self.batch_size) critic_loss, batch_loss = self.calc_loss(q_observed, q_target, weights.to(self.device)) # Backpropagation of the gradients self.optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5) self.optimizer.step() # Update density estimator if self.number_steps % self.burn_in_density == 0: s0_for_d, a0_for_d, _, _, _, _, _, _, _ = self.memory.sample_batch( self.burn_in_density) self.density_estimator.fit(s0_for_d.cpu()) if hasattr(self.density_estimator, 'kde'): print('steps: {}, DE fitted: {}, bandwifth: {}'.format( self.number_steps, self.density_estimator.kde, self.density_estimator.kde.bandwidth)) # Update Enet if self.memory.number_samples() > self.burn_in_density: e_observed = self._epistemic_uncertainty(s0).gather( 1, a0.long()).squeeze() e_loss, e_batch_loss = self.calc_loss( e_observed, torch.tensor(batch_loss).to(self.device), weights.to(self.device)) if self.number_steps % self.burn_in_density == 0: # print("steps, Top k samples from Qnet: batch_loss:", self.number_steps, torch.topk(torch.tensor(batch_loss), 10)) # print("Top k samples from Enet, e_observed:", torch.topk(e_observed, 10)) print('steps, e_loss', self.number_steps, e_loss) # print('density', self.density_estimator.score_samples(s0.cpu()).to(self.device)) self.e_optimizer.zero_grad() e_loss.backward() torch.nn.utils.clip_grad_norm_(self.enet.parameters(), 5) self.e_optimizer.step() # Update replay memory self.memory.update_priorities(indices, batch_loss) return def _epistemic_uncertainty(self, x): """ Computes uncertainty for input sample and returns epistemic uncertainty estimate. """ u_in = [] if 'x' in self.features: u_in.append(x) if 'd' in self.features: density_feature = self.density_estimator.score_samples(x.cpu()).to( self.device) u_in.append(density_feature) u_in = torch.cat(u_in, dim=1) return self.enet.forward(u_in) def pretrain_density_estimator(self, x): """ Trains density estimator on input samples """ self.density_estimator.fit(x.cpu())
def __init__(self, algo, env, num_actions, memory_size=10000): self.algo = algo self.env = env self.policy = CNN(3, 96, 96, 1) self.memory = ReplayMemory(memory_size)
class Agent(): def __init__(self, algo, optimizer, env, num_actions, memory_size=10000): self.algo = algo # currently not used. Future work to make learning algorithms modular. self.env = env self.policy = Feedforward(env.observation_space.shape[0], env.action_space.n) # Reusing ReplayMemory class for Q-learning, but this may be a bit confusing so let me clarify (I'm just lazy here) # As policy gradient is on-policy method, you can not use experiences generated from a different policy # I'm flushing the memory at the beggining of every epoch (see train function) so the policy is updated # only on the trajectories from the current policy self.memory = ReplayMemory(memory_size) self.n_actions = env.action_space.n if optimizer == 'Adam': self.optim = optim.Adam(self.policy.parameters()) else: raise NotImplementedError def update_policy(self, gamma): memory = self.memory.get_memory() episode_length = len(memory) memory = list(zip(*memory)) discounted_rewards = [] s = memory[0] a = memory[1] ns = memory[2] r = memory[3] a_one_hot = torch.nn.functional.one_hot(torch.tensor(a), num_classes=2).float() for t in range(episode_length): r_forward = r[t:] G = 0 # Compute a discounted cummulative reward from time step t for i, reward in enumerate(r_forward): G += gamma**i * reward discounted_rewards.append(G) rewards_t = torch.tensor(discounted_rewards).detach() s_t = torch.tensor(s).float() selected_action_probs = self.policy(s_t) prob = torch.sum(selected_action_probs * a_one_hot, axis=1) # A small hack to prevent inf when log(0) clipped = torch.clamp(prob, min=1e-10, max=1.0) J = -torch.log(clipped) * rewards_t grad = torch.sum(J) self.optim.zero_grad() grad.backward() self.optim.step() def train(self, num_epochs, gamma): # initialization for e in range(num_epochs): self.memory.flush() state = self.env.reset() done = False total_reward = 0 self.env.render() t = 0 while not done: action_prob = self.policy(torch.from_numpy(state).float()) #print(action_prob.detach().numpy()) #action = torch.argmax(action_prob).item() action = np.random.choice(range(self.n_actions), p=action_prob.detach().numpy()) next_state, reward, done, _ = self.env.step(action) self.memory.push(state, action, next_state, reward) self.env.render() total_reward += reward state = next_state t += 1 print("Episode: ", e) print("Reward: ", total_reward) writer.add_scalar('./runs/rewards', total_reward, e) self.update_policy(gamma) self.env.close() writer.close()
def __init__(self, n_agents, dim_obs, dim_act, batch_size, capacity, episodes_before_train, use_approx=False): self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)] self.critics = [ Critic(n_agents, dim_obs, dim_act) for i in range(n_agents) ] self.actors_target = deepcopy(self.actors) self.critics_target = deepcopy(self.critics) self.n_agents = n_agents self.n_states = dim_obs self.n_actions = dim_act self.memory = ReplayMemory(capacity) self.batch_size = batch_size self.use_cuda = torch.cuda.is_available() self.episodes_before_train = episodes_before_train self.GAMMA = 0.95 self.tau = 0.01 self.use_approx = use_approx # for gaussian noise self.var = [1.0 for i in range(n_agents)] self.critic_optimizer = [ Adam(x.parameters(), lr=0.001) for x in self.critics ] self.actor_optimizer = [ Adam(x.parameters(), lr=0.0001) for x in self.actors ] if (self.use_approx): self.approx_policies = [[ ApproxPolicy(dim_obs, dim_act) if i != j else None for i in range(self.n_agents) ] for j in range(self.n_agents)] self.approx_targets = deepcopy(self.approx_policies) self.approx_optimizer = [[ Adam(x.parameters(), lr=0.001) if x is not None else None for x in approx_actor ] for approx_actor in self.approx_policies] if self.use_cuda: for x in self.actors: x.cuda() for x in self.critics: x.cuda() for x in self.actors_target: x.cuda() for x in self.critics_target: x.cuda() if self.use_approx: for i in range(self.n_agents): for j in range(self.n_agents): if self.approx_policies[i][j] is not None: self.approx_policies[i][j].cuda() self.approx_targets[i][j].cuda() self.steps_done = 0 self.episode_done = 0
class MADDPG: def __init__(self, n_agents, dim_obs, dim_act, batch_size, capacity, episodes_before_train, use_approx=False): self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)] self.critics = [ Critic(n_agents, dim_obs, dim_act) for i in range(n_agents) ] self.actors_target = deepcopy(self.actors) self.critics_target = deepcopy(self.critics) self.n_agents = n_agents self.n_states = dim_obs self.n_actions = dim_act self.memory = ReplayMemory(capacity) self.batch_size = batch_size self.use_cuda = torch.cuda.is_available() self.episodes_before_train = episodes_before_train self.GAMMA = 0.95 self.tau = 0.01 self.use_approx = use_approx # for gaussian noise self.var = [1.0 for i in range(n_agents)] self.critic_optimizer = [ Adam(x.parameters(), lr=0.001) for x in self.critics ] self.actor_optimizer = [ Adam(x.parameters(), lr=0.0001) for x in self.actors ] if (self.use_approx): self.approx_policies = [[ ApproxPolicy(dim_obs, dim_act) if i != j else None for i in range(self.n_agents) ] for j in range(self.n_agents)] self.approx_targets = deepcopy(self.approx_policies) self.approx_optimizer = [[ Adam(x.parameters(), lr=0.001) if x is not None else None for x in approx_actor ] for approx_actor in self.approx_policies] if self.use_cuda: for x in self.actors: x.cuda() for x in self.critics: x.cuda() for x in self.actors_target: x.cuda() for x in self.critics_target: x.cuda() if self.use_approx: for i in range(self.n_agents): for j in range(self.n_agents): if self.approx_policies[i][j] is not None: self.approx_policies[i][j].cuda() self.approx_targets[i][j].cuda() self.steps_done = 0 self.episode_done = 0 def update_policy(self): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None BoolTensor = torch.cuda.BoolTensor if self.use_cuda else torch.BoolTensor FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor c_loss = [] a_loss = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = BoolTensor( list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = torch.stack(batch.states).type(FloatTensor) action_batch = torch.stack(batch.actions).type(FloatTensor) reward_batch = torch.stack(batch.rewards).type(FloatTensor) # non_final_next_states: (batch_size_non_final) x n_agents x dim_obs non_final_next_states = torch.stack([ s for s in batch.next_states if s is not None ]).type(FloatTensor) # for current agent whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) # calculating current_Q : Q(x, a1, ..., an) self.critic_optimizer[agent].zero_grad() current_Q = self.critics[agent](whole_state, whole_action) # calculating for target_Q : y = r + Q(x', a'1, ... , a'n) --> for all states non final non_final_next_actions = None if self.use_approx: self.update_approx_policy(agent) param_list = [ self.approx_targets[agent][i]( non_final_next_states[:, i, :]) if i != agent else None for i in range(self.n_agents) ] param_list = [ list(torch.chunk(param, 2 * self.n_actions)) if param is not None else None for param in param_list ] param_list = [ [torch.split(x, self.n_actions, dim=1) for x in param] if param is not None else None for param in param_list ] act_pd_n = [[Normal(loc=x[0], scale=x[1]) for x in param] if param is not None else None for param in param_list] non_final_next_actions = [ torch.cat([x.sample() for x in act_pd]) if act_pd is not None else None for act_pd in act_pd_n ] non_final_next_actions[agent] = self.actors_target[agent]( non_final_next_states[:, agent, :]) else: non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents) ] non_final_next_actions = torch.stack(non_final_next_actions) non_final_next_actions = (non_final_next_actions.transpose( 0, 1).contiguous()) target_Q = torch.zeros(self.batch_size).type(FloatTensor) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.n_states), non_final_next_actions.view(-1, self.n_agents * self.n_actions)).squeeze() target_Q = (target_Q.unsqueeze(1) * self.GAMMA) + (reward_batch[:, agent].unsqueeze(1)) # calculating critic loss from current_Q and target_Q loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() # calculating actor loss self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.actors[agent](state_i) ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) if self.steps_done % 100 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.critics[i], self.tau) soft_update(self.actors_target[i], self.actors[i], self.tau) return c_loss, a_loss def select_action(self, state_batch): # state_batch dimention: n_agents x state_dim # Define type of tensor FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor # create actions tensor actions = torch.zeros(self.n_agents, self.n_actions) # iterating for all agents for i in range(self.n_agents): # get all observation sb = state_batch[i, :].detach() # calculate forward act = self.actors[i](sb.unsqueeze(0)).squeeze() ## add gaussian noise act += torch.from_numpy( np.random.randn(self.n_actions) * self.var[i]).type(FloatTensor) # update gaussian noise if self.episode_done > self.episodes_before_train and self.var[ i] > 0.05: self.var[i] *= 0.999998 act = torch.clamp(act, -1.0, 1.0) actions[i, :] = act self.steps_done += 1 return actions def update_approx_policy(self, agent_idx): # Define type of tensor FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor # implementing infering policy of other's agent # get latest sample latest_sample = self.memory.latest_sample() experience = Experience(*zip(*latest_sample)) latest_state = torch.stack( experience.states).type(FloatTensor).squeeze() latest_action = torch.stack( experience.actions).type(FloatTensor).squeeze() # update for each approx policy for i in range(self.n_agents): if i == agent_idx: continue # run neural network for getting param self.approx_optimizer[agent_idx][i].zero_grad() param = self.approx_policies[agent_idx][i](latest_state[i, :]) param = param.unsqueeze(0) ## create normal distribution from param param = torch.split(param, self.n_actions, dim=1) act_pd = Normal(loc=param[0], scale=param[1]) # get sample act act_sample = act_pd.sample() # calculate entrophy loss p_reg = -torch.mean(act_pd.entropy()) # calculate log prob loss act_target = latest_action pg_loss = -torch.mean(act_pd.log_prob(act_target)) loss = pg_loss + p_reg * 1e-3 loss.backward() torch.nn.utils.clip_grad_norm_( self.approx_policies[agent_idx][i].parameters(), 1) self.approx_optimizer[agent_idx][i].step() # target network soft_update(self.approx_targets[agent_idx][i], self.approx_policies[agent_idx][i], self.tau) # TODO : calculate KL difference, can't do it right now because I use different type neural network for # approximation and target network. The approx network is outputting parameters for distribution, # meanwhile target network is outputting direct actions. def save(self, time, episode): # check path exists cwd = os.getcwd() save_dir = os.path.join(cwd, 'checkpoint') if not os.path.exists(save_dir): os.mkdir(save_dir) # create filename time = time.replace(' ', '_') filename = 'Time_{}_NAgent_{}_Episode_{}.pth'.format( time, self.n_agents, episode) save_dir = os.path.join(save_dir, filename) # create saving dictionary checkpoint = dict() # saving model for i in range(self.n_agents): checkpoint['actor_{}'.format(i)] = self.actors[i].state_dict() checkpoint['critic_{}'.format(i)] = self.critics[i].state_dict() checkpoint['actor_target_{}'.format( i)] = self.actors_target[i].state_dict() checkpoint['critic_target_{}'.format( i)] = self.critics_target[i].state_dict() checkpoint['actor_optimizer_{}'.format( i)] = self.actor_optimizer[i].state_dict() checkpoint['critic_optimizer_{}'.format( i)] = self.critic_optimizer[i].state_dict() checkpoint['var_{}'.format(i)] = self.var[i] if self.use_approx: for j in range(self.n_agents): if i != j: checkpoint['approx_policy_{}_{}'.format( i, j)] = self.approx_policies[i][j].state_dict() checkpoint['approx_target_{}_{}'.format( i, j)] = self.approx_targets[i][j].state_dict() checkpoint['approx_optimizer_{}_{}'.format( i, j)] = self.approx_optimizer[i][j].state_dict() # saving model info checkpoint['n_agents'] = self.n_agents checkpoint['episode'] = episode checkpoint['time'] = str(datetime.now()) # save torch.save(checkpoint, save_dir) def load(self, path, map_location): checkpoint = torch.load(path, map_location=map_location) # loading model for i in range(self.n_agents): self.actors[i].load_state_dict(checkpoint['actor_{}'.format(i)]) self.critics[i].load_state_dict(checkpoint['critic_{}'.format(i)]) self.actors_target[i].load_state_dict( checkpoint['actor_target_{}'.format(i)]) self.critics_target[i].load_state_dict( checkpoint['critic_target_{}'.format(i)]) self.actor_optimizer[i].load_state_dict( checkpoint['actor_optimizer_{}'.format(i)]) self.critic_optimizer[i].load_state_dict( checkpoint['critic_optimizer_{}'.format(i)]) self.var[i] = checkpoint['var_{}'.format(i)] if self.use_approx: for j in range(self.n_agents): if i != j: self.approx_policies[i][j].load_state_dict( checkpoint['approx_policy_{}_{}'.format(i, j)]) self.approx_targets[i][j].load_state_dict( checkpoint['approx_target_{}_{}'.format(i, j)]) self.approx_optimizer[i][j].load_state_dict( checkpoint['approx_optimizer_{}_{}'.format(i, j)]) def load_all_agent(self, path, model_number, map_location): '''strictly for testing, do not use for resume training due to critic's network's size differents''' checkpoint = torch.load(path, map_location=map_location) #loading from 1 agent if model_number >= self.n_agents or model_number < 0: model_number = 0 for i in range(self.n_agents): self.actors[i].load_state_dict( checkpoint['actor_{}'.format(model_number)]) self.actors_target[i].load_state_dict( checkpoint['actor_target_{}'.format(model_number)]) self.actor_optimizer[i].load_state_dict( checkpoint['actor_optimizer_{}'.format(model_number)]) self.var[i] = checkpoint['var_{}'.format(model_number)] def load_agent(self, path, agent_number, model_number, map_location): '''strictly for testing, do not use for resume training due to critic's network's size differents''' checkpoint = torch.load(path, map_location=map_location) #loading from 1 agent if agent_number >= self.n_agents or agent_number < 0: agent_number = 0 if model_number >= self.n_agents or model_number < 0: model_number = 0 self.actors[agent_number].load_state_dict( checkpoint['actor_{}'.format(model_number)]) self.actors_target[agent_number].load_state_dict( checkpoint['actor_target_{}'.format(model_number)]) self.actor_optimizer[agent_number].load_state_dict( checkpoint['actor_optimizer_{}'.format(model_number)]) self.var[agent_number] = checkpoint['var_{}'.format(model_number)]