def main(): # define arguments parser = argparse.ArgumentParser() parser.add_argument("--render", action="store_true", help="Render the state") parser.add_argument("--render_interval", type=int, default=10, help="Number of rollouts to skip before rendering") parser.add_argument("--num_rollouts", type=int, default=1000, help="Number of max rollouts") parser.add_argument("--logfile", type=str, help="Indicate where to save rollout data") parser.add_argument( "--load_params", type=str, help="Load previously learned parameters from [LOAD_PARAMS]") parser.add_argument("--save_params", type=str, help="Save learned parameters to [SAVE_PARAMS]") parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") parser.add_argument("--test", action="store_true", help="Test the params") args = parser.parse_args() signal.signal(signal.SIGINT, stopsigCallback) global stopsig # create the basketball environment env = BasketballVelocityEnv(fps=60.0, timeInterval=0.1, goal=[0, 5, 0], initialLengths=np.array([0, 0, 1, 1, 1, 0, 1]), initialAngles=np.array( [0, 45, -20, -20, 0, -20, 0])) # create space stateSpace = ContinuousSpace(ranges=env.state_range()) actionSpace = ContinuousSpace(ranges=env.action_range()) # create the model and policy functions modelFn = PoWERDistribution(stateSpace.n, actionSpace.n, sigma=5.0 if not args.test else 0) if args.load_params: print("Loading params...") modelFn.load_params(args.load_params) replayBuffer = ReplayBuffer(1024) if args.logfile: log = open(args.logfile, "a") rollout = 0 while args.num_rollouts == -1 or rollout < args.num_rollouts: print("Iteration:", rollout) state = env.reset() reward = 0 done = False steps = 0 while not done and steps < 5: if stopsig: break action, eps = modelFn.predict( state, replayBuffer.sample(gamma=args.gamma)) if steps == 4: action[-1] = 1.0 nextState, reward, done, info = env.step(action) replayBuffer.append(state, action, reward, nextState=nextState, info={"eps": eps}) state = nextState steps += 1 if args.render and rollout % args.render_interval == 0: env.render() if stopsig: break # no importance sampling, implement it when we have small datasets replayBuffer.reset() dataset = replayBuffer.sample(gamma=args.gamma) modelFn.fit(dataset) avgR = np.sum(dataset["rewards"]) / float(len(dataset["rewards"])) avgQ = np.sum(dataset["values"]) / float(len(dataset["values"])) print("Rollouts:", rollout, "Error:", modelFn.score(), "Average Q", avgQ, "Average R", avgR) if args.logfile: log.write("[" + str(rollout) + ", " + str(modelFn.score()) + ", " + str(avgQ) + ", " + str(avgR) + "]\n") rollout += 1 if args.logfile: log.close() if args.save_params: print("Saving params...") modelFn.save_params(args.save_params)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = args.discrete net_config = { 'hidden1' : args.hidden1, 'hidden2' : args.hidden2 } # Actor and Critic initialization self.actor = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr) self.critic = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr) hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) # Replay Buffer and noise self.memory = ReplayBuffer(args.memory_size) self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions)) self.last_state = None self.last_action = None # Hyper parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount # CUDA self.use_cuda = args.cuda if self.use_cuda: self.cuda() def cuda(self): self.actor.to(device) self.actor_target.to(device) self.critic.to(device) self.critic_target.to(device) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def reset(self, obs): self.last_state = obs self.noise.reset() def observe(self, reward, state, done): self.memory.append([self.last_state, self.last_action, reward, state, done]) self.last_state = state def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.last_action = action return action.argmax() if self.discrete else action def select_action(self, state, apply_noise=False): self.eval() action = to_numpy(self.actor(to_tensor(np.array([state]), device=device))).squeeze(0) self.train() if apply_noise: action = action + self.noise.sample() action = np.clip(action, -1., 1.) self.last_action = action #print('action:', action, 'output:', action.argmax()) return action.argmax() if self.discrete else action def update_policy(self): state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) state = to_tensor(np.array(state_batch), device=device) action = to_tensor(np.array(action_batch), device=device) next_state = to_tensor(np.array(next_state_batch), device=device) # compute target Q value next_q_value = self.critic_target([next_state, self.actor_target(next_state)]) target_q_value = to_tensor(reward_batch, device=device) \ + self.discount * to_tensor((1 - terminal_batch.astype(np.float)), device=device) * next_q_value # Critic and Actor update self.critic.zero_grad() with torch.set_grad_enabled(True): q_values = self.critic([state, action]) critic_loss = criterion(q_values, target_q_value.detach()) critic_loss.backward() self.critic_optim.step() self.actor.zero_grad() with torch.set_grad_enabled(True): policy_loss = -self.critic([state.detach(), self.actor(state)]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return to_numpy(-policy_loss), to_numpy(critic_loss), to_numpy(q_values.mean()) def save_model(self, output, num=1): if self.use_cuda: self.actor.to(torch.device("cpu")) self.critic.to(torch.device("cpu")) torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.to(device) self.critic.to(device) def load_model(self, output, num=1): self.actor.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num))) if self.use_cuda: self.cuda()
def main(): # define arguments parser = argparse.ArgumentParser() parser.add_argument("--render", action="store_true", help="Render the state") parser.add_argument("--render_interval", type=int, default=10, help="Number of rollouts to skip before rendering") parser.add_argument("--num_rollouts", type=int, default=-1, help="Number of max rollouts") parser.add_argument("--logfile", type=str, help="Indicate where to save rollout data") parser.add_argument( "--load_params", type=str, help="Load previously learned parameters from [LOAD_PARAMS]") parser.add_argument("--save_params", type=str, help="Save learned parameters to [SAVE_PARAMS]") args = parser.parse_args() signal.signal(signal.SIGINT, stopsigCallback) global stopsig # create the basketball environment env = BasketballVelocityEnv(fps=60.0, timeInterval=0.1, goal=[0, 5, 0], initialLengths=np.array([0, 0, 1, 1, 0, 0, 0]), initialAngles=np.array([0, 45, 0, 0, 0, 0, 0])) # create space stateSpace = ContinuousSpace(ranges=env.state_range()) actionRange = env.action_range() actionSpace = DiscreteSpace( intervals=[15 for i in range(2)] + [1], ranges=[actionRange[1], actionRange[2], actionRange[7]]) processor = JointProcessor(actionSpace) # create the model and policy functions modelFn = MxFullyConnected(sizes=[stateSpace.n + actionSpace.n, 64, 32, 1], alpha=0.001, use_gpu=True) if args.load_params: print("loading params...") modelFn.load_params(args.load_params) softmax = lambda s: np.exp(s) / np.sum(np.exp(s)) policyFn = EpsilonGreedyPolicy( epsilon=0.5, getActionsFn=lambda state: actionSpace.sample(1024), distributionFn=lambda qstate: softmax(modelFn(qstate))) dataset = ReplayBuffer() if args.logfile: log = open(args.logfile, "a") rollout = 0 while args.num_rollouts == -1 or rollout < args.num_rollouts: print("Iteration:", rollout) state = env.reset() reward = 0 done = False steps = 0 while not done: if stopsig: break action = policyFn(state) nextState, reward, done, info = env.step( createAction(processor.process_env_action(action))) dataset.append(state, action, reward, nextState) state = nextState steps += 1 if args.render and rollout % args.render_interval == 0: env.render() if stopsig: break dataset.reset() # push trajectory into the dataset buffer modelFn.fit(processor.process_Q(dataset.sample(1024)), num_epochs=10) print("Reward:", reward if (reward >= 0.00001) else 0, "with Error:", modelFn.score(), "with steps:", steps) if args.logfile: log.write("[" + str(rollout) + ", " + str(reward) + ", " + str(modelFn.score()) + "]\n") rollout += 1 if rollout % 100 == 0: policyFn.epsilon *= 0.95 print("Epsilon is now:", policyFn.epsilon) if args.logfile: log.close() if args.save_params: print("saving params...") modelFn.save_params(args.save_params)
def train(config_file_path: str, save_dir: str, use_vime: bool, random_policy: bool, device: str, visualize_interval: int): conf_d = toml.load(open(config_file_path)) conf = namedtuple('Config', conf_d.keys())(*conf_d.values()) # Check if saving directory is valid if "test" in save_dir and os.path.exists(save_dir): shutil.rmtree(save_dir) if os.path.exists(save_dir): raise ValueError("Directory {} already exists.".format(save_dir)) # Create save dir os.makedirs(save_dir) ckpt_dir = os.path.join(save_dir, 'checkpoints') os.makedirs(ckpt_dir) log_dir = os.path.join(save_dir, 'logs') os.makedirs(log_dir) # Save config file shutil.copyfile(config_file_path, os.path.join(save_dir, os.path.basename(config_file_path))) # Set random variable np.random.seed(int(time.time())) torch.manual_seed(int(time.time())) device = torch.device(device) if device.type == 'cuda': torch.cuda.manual_seed(int(time.time())) # Set up log metrics metrics = { 'episode': [], 'collected_samples': [], 'reward': [], # cummulated reward 'curiosity_reward': [], # cummulated reward with information gain 'likelihood': [], # likelihood of leanred dynamics model 'D_KL_median': [], 'D_KL_mean': [], 'q1_loss': [], 'policy_loss': [], 'alpha_loss': [], 'alpha': [], 'ELBO': [], 'step': [], 'step_reward': [], 'test_episode': [], 'test_reward': [], } # Set up environment print("----------------------------------------\nTrain in {}\n----------------------------------------".format(conf.environment)) env = gym.make(conf.environment) if use_vime: print("Use VIME") if random_policy: print("Keep using random policy.") # Training set up agent = SAC(env.observation_space, env.action_space, device, **conf.agent) memory = ReplayBuffer(conf.replay_buffer_capacity, env.observation_space.shape, env.action_space.shape) vime = VIME(env.observation_space.shape[0], env.action_space.shape[0], device, **conf.vime) if use_vime else None # Load checkpoint if specified in config if conf.checkpoint != '': ckpt = torch.load(conf.checkpoint, map_location=device) metrics = ckpt['metrics'] agent.load_state_dict(ckpt['agent']) memory.load_state_dict(ckpt['memory']) if use_vime: vime.load_state_dict(ckpt['vime']) def save_checkpoint(): # Save checkpoint ckpt = {'metrics': metrics, 'agent': agent.state_dict(), 'memory': memory.state_dict()} if use_vime: ckpt['vime'] = vime.state_dict() path = os.path.join(ckpt_dir, 'checkpoint.pth') torch.save(ckpt, path) # Save agent model only model_ckpt = {'agent': agent.state_dict()} model_path = os.path.join(ckpt_dir, 'model.pth') torch.save(model_ckpt, model_path) # Save metrics only metrics_ckpt = {'metrics': metrics} metrics_path = os.path.join(ckpt_dir, 'metrics.pth') torch.save(metrics_ckpt, metrics_path) # Train agent init_episode = 0 if len(metrics['episode']) == 0 else metrics['episode'][-1] + 1 pbar = tqdm.tqdm(range(init_episode, conf.episodes)) reward_moving_avg = None moving_avg_coef = 0.1 agent_update_count = 0 total_steps = 0 for episode in pbar: o = env.reset() rewards, curiosity_rewards = [], [] info_gains = [] log_likelihoods = [] q1_losses, q2_losses, policy_losses, alpha_losses, alphas = [],[],[],[],[] for t in range(conf.horizon): if len(memory) < conf.random_sample_num or random_policy: a = env.action_space.sample() else: a = agent.select_action(o, eval=False) o_next, r, done, _ = env.step(a) total_steps += 1 metrics['step'].append(total_steps) metrics['step_reward'].append(r) done = False if t == env._max_episode_steps - 1 else bool(done) # done should be False if an episode is terminated forcefully rewards.append(r) if use_vime and len(memory) >= conf.random_sample_num: # Calculate curiosity reward in VIME info_gain, log_likelihood = vime.calc_info_gain(o, a, o_next) assert not np.isnan(info_gain).any() and not np.isinf(info_gain).any(), "invalid information gain, {}".format(info_gains) info_gains.append(info_gain) log_likelihoods.append(log_likelihood) vime.memorize_episodic_info_gains(info_gain) r = vime.calc_curiosity_reward(r, info_gain) curiosity_rewards.append(r) memory.append(o, a, r, o_next, done) o = o_next # Update agent if len(memory) >= conf.random_sample_num and not random_policy: for _ in range(conf.agent_update_per_step): batch_data = memory.sample(conf.agent_update_batch_size) q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters(batch_data, agent_update_count) q1_losses.append(q1_loss) q2_losses.append(q2_loss) policy_losses.append(policy_loss) alpha_losses.append(alpha_loss) alphas.append(alpha) agent_update_count += 1 if done: break if len(log_likelihoods) == 0: log_likelihoods.append(-np.inf) # Display performance episodic_reward = np.sum(rewards) reward_moving_avg = episodic_reward if reward_moving_avg is None else (1-moving_avg_coef) * reward_moving_avg + moving_avg_coef * episodic_reward if use_vime: pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Curiosity {:.1f}, Rwd {:.1f} (m.avg {:.1f}), Likelihood {:.2E}".format( episode, memory.step, len(memory), len(rewards), np.sum(curiosity_rewards), episodic_reward, reward_moving_avg, np.mean(np.exp(log_likelihoods)))) else: pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Rwd {:.1f} (mov avg {:.1f})".format( episode, memory.step, len(memory), len(rewards), episodic_reward, reward_moving_avg)) # Save episodic metrics metrics['episode'].append(episode) metrics['collected_samples'].append(total_steps) metrics['reward'].append(episodic_reward) metrics['curiosity_reward'].append(np.sum(curiosity_rewards)) metrics['likelihood'].append(np.mean(np.exp(log_likelihoods))) if episode % visualize_interval == 0: lineplot(metrics['step'][-len(metrics['step_reward']):], metrics['step_reward'], 'stepwise_reward', log_dir, xaxis='total step') lineplot(metrics['episode'][-len(metrics['reward']):], metrics['reward'], 'reward', log_dir) lineplot(metrics['collected_samples'][-len(metrics['reward']):], metrics['reward'], 'sample-reward', log_dir, xaxis='total step') lineplot(metrics['episode'][-len(metrics['curiosity_reward']):], metrics['curiosity_reward'], 'curiosity_reward', log_dir) lineplot(metrics['episode'][-len(metrics['likelihood']):], metrics['likelihood'], 'likelihood', log_dir) # Agent update related metrics if len(policy_losses) > 0 and not random_policy: metrics['q1_loss'].append(np.mean(q1_losses)) metrics['policy_loss'].append(np.mean(policy_losses)) metrics['alpha_loss'].append(np.mean(alpha_losses)) metrics['alpha'].append(np.mean(alphas)) if episode % visualize_interval == 0: lineplot(metrics['episode'][-len(metrics['q1_loss']):], metrics['q1_loss'], 'q1_loss', log_dir) lineplot(metrics['episode'][-len(metrics['policy_loss']):], metrics['policy_loss'], 'policy_loss', log_dir) lineplot(metrics['episode'][-len(metrics['alpha_loss']):], metrics['alpha_loss'], 'alpha_loss', log_dir) lineplot(metrics['episode'][-len(metrics['alpha']):], metrics['alpha'], 'alpha', log_dir) # Update VIME if use_vime and len(memory) >= conf.random_sample_num: for _ in range(conf.vime_update_per_episode): batch_s, batch_a, _, batch_s_next, _ = memory.sample(conf.vime_update_batch_size) elbo = vime.update_posterior(batch_s, batch_a, batch_s_next) metrics['ELBO'].append(elbo) lineplot(metrics['episode'][-len(metrics['ELBO']):], metrics['ELBO'], 'ELBO', log_dir) if len(info_gains) > 0: metrics['D_KL_median'].append(np.median(info_gains)) metrics['D_KL_mean'].append(np.mean(info_gains)) multiple_lineplot(metrics['episode'][-len(metrics['D_KL_median']):], np.array([metrics['D_KL_median'], metrics['D_KL_mean']]).T, 'D_KL', ['median', 'mean'], log_dir) # Test current policy if episode % conf.test_interval == 0: rewards = [] for _ in range(conf.test_times): o = env.reset() done = False episode_reward = 0 while not done: a = agent.select_action(o, eval=True) o_next, r, done, _ = env.step(a) episode_reward += r o = o_next rewards.append(episode_reward) mean, std = np.mean(rewards), np.std(rewards) print("\nTEST AT EPISODE {} ({} episodes) --- Avg. Reward {:.2f} (+- {:.2f})".format(episode, conf.test_times, mean, std)) metrics['test_episode'].append(episode) metrics['test_reward'].append(rewards) lineplot(metrics['test_episode'][-len(metrics['test_reward']):], metrics['test_reward'], 'test_reward', log_dir) # Save checkpoint if episode % conf.checkpoint_interval == 0: save_checkpoint() save_checkpoint() # Save the final model torch.save({'agent': agent.state_dict()}, os.path.join(ckpt_dir, 'final_model.pth'))