def main(): # define arguments parser = argparse.ArgumentParser() parser.add_argument("--render", action="store_true", help="Render the state") parser.add_argument("--render_interval", type=int, default=10, help="Number of rollouts to skip before rendering") parser.add_argument("--num_rollouts", type=int, default=-1, help="Number of max rollouts") parser.add_argument("--logfile", type=str, help="Indicate where to save rollout data") parser.add_argument( "--load_params", type=str, help="Load previously learned parameters from [LOAD_PARAMS]") parser.add_argument("--save_params", type=str, help="Save learned parameters to [SAVE_PARAMS]") args = parser.parse_args() signal.signal(signal.SIGINT, stopsigCallback) global stopsig # create the basketball environment env = BasketballVelocityEnv(fps=60.0, timeInterval=0.1, goal=[0, 5, 0], initialLengths=np.array([0, 0, 1, 1, 0, 0, 0]), initialAngles=np.array([0, 45, 0, 0, 0, 0, 0])) # create space stateSpace = ContinuousSpace(ranges=env.state_range()) actionRange = env.action_range() actionSpace = DiscreteSpace( intervals=[15 for i in range(2)] + [1], ranges=[actionRange[1], actionRange[2], actionRange[7]]) processor = JointProcessor(actionSpace) # create the model and policy functions modelFn = MxFullyConnected(sizes=[stateSpace.n + actionSpace.n, 64, 32, 1], alpha=0.001, use_gpu=True) if args.load_params: print("loading params...") modelFn.load_params(args.load_params) softmax = lambda s: np.exp(s) / np.sum(np.exp(s)) policyFn = EpsilonGreedyPolicy( epsilon=0.5, getActionsFn=lambda state: actionSpace.sample(1024), distributionFn=lambda qstate: softmax(modelFn(qstate))) dataset = ReplayBuffer() if args.logfile: log = open(args.logfile, "a") rollout = 0 while args.num_rollouts == -1 or rollout < args.num_rollouts: print("Iteration:", rollout) state = env.reset() reward = 0 done = False steps = 0 while not done: if stopsig: break action = policyFn(state) nextState, reward, done, info = env.step( createAction(processor.process_env_action(action))) dataset.append(state, action, reward, nextState) state = nextState steps += 1 if args.render and rollout % args.render_interval == 0: env.render() if stopsig: break dataset.reset() # push trajectory into the dataset buffer modelFn.fit(processor.process_Q(dataset.sample(1024)), num_epochs=10) print("Reward:", reward if (reward >= 0.00001) else 0, "with Error:", modelFn.score(), "with steps:", steps) if args.logfile: log.write("[" + str(rollout) + ", " + str(reward) + ", " + str(modelFn.score()) + "]\n") rollout += 1 if rollout % 100 == 0: policyFn.epsilon *= 0.95 print("Epsilon is now:", policyFn.epsilon) if args.logfile: log.close() if args.save_params: print("saving params...") modelFn.save_params(args.save_params)
def main(): # define arguments parser = argparse.ArgumentParser() parser.add_argument("--render", action="store_true", help="Render the state") parser.add_argument("--render_interval", type=int, default=10, help="Number of rollouts to skip before rendering") parser.add_argument("--num_rollouts", type=int, default=1000, help="Number of max rollouts") parser.add_argument("--logfile", type=str, help="Indicate where to save rollout data") parser.add_argument( "--load_params", type=str, help="Load previously learned parameters from [LOAD_PARAMS]") parser.add_argument("--save_params", type=str, help="Save learned parameters to [SAVE_PARAMS]") parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") parser.add_argument("--test", action="store_true", help="Test the params") args = parser.parse_args() signal.signal(signal.SIGINT, stopsigCallback) global stopsig # create the basketball environment env = BasketballVelocityEnv(fps=60.0, timeInterval=0.1, goal=[0, 5, 0], initialLengths=np.array([0, 0, 1, 1, 1, 0, 1]), initialAngles=np.array( [0, 45, -20, -20, 0, -20, 0])) # create space stateSpace = ContinuousSpace(ranges=env.state_range()) actionSpace = ContinuousSpace(ranges=env.action_range()) # create the model and policy functions modelFn = PoWERDistribution(stateSpace.n, actionSpace.n, sigma=5.0 if not args.test else 0) if args.load_params: print("Loading params...") modelFn.load_params(args.load_params) replayBuffer = ReplayBuffer(1024) if args.logfile: log = open(args.logfile, "a") rollout = 0 while args.num_rollouts == -1 or rollout < args.num_rollouts: print("Iteration:", rollout) state = env.reset() reward = 0 done = False steps = 0 while not done and steps < 5: if stopsig: break action, eps = modelFn.predict( state, replayBuffer.sample(gamma=args.gamma)) if steps == 4: action[-1] = 1.0 nextState, reward, done, info = env.step(action) replayBuffer.append(state, action, reward, nextState=nextState, info={"eps": eps}) state = nextState steps += 1 if args.render and rollout % args.render_interval == 0: env.render() if stopsig: break # no importance sampling, implement it when we have small datasets replayBuffer.reset() dataset = replayBuffer.sample(gamma=args.gamma) modelFn.fit(dataset) avgR = np.sum(dataset["rewards"]) / float(len(dataset["rewards"])) avgQ = np.sum(dataset["values"]) / float(len(dataset["values"])) print("Rollouts:", rollout, "Error:", modelFn.score(), "Average Q", avgQ, "Average R", avgR) if args.logfile: log.write("[" + str(rollout) + ", " + str(modelFn.score()) + ", " + str(avgQ) + ", " + str(avgR) + "]\n") rollout += 1 if args.logfile: log.close() if args.save_params: print("Saving params...") modelFn.save_params(args.save_params)
def train(config_filepath, save_dir, device, visualize_interval): conf = load_toml_config(config_filepath) data_dir, log_dir = create_save_dir(save_dir) # Save config file shutil.copyfile(config_filepath, os.path.join(save_dir, os.path.basename(config_filepath))) device = torch.device(device) # Set up log metrics metrics = { 'episode': [], 'episodic_step': [], 'collected_total_samples': [], 'reward': [], 'q_loss': [], 'policy_loss': [], 'alpha_loss': [], 'alpha': [], 'policy_switch_epoch': [], 'policy_switch_sample': [], 'test_episode': [], 'test_reward': [], } policy_switch_samples = conf.policy_switch_samples if hasattr( conf, "policy_switch_samples") else None total_collected_samples = 0 # Create environment env = make_env(conf.environment, render=False) # Instantiate modules memory = ReplayBuffer(int(conf.replay_buffer_capacity), env.observation_space.shape, env.action_space.shape) agent = getattr(agents, conf.agent_type)(env.observation_space, env.action_space, device=device, **conf.agent) # Load checkpoint if specified in config if conf.checkpoint != '': ckpt = torch.load(conf.checkpoint, map_location=device) metrics = ckpt['metrics'] agent.load_state_dict(ckpt['agent']) memory.load_state_dict(ckpt['memory']) policy_switch_samples = ckpt['policy_switch_samples'] total_collected_samples = ckpt['total_collected_samples'] def save_checkpoint(): # Save checkpoint ckpt = { 'metrics': metrics, 'agent': agent.state_dict(), 'memory': memory.state_dict(), 'policy_switch_samples': policy_switch_samples, 'total_collected_samples': total_collected_samples } path = os.path.join(data_dir, 'checkpoint.pth') torch.save(ckpt, path) # Save agent model only model_ckpt = {'agent': agent.state_dict()} model_path = os.path.join(data_dir, 'model.pth') torch.save(model_ckpt, model_path) # Save metrics only metrics_ckpt = {'metrics': metrics} metrics_path = os.path.join(data_dir, 'metrics.pth') torch.save(metrics_ckpt, metrics_path) # Train agent init_episode = 0 if len( metrics['episode']) == 0 else metrics['episode'][-1] + 1 pbar = tqdm.tqdm(range(init_episode, conf.episodes)) reward_moving_avg = None agent_update_count = 0 for episode in pbar: episodic_reward = 0 o = env.reset() q1_loss, q2_loss, policy_loss, alpha_loss, alpha = None, None, None, None, None for t in range(conf.horizon): if total_collected_samples <= conf.random_sample_num: # Select random actions at the begining of training. h = env.action_space.sample() elif memory.step <= conf.random_sample_num: # Select actions from random latent variable soon after inserting a new subpolicy. h = agent.select_action(o, random=True) else: h = agent.select_action(o) a = agent.post_process_action( o, h) # Convert abstract action h to actual action a o_next, r, done, _ = env.step(a) total_collected_samples += 1 episodic_reward += r memory.push(o, h, r, o_next, done) o = o_next if memory.step > conf.random_sample_num: # Update agent batch_data = memory.sample(conf.agent_update_batch_size) q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters( batch_data, agent_update_count) agent_update_count += 1 if done: break # Describe and save episodic metrics reward_moving_avg = ( 1. - MOVING_AVG_COEF ) * reward_moving_avg + MOVING_AVG_COEF * episodic_reward if reward_moving_avg else episodic_reward pbar.set_description( "EPISODE {} (total samples {}, subpolicy samples {}) --- Step {}, Reward {:.1f} (avg {:.1f})" .format(episode, total_collected_samples, memory.step, t, episodic_reward, reward_moving_avg)) metrics['episode'].append(episode) metrics['reward'].append(episodic_reward) metrics['episodic_step'].append(t) metrics['collected_total_samples'].append(total_collected_samples) if episode % visualize_interval == 0: # Visualize metrics lineplot(metrics['episode'][-len(metrics['reward']):], metrics['reward'], 'REWARD', log_dir) reward_avg = np.array(metrics['reward']) / np.array( metrics['episodic_step']) lineplot(metrics['episode'][-len(reward_avg):], reward_avg, 'AVG_REWARD', log_dir) lineplot( metrics['collected_total_samples'][-len(metrics['reward']):], metrics['reward'], 'SAMPLE-REWARD', log_dir, xaxis='sample') # Save metrics for agent update if q1_loss is not None: metrics['q_loss'].append(np.mean([q1_loss, q2_loss])) metrics['policy_loss'].append(policy_loss) metrics['alpha_loss'].append(alpha_loss) metrics['alpha'].append(alpha) if episode % visualize_interval == 0: lineplot(metrics['episode'][-len(metrics['q_loss']):], metrics['q_loss'], 'Q_LOSS', log_dir) lineplot(metrics['episode'][-len(metrics['policy_loss']):], metrics['policy_loss'], 'POLICY_LOSS', log_dir) lineplot(metrics['episode'][-len(metrics['alpha_loss']):], metrics['alpha_loss'], 'ALPHA_LOSS', log_dir) lineplot(metrics['episode'][-len(metrics['alpha']):], metrics['alpha'], 'ALPHA', log_dir) # Insert new subpolicy layer and reset memory if a specific amount of samples is collected if policy_switch_samples and len( policy_switch_samples ) > 0 and total_collected_samples >= policy_switch_samples[0]: print( "----------------------\nInser new policy\n----------------------" ) agent.insert_subpolicy() memory.reset() metrics['policy_switch_epoch'].append(episode) metrics['policy_switch_sample'].append(total_collected_samples) policy_switch_samples = policy_switch_samples[1:] # Test a policy if episode % conf.test_interval == 0: test_rewards = [] for _ in range(conf.test_times): episodic_reward = 0 obs = env.reset() for t in range(conf.horizon): h = agent.select_action(obs, eval=True) a = agent.post_process_action(o, h) obs_next, r, done, _ = env.step(a) episodic_reward += r obs = obs_next if done: break test_rewards.append(episodic_reward) test_reward_avg, test_reward_std = np.mean(test_rewards), np.std( test_rewards) print(" TEST --- ({} episodes) Reward {:.1f} (pm {:.1f})".format( conf.test_times, test_reward_avg, test_reward_std)) metrics['test_episode'].append(episode) metrics['test_reward'].append(test_rewards) lineplot(metrics['test_episode'][-len(metrics['test_reward']):], metrics['test_reward'], "TEST_REWARD", log_dir) # Save checkpoint if episode % conf.checkpoint_interval: save_checkpoint() # Save the final model torch.save({'agent': agent.state_dict()}, os.path.join(data_dir, 'final_model.pth'))