def main(): global_seed() map_name = "4x4" env = init_env(map_name) model = Model( policy=PolicyFullyConnected, observation_space = env.observation_space, action_space = env.action_space, learning_rate = 1e-3, nsteps=3, decay=0.99 ) runner = Runner( env = env, model = model, num_steps= 3, discount_rate = 0.99, save_summary_steps=1000, performance_num_episodes=100 ) time_steps = 3000 for _ in range(time_steps): runner.run()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--timesteps', default=int(1e6)) parser.add_argument('--batch_size', default=128) parser.add_argument('--discount_rate', default=0.99) parser.add_argument('--summary_frequency', default=20000) parser.add_argument('--performance_num_episodes', default=10) parser.add_argument('--summary_log_dir', default="reinforce_lstm") args = parser.parse_args() global_seed(0) env = init_environment() model = Model(policy=LstmPolicy, observation_space=env.observation_space, action_space=env.action_space, batch_size=args.batch_size) runner = Runner(env=env, model=model, batch_size=args.batch_size, discount_rate=args.discount_rate, summary_frequency=args.summary_frequency, performance_num_episodes=args.performance_num_episodes, summary_log_dir=args.summary_log_dir) for _ in range(0, args.timesteps // args.batch_size + 1): observations, states, rewards, terminals, actions = runner.run() model.train(observations, states, rewards, terminals, actions)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--timesteps', default=int(1e6)) parser.add_argument('--learning_rate', default=2e-4) parser.add_argument('--num_steps', default=5) parser.add_argument('--discount_rate', default=0.99) parser.add_argument('--summary_frequency', default=20000) parser.add_argument('--performance_num_episodes', default=10) parser.add_argument('--summary_log_dir', default="a2c") args = parser.parse_args() global_seed(0) env = init_environment() model = Model(policy=PolicyFullyConnected, observation_space=env.observation_space, action_space=env.action_space, learning_rate=args.learning_rate) runner = Runner(env=env, model=model, num_steps=args.num_steps, discount_rate=args.discount_rate, summary_frequency=args.summary_frequency, performance_num_episodes=args.performance_num_episodes, summary_log_dir=args.summary_log_dir) for _ in range(0, (args.timesteps // args.num_steps) + 1): runner.run()
def run(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--timesteps', default=int(1e6)) parser.add_argument('--num_steps', default=5) parser.add_argument('--discount_rate', default=0.99) parser.add_argument('--learning_rate', default=2e-4) parser.add_argument('--summary_frequency', default=20000) parser.add_argument('--performance_num_episodes', default=10) parser.add_argument('--summary_log_dir', default="a2c") args = parser.parse_args() dimensions = Dimensions(screen=(32, 32), minimap=(1, 1)) interfaceFormat = AgentInterfaceFormat( feature_dimensions=dimensions, use_feature_units=True, ) global_seed(0) env = SC2Env(map_name="MoveToBeacon", agent_interface_format=interfaceFormat, step_mul=8, random_seed=1) env = EnvWrapper(env) model = Model(policy=PolicyFullyConnected, observation_space=env.observation_space, action_space=env.action_space, learning_rate=args.learning_rate, spatial_resolution=(5, 5)) runner = Runner(env=env, model=model, batch_size=args.num_steps, discount_rate=args.discount_rate, summary_log_dir=args.summary_log_dir, summary_frequency=args.summary_frequency, performance_num_episodes=args.performance_num_episodes) for _ in range(0, (args.timesteps // args.num_steps) + 1): runner.run()
def run(): global_seed(0) env = init_env() model = Model( policy=PolicyFullyConnected, observation_space=(80, 80), action_space=3, learning_rate=2e-4 ) dir = "reinforce" runner = Runner( env = env, model = model, batch_size=128, timesteps=int(1e6), discount_rate=0.99, summary_frequency=20000, performance_num_episodes=100, summary_log_dir=dir ) runner.run()
def run(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--timesteps', default=int(1e6)) parser.add_argument('--num_steps', default=128) parser.add_argument('--entropy_coefficient', default=0.01) parser.add_argument('--learning_rate', default=2e-4) parser.add_argument('--gae_gamma', default=0.99) parser.add_argument('--gae_lambda', default=0.95) parser.add_argument('--num_batches', default=4) parser.add_argument('--num_training_epochs', default=4) parser.add_argument('--clip_range', default=0.2) parser.add_argument('--summary_frequency', default=20000) parser.add_argument('--performance_num_episodes', default=10) parser.add_argument('--summary_log_dir', default="ppo_fc") args = parser.parse_args() dimensions = Dimensions(screen=(32, 32), minimap=(1, 1)) interface_format = AgentInterfaceFormat( feature_dimensions=dimensions, use_feature_units=True, ) global_seed(0) batch_size = args.num_steps // args.num_batches env = SC2Env(map_name="MoveToBeacon", agent_interface_format=interface_format, step_mul=8, random_seed=1) env = EnvWrapper(env) model = Model(policy=PolicyFullyConnected, observation_space=env.observation_space, action_space=env.action_space, learning_rate=args.learning_rate, spatial_resolution=(5, 5), clip_range=args.clip_range, entropy_coefficient=args.entropy_coefficient) runner = Runner(env=env, model=model, num_steps=args.num_steps, advantage_estimator_gamma=args.gae_gamma, advantage_estimator_lambda=args.gae_lambda, summary_frequency=args.summary_frequency, performance_num_episodes=args.performance_num_episodes, summary_log_dir=args.summary_log_dir) for _ in range(0, (args.timesteps // args.num_steps) + 1): assert args.num_steps % args.num_batches == 0 step = runner.run() observations = np.asarray(step[0]) actions = np.asarray(step[1]) available_actions = np.asarray(step[2]) actions_spatial = np.asarray(step[3]) actions_spatial_mask = np.asarray(step[4]) advantage_estimations = np.asarray(step[5]) values = np.asarray(step[6]) probs = np.asarray(step[7]) probs_spatial = np.asarray(step[8]) indexes = np.arange(args.num_steps) for _ in range(args.num_training_epochs): np.random.shuffle(indexes) for i in range(0, args.num_steps, batch_size): shuffled_indexes = indexes[i:i + batch_size] model.train( observations=[ observations[0][shuffled_indexes], observations[1][shuffled_indexes], observations[2][shuffled_indexes] ], actions=actions[shuffled_indexes], available_actions_mask=available_actions[shuffled_indexes], actions_spatial=actions_spatial[shuffled_indexes], actions_spatial_mask=actions_spatial_mask[ shuffled_indexes], advantages=advantage_estimations[shuffled_indexes], values=values[shuffled_indexes], probs=probs[shuffled_indexes], probs_spatial=probs_spatial[shuffled_indexes])
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--total_timesteps', default=int(1000000)) parser.add_argument('--num_steps', default=128) parser.add_argument('--ent_coef', default=0.01) parser.add_argument('--learning_rate', default=3e-4) parser.add_argument('--vf_coef', default=0.5) parser.add_argument('--gae_gamma', default=0.99) parser.add_argument('--gae_lambda', default=0.95) parser.add_argument('--num_batches', default=4) parser.add_argument('--num_training_epochs', default=4) parser.add_argument('--clip_range', default=0.2) parser.add_argument('--summary_frequency', default=20000) parser.add_argument('--performance_num_episodes', default=10) parser.add_argument('--summary_log_dir', default="ppo_fc") args = parser.parse_args() global_seed(0) env = init_environment() batch_size = args.num_steps // args.num_batches model = Model(policy=PolicyFullyConnected, observation_space=env.observation_space, action_space=env.action_space, batch_size=batch_size, ent_coef=args.ent_coef, vf_coef=args.vf_coef) runner = Runner(env=env, model=model, num_steps=args.num_steps, advantage_estimator_gamma=args.gae_gamma, advantage_estimator_lambda=args.gae_lambda, summary_frequency=args.summary_frequency, performance_num_episodes=args.performance_num_episodes, summary_log_dir=args.summary_log_dir) for _ in range(0, (args.total_timesteps // args.num_steps) + 1): assert args.num_steps % args.num_batches == 0 observations, advantages, masks, actions, values, log_probs = runner.run( ) indexes = np.arange(args.num_steps) # [0,1,2 ..., 127] for _ in range(args.num_training_epochs): np.random.shuffle(indexes) for i in range(0, args.num_steps, batch_size): # 0 # 32 # 64 # 96 shuffled_indexes = indexes[i:i + batch_size] model.train(learning_rate=args.learning_rate, clip_range=args.clip_range, observations=observations[shuffled_indexes], advantages=advantages[shuffled_indexes], actions=actions[shuffled_indexes], values=values[shuffled_indexes], log_probs=log_probs[shuffled_indexes])