def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, visualize=True, screen_size_px=(16, 16), minimap_size_px=(16, 16)) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "deepq-4way"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, screen_size_px=(32, 32), minimap_size_px=(32, 32), visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) # logdir = "tensorboard" # if FLAGS.algorithm == "deepq-4way": # logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( # FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, # FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) # elif FLAGS.algorithm == "deepq": # logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( # FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, # FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) # elif FLAGS.algorithm == "a2c": # logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( # FLAGS.algorithm, FLAGS.timesteps, # FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, # FLAGS.nsteps, lr_round, start_time) # logdir = os.path.join(PROJ_DIR, logdir) # if FLAGS.log == "tensorboard": # Logger.DEFAULT \ # = Logger.CURRENT \ # = Logger(dir=None, # output_formats=[TensorBoardOutputFormat(logdir)]) # elif FLAGS.log == "stdout": # Logger.DEFAULT \ # = Logger.CURRENT \ # = Logger(dir=None, # output_formats=[HumanOutputFormat(sys.stdout)]) if FLAGS.algorithm == "deepq": with sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=step_mul, visualize=True, screen_size_px=(16, 16), minimap_size_px=(16, 16)) as env: model = deepq.models.cnn_to_mlp( convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif FLAGS.algorithm == "deepq-4way": with sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=step_mul, screen_size_px=(32, 32), minimap_size_px=(32, 32), visualize=True) as env: model = deepq.models.cnn_to_mlp( convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif FLAGS.algorithm == "a2c": num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn( policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=FLAGS.ent_coef, vf_coef=FLAGS.vf_coef, nsteps=FLAGS.nsteps, max_grad_norm=FLAGS.max_grad_norm, callback=a2c_callback)
def main(): FLAGS(sys.argv) steps = 0 #Test steps print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if FLAGS.lr == 0: FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if FLAGS.algorithm == "deepq-4way": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "deepq": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "a2c": logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if FLAGS.log == "tensorboard": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if FLAGS.algorithm == "deepq": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16)) # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting # We need this change because sc2 now requires specifying players. with sc2_env.SC2Env( map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)], step_mul=step_mul, visualize=True, agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) acts = deepq_nexus_wars.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) agent = random_agent.RandomAgent() run_loop.run_loop([agent], env, steps) acts[0].save("mineral_shards_x.pkl") acts[1].save("mineral_shards_y.pkl") elif FLAGS.algorithm == "deepq-4way": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env(map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif FLAGS.algorithm == "a2c": num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( #interface.feature_layer.resolution 和 interface.feature_layer.minimap_resolution feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32) # 16 16 # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32) # 16 16 ) with sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=step_mul, #推进的速度,通俗理解就是人类玩家的每秒的有效操作 visualize=True, # screen_size_px=(16, 16), # minimap_size_px=(16, 16)) as env: agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = deepq.models.cnn_to_mlp( #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) #卷积核数量,卷积核大小,步长 # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True) # 卷积核数量,卷积核大小,步长 act = deepq_mineral_shards.learn( #训练模型并保存 # act = deepq_ActSeparate.learn( #训练模型并保存 # act=deepq_actSeparateWith4Directions.learn( # act = deepq_actionGroup_4way.learn( # act = deep_DiffActInSameTime.learn( env, q_func=model, num_actions=4, #default 16 num_actions=256 3 4 lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_actSeparateWith4Directions_callback ) #deepq_callback; deepq_ActSeperate_callback ; deepq_actSeparateWith4Directions_callback deep_DiffActInSameTime_callback act.save( "mineral_shards.pkl" ) #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py elif (FLAGS.algorithm == "deepq-4way"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env( # map_name="CollectMineralShards", step_mul=step_mul, # screen_size_px=(32, 32), # minimap_size_px=(32, 32), save_replay_episodes=2, replay_dir="D:/StarCraft II/StarCraft II/video", agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) # model = deepq.models.mlp(hiddens=[256,128,4]) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)