def config_log(FLAGS): logdir = "tensorboard/%s/hrl_a2c_svib/%s_lr%s_%s/%s_%s_%s" % ( FLAGS.env,FLAGS.num_timesteps, '0.0007',FLAGS.policy, start_time, FLAGS.train_option, str(FLAGS.beta)) if FLAGS.log == "tensorboard": Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[HumanOutputFormat(sys.stdout)])
def main(): FLAGS(sys.argv) logdir = "tensorboard" if FLAGS.algorithm == "deepq": logdir = "tensorboard/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) if FLAGS.log == "tensorboard": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) print("env : %s" % FLAGS.env) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("lr : %s" % FLAGS.lr) # Choose which RL algorithm to train. if FLAGS.algorithm == "deepq": # Use DQN train_dqn(env_id=FLAGS.env, num_timesteps=FLAGS.timesteps)
def main(): FLAGS(sys.argv) logdir = "tensorboard" if(FLAGS.algorithm == "deepq"): logdir = "./tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time ) if(FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir='log.txt', output_formats=[TensorBoardOutputFormat(logdir)]) elif(FLAGS.log == "stdout"): os.mkdir(logdir) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(logdir+"/log.txt")]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", minimap_size_px = (FLAGS.minimap_size_px, FLAGS.minimap_size_px), step_mul=FLAGS.step_mul, visualize=FLAGS.visualize, game_steps_per_episode= FLAGS.episode_steps) as env: model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1), (64, 3, 1), (64, 3, 1), (32, 3, 1)], hiddens=[256], dueling=True ) act = dqfd.learn( env, q_func=model, num_actions=FLAGS.num_actions, lr=FLAGS.lr, print_freq= FLAGS.print_freq, max_timesteps=FLAGS.timesteps, buffer_size=FLAGS.buffer_size, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=FLAGS.exploration_final_eps, train_freq=FLAGS.train_freq, learning_starts=FLAGS.learning_starts, target_network_update_freq=FLAGS.target_network_update_freq, gamma=FLAGS.gamma, prioritized_replay=FLAGS.prioritized, callback=deepq_callback ) act.save("defeat_zerglings.pkl")
def __init__(self, act_fun=tf.nn.leaky_relu): print("Local rank: ", hvd.local_rank(), hvd.size()) self.logdir = osp.join(FLAGS.logdir, FLAGS.exp) if hvd.rank() == 0: if not osp.exists(self.logdir): os.makedirs(self.logdir) self.logger = TensorBoardOutputFormat(self.logdir) else: self.logger = None self.act_fun = act_fun self.target_vars, self.saver, self.sess, self.resume_itr = setup(self.act_fun)
def main(): FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", step_mul=step_mul, visualize=True, agent_interface_format=sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)), game_steps_per_episode=steps * step_mul) as env: obs = env.reset() #print(obs[0].observation) model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) demo_replay = [] act = dqfd.learn(env, q_func=model, num_actions=3, lr=1e-4, max_timesteps=10000000, buffer_size=100000, exploration_fraction=0.5, exploration_final_eps=0.01, train_freq=2, learning_starts=100000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("defeat_zerglings.pkl")
def main(): FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", step_mul=step_mul, visualize=True, agent_interface_format=sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)), game_steps_per_episode=steps * step_mul) as env: print(env.observation_spec()) screen_dim = env.observation_spec()[0]['feature_screen'][1:3] print(screen_dim)
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( #interface.feature_layer.resolution 和 interface.feature_layer.minimap_resolution feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32) # 16 16 # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32) # 16 16 ) with sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=step_mul, #推进的速度,通俗理解就是人类玩家的每秒的有效操作 visualize=True, # screen_size_px=(16, 16), # minimap_size_px=(16, 16)) as env: agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = deepq.models.cnn_to_mlp( #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) #卷积核数量,卷积核大小,步长 # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True) # 卷积核数量,卷积核大小,步长 act = deepq_mineral_shards.learn( #训练模型并保存 # act = deepq_ActSeparate.learn( #训练模型并保存 # act=deepq_actSeparateWith4Directions.learn( # act = deepq_actionGroup_4way.learn( # act = deep_DiffActInSameTime.learn( env, q_func=model, num_actions=4, #default 16 num_actions=256 3 4 lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_actSeparateWith4Directions_callback ) #deepq_callback; deepq_ActSeperate_callback ; deepq_actSeparateWith4Directions_callback deep_DiffActInSameTime_callback act.save( "mineral_shards.pkl" ) #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py elif (FLAGS.algorithm == "deepq-4way"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env( # map_name="CollectMineralShards", step_mul=step_mul, # screen_size_px=(32, 32), # minimap_size_px=(32, 32), save_replay_episodes=2, replay_dir="D:/StarCraft II/StarCraft II/video", agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) # model = deepq.models.mlp(hiddens=[256,128,4]) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) elif (FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions( screen=64, minimap=64 ) #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 ) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", #DefeatZerglingsAndBanelings step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True, #True game_steps_per_episode=steps * step_mul) as env: model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) #model,需要改成lstm的形式。 demo_replay = [] # act = dqfd.learn( # env, # q_func=model, # num_actions=3, # lr=1e-4, # max_timesteps=10000000, # buffer_size=100000, # exploration_fraction=0.5, # exploration_final_eps=0.01, # train_freq=2, # learning_starts=100000, # target_network_update_freq=1000, # gamma=0.99, # prioritized_replay=True, # callback=deepq_callback # ) # act.save("defeat_zerglings.pkl") BicNet_findAndDefeatZergling.learn( env, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=100000, train_freq=1, learning_starts=1000, #100000, target_network_update_freq=1000, gamma=0.99, callback=BicNet_callback)
_SELECT_ARMY = actions.FUNCTIONS.select_army.id _SELECT_UNIT = actions.FUNCTIONS.select_unit.id _SELECT_POINT = actions.FUNCTIONS.select_point.id _NOT_QUEUED = [0] _SELECT_ALL = [0] UP, DOWN, LEFT, RIGHT = 'up', 'down', 'left', 'right' #to record the output start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") logdir = "./tensorboard/enjoy/%s" % start_time Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) FLAGS = flags.FLAGS flags.DEFINE_string("map_name", "DefeatZerglingsAndBanelings", "the map you want to see.") flags.DEFINE_string("trained_model", "/home/tld/PycharmProjects/DeepQ_StarCraft2/models/deepq/zergling_45.6.pkl", "the model you has trained.") flags.DEFINE_bool("visualize", True, "if you want to see the game") flags.DEFINE_integer("num_actions", 4, "numbers of your action") flags.DEFINE_integer("step_mul", 5, "the time of every step spends") flags.DEFINE_integer("episode_steps", 2800, "the steps of every episode spends") def main(): FLAGS(sys.argv) with sc2_env.SC2Env( map_name=FLAGS.map_name,
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) # steps_left = FLAGS.timesteps logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) elif (FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions( screen=32, minimap=32 ), #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 use_feature_units=True) lr = FLAGS.lr buffer_size = 60000 # 50000 减少一下,尽量是训练步数的1/10 70000 test 200 70000 batch_size = 32 # 32 gamma = 0.99 num_agents = 2 #9 vector_obs_len = 736 #33 #4096 # 32*32 1024 output_len = 4 #3 hidden_vector_len = 128 #128 #1 tau = 0.001 # stddev = 0.1 sess = U.make_session() sess.__enter__() actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) critic = tb.CriticNetwork(sess, lr, tau, gamma, actor.get_num_trainable_vars(), num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1), sigma=float(stddev) * np.ones(1)) action_noise = noise_OU.OU_noise(decay_period=FLAGS.timesteps - buffer_size) # while(steps_left > 0): with sc2_env.SC2Env( map_name="CollectMineralShards", #DefeatZerglingsAndBanelings # step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=False, #True game_steps_per_episode=steps * step_mul) as env: learn( env, sess=sess, max_timesteps=FLAGS.timesteps, train_freq=1, save_freq=10000, target_network_update_freq=1, #1000 gamma=gamma, # callback=BicNet_callback, actor=actor, critic=critic, replay_buffer=replay_buffer, num_agents=num_agents, action_noise=action_noise, output_len=output_len, num_exploring=buffer_size #buffer_size )
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, visualize=True, screen_size_px=(16, 16), minimap_size_px=(16, 16)) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "deepq-4way"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, screen_size_px=(32, 32), minimap_size_px=(32, 32), visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): FLAGS(sys.argv) steps = 0 #Test steps print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if FLAGS.lr == 0: FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if FLAGS.algorithm == "deepq-4way": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "deepq": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "a2c": logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if FLAGS.log == "tensorboard": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if FLAGS.algorithm == "deepq": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16)) # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting # We need this change because sc2 now requires specifying players. with sc2_env.SC2Env( map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)], step_mul=step_mul, visualize=True, agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) acts = deepq_nexus_wars.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) agent = random_agent.RandomAgent() run_loop.run_loop([agent], env, steps) acts[0].save("mineral_shards_x.pkl") acts[1].save("mineral_shards_y.pkl") elif FLAGS.algorithm == "deepq-4way": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env(map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif FLAGS.algorithm == "a2c": num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): logdir = osp.join(FLAGS.logdir, FLAGS.exp) if not osp.exists(logdir): os.makedirs(logdir) logger = TensorBoardOutputFormat(logdir) datasource = FLAGS.datasource def make_env(rank): def _thunk(): # Make the environments non stoppable for now if datasource == "maze": env = Maze(end=[0.7, -0.8], start=[-0.85, -0.85], random_starts=False) elif datasource == "point": env = Point(end=[0.5, 0.5], start=[0.0, 0.0], random_starts=True) elif datasource == "reacher": env = Reacher(end=[0.7, 0.5], eps=0.01) env.seed(rank) env = Monitor(env, os.path.join("/tmp", str(rank)), allow_early_resets=True) return env return _thunk env = SubprocVecEnv( [make_env(i + FLAGS.seed) for i in range(FLAGS.num_env)]) if FLAGS.datasource == 'point' or FLAGS.datasource == 'maze' or FLAGS.datasource == 'reacher': if FLAGS.ff_model: model = TrajFFDynamics(dim_input=FLAGS.latent_dim, dim_output=FLAGS.latent_dim) else: model = TrajNetLatentFC(dim_input=FLAGS.latent_dim) X_NOISE = tf.placeholder(shape=(None, FLAGS.total_frame, FLAGS.input_objects, FLAGS.latent_dim), dtype=tf.float32) X = tf.placeholder(shape=(None, FLAGS.total_frame, FLAGS.input_objects, FLAGS.latent_dim), dtype=tf.float32) if FLAGS.cond: ACTION_LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) else: ACTION_LABEL = None ACTION_NOISE_LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) ACTION_PLAN = tf.placeholder(shape=(None, FLAGS.plan_steps + 1, 2), dtype=tf.float32) X_START = tf.placeholder(shape=(None, 1, FLAGS.input_objects, FLAGS.latent_dim), dtype=tf.float32) X_PLAN = tf.placeholder(shape=(None, FLAGS.plan_steps, FLAGS.input_objects, FLAGS.latent_dim), dtype=tf.float32) if FLAGS.datasource == 'reacher': X_END = tf.placeholder(shape=(None, 1, FLAGS.input_objects, 2), dtype=tf.float32) else: X_END = tf.placeholder(shape=(None, 1, FLAGS.input_objects, FLAGS.latent_dim), dtype=tf.float32) else: raise AssertionError("Unsupported data source") weights = model.construct_weights(action_size=FLAGS.action_dim) optimizer = AdamOptimizer(1e-2, beta1=0.0, beta2=0.999) if FLAGS.ff_model: target_vars = construct_ff_model(model, weights, X_NOISE, X, ACTION_LABEL, ACTION_NOISE_LABEL, optimizer) target_vars = construct_ff_plan_model(model, weights, X_PLAN, X_START, X_END, ACTION_PLAN, target_vars=target_vars) else: target_vars = construct_model(model, weights, X_NOISE, X, ACTION_LABEL, ACTION_NOISE_LABEL, optimizer) target_vars = construct_plan_model(model, weights, X_PLAN, X_START, X_END, ACTION_PLAN, target_vars=target_vars) sess = tf.InteractiveSession() saver = loader = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2) tf.global_variables_initializer().run() print("Initializing variables...") if FLAGS.resume_iter != -1 or not FLAGS.train: model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter)) resume_itr = FLAGS.resume_iter saver.restore(sess, model_file) train(target_vars, saver, sess, logger, FLAGS.resume_iter, env)
def train(arglist): with U.single_threaded_session(): # Create environment env = StarCraft2Env(map_name=arglist.scenario, reward_only_positive=False, obs_last_action=True, obs_timestep_number=True, reward_scale_rate=200) # Create agent trainers env_info = env.get_env_info() num_agents = env_info["n_agents"] num_adversaries = num_agents obs_shape_n = [(env_info["obs_shape"], ) for i in range(num_adversaries)] action_space_n = [ env_info["n_actions"] for i in range(num_adversaries) ] buffer_size = arglist.buffer_size trainers = get_trainers(num_adversaries, obs_shape_n, action_space_n, arglist, buffer_size) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() logdir = "./tensorboard/" Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(num_agents)] # individual agent reward saver = tf.train.Saver(max_to_keep=100000000) n_actions_no_attack = 6 env.reset() obs_n = [] reward_hl_own_old = [] reward_hl_en_old = [] for agent_id in range(num_agents): # 第一个循环是为了得到初始状态/观察/生命值信息 obs = env.get_obs_agent(agent_id) obs_n.append(obs) reward_hl_own_old.append(env.get_agent_health(agent_id)) reward_hl_en_old.append(env.get_enemy_health(agent_id)) episode_step = 0 step = 0 print('Starting iterations...') while True: # get action action_set_actual = [] action_set_execute = [] action_n = [] dead_unit = [] for agent_id in range(num_agents): action_output = trainers[agent_id].action(obs_n[agent_id]) action_n.append(action_output) action_prob = action_output action_to_choose = np.argmax(action_prob) action_set_actual.append(action_to_choose) avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] if action_to_choose in avail_actions_ind: action_set_execute.append(action_to_choose) elif (avail_actions[0] == 1): action_set_execute.append( 0) # 如果该动作不能执行,并且智能体已经死亡,那么就用NO_OP代替当前动作 else: action_set_execute.append(1) # 如果该动作不能执行,那么就用STOP动作代替 if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): # 判断该智能体是否已经死亡 dead_unit.append(agent_id) rew_base, done, _ = env.step(action_set_execute) episode_rewards[-1] += rew_base new_obs_n = [] reward_hl_own_new = [] reward_hl_en_new = [] rew_n = [] for agent_id in range(num_agents): obs_next = env.get_obs_agent(agent_id=agent_id) new_obs_n.append(obs_next) reward_hl_own_new.append(env.get_agent_health(agent_id)) reward_hl_en_new.append(env.get_enemy_health(agent_id)) for agent_id in range(num_agents): if (agent_id in dead_unit): reward = 0 elif (action_set_execute[agent_id] != action_set_actual[agent_id] ): #当输出动作无法执行时,执行替代动作,但是把输出动作进行保存并且给与一个负的奖励 reward = -2 elif (action_set_execute[agent_id] > 5): target_id = action_set_execute[ agent_id] - n_actions_no_attack health_reduce_en = reward_hl_en_old[ target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): if (rew_base > 0): reward = 2 + rew_base else: reward = 2 else: reward = 1 else: reward = (reward_hl_own_new[agent_id] - reward_hl_own_old[agent_id]) * 5 rew_n.append(reward) episode_step += 1 # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done) obs_n = new_obs_n reward_hl_own_old = reward_hl_own_new reward_hl_en_old = reward_hl_en_new for i, rew in enumerate(rew_n): agent_rewards[i][-1] += rew if done: print("steps until now : %s, episode: %s, episode reward: %s" % (step, len(episode_rewards), episode_rewards[-1])) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("episode reward", episode_rewards[-1]) for i in range(num_agents): logger.record_tabular("agent" + str(i) + " episode reward", agent_rewards[i][-1]) logger.dump_tabular() env.reset() obs_n = [] reward_hl_own_old = [] reward_hl_en_old = [] for agent_id in range(num_agents): # 第一个循环是为了得到初始状态/观察/生命值信息 obs = env.get_obs_agent(agent_id) obs_n.append(obs) reward_hl_own_old.append(env.get_agent_health(agent_id)) reward_hl_en_old.append(env.get_enemy_health(agent_id)) episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) # increment global step counter step += 1 if (step == arglist.buffer_size): print("Training starts.") # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, step) # save model, display training output if done and (len(episode_rewards) % arglist.save_rate == 0): save_dir = arglist.save_dir + "/model_" + str( step) + "steps/" + arglist.exp_name U.save_state(save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}". format(step, len(episode_rewards), np.mean( episode_rewards[-arglist.save_rate:]))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}" .format(step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: print('...Finished total of {} episodes.'.format( len(episode_rewards) - 1)) break
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") lr = 0.002 buffer_size = 80000 #80000 # 减少一下,尽量是训练步数的1/10 70000 test 200 80000 20000 batch_size = 32 # 32 gamma = 0.99 num_agents = 8 vector_obs_len = 248 # local obs:80 ; global state:168; output_len = 14 hidden_vector_len = 256 #128 # 1 256 tau = 0.001 num_exploring = buffer_size #buffer_size action_low = -1 action_high = 1 save_freq = 10000 # min_life = 45 env = StarCraft2Env(map_name="8m",reward_only_positive=False, reward_scale_rate=200) #8m DefeatZerglingsAndBanelings reward_scale_rate=200 env_info = env.get_env_info() n_episodes = 4000 #4000 #2000 # n_actions = env_info["n_actions"] n_agents = env_info["n_agents"] episode_len = env_info["episode_limit"] timesteps = n_episodes * episode_len logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( "BicNet", timesteps, 16, lr, start_time ) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() # state_dim = (n_agents, vector_obs_len) # action_dim = (n_agents, output_len) actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) # actor = ActorNetwork(sess, state_dim, action_dim, lr, tau, batch_size) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, vector_obs_len, output_len, hidden_vector_len) # critic = CriticNetwork(sess, state_dim, action_dim, lr, tau, gamma, actor.get_num_trainable_vars()) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) action_noise = OU_noise(decay_period=timesteps - buffer_size) action_noise.reset() # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings") # U.load_state(model_file_load, sess) U.initialize() t = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 obs = env.get_obs() obs = np.array(obs) # state, target_attack = env.get_state() state, min = env.get_state() screen_expand = np.zeros([obs.shape[0],obs.shape[1] + state.shape[0]]) for i in range(obs.shape[0]): screen_expand[i] = np.append(obs[i],state.flatten()) # screen_expand = state_transform(obs) # screen_expand = state_expand(state, n_agents) while not terminated: t = t+1 screen_input = np.expand_dims(screen_expand, axis=0) action = actor.predict(screen_input)[0] act_with_noise = np.clip(action + action_noise.get_noise(t - num_exploring), action_low, action_high) act_mat_norm = (act_with_noise+1)/2 actions = [] dead_unit = [] rew_expand = np.zeros((n_agents, 1)) # punish = [] # health_agent = [] # health_enemy = [] agent_group = [] for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] # print('act_unit_norm',act_unit_norm) # act_prob = act_unit_norm / np.sum(act_unit_norm, axis=0) for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if(sum_avail_act == 0): act_prob = (np.array(act_prob) + 1)/len(act_prob) else : act_prob = np.array(act_prob)/sum_avail_act # index = np.random.choice(np.arange(0,14), p=act_prob.ravel()) # print("act_prob",act_prob) index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) # if (index in avail_actions_ind): # punish.append(False) # else: # punish.append(True) # if (0 in avail_actions_ind): # actions.append(0) # else: # actions.append(1) actions.append(index) # health_agent.append(state[4*agent_id]) # health_enemy.append(state[4*n_agents + 3*agent_id]) # if(index > 5): # target_id = index - 6 if(len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): dead_unit.append(agent_id) # health_agent = np.array(health_agent) # for i in range(len(health_enemy)): # if (health_enemy[i] < min_life): # min_life = health_enemy[i] # health_enemy = np.array(health_enemy) reward, terminated, _ = env.step(actions) # rew_expand = np.ones((n_agents, 1))*reward # health_enemy_new = [] for i in range(n_agents): if (i not in dead_unit): rew_expand[i] += reward if (actions[i] > 5): enemy_id = actions[i] - 6 rew_expand[i] += 1 # if(actions[i]-6 == target_attack): for j in range(n_agents): if (actions[j] == actions[i] and i!=j): if (state[4 * n_agents + 3 * enemy_id] == min): rew_expand[i] += 1 new_obs = env.get_obs() new_obs = np.array(new_obs) # new_state, target_attack = env.get_state() new_state, min = env.get_state() new_screen_expand = np.zeros([new_obs.shape[0], new_obs.shape[1] + new_state.shape[0]]) for i in range(new_obs.shape[0]): new_screen_expand[i] = np.append(new_obs[i], new_state.flatten()) # health_agent_new = [] # for i in range(n_agents): # health_agent_new.append(new_state[4 * i]) # # health_enemy_new.append(new_state[4 * n_agents + 3 * i]) # health_agent_new = np.array(health_agent_new) # health_enemy_new = np.array(health_enemy_new) # life_reduce_agent = health_agent - health_agent_new # life_reduce_agent_all = life_reduce_agent.sum(axis=0) # life_reduce_enemy = health_enemy - health_enemy_new # life_reduce_enemy_all = life_reduce_enemy.sum(axis=0) # reward_base = life_reduce_enemy_all - life_reduce_agent_all # for i in range(n_agents): # rew_expand[i] += reward_base+life_reduce_agent[i] # for i in range(n_agents): # if (punish[i]): # rew_expand[i] += -2 # elif (i in dead_unit): # rew_expand[i] += 0 # elif (actions[i] > 5): # rew_expand[i] = 1 # if(health_enemy[actions[i] - 6] == min_life): # rew_expand[i] = 1 # rew_expand[i] += life_reduce_agent[i] replay_buffer.add(screen_expand, act_with_noise, rew_expand, terminated, new_screen_expand) episode_reward += reward screen_expand = new_screen_expand # state = new_state # target_attack = target_attack_new if(t>=num_exploring): print("training starts") s_batch, a_batch, r_batch, done_batch, s2_batch = replay_buffer.sample_batch(batch_size) # [group0:[batch_size, trace.dimension], group1, ... group8] target_q = r_batch + gamma * critic.predict_target(s2_batch, actor.predict_target(s2_batch)) predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, output_len))) a_outs = actor.predict(s_batch) # a_outs和a_batch是完全相同的 grads = critic.action_gradients(s_batch, a_outs) # delta Q对a的导数 actor.train(s_batch, grads) actor.update_target_network() critic.update_target_network() # if(t % save_freq == 0): # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) # replay_buffer.save() print("Total reward in episode {} = {}".format(e, episode_reward)) logger.record_tabular("steps", t) logger.record_tabular("episodes", e) logger.record_tabular("reward", episode_reward) logger.dump_tabular() # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) env.close()
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") lr = 0.002 batch_size = 32 # 32 num_agents = 8 vector_obs_len = 248 # local obs:80 ; global state:168; output_len = 14 hidden_vector_len = 256 #128 # 1 256 tau = 0.001 env = StarCraft2Env(map_name="8m",reward_only_positive=False, reward_scale_rate=200) #8m DefeatZerglingsAndBanelings reward_scale_rate=200 env_info = env.get_env_info() n_episodes = 4000 #4000 #2000 n_agents = env_info["n_agents"] episode_len = env_info["episode_limit"] timesteps = n_episodes * episode_len logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( "BicNet", timesteps, 16, lr, start_time ) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) model_file_load = os.path.join(str(300000) + "_" + "model_segment_training/", "defeat_zerglings") U.load_state(model_file_load, sess) t = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 obs = env.get_obs() obs = np.array(obs) state, min = env.get_state() screen_expand = np.zeros([obs.shape[0],obs.shape[1] + state.shape[0]]) for i in range(obs.shape[0]): screen_expand[i] = np.append(obs[i],state.flatten()) while not terminated: t = t+1 screen_input = np.expand_dims(screen_expand, axis=0) action = actor.predict(screen_input)[0] act_with_noise = action act_mat_norm = (act_with_noise+1)/2 actions = [] for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if(sum_avail_act == 0): act_prob = (np.array(act_prob) + 1)/len(act_prob) else : act_prob = np.array(act_prob)/sum_avail_act index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) actions.append(index) reward, terminated, _ = env.step(actions) new_obs = env.get_obs() new_obs = np.array(new_obs) new_state, min = env.get_state() new_screen_expand = np.zeros([new_obs.shape[0], new_obs.shape[1] + new_state.shape[0]]) for i in range(new_obs.shape[0]): new_screen_expand[i] = np.append(new_obs[i], new_state.flatten()) episode_reward += reward screen_expand = new_screen_expand print("Total reward in episode {} = {}".format(e, episode_reward)) logger.record_tabular("steps", t) logger.record_tabular("episodes", e) logger.record_tabular("reward", episode_reward) logger.dump_tabular() env.close()
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) # steps_left = FLAGS.timesteps logdir = "tensorboard" if(FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time ) elif(FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time ) elif(FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time ) if(FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif(FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=64, minimap=64),#feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 use_feature_units=True ) lr = FLAGS.lr batch_size = 32 # 32 gamma = 0.99 num_agents = 9 vector_obs_len = 33 #4096 # 32*32 1024 output_len = 3 hidden_vector_len = 128 #1 tau = 0.001 # stddev = 0.1 sess = U.make_session() sess.__enter__() actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) # while(steps_left > 0): with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", #DefeatZerglingsAndBanelings step_mul=step_mul, save_replay_episodes=1, replay_dir="D:/StarCraft II/StarCraft II/Replays/video/0722", agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=False, #True game_steps_per_episode=steps * step_mul) as env: learn( env, sess=sess, max_timesteps=FLAGS.timesteps, # callback=BicNet_callback, actor=actor, num_agents=num_agents )
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_cpu : %s" % FLAGS.num_cpu) print("lr : %s" % FLAGS.lr) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env("CollectMineralShards", step_mul=step_mul, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn(env, q_func=model, num_actions=64, lr=1e-3, max_timesteps=20000000, buffer_size=10000, exploration_fraction=0.5, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "acktr"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 # def make_env(rank): # # env = sc2_env.SC2Env( # # "CollectMineralShards", # # step_mul=step_mul) # # return env # #env.seed(seed + rank) # def _thunk(): # env = sc2_env.SC2Env( # map_name=FLAGS.map, # step_mul=step_mul, # visualize=True) # #env.seed(seed + rank) # if logger.get_dir(): # env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) # return env # return _thunk # agents = [Agent() # for _ in range(num_cpu)] # # for agent in agents: # time.sleep(1) # agent.daemon = True # agent.start() # agent_controller = AgentController(agents) #set_global_seeds(seed) env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map) policy_fn = CnnPolicy acktr_disc.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_cpu, ent_coef=0.1, callback=acktr_callback)
def main(): if FLAGS.dataset == "mnist": train_dataloader = DataLoader(MNIST("/root/data", train=True, download=True, transform=transforms.ToTensor()), batch_size=FLAGS.batch_size) test_dataloader = DataLoader(MNIST("/root/data", train=False, download=True, transform=transforms.ToTensor()), batch_size=FLAGS.batch_size) input_dim = 784 prob_dist = "discrete" else: train_dataloader = DataLoader(FreyFaces(train=True), batch_size=FLAGS.batch_size) test_dataloader = DataLoader(FreyFaces(train=False), batch_size=FLAGS.batch_size) input_dim = 560 prob_dist = "continuous" model = VAE(hidden_dim=FLAGS.latent_dim, input_dim=input_dim, nh=FLAGS.hidden_dim, prob_dist=prob_dist).train().cuda() logdir = osp.join(FLAGS.logdir, FLAGS.exp) optimizer = optim.Adam(model.parameters(), lr=1e-3) logger = TensorBoardOutputFormat(logdir) it = FLAGS.resume_iter if not osp.exists(logdir): os.makedirs(logdir) if FLAGS.resume_iter != 0: model_path = osp.join(logdir, "model_{}".format(FLAGS.resume_iter)) model.load_state_dict(torch.load(model_path)) if FLAGS.train: stop = False its = [] train_losses = [] test_losses = [] test_dataloader_iter = iter(test_dataloader) while it < FLAGS.num_iter: for dat, label in tqdm(train_dataloader): if FLAGS.dataset == "mnist": dat = dat.cuda().reshape((dat.size(0), 28 * 28)) else: dat = dat.float().cuda().reshape((dat.size(0), 28 * 20)) optimizer.zero_grad() outputs = model.forward(dat) if FLAGS.dataset == "mnist": loss = model.compute_loss(outputs, dat) else: loss = model.compute_loss(outputs, dat) loss.backward() optimizer.step() if it % (100 * FLAGS.batch_size) == 0: loss = loss.item() logger.writekvs({"loss": loss}) print(it, loss) if FLAGS.gen_plots: its.append(it) if FLAGS.estimate_prob: estimate_prob = model.estimate_prob(dat).item() print(estimate_prob) train_losses.append(estimate_prob) else: train_losses.append(-1 * loss) try: dat, label = test_dataloader_iter.next() except: test_dataloader_iter = iter(test_dataloader) dat, label = test_dataloader_iter.next() if FLAGS.dataset == "mnist": dat = dat.cuda().reshape((dat.size(0), 28 * 28)) else: dat = dat.float().cuda().reshape( (dat.size(0), 28 * 20)) outputs = model.forward(dat) if FLAGS.dataset == "mnist": loss = model.compute_loss(outputs, dat) else: loss = model.compute_loss(outputs, dat) if FLAGS.estimate_prob: estimate_prob = model.estimate_prob(dat).item() test_losses.append(estimate_prob) else: test_losses.append(-1 * loss) it += FLAGS.batch_size if it > FLAGS.num_iter: break if FLAGS.gen_plots: plt.semilogx(its, train_losses, "r") plt.semilogx(its, test_losses, "b") plt.ylabel("ELBO") if FLAGS.dataset == "frey": data_string = "Frey Faces" elif FLAGS.dataset == "mnist": data_string = "MNIST" plt.title("{}, $N_z = {}$".format(data_string, FLAGS.latent_dim)) time = str(datetime.datetime.now()) plt.savefig("plot_{}_{}_{}.png".format(FLAGS.dataset, time, FLAGS.latent_dim)) model_path = osp.join(logdir, "model_{}".format(it)) torch.save(model.state_dict(), model_path) if FLAGS.latent_traversal: intervals = np.linspace(-1, 1, 8) grid = np.meshgrid(intervals, intervals) value = np.stack([grid[0], grid[1]], axis=2) latent_input = torch.from_numpy(value.reshape((64, 2))).float().cuda() output = model.generate_sample(z=latent_input) else: output = model.generate_sample() output = output.cpu().detach().numpy() if FLAGS.dataset == "mnist": output = output.reshape((8, 8, 28, 28)).transpose( (0, 2, 1, 3)).reshape((8 * 28, 8 * 28)) elif FLAGS.dataset == "frey": output = output.reshape((8, 8, 28, 20)).transpose( (0, 2, 1, 3)).reshape((8 * 28, 8 * 20)) time = str(datetime.datetime.now()) imsave("test_{}_{}_{}.png".format(FLAGS.dataset, time, FLAGS.latent_dim), output) print("Done")
import os from glob import glob from baselines.bench.monitor import load_results from baselines.logger import TensorBoardOutputFormat from collections import deque import numpy as np monitor_files = glob(os.path.join(os.path.dirname(__file__), '../../result', '**/monitor.csv'), recursive=True) for monitor in monitor_files: dir = os.path.dirname(monitor) csv = load_results(dir) tb = TensorBoardOutputFormat(os.path.join(dir, 'tb2')) length = 100 kv = {} for i in range(length, csv.r.size): t = csv.t.values[i] r = csv.r.values[i - length:i] l = csv.l.values[i - length:i] e = csv.best_exec.values[i - length:i] * 1000 # seconds to ms kv['EpExecMean'] = np.mean(e) kv['EpRewMean'] = np.mean(r) kv['EpLenMean'] = np.mean(l) tb.writekvs_wt(kv, t) tb.close()
def main(): print("Local rank: ", hvd.local_rank(), hvd.size()) FLAGS.exp = FLAGS.exp + '_' + FLAGS.divergence logdir = osp.join(FLAGS.logdir, FLAGS.exp) if hvd.rank() == 0: if not osp.exists(logdir): os.makedirs(logdir) logger = TensorBoardOutputFormat(logdir) else: logger = None print("Loading data...") dataset = Cifar10(augment=FLAGS.augment, rescale=FLAGS.rescale) test_dataset = Cifar10(train=False, rescale=FLAGS.rescale) channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32) if FLAGS.large_model: model = ResNet32Large( num_channels=channel_num, num_filters=128, train=True) model_dis = ResNet32Large( num_channels=channel_num, num_filters=128, train=True) elif FLAGS.larger_model: model = ResNet32Larger( num_channels=channel_num, num_filters=128) model_dis = ResNet32Larger( num_channels=channel_num, num_filters=128) elif FLAGS.wider_model: model = ResNet32Wider( num_channels=channel_num, num_filters=256) model_dis = ResNet32Wider( num_channels=channel_num, num_filters=256) else: model = ResNet32( num_channels=channel_num, num_filters=128) model_dis = ResNet32( num_channels=channel_num, num_filters=128) print("Done loading...") grad_exp, conjugate_grad_exp = get_divergence_funcs(FLAGS.divergence) data_loader = DataLoader( dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.data_workers, drop_last=True, shuffle=True) weights = [model.construct_weights('context_energy'), model_dis.construct_weights('context_dis')] Y = tf.placeholder(shape=(None), dtype=tf.int32) # Varibles to run in training X_SPLIT = tf.split(X, FLAGS.num_gpus) X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus) LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus) LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus) LABEL_SPLIT_INIT = list(LABEL_SPLIT) tower_grads = [] tower_grads_dis = [] tower_grads_l2 = [] tower_grads_dis_l2 = [] optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999) optimizer = hvd.DistributedOptimizer(optimizer) optimizer_dis = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999) optimizer_dis = hvd.DistributedOptimizer(optimizer_dis) for j in range(FLAGS.num_gpus): energy_pos = [ model.forward( X_SPLIT[j], weights[0], label=LABEL_POS_SPLIT[j], stop_at_grad=False)] energy_pos = tf.concat(energy_pos, axis=0) score_pos = [ model_dis.forward( X_SPLIT[j], weights[1], label=LABEL_POS_SPLIT[j], stop_at_grad=False)] score_pos = tf.concat(score_pos, axis=0) print("Building graph...") x_mod = x_orig = X_NOISE_SPLIT[j] x_grads = [] energy_negs = [] loss_energys = [] energy_negs.extend([model.forward(tf.stop_gradient( x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True)]) eps_begin = tf.zeros(1) steps = tf.constant(0) c = lambda i, x: tf.less(i, FLAGS.num_steps) def langevin_step(counter, x_mod): x_mod = x_mod + tf.random_normal(tf.shape(x_mod), mean=0.0, stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale) energy_noise = energy_start = tf.concat( [model.forward( x_mod, weights[0], label=LABEL_SPLIT[j], reuse=True, stop_at_grad=False, stop_batch=True)], axis=0) x_grad, label_grad = tf.gradients(energy_noise, [x_mod, LABEL_SPLIT[j]]) energy_noise_old = energy_noise lr = FLAGS.step_lr if FLAGS.proj_norm != 0.0: if FLAGS.proj_norm_type == 'l2': x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm) elif FLAGS.proj_norm_type == 'li': x_grad = tf.clip_by_value( x_grad, -FLAGS.proj_norm, FLAGS.proj_norm) else: print("Other types of projection are not supported!!!") assert False # Clip gradient norm for now if FLAGS.hmc: # Step size should be tuned to get around 65% acceptance def energy(x): return FLAGS.temperature * \ model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True) x_last = hmc(x_mod, 15., 10, energy) else: x_last = x_mod - (lr) * x_grad x_mod = x_last x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale) counter = counter + 1 return counter, x_mod steps, x_mod = tf.while_loop(c, langevin_step, (steps, x_mod)) energy_eval = model.forward(x_mod, weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) x_grad = tf.gradients(energy_eval, [x_mod])[0] x_grads.append(x_grad) energy_negs.append( model.forward( tf.stop_gradient(x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True)) score_neg = model_dis.forward( tf.stop_gradient(x_mod), weights[1], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) test_x_mod = x_mod temp = FLAGS.temperature energy_neg = energy_negs[-1] x_off = tf.reduce_mean( tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j])) loss_energy = model.forward( x_mod, weights[0], reuse=True, label=LABEL, stop_grad=True) print("Finished processing loop construction ...") target_vars = {} if FLAGS.cclass or FLAGS.model_cclass: label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0) label_prob = label_sum / tf.reduce_sum(label_sum) label_ent = -tf.reduce_sum(label_prob * tf.math.log(label_prob + 1e-7)) else: label_ent = tf.zeros(1) target_vars['label_ent'] = label_ent if FLAGS.train: loss_dis = - (tf.reduce_mean(grad_exp(score_pos + energy_pos)) - tf.reduce_mean(conjugate_grad_exp(score_neg + energy_neg))) loss_dis = loss_dis + FLAGS.l2_coeff * (tf.reduce_mean(tf.square(score_pos)) + tf.reduce_mean(tf.square(score_neg))) l2_dis = FLAGS.l2_coeff * (tf.reduce_mean(tf.square(score_pos)) + tf.reduce_mean(tf.square(score_neg))) loss_model = tf.reduce_mean(grad_exp(score_pos + energy_pos)) + \ tf.reduce_mean(energy_neg * tf.stop_gradient(conjugate_grad_exp(score_neg + energy_neg))) - \ tf.reduce_mean(energy_neg) * tf.stop_gradient(tf.reduce_mean(conjugate_grad_exp(score_neg + energy_neg))) - \ tf.reduce_mean(conjugate_grad_exp(score_neg + energy_neg)) loss_model = loss_model + FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square(energy_neg))) l2_model = FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square(energy_neg))) print("Started gradient computation...") model_vars = [var for var in tf.trainable_variables() if 'context_energy' in var.name] print("model var number", len(model_vars)) dis_vars = [var for var in tf.trainable_variables() if 'context_dis' in var.name] print("discriminator var number", len(dis_vars)) gvs = optimizer.compute_gradients(loss_model, model_vars) gvs = [(k, v) for (k, v) in gvs if k is not None] tower_grads.append(gvs) gvs = optimizer.compute_gradients(l2_model, model_vars) gvs = [(k, v) for (k, v) in gvs if k is not None] tower_grads_l2.append(gvs) gvs_dis = optimizer_dis.compute_gradients(loss_dis, dis_vars) gvs_dis = [(k, v) for (k, v) in gvs_dis if k is not None] tower_grads_dis.append(gvs_dis) gvs_dis = optimizer_dis.compute_gradients(l2_dis, dis_vars) gvs_dis = [(k, v) for (k, v) in gvs_dis if k is not None] tower_grads_dis_l2.append(gvs_dis) print("Finished applying gradients.") target_vars['total_loss'] = loss_model target_vars['loss_energy'] = loss_energy target_vars['weights'] = weights target_vars['gvs'] = gvs target_vars['X'] = X target_vars['Y'] = Y target_vars['LABEL'] = LABEL target_vars['LABEL_POS'] = LABEL_POS target_vars['X_NOISE'] = X_NOISE target_vars['energy_pos'] = energy_pos target_vars['energy_start'] = energy_negs[0] if len(x_grads) >= 1: target_vars['x_grad'] = x_grads[-1] target_vars['x_grad_first'] = x_grads[0] else: target_vars['x_grad'] = tf.zeros(1) target_vars['x_grad_first'] = tf.zeros(1) target_vars['x_mod'] = x_mod target_vars['x_off'] = x_off target_vars['temp'] = temp target_vars['energy_neg'] = energy_neg target_vars['test_x_mod'] = test_x_mod target_vars['eps_begin'] = eps_begin target_vars['score_neg'] = score_neg target_vars['score_pos'] = score_pos if FLAGS.train: grads_model = average_gradients(tower_grads) train_op_model = optimizer.apply_gradients(grads_model) target_vars['train_op_model'] = train_op_model grads_model_l2 = average_gradients(tower_grads_l2) train_op_model_l2 = optimizer.apply_gradients(grads_model_l2) target_vars['train_op_model_l2'] = train_op_model_l2 grads_model_dis = average_gradients(tower_grads_dis) train_op_dis = optimizer_dis.apply_gradients(grads_model_dis) target_vars['train_op_dis'] = train_op_dis grads_model_dis_l2 = average_gradients(tower_grads_dis_l2) train_op_dis_l2 = optimizer_dis.apply_gradients(grads_model_dis_l2) target_vars['train_op_dis_l2'] = train_op_dis_l2 config = tf.ConfigProto() if hvd.size() > 1: config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.Session(config=config) saver = loader = tf.train.Saver(max_to_keep=500) total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters print("Model has a total of {} parameters".format(total_parameters)) sess.run(tf.global_variables_initializer()) resume_itr = 0 if (FLAGS.resume_iter != -1 or not FLAGS.train) and hvd.rank() == 0: model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter)) resume_itr = FLAGS.resume_iter saver.restore(sess, model_file) # optimistic_restore(sess, model_file) sess.run(hvd.broadcast_global_variables(0)) print("Initializing variables...") print("Start broadcast") print("End broadcast") if FLAGS.train: train(target_vars, saver, sess, logger, data_loader, resume_itr, logdir) test(target_vars, saver, sess, logger, data_loader)
def main(): print("Local rank: ", hvd.local_rank(), hvd.size()) logdir = osp.join(FLAGS.logdir, FLAGS.exp) if hvd.rank() == 0: if not osp.exists(logdir): os.makedirs(logdir) logger = TensorBoardOutputFormat(logdir) else: logger = None LABEL = None print("Loading data...") if FLAGS.dataset == 'cifar10': dataset = Cifar10(augment=FLAGS.augment, rescale=FLAGS.rescale) test_dataset = Cifar10(train=False, rescale=FLAGS.rescale) channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32) if FLAGS.large_model: model = ResNet32Large(num_channels=channel_num, num_filters=128, train=True) elif FLAGS.larger_model: model = ResNet32Larger(num_channels=channel_num, num_filters=128) elif FLAGS.wider_model: model = ResNet32Wider(num_channels=channel_num, num_filters=192) else: model = ResNet32(num_channels=channel_num, num_filters=128) elif FLAGS.dataset == 'imagenet': dataset = Imagenet(train=True) test_dataset = Imagenet(train=False) channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 1000), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1000), dtype=tf.float32) model = ResNet32Wider(num_channels=channel_num, num_filters=256) elif FLAGS.dataset == 'imagenetfull': channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 1000), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1000), dtype=tf.float32) model = ResNet128(num_channels=channel_num, num_filters=64) elif FLAGS.dataset == 'mnist': dataset = Mnist(rescale=FLAGS.rescale) test_dataset = dataset channel_num = 1 X_NOISE = tf.placeholder(shape=(None, 28, 28), dtype=tf.float32) X = tf.placeholder(shape=(None, 28, 28), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32) model = MnistNet(num_channels=channel_num, num_filters=FLAGS.num_filters) elif FLAGS.dataset == 'dsprites': dataset = DSprites(cond_shape=FLAGS.cond_shape, cond_size=FLAGS.cond_size, cond_pos=FLAGS.cond_pos, cond_rot=FLAGS.cond_rot) test_dataset = dataset channel_num = 1 X_NOISE = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32) X = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32) if FLAGS.dpos_only: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) elif FLAGS.dsize_only: LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32) elif FLAGS.drot_only: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) elif FLAGS.cond_size: LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32) elif FLAGS.cond_shape: LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32) elif FLAGS.cond_pos: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) elif FLAGS.cond_rot: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) else: LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32) model = DspritesNet(num_channels=channel_num, num_filters=FLAGS.num_filters, cond_size=FLAGS.cond_size, cond_shape=FLAGS.cond_shape, cond_pos=FLAGS.cond_pos, cond_rot=FLAGS.cond_rot) print("Done loading...") if FLAGS.dataset == "imagenetfull": # In the case of full imagenet, use custom_tensorflow dataloader data_loader = TFImagenetLoader('train', FLAGS.batch_size, hvd.rank(), hvd.size(), rescale=FLAGS.rescale) else: data_loader = DataLoader(dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.data_workers, drop_last=True, shuffle=True) batch_size = FLAGS.batch_size weights = [model.construct_weights('context_0')] Y = tf.placeholder(shape=(None), dtype=tf.int32) # Varibles to run in training X_SPLIT = tf.split(X, FLAGS.num_gpus) X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus) LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus) LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus) LABEL_SPLIT_INIT = list(LABEL_SPLIT) tower_grads = [] tower_gen_grads = [] x_mod_list = [] optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999) optimizer = hvd.DistributedOptimizer(optimizer) for j in range(FLAGS.num_gpus): if FLAGS.model_cclass: ind_batch_size = FLAGS.batch_size // FLAGS.num_gpus label_tensor = tf.Variable(tf.convert_to_tensor(np.reshape( np.tile(np.eye(10), (FLAGS.batch_size, 1, 1)), (FLAGS.batch_size * 10, 10)), dtype=tf.float32), trainable=False, dtype=tf.float32) x_split = tf.tile( tf.reshape(X_SPLIT[j], (ind_batch_size, 1, 32, 32, 3)), (1, 10, 1, 1, 1)) x_split = tf.reshape(x_split, (ind_batch_size * 10, 32, 32, 3)) energy_pos = model.forward(x_split, weights[0], label=label_tensor, stop_at_grad=False) energy_pos_full = tf.reshape(energy_pos, (ind_batch_size, 10)) energy_partition_est = tf.reduce_logsumexp(energy_pos_full, axis=1, keepdims=True) uniform = tf.random_uniform(tf.shape(energy_pos_full)) label_tensor = tf.argmax(-energy_pos_full - tf.log(-tf.log(uniform)) - energy_partition_est, axis=1) label = tf.one_hot(label_tensor, 10, dtype=tf.float32) label = tf.Print(label, [label_tensor, energy_pos_full]) LABEL_SPLIT[j] = label energy_pos = tf.concat(energy_pos, axis=0) else: energy_pos = [ model.forward(X_SPLIT[j], weights[0], label=LABEL_POS_SPLIT[j], stop_at_grad=False) ] energy_pos = tf.concat(energy_pos, axis=0) print("Building graph...") x_mod = x_orig = X_NOISE_SPLIT[j] x_grads = [] energy_negs = [] loss_energys = [] energy_negs.extend([ model.forward(tf.stop_gradient(x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) ]) eps_begin = tf.zeros(1) steps = tf.constant(0) c = lambda i, x: tf.less(i, FLAGS.num_steps) def langevin_step(counter, x_mod): x_mod = x_mod + tf.random_normal( tf.shape(x_mod), mean=0.0, stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale) energy_noise = energy_start = tf.concat([ model.forward(x_mod, weights[0], label=LABEL_SPLIT[j], reuse=True, stop_at_grad=False, stop_batch=True) ], axis=0) x_grad, label_grad = tf.gradients(FLAGS.temperature * energy_noise, [x_mod, LABEL_SPLIT[j]]) energy_noise_old = energy_noise lr = FLAGS.step_lr if FLAGS.proj_norm != 0.0: if FLAGS.proj_norm_type == 'l2': x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm) elif FLAGS.proj_norm_type == 'li': x_grad = tf.clip_by_value(x_grad, -FLAGS.proj_norm, FLAGS.proj_norm) else: print("Other types of projection are not supported!!!") assert False # Clip gradient norm for now if FLAGS.hmc: # Step size should be tuned to get around 65% acceptance def energy(x): return FLAGS.temperature * \ model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True) x_last = hmc(x_mod, 15., 10, energy) else: x_last = x_mod - (lr) * x_grad x_mod = x_last x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale) counter = counter + 1 return counter, x_mod steps, x_mod = tf.while_loop(c, langevin_step, (steps, x_mod)) energy_eval = model.forward(x_mod, weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) x_grad = tf.gradients(FLAGS.temperature * energy_eval, [x_mod])[0] x_grads.append(x_grad) energy_negs.append( model.forward(tf.stop_gradient(x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True)) test_x_mod = x_mod temp = FLAGS.temperature energy_neg = energy_negs[-1] x_off = tf.reduce_mean( tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j])) loss_energy = model.forward(x_mod, weights[0], reuse=True, label=LABEL, stop_grad=True) print("Finished processing loop construction ...") target_vars = {} if FLAGS.cclass or FLAGS.model_cclass: label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0) label_prob = label_sum / tf.reduce_sum(label_sum) label_ent = -tf.reduce_sum( label_prob * tf.math.log(label_prob + 1e-7)) else: label_ent = tf.zeros(1) target_vars['label_ent'] = label_ent if FLAGS.train: if FLAGS.objective == 'logsumexp': pos_term = temp * energy_pos energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg)) coeff = tf.stop_gradient(tf.exp(-temp * energy_neg_reduced)) norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4 pos_loss = tf.reduce_mean(temp * energy_pos) neg_loss = coeff * (-1 * temp * energy_neg) / norm_constant loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss)) elif FLAGS.objective == 'cd': pos_loss = tf.reduce_mean(temp * energy_pos) neg_loss = -tf.reduce_mean(temp * energy_neg) loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss)) elif FLAGS.objective == 'softplus': loss_ml = FLAGS.ml_coeff * \ tf.nn.softplus(temp * (energy_pos - energy_neg)) loss_total = tf.reduce_mean(loss_ml) if not FLAGS.zero_kl: loss_total = loss_total + tf.reduce_mean(loss_energy) loss_total = loss_total + \ FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square((energy_neg)))) print("Started gradient computation...") gvs = optimizer.compute_gradients(loss_total) gvs = [(k, v) for (k, v) in gvs if k is not None] print("Applying gradients...") tower_grads.append(gvs) print("Finished applying gradients.") target_vars['loss_ml'] = loss_ml target_vars['total_loss'] = loss_total target_vars['loss_energy'] = loss_energy target_vars['weights'] = weights target_vars['gvs'] = gvs target_vars['X'] = X target_vars['Y'] = Y target_vars['LABEL'] = LABEL target_vars['LABEL_POS'] = LABEL_POS target_vars['X_NOISE'] = X_NOISE target_vars['energy_pos'] = energy_pos target_vars['energy_start'] = energy_negs[0] if len(x_grads) >= 1: target_vars['x_grad'] = x_grads[-1] target_vars['x_grad_first'] = x_grads[0] else: target_vars['x_grad'] = tf.zeros(1) target_vars['x_grad_first'] = tf.zeros(1) target_vars['x_mod'] = x_mod target_vars['x_off'] = x_off target_vars['temp'] = temp target_vars['energy_neg'] = energy_neg target_vars['test_x_mod'] = test_x_mod target_vars['eps_begin'] = eps_begin if FLAGS.train: grads = average_gradients(tower_grads) train_op = optimizer.apply_gradients(grads) target_vars['train_op'] = train_op config = tf.ConfigProto() if hvd.size() > 1: config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.Session(config=config) saver = loader = tf.train.Saver(max_to_keep=30, keep_checkpoint_every_n_hours=6) total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters print("Model has a total of {} parameters".format(total_parameters)) sess.run(tf.global_variables_initializer()) resume_itr = 0 if (FLAGS.resume_iter != -1 or not FLAGS.train) and hvd.rank() == 0: model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter)) resume_itr = FLAGS.resume_iter # saver.restore(sess, model_file) optimistic_restore(sess, model_file) sess.run(hvd.broadcast_global_variables(0)) print("Initializing variables...") print("Start broadcast") print("End broadcast") if FLAGS.train: print("Training phase") train(target_vars, saver, sess, logger, data_loader, resume_itr, logdir) print("Testing phase") test(target_vars, saver, sess, logger, data_loader)
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") env = StarCraft2Env(map_name="8m", reward_only_positive=False, reward_scale_rate=200, state_last_action=True, obs_last_action=True, obs_timestep_number=True, state_timestep_number=True) #reward_defeat=-200 env_info = env.get_env_info() n_episodes = 2500 #4000 #2000 timesteps = 500000 n_agents = env_info["n_agents"] n_actions = env_info["n_actions"] output_len = n_actions lr = 0.002 buffer_size = 70000 #int(timesteps * 0.1) # 80000 # 减少一下,尽量是训练步数的1/10 70000 test 200 80000 20000 batch_size = 32 # 32 gamma = 0.99 num_agents = 8 local_obs_len = 179 # local obs:80 ; global state:168; global_state_len = 348 # 179+169 hidden_vector_len = 256 # 128 # 1 256 tau = 0.001 num_exploring = buffer_size # buffer_size action_low = -1 action_high = 1 save_freq = 10000 critic_output_len = 1 logdir = "tensorboard/%s/%s_lr%s/%s" % ("BicNet", timesteps, lr, start_time) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, local_obs_len, output_len, hidden_vector_len) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, global_state_len, critic_output_len, hidden_vector_len, n_actions) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) action_noise = OU_noise(decay_period=timesteps - buffer_size) action_noise.reset() # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings") # U.load_state(model_file_load, sess) U.initialize() t = 0 step_train = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 local_obs = env.get_obs() local_obs = np.array(local_obs) global_state = env.get_state() global_state_expand = np.zeros( [local_obs.shape[0], local_obs.shape[1] + global_state.shape[0]]) reward_hl_own_old = [] reward_hl_en_old = [] episode_reward_agent = [0 for n in range(n_agents)] for i in range(local_obs.shape[0]): global_state_expand[i] = np.append(local_obs[i], global_state.flatten()) reward_hl_own_old.append(env.get_agent_health(i)) reward_hl_en_old.append(env.get_enemy_health(i)) while not terminated: t = t + 1 critic_input = np.expand_dims(global_state_expand, axis=0) actor_input = np.expand_dims(local_obs, axis=0) action = actor.predict(actor_input)[0] act_with_noise = action #np.clip(action + action_noise.get_noise(step_train), action_low, action_high) act_mat_norm = (act_with_noise + 1) / 2 actions = [] dead_unit = [] rew_expand = np.zeros((n_agents, 1)) for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if (sum_avail_act == 0): act_prob = (np.array(act_prob) + 1) / len(act_prob) else: act_prob = np.array(act_prob) / sum_avail_act index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) actions.append(index) if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): dead_unit.append(agent_id) reward_base, terminated, info = env.step(actions) new_local_obs = env.get_obs() new_local_obs = np.array(new_local_obs) new_global_state = env.get_state() new_global_state_expand = np.zeros([ new_local_obs.shape[0], new_local_obs.shape[1] + new_global_state.shape[0] ]) reward_hl_own_new = [] reward_hl_en_new = [] for i in range(new_local_obs.shape[0]): new_global_state_expand[i] = np.append( new_local_obs[i], new_global_state.flatten()) reward_hl_own_new.append(env.get_agent_health(i)) reward_hl_en_new.append(env.get_enemy_health(i)) for i in range(n_agents): if (i in dead_unit): rew_expand[i] = 0 else: rew_expand[i] = -0.05 if (actions[i] > 5): target_id = actions[i] - 6 health_reduce_en = reward_hl_en_old[ target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): rew_expand[i] += 2 + health_reduce_en * 5 # if (reward_base > 50): # rew_expand[i] += 20 else: rew_expand[i] += 1 else: rew_expand[i] += (reward_hl_own_new[i] - reward_hl_own_old[i]) * 5 # if (terminated): if (info["battle_won"] is False): rew_expand[i] += -10 else: rew_expand[i] += 10 episode_reward_agent[i] += rew_expand[i] replay_buffer.add(local_obs, global_state_expand, act_with_noise, rew_expand, terminated, new_local_obs, new_global_state_expand) episode_reward += reward_base local_obs = new_local_obs global_state_expand = new_global_state_expand if (t == num_exploring): print("training starts") if (t >= num_exploring): local_s_batch, global_s_batch, a_batch, r_batch, done_batch, local_s2_batch, global_s2_batch = replay_buffer.sample_batch( batch_size ) # [group0:[batch_size, trace.dimension], group1, ... group8] target_q = r_batch + gamma * critic.predict_target( global_s2_batch, actor.predict_target(local_s2_batch)) predicted_q_value, _ = critic.train( global_s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, critic_output_len))) a_outs = actor.predict(local_s_batch) # a_outs和a_batch是完全相同的 grads = critic.action_gradients(global_s_batch, a_outs) # delta Q对a的导数 actor.train(local_s_batch, grads) step_train = step_train + 1 actor.update_target_network() critic.update_target_network() if (t % save_freq == 0): model_file_save = os.path.join( "model/" + str(step_train) + "_" + "training_steps_model/", "8m") U.save_state(model_file_save) print("Model have been trained for %s times" % (step_train)) # replay_buffer.save() print("steps until now : %s, episode: %s, episode reward: %s" % (t, e, episode_reward)) logger.record_tabular("steps", t) logger.record_tabular("episodes", e) logger.record_tabular("reward_episode", episode_reward) for i in range(n_agents): logger.record_tabular("reward_agent_" + str(i), episode_reward_agent[i]) logger.dump_tabular() # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) env.close()
def main(): logdir = osp.join(FLAGS.logdir, FLAGS.exp) logger = TensorBoardOutputFormat(logdir) config = tf.ConfigProto() sess = tf.Session(config=config) LABEL = None print("Loading data...") if FLAGS.dataset == 'cubes': dataset = Cubes(cond_idx=FLAGS.cond_idx) test_dataset = dataset if FLAGS.cond_idx == 0: label_size = 2 elif FLAGS.cond_idx == 1: label_size = 1 elif FLAGS.cond_idx == 2: label_size = 3 elif FLAGS.cond_idx == 3: label_size = 20 LABEL = tf.placeholder(shape=(None, label_size), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, label_size), dtype=tf.float32) elif FLAGS.dataset == 'color': dataset = CubesColor() test_dataset = dataset LABEL = tf.placeholder(shape=(None, 301), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 301), dtype=tf.float32) label_size = 301 elif FLAGS.dataset == 'pos': dataset = CubesPos() test_dataset = dataset LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) label_size = 2 elif FLAGS.dataset == "pairs": dataset = Pairs(cond_idx=0) test_dataset = dataset LABEL = tf.placeholder(shape=(None, 6), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 6), dtype=tf.float32) label_size = 6 elif FLAGS.dataset == "continual": dataset = CubesContinual() test_dataset = dataset if FLAGS.prelearn_model_shape: LABEL = tf.placeholder(shape=(None, 20), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 20), dtype=tf.float32) label_size = 20 else: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) label_size = 2 elif FLAGS.dataset == "cross": dataset = CubesCrossProduct(FLAGS.ratio, cond_size=FLAGS.cond_size, cond_pos=FLAGS.cond_pos, joint_baseline=FLAGS.joint_baseline) test_dataset = dataset if FLAGS.cond_size: LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32) label_size = 1 elif FLAGS.cond_pos: LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) label_size = 2 if FLAGS.joint_baseline: LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32) label_size = 3 elif FLAGS.dataset == 'celeba': dataset = CelebA(cond_idx=FLAGS.celeba_cond_idx) test_dataset = dataset channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32) model = ResNet128( num_channels=channel_num, num_filters=64, classes=2) if FLAGS.joint_baseline: # Other stuff for joint model optimizer = AdamOptimizer(FLAGS.lr, beta1=0.99, beta2=0.999) X = tf.placeholder(shape=(None, 64, 64, 3), dtype=tf.float32) X_NOISE = tf.placeholder(shape=(None, 64, 64, 3), dtype=tf.float32) ATTENTION_MASK = tf.placeholder(shape=(None, 64, 64, FLAGS.cond_func), dtype=tf.float32) NOISE = tf.placeholder(shape=(None, 128), dtype=tf.float32) HIER_LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) channel_num = 3 model = CubesNetGen(num_channels=channel_num, label_size=label_size) weights = model.construct_weights('context_0') output = model.forward(NOISE, weights, reuse=False, label=LABEL) print(output.get_shape()) mse_loss = tf.reduce_mean(tf.square(output - X)) gvs = optimizer.compute_gradients(mse_loss) train_op = optimizer.apply_gradients(gvs) gvs = [(k, v) for (k, v) in gvs if k is not None] target_vars = {} target_vars['train_op'] = train_op target_vars['X'] = X target_vars['X_NOISE'] = X_NOISE target_vars['ATTENTION_MASK'] = ATTENTION_MASK target_vars['eps_begin'] = tf.zeros(1) target_vars['gvs'] = gvs target_vars['energy_pos'] = tf.zeros(1) target_vars['energy_neg'] = tf.zeros(1) target_vars['loss_energy'] = tf.zeros(1) target_vars['loss_ml'] = tf.zeros(1) target_vars['total_loss'] = mse_loss target_vars['attention_mask'] = tf.zeros(1) target_vars['attention_grad'] = tf.zeros(1) target_vars['x_off'] = tf.reduce_mean(tf.abs(output - X)) target_vars['x_mod'] = tf.zeros(1) target_vars['x_grad'] = tf.zeros(1) target_vars['NOISE'] = NOISE target_vars['LABEL'] = LABEL target_vars['LABEL_POS'] = LABEL_POS target_vars['HIER_LABEL'] = HIER_LABEL data_loader = DataLoader( dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.data_workers, drop_last=True, shuffle=True) else: print("label size here ", label_size) channel_num = 3 X_NOISE = tf.placeholder(shape=(None, 64, 64, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 64, 64, 3), dtype=tf.float32) HEIR_LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32) ATTENTION_MASK = tf.placeholder(shape=(None, 64, 64, FLAGS.cond_func), dtype=tf.float32) if FLAGS.dataset != "celeba": model = CubesNet(num_channels=channel_num, label_size=label_size) heir_model = HeirNet(num_channels=FLAGS.cond_func) models_pretrain = [] if FLAGS.prelearn_model: model_prelearn = CubesNet(num_channels=channel_num, label_size=FLAGS.prelearn_label) weights = model_prelearn.construct_weights('context_1') LABEL_PRELEARN = tf.placeholder(shape=(None, FLAGS.prelearn_label), dtype=tf.float32) models_pretrain.append((model_prelearn, weights, LABEL_PRELEARN)) cubes_logdir = osp.join(FLAGS.logdir, FLAGS.prelearn_exp) if (FLAGS.prelearn_iter != -1 or not FLAGS.train): model_file = osp.join(cubes_logdir, 'model_{}'.format(FLAGS.prelearn_iter)) resume_itr = FLAGS.resume_iter # saver.restore(sess, model_file) v_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='context_{}'.format(1)) v_map = {(v.name.replace('context_{}'.format(1), 'context_0')[:-2]): v for v in v_list} saver = tf.train.Saver(v_map) saver.restore(sess, model_file) if FLAGS.prelearn_model_shape: model_prelearn = CubesNet(num_channels=channel_num, label_size=FLAGS.prelearn_label_shape) weights = model_prelearn.construct_weights('context_2') LABEL_PRELEARN = tf.placeholder(shape=(None, FLAGS.prelearn_label_shape), dtype=tf.float32) models_pretrain.append((model_prelearn, weights, LABEL_PRELEARN)) cubes_logdir = osp.join(FLAGS.logdir, FLAGS.prelearn_exp_shape) if (FLAGS.prelearn_iter_shape != -1 or not FLAGS.train): model_file = osp.join(cubes_logdir, 'model_{}'.format(FLAGS.prelearn_iter_shape)) resume_itr = FLAGS.resume_iter # saver.restore(sess, model_file) v_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='context_{}'.format(2)) v_map = {(v.name.replace('context_{}'.format(2), 'context_0')[:-2]): v for v in v_list} saver = tf.train.Saver(v_map) saver.restore(sess, model_file) print("Done loading...") data_loader = DataLoader( dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.data_workers, drop_last=True, shuffle=True) batch_size = FLAGS.batch_size weights = model.construct_weights('context_0') if FLAGS.heir_mask: weights = heir_model.construct_weights('heir_0', weights=weights) Y = tf.placeholder(shape=(None), dtype=tf.int32) # Varibles to run in training X_SPLIT = tf.split(X, FLAGS.num_gpus) X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus) LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus) LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus) LABEL_SPLIT_INIT = list(LABEL_SPLIT) attention_mask = ATTENTION_MASK tower_grads = [] tower_gen_grads = [] x_mod_list = [] optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.99) for j in range(FLAGS.num_gpus): x_mod = X_SPLIT[j] if FLAGS.comb_mask: steps = tf.constant(0) c = lambda i, x: tf.less(i, FLAGS.num_steps) def langevin_attention_step(counter, attention_mask): attention_mask = attention_mask + tf.random_normal(tf.shape(attention_mask), mean=0.0, stddev=0.01) energy_noise = energy_start = model.forward( x_mod, weights, attention_mask, label=LABEL_SPLIT[j], reuse=True, stop_at_grad=False, stop_batch=True) if FLAGS.heir_mask: energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL) energy_noise = energy_noise + energy_heir attention_grad = tf.gradients( FLAGS.temperature * energy_noise, [attention_mask])[0] energy_noise_old = energy_noise # Clip gradient norm for now attention_mask = attention_mask - (FLAGS.attention_lr) * attention_grad attention_mask = tf.layers.average_pooling2d(attention_mask, (3, 3), 1, padding='SAME') attention_mask = tf.stop_gradient(attention_mask) counter = counter + 1 return counter, attention_mask steps, attention_mask = tf.while_loop(c, langevin_attention_step, (steps, attention_mask)) # attention_mask = tf.Print(attention_mask, [attention_mask]) energy_pos = model.forward( X_SPLIT[j], weights, tf.stop_gradient(attention_mask), label=LABEL_POS_SPLIT[j], stop_at_grad=False) if FLAGS.heir_mask: energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL) energy_pos = energy_heir + energy_pos else: energy_pos = model.forward( X_SPLIT[j], weights, attention_mask, label=LABEL_POS_SPLIT[j], stop_at_grad=False) if FLAGS.heir_mask: energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL) energy_pos = energy_heir + energy_pos print("Building graph...") x_mod = x_orig = X_NOISE_SPLIT[j] x_grads = [] loss_energys = [] eps_begin = tf.zeros(1) steps = tf.constant(0) c_cond = lambda i, x, y: tf.less(i, FLAGS.num_steps) def langevin_step(counter, x_mod, attention_mask): lr = FLAGS.step_lr x_mod = x_mod + tf.random_normal(tf.shape(x_mod), mean=0.0, stddev=0.001 * FLAGS.rescale * FLAGS.noise_scale) attention_mask = attention_mask + tf.random_normal(tf.shape(attention_mask), mean=0.0, stddev=0.01) energy_noise = model.forward( x_mod, weights, attention_mask, label=LABEL_SPLIT[j], reuse=True, stop_at_grad=False, stop_batch=True) if FLAGS.prelearn_model: for m_i, w_i, l_i in models_pretrain: energy_noise = energy_noise + m_i.forward( x_mod, w_i, attention_mask, label=l_i, reuse=True, stop_at_grad=False, stop_batch=True) if FLAGS.heir_mask: energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL) energy_noise = energy_heir + energy_noise x_grad, attention_grad = tf.gradients( FLAGS.temperature * energy_noise, [x_mod, attention_mask]) if not FLAGS.comb_mask: attention_grad = tf.zeros(1) energy_noise_old = energy_noise if FLAGS.proj_norm != 0.0: if FLAGS.proj_norm_type == 'l2': x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm) elif FLAGS.proj_norm_type == 'li': x_grad = tf.clip_by_value( x_grad, -FLAGS.proj_norm, FLAGS.proj_norm) else: print("Other types of projection are not supported!!!") assert False # Clip gradient norm for now x_last = x_mod - (lr) * x_grad if FLAGS.comb_mask: attention_mask = attention_mask - FLAGS.attention_lr * attention_grad attention_mask = tf.layers.average_pooling2d(attention_mask, (3, 3), 1, padding='SAME') attention_mask = tf.stop_gradient(attention_mask) x_mod = x_last x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale) counter = counter + 1 return counter, x_mod, attention_mask steps, x_mod, attention_mask = tf.while_loop(c_cond, langevin_step, (steps, x_mod, attention_mask)) attention_mask = tf.stop_gradient(attention_mask) # attention_mask = tf.Print(attention_mask, [attention_mask]) energy_eval = model.forward(x_mod, weights, attention_mask, label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) x_grad, attention_grad = tf.gradients(FLAGS.temperature * energy_eval, [x_mod, attention_mask]) x_grads.append(x_grad) energy_neg = model.forward( tf.stop_gradient(x_mod), weights, tf.stop_gradient(attention_mask), label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) if FLAGS.heir_mask: energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL) energy_neg = energy_heir + energy_neg temp = FLAGS.temperature x_off = tf.reduce_mean( tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j])) loss_energy = model.forward( x_mod, weights, attention_mask, reuse=True, label=LABEL, stop_grad=True) print("Finished processing loop construction ...") target_vars = {} if FLAGS.antialias: antialias = tf.tile(stride_3, (1, 1, tf.shape(x_mod)[3], tf.shape(x_mod)[3])) inp = tf.nn.conv2d(x_mod, antialias, [1, 2, 2, 1], padding='SAME') test_x_mod = x_mod if FLAGS.cclass or FLAGS.model_cclass: label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0) label_prob = label_sum / tf.reduce_sum(label_sum) label_ent = -tf.reduce_sum(label_prob * tf.math.log(label_prob + 1e-7)) else: label_ent = tf.zeros(1) target_vars['label_ent'] = label_ent if FLAGS.train: if FLAGS.objective == 'logsumexp': pos_term = temp * energy_pos energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg)) coeff = tf.stop_gradient(tf.exp(-temp * energy_neg_reduced)) norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4 pos_loss = tf.reduce_mean(temp * energy_pos) neg_loss = coeff * (-1 * temp * energy_neg) / norm_constant loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss)) elif FLAGS.objective == 'cd': pos_loss = tf.reduce_mean(temp * energy_pos) neg_loss = -tf.reduce_mean(temp * energy_neg) loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss)) elif FLAGS.objective == 'softplus': loss_ml = FLAGS.ml_coeff * \ tf.nn.softplus(temp * (energy_pos - energy_neg)) loss_total = tf.reduce_mean(loss_ml) if not FLAGS.zero_kl: loss_total = loss_total + tf.reduce_mean(loss_energy) loss_total = loss_total + \ FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square((energy_neg)))) print("Started gradient computation...") gvs = optimizer.compute_gradients(loss_total) gvs = [(k, v) for (k, v) in gvs if k is not None] print("Applying gradients...") tower_grads.append(gvs) print("Finished applying gradients.") target_vars['loss_ml'] = loss_ml target_vars['total_loss'] = loss_total target_vars['loss_energy'] = loss_energy target_vars['weights'] = weights target_vars['gvs'] = gvs target_vars['X'] = X target_vars['Y'] = Y target_vars['LABEL'] = LABEL target_vars['HIER_LABEL'] = HEIR_LABEL target_vars['LABEL_POS'] = LABEL_POS target_vars['X_NOISE'] = X_NOISE target_vars['energy_pos'] = energy_pos target_vars['attention_grad'] = attention_grad if len(x_grads) >= 1: target_vars['x_grad'] = x_grads[-1] target_vars['x_grad_first'] = x_grads[0] else: target_vars['x_grad'] = tf.zeros(1) target_vars['x_grad_first'] = tf.zeros(1) target_vars['x_mod'] = x_mod target_vars['x_off'] = x_off target_vars['temp'] = temp target_vars['energy_neg'] = energy_neg target_vars['test_x_mod'] = test_x_mod target_vars['eps_begin'] = eps_begin target_vars['ATTENTION_MASK'] = ATTENTION_MASK target_vars['models_pretrain'] = models_pretrain if FLAGS.comb_mask: target_vars['attention_mask'] = tf.nn.softmax(attention_mask) else: target_vars['attention_mask'] = tf.zeros(1) if FLAGS.train: grads = average_gradients(tower_grads) train_op = optimizer.apply_gradients(grads) target_vars['train_op'] = train_op # sess = tf.Session(config=config) saver = loader = tf.train.Saver( max_to_keep=30, keep_checkpoint_every_n_hours=6) total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters print("Model has a total of {} parameters".format(total_parameters)) sess.run(tf.global_variables_initializer()) resume_itr = 0 if (FLAGS.resume_iter != -1 or not FLAGS.train): model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter)) resume_itr = FLAGS.resume_iter # saver.restore(sess, model_file) optimistic_restore(sess, model_file) print("Initializing variables...") print("Start broadcast") print("End broadcast") if FLAGS.train: train(target_vars, saver, sess, logger, data_loader, resume_itr, logdir) test(target_vars, saver, sess, logger, data_loader)
import os from glob import glob from baselines.logger import TensorBoardOutputFormat import tensorflow as tf tb_files = glob(os.path.join(os.path.dirname(__file__), '../../result', '**/tb/*tfevents*'), recursive=True) for file in tb_files: dir = os.path.join(os.path.dirname(file), os.path.pardir) tb = TensorBoardOutputFormat(os.path.join(dir, 'tb1')) for e in tf.train.summary_iterator(file): for v in e.summary.value: if v.tag == 'EpExecMean': v.simple_value = v.simple_value * 1000 # seconds to ms tb.writekvs_ev(e) tb.close()