def config_log(FLAGS):
    logdir = "tensorboard/%s/hrl_a2c_svib/%s_lr%s_%s/%s_%s_%s" % (
        FLAGS.env,FLAGS.num_timesteps, '0.0007',FLAGS.policy, start_time, FLAGS.train_option, str(FLAGS.beta))
    if FLAGS.log == "tensorboard":
        Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[TensorBoardOutputFormat(logdir)])
    elif FLAGS.log == "stdout":
        Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[HumanOutputFormat(sys.stdout)])
Esempio n. 2
0
def main():
    FLAGS(sys.argv)
    logdir = "tensorboard"
    if FLAGS.algorithm == "deepq":
        logdir = "tensorboard/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)

    if FLAGS.log == "tensorboard":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])
    elif FLAGS.log == "stdout":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])
    print("env : %s" % FLAGS.env)
    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("lr : %s" % FLAGS.lr)
    # Choose which RL algorithm to train.
    if FLAGS.algorithm == "deepq":  # Use DQN
        train_dqn(env_id=FLAGS.env, num_timesteps=FLAGS.timesteps)
Esempio n. 3
0
def main():
  FLAGS(sys.argv)

  logdir = "tensorboard"
  if(FLAGS.algorithm == "deepq"):
    logdir = "./tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.exploration_fraction,
      FLAGS.prioritized,
      FLAGS.dueling,
      FLAGS.lr,
      start_time
    )

  if(FLAGS.log == "tensorboard"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir='log.txt',
               output_formats=[TensorBoardOutputFormat(logdir)])

  elif(FLAGS.log == "stdout"):
    os.mkdir(logdir)
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[HumanOutputFormat(logdir+"/log.txt")])

  with sc2_env.SC2Env(
      map_name="DefeatZerglingsAndBanelings",
      minimap_size_px = (FLAGS.minimap_size_px, FLAGS.minimap_size_px),
      step_mul=FLAGS.step_mul,
      visualize=FLAGS.visualize,
      game_steps_per_episode= FLAGS.episode_steps) as env:

    model = deepq.models.cnn_to_mlp(
      convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1), (64, 3, 1), (64, 3, 1), (32, 3, 1)],
      hiddens=[256],
      dueling=True
    )

    act = dqfd.learn(
      env,
      q_func=model,
      num_actions=FLAGS.num_actions,
      lr=FLAGS.lr,
      print_freq= FLAGS.print_freq,
      max_timesteps=FLAGS.timesteps,
      buffer_size=FLAGS.buffer_size,
      exploration_fraction=FLAGS.exploration_fraction,
      exploration_final_eps=FLAGS.exploration_final_eps,
      train_freq=FLAGS.train_freq,
      learning_starts=FLAGS.learning_starts,
      target_network_update_freq=FLAGS.target_network_update_freq,
      gamma=FLAGS.gamma,
      prioritized_replay=FLAGS.prioritized,
      callback=deepq_callback
    )
    act.save("defeat_zerglings.pkl")
 def __init__(self, act_fun=tf.nn.leaky_relu):
     print("Local rank: ", hvd.local_rank(), hvd.size())
     self.logdir = osp.join(FLAGS.logdir, FLAGS.exp)
     if hvd.rank() == 0:
         if not osp.exists(self.logdir):
             os.makedirs(self.logdir)
         self.logger = TensorBoardOutputFormat(self.logdir)
     else:
         self.logger = None
     self.act_fun = act_fun
     self.target_vars, self.saver, self.sess, self.resume_itr = setup(self.act_fun)
Esempio n. 5
0
def main():
    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",
            step_mul=step_mul,
            visualize=True,
            agent_interface_format=sc2_env.AgentInterfaceFormat(
                feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)),
            game_steps_per_episode=steps * step_mul) as env:
        obs = env.reset()
        #print(obs[0].observation)
        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[256],
                                        dueling=True)
        demo_replay = []
        act = dqfd.learn(env,
                         q_func=model,
                         num_actions=3,
                         lr=1e-4,
                         max_timesteps=10000000,
                         buffer_size=100000,
                         exploration_fraction=0.5,
                         exploration_final_eps=0.01,
                         train_freq=2,
                         learning_starts=100000,
                         target_network_update_freq=1000,
                         gamma=0.99,
                         prioritized_replay=True,
                         callback=deepq_callback)
        act.save("defeat_zerglings.pkl")
Esempio n. 6
0
def main():
    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",
            step_mul=step_mul,
            visualize=True,
            agent_interface_format=sc2_env.AgentInterfaceFormat(
                feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)),
            game_steps_per_episode=steps * step_mul) as env:

        print(env.observation_spec())
        screen_dim = env.observation_spec()[0]['feature_screen'][1:3]
        print(screen_dim)
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(  #interface.feature_layer.resolution 和  interface.feature_layer.minimap_resolution
            feature_dimensions=sc2_env.Dimensions(screen=32,
                                                  minimap=32)  # 16 16
            # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32)  # 16 16
        )
        with sc2_env.SC2Env(
                map_name="CollectMineralShards",
                step_mul=step_mul,  #推进的速度,通俗理解就是人类玩家的每秒的有效操作
                visualize=True,
                # screen_size_px=(16, 16),
                # minimap_size_px=(16, 16)) as env:
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = deepq.models.cnn_to_mlp(  #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model
                convs=[(16, 8, 4), (32, 4, 2)],
                hiddens=[256],
                dueling=True)  #卷积核数量,卷积核大小,步长
            # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True)  # 卷积核数量,卷积核大小,步长
            act = deepq_mineral_shards.learn(  #训练模型并保存
                # act = deepq_ActSeparate.learn(  #训练模型并保存
                # act=deepq_actSeparateWith4Directions.learn(
                # act = deepq_actionGroup_4way.learn(
                # act = deep_DiffActInSameTime.learn(
                env,
                q_func=model,
                num_actions=4,  #default 16  num_actions=256   3  4
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_actSeparateWith4Directions_callback
            )  #deepq_callback; deepq_ActSeperate_callback  ;   deepq_actSeparateWith4Directions_callback  deep_DiffActInSameTime_callback
            act.save(
                "mineral_shards.pkl"
            )  #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py

    elif (FLAGS.algorithm == "deepq-4way"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(  #
                map_name="CollectMineralShards",
                step_mul=step_mul,
                # screen_size_px=(32, 32),
                # minimap_size_px=(32, 32),
                save_replay_episodes=2,
                replay_dir="D:/StarCraft II/StarCraft II/video",
                agent_interface_format=AGENT_INTERFACE_FORMAT,
                visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)
            # model = deepq.models.mlp(hiddens=[256,128,4])
            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
def main():

    # tf.reset_default_graph()
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)
    elif (FLAGS.algorithm == "BicNet"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
        feature_dimensions=sc2_env.Dimensions(
            screen=64, minimap=64
        )  #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
    )
    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",  #DefeatZerglingsAndBanelings
            step_mul=step_mul,
            agent_interface_format=AGENT_INTERFACE_FORMAT,
            visualize=True,  #True
            game_steps_per_episode=steps * step_mul) as env:

        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[256],
                                        dueling=True)
        #model,需要改成lstm的形式。
        demo_replay = []
        # act = dqfd.learn(
        #   env,
        #   q_func=model,
        #   num_actions=3,
        #   lr=1e-4,
        #   max_timesteps=10000000,
        #   buffer_size=100000,
        #   exploration_fraction=0.5,
        #   exploration_final_eps=0.01,
        #   train_freq=2,
        #   learning_starts=100000,
        #   target_network_update_freq=1000,
        #   gamma=0.99,
        #   prioritized_replay=True,
        #   callback=deepq_callback
        # )
        # act.save("defeat_zerglings.pkl")
        BicNet_findAndDefeatZergling.learn(
            env,
            lr=FLAGS.lr,
            max_timesteps=FLAGS.timesteps,
            buffer_size=100000,
            train_freq=1,
            learning_starts=1000,  #100000,
            target_network_update_freq=1000,
            gamma=0.99,
            callback=BicNet_callback)
_SELECT_ARMY = actions.FUNCTIONS.select_army.id
_SELECT_UNIT = actions.FUNCTIONS.select_unit.id
_SELECT_POINT = actions.FUNCTIONS.select_point.id

_NOT_QUEUED = [0]
_SELECT_ALL = [0]

UP, DOWN, LEFT, RIGHT = 'up', 'down', 'left', 'right'

#to record the output
start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
logdir = "./tensorboard/enjoy/%s" % start_time
Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[TensorBoardOutputFormat(logdir)])

FLAGS = flags.FLAGS
flags.DEFINE_string("map_name", "DefeatZerglingsAndBanelings", "the map you want to see.")
flags.DEFINE_string("trained_model", "/home/tld/PycharmProjects/DeepQ_StarCraft2/models/deepq/zergling_45.6.pkl",
                    "the model you has trained.")
flags.DEFINE_bool("visualize", True, "if you want to see the game")
flags.DEFINE_integer("num_actions", 4, "numbers of your action")
flags.DEFINE_integer("step_mul", 5, "the time of every step spends")
flags.DEFINE_integer("episode_steps", 2800, "the steps of every episode spends")


def main():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name=FLAGS.map_name,
Esempio n. 10
0
def main():

    # tf.reset_default_graph()
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    FLAGS(sys.argv)
    # steps_left = FLAGS.timesteps

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)
    elif (FLAGS.algorithm == "BicNet"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
        feature_dimensions=sc2_env.Dimensions(
            screen=32, minimap=32
        ),  #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
        use_feature_units=True)

    lr = FLAGS.lr
    buffer_size = 60000  # 50000   减少一下,尽量是训练步数的1/10  70000  test 200  70000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 2  #9
    vector_obs_len = 736  #33   #4096  # 32*32  1024
    output_len = 4  #3

    hidden_vector_len = 128  #128   #1
    tau = 0.001
    # stddev = 0.1

    sess = U.make_session()
    sess.__enter__()
    actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents,
                            vector_obs_len, output_len, hidden_vector_len)
    critic = tb.CriticNetwork(sess, lr, tau, gamma,
                              actor.get_num_trainable_vars(), num_agents,
                              vector_obs_len, output_len, hidden_vector_len)
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1), sigma=float(stddev) * np.ones(1))
    action_noise = noise_OU.OU_noise(decay_period=FLAGS.timesteps -
                                     buffer_size)

    # while(steps_left > 0):
    with sc2_env.SC2Env(
            map_name="CollectMineralShards",  #DefeatZerglingsAndBanelings
            # step_mul=step_mul,
            agent_interface_format=AGENT_INTERFACE_FORMAT,
            visualize=False,  #True
            game_steps_per_episode=steps * step_mul) as env:

        learn(
            env,
            sess=sess,
            max_timesteps=FLAGS.timesteps,
            train_freq=1,
            save_freq=10000,
            target_network_update_freq=1,  #1000
            gamma=gamma,
            # callback=BicNet_callback,
            actor=actor,
            critic=critic,
            replay_buffer=replay_buffer,
            num_agents=num_agents,
            action_noise=action_noise,
            output_len=output_len,
            num_exploring=buffer_size  #buffer_size
        )
Esempio n. 11
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            visualize=True,
                            screen_size_px=(16, 16),
                            minimap_size_px=(16, 16)) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "deepq-4way"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            screen_size_px=(32, 32),
                            minimap_size_px=(32, 32),
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
def main():
    FLAGS(sys.argv)

    steps = 0  #Test steps

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if FLAGS.lr == 0:
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if FLAGS.algorithm == "deepq-4way":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "deepq":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "a2c":
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if FLAGS.log == "tensorboard":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif FLAGS.log == "stdout":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if FLAGS.algorithm == "deepq":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16))
        # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting
        # We need this change because sc2 now requires specifying players.
        with sc2_env.SC2Env(
                map_name="Simple64",
                players=[
                    sc2_env.Agent(race=sc2_env.Race.terran),
                    sc2_env.Agent(race=sc2_env.Race.terran)
                ],
                #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)],
                step_mul=step_mul,
                visualize=True,
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            acts = deepq_nexus_wars.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)

            agent = random_agent.RandomAgent()
            run_loop.run_loop([agent], env, steps)

            acts[0].save("mineral_shards_x.pkl")
            acts[1].save("mineral_shards_y.pkl")

    elif FLAGS.algorithm == "deepq-4way":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(map_name="Simple64",
                            players=[
                                sc2_env.Agent(race=sc2_env.Race.terran),
                                sc2_env.Agent(race=sc2_env.Race.terran)
                            ],
                            step_mul=step_mul,
                            agent_interface_format=AGENT_INTERFACE_FORMAT,
                            visualize=True) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif FLAGS.algorithm == "a2c":

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
Esempio n. 13
0
def main():
    logdir = osp.join(FLAGS.logdir, FLAGS.exp)
    if not osp.exists(logdir):
        os.makedirs(logdir)
    logger = TensorBoardOutputFormat(logdir)

    datasource = FLAGS.datasource

    def make_env(rank):
        def _thunk():
            # Make the environments non stoppable for now
            if datasource == "maze":
                env = Maze(end=[0.7, -0.8],
                           start=[-0.85, -0.85],
                           random_starts=False)
            elif datasource == "point":
                env = Point(end=[0.5, 0.5],
                            start=[0.0, 0.0],
                            random_starts=True)
            elif datasource == "reacher":
                env = Reacher(end=[0.7, 0.5], eps=0.01)
            env.seed(rank)
            env = Monitor(env,
                          os.path.join("/tmp", str(rank)),
                          allow_early_resets=True)
            return env

        return _thunk

    env = SubprocVecEnv(
        [make_env(i + FLAGS.seed) for i in range(FLAGS.num_env)])

    if FLAGS.datasource == 'point' or FLAGS.datasource == 'maze' or FLAGS.datasource == 'reacher':
        if FLAGS.ff_model:
            model = TrajFFDynamics(dim_input=FLAGS.latent_dim,
                                   dim_output=FLAGS.latent_dim)
        else:
            model = TrajNetLatentFC(dim_input=FLAGS.latent_dim)

        X_NOISE = tf.placeholder(shape=(None, FLAGS.total_frame,
                                        FLAGS.input_objects, FLAGS.latent_dim),
                                 dtype=tf.float32)
        X = tf.placeholder(shape=(None, FLAGS.total_frame, FLAGS.input_objects,
                                  FLAGS.latent_dim),
                           dtype=tf.float32)

        if FLAGS.cond:
            ACTION_LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        else:
            ACTION_LABEL = None

        ACTION_NOISE_LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        ACTION_PLAN = tf.placeholder(shape=(None, FLAGS.plan_steps + 1, 2),
                                     dtype=tf.float32)

        X_START = tf.placeholder(shape=(None, 1, FLAGS.input_objects,
                                        FLAGS.latent_dim),
                                 dtype=tf.float32)
        X_PLAN = tf.placeholder(shape=(None, FLAGS.plan_steps,
                                       FLAGS.input_objects, FLAGS.latent_dim),
                                dtype=tf.float32)

        if FLAGS.datasource == 'reacher':
            X_END = tf.placeholder(shape=(None, 1, FLAGS.input_objects, 2),
                                   dtype=tf.float32)
        else:
            X_END = tf.placeholder(shape=(None, 1, FLAGS.input_objects,
                                          FLAGS.latent_dim),
                                   dtype=tf.float32)
    else:
        raise AssertionError("Unsupported data source")

    weights = model.construct_weights(action_size=FLAGS.action_dim)
    optimizer = AdamOptimizer(1e-2, beta1=0.0, beta2=0.999)

    if FLAGS.ff_model:
        target_vars = construct_ff_model(model, weights, X_NOISE, X,
                                         ACTION_LABEL, ACTION_NOISE_LABEL,
                                         optimizer)
        target_vars = construct_ff_plan_model(model,
                                              weights,
                                              X_PLAN,
                                              X_START,
                                              X_END,
                                              ACTION_PLAN,
                                              target_vars=target_vars)
    else:
        target_vars = construct_model(model, weights, X_NOISE, X, ACTION_LABEL,
                                      ACTION_NOISE_LABEL, optimizer)
        target_vars = construct_plan_model(model,
                                           weights,
                                           X_PLAN,
                                           X_START,
                                           X_END,
                                           ACTION_PLAN,
                                           target_vars=target_vars)

    sess = tf.InteractiveSession()
    saver = loader = tf.train.Saver(max_to_keep=10,
                                    keep_checkpoint_every_n_hours=2)

    tf.global_variables_initializer().run()
    print("Initializing variables...")

    if FLAGS.resume_iter != -1 or not FLAGS.train:
        model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter))
        resume_itr = FLAGS.resume_iter
        saver.restore(sess, model_file)

    train(target_vars, saver, sess, logger, FLAGS.resume_iter, env)
Esempio n. 14
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = StarCraft2Env(map_name=arglist.scenario,
                            reward_only_positive=False,
                            obs_last_action=True,
                            obs_timestep_number=True,
                            reward_scale_rate=200)
        # Create agent trainers
        env_info = env.get_env_info()
        num_agents = env_info["n_agents"]
        num_adversaries = num_agents
        obs_shape_n = [(env_info["obs_shape"], )
                       for i in range(num_adversaries)]
        action_space_n = [
            env_info["n_actions"] for i in range(num_adversaries)
        ]
        buffer_size = arglist.buffer_size

        trainers = get_trainers(num_adversaries, obs_shape_n, action_space_n,
                                arglist, buffer_size)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        logdir = "./tensorboard/"

        Logger.DEFAULT \
            = Logger.CURRENT \
            = Logger(dir=None,
                     output_formats=[TensorBoardOutputFormat(logdir)])

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(num_agents)]  # individual agent reward
        saver = tf.train.Saver(max_to_keep=100000000)
        n_actions_no_attack = 6

        env.reset()

        obs_n = []
        reward_hl_own_old = []
        reward_hl_en_old = []
        for agent_id in range(num_agents):  # 第一个循环是为了得到初始状态/观察/生命值信息
            obs = env.get_obs_agent(agent_id)
            obs_n.append(obs)
            reward_hl_own_old.append(env.get_agent_health(agent_id))
            reward_hl_en_old.append(env.get_enemy_health(agent_id))

        episode_step = 0
        step = 0

        print('Starting iterations...')
        while True:
            # get action
            action_set_actual = []
            action_set_execute = []
            action_n = []
            dead_unit = []
            for agent_id in range(num_agents):
                action_output = trainers[agent_id].action(obs_n[agent_id])
                action_n.append(action_output)
                action_prob = action_output
                action_to_choose = np.argmax(action_prob)
                action_set_actual.append(action_to_choose)
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                if action_to_choose in avail_actions_ind:
                    action_set_execute.append(action_to_choose)
                elif (avail_actions[0] == 1):
                    action_set_execute.append(
                        0)  # 如果该动作不能执行,并且智能体已经死亡,那么就用NO_OP代替当前动作
                else:
                    action_set_execute.append(1)  # 如果该动作不能执行,那么就用STOP动作代替

                if (len(avail_actions_ind) == 1
                        and avail_actions_ind[0] == 0):  # 判断该智能体是否已经死亡
                    dead_unit.append(agent_id)

            rew_base, done, _ = env.step(action_set_execute)
            episode_rewards[-1] += rew_base
            new_obs_n = []
            reward_hl_own_new = []
            reward_hl_en_new = []
            rew_n = []

            for agent_id in range(num_agents):
                obs_next = env.get_obs_agent(agent_id=agent_id)
                new_obs_n.append(obs_next)
                reward_hl_own_new.append(env.get_agent_health(agent_id))
                reward_hl_en_new.append(env.get_enemy_health(agent_id))

            for agent_id in range(num_agents):
                if (agent_id in dead_unit):
                    reward = 0
                elif (action_set_execute[agent_id] !=
                      action_set_actual[agent_id]
                      ):  #当输出动作无法执行时,执行替代动作,但是把输出动作进行保存并且给与一个负的奖励
                    reward = -2

                elif (action_set_execute[agent_id] > 5):
                    target_id = action_set_execute[
                        agent_id] - n_actions_no_attack
                    health_reduce_en = reward_hl_en_old[
                        target_id] - reward_hl_en_new[target_id]
                    if (health_reduce_en > 0):
                        if (rew_base > 0):
                            reward = 2 + rew_base
                        else:
                            reward = 2
                    else:
                        reward = 1
                else:
                    reward = (reward_hl_own_new[agent_id] -
                              reward_hl_own_old[agent_id]) * 5
                rew_n.append(reward)

            episode_step += 1

            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done)

            obs_n = new_obs_n
            reward_hl_own_old = reward_hl_own_new
            reward_hl_en_old = reward_hl_en_new

            for i, rew in enumerate(rew_n):
                agent_rewards[i][-1] += rew

            if done:
                print("steps until now : %s, episode: %s, episode reward: %s" %
                      (step, len(episode_rewards), episode_rewards[-1]))
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("episode reward", episode_rewards[-1])
                for i in range(num_agents):
                    logger.record_tabular("agent" + str(i) + " episode reward",
                                          agent_rewards[i][-1])
                logger.dump_tabular()

                env.reset()
                obs_n = []
                reward_hl_own_old = []
                reward_hl_en_old = []
                for agent_id in range(num_agents):  # 第一个循环是为了得到初始状态/观察/生命值信息
                    obs = env.get_obs_agent(agent_id)
                    obs_n.append(obs)
                    reward_hl_own_old.append(env.get_agent_health(agent_id))
                    reward_hl_en_old.append(env.get_enemy_health(agent_id))
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)

            # increment global step counter
            step += 1
            if (step == arglist.buffer_size):
                print("Training starts.")

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, step)

            # save model, display training output
            if done and (len(episode_rewards) % arglist.save_rate == 0):
                save_dir = arglist.save_dir + "/model_" + str(
                    step) + "steps/" + arglist.exp_name
                U.save_state(save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}".
                          format(step, len(episode_rewards),
                                 np.mean(
                                     episode_rewards[-arglist.save_rate:])))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}"
                        .format(step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards) - 1))
                break
Esempio n. 15
0
def main():

    start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
    lr = 0.002
    buffer_size = 80000 #80000 # 减少一下,尽量是训练步数的1/10  70000  test 200  80000 20000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 8
    vector_obs_len = 248  # local obs:80 ; global state:168;
    output_len = 14
    hidden_vector_len = 256 #128  # 1  256
    tau = 0.001
    num_exploring = buffer_size #buffer_size
    action_low = -1
    action_high = 1
    save_freq = 10000
    # min_life = 45

    env = StarCraft2Env(map_name="8m",reward_only_positive=False, reward_scale_rate=200)  #8m  DefeatZerglingsAndBanelings  reward_scale_rate=200
    env_info = env.get_env_info()

    n_episodes = 4000 #4000    #2000
    # n_actions = env_info["n_actions"]
    n_agents = env_info["n_agents"]
    episode_len = env_info["episode_limit"]

    timesteps = n_episodes * episode_len

    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
        "BicNet",
        timesteps,
        16,
        lr,
        start_time
    )

    Logger.DEFAULT \
        = Logger.CURRENT \
        = Logger(dir=None,
                 output_formats=[TensorBoardOutputFormat(logdir)])

    sess = U.make_session()
    sess.__enter__()
    # state_dim = (n_agents, vector_obs_len)
    # action_dim = (n_agents, output_len)

    actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len)
    # actor = ActorNetwork(sess, state_dim, action_dim, lr, tau, batch_size)
    critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, vector_obs_len,
                              output_len, hidden_vector_len)
    # critic = CriticNetwork(sess, state_dim, action_dim, lr, tau, gamma, actor.get_num_trainable_vars())
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    action_noise = OU_noise(decay_period=timesteps - buffer_size)

    action_noise.reset()
    # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings")
    # U.load_state(model_file_load, sess)
    U.initialize()


    t = 0
    for e in range(n_episodes):
        env.reset()
        terminated = False
        episode_reward = 0
        obs = env.get_obs()
        obs = np.array(obs)
        # state, target_attack = env.get_state()
        state, min = env.get_state()
        screen_expand = np.zeros([obs.shape[0],obs.shape[1] + state.shape[0]])
        for i in range(obs.shape[0]):
            screen_expand[i] = np.append(obs[i],state.flatten())
        # screen_expand = state_transform(obs)
        # screen_expand = state_expand(state, n_agents)
        while not terminated:
            t = t+1
            screen_input = np.expand_dims(screen_expand, axis=0)
            action = actor.predict(screen_input)[0]
            act_with_noise = np.clip(action + action_noise.get_noise(t - num_exploring), action_low, action_high)
            act_mat_norm = (act_with_noise+1)/2
            actions = []
            dead_unit = []
            rew_expand = np.zeros((n_agents, 1))
            # punish = []
            # health_agent = []
            # health_enemy = []

            agent_group = []
            for agent_id in range(n_agents):
                sum_avail_act = 0
                act_prob = []
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                act_unit_norm = act_mat_norm[agent_id]
                # print('act_unit_norm',act_unit_norm)
                # act_prob = act_unit_norm / np.sum(act_unit_norm, axis=0)
                for i in avail_actions_ind:
                    act_prob.append(act_unit_norm[i])
                    sum_avail_act = sum_avail_act + act_unit_norm[i]

                if(sum_avail_act == 0):
                    act_prob = (np.array(act_prob) + 1)/len(act_prob)
                else : act_prob = np.array(act_prob)/sum_avail_act

                # index = np.random.choice(np.arange(0,14), p=act_prob.ravel())
                # print("act_prob",act_prob)
                index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel())
                # if (index in avail_actions_ind):
                #     punish.append(False)
                # else:
                #     punish.append(True)
                #     if (0 in avail_actions_ind):
                #         actions.append(0)
                #     else:
                #         actions.append(1)
                actions.append(index)
                # health_agent.append(state[4*agent_id])
                # health_enemy.append(state[4*n_agents + 3*agent_id])

                # if(index > 5):
                #     target_id = index - 6

                if(len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0):
                    dead_unit.append(agent_id)
            # health_agent = np.array(health_agent)
            # for i in range(len(health_enemy)):
            #     if (health_enemy[i] < min_life):
            #         min_life = health_enemy[i]
            # health_enemy = np.array(health_enemy)
            reward, terminated, _ = env.step(actions)
            # rew_expand = np.ones((n_agents, 1))*reward
            # health_enemy_new = []


            for i in range(n_agents):
                if (i not in dead_unit):
                    rew_expand[i] += reward
                    if (actions[i] > 5):
                        enemy_id = actions[i] - 6
                        rew_expand[i] += 1
                        # if(actions[i]-6 == target_attack):
                        for j in range(n_agents):
                            if (actions[j] == actions[i] and i!=j):
                                if (state[4 * n_agents + 3 * enemy_id] == min):
                                    rew_expand[i] += 1
            new_obs = env.get_obs()
            new_obs = np.array(new_obs)
            # new_state, target_attack = env.get_state()
            new_state, min = env.get_state()
            new_screen_expand = np.zeros([new_obs.shape[0], new_obs.shape[1] + new_state.shape[0]])
            for i in range(new_obs.shape[0]):
                new_screen_expand[i] = np.append(new_obs[i], new_state.flatten())
            # health_agent_new = []
            # for i in range(n_agents):
            #     health_agent_new.append(new_state[4 * i])
            #     # health_enemy_new.append(new_state[4 * n_agents + 3 * i])
            # health_agent_new = np.array(health_agent_new)
            # health_enemy_new = np.array(health_enemy_new)
            # life_reduce_agent = health_agent - health_agent_new
            # life_reduce_agent_all = life_reduce_agent.sum(axis=0)
            # life_reduce_enemy = health_enemy - health_enemy_new
            # life_reduce_enemy_all = life_reduce_enemy.sum(axis=0)
            # reward_base = life_reduce_enemy_all - life_reduce_agent_all
            # for i in range(n_agents):
            #     rew_expand[i] += reward_base+life_reduce_agent[i]

            # for i in range(n_agents):
            #     if (punish[i]):
            #         rew_expand[i] += -2
            #     elif (i in dead_unit):
            #         rew_expand[i] += 0
            #     elif (actions[i] > 5):
            #         rew_expand[i] = 1
            #         if(health_enemy[actions[i] - 6] == min_life):
            #             rew_expand[i] = 1
            #     rew_expand[i] += life_reduce_agent[i]

            replay_buffer.add(screen_expand, act_with_noise, rew_expand, terminated, new_screen_expand)

            episode_reward += reward
            screen_expand = new_screen_expand
            # state = new_state
            # target_attack = target_attack_new

            if(t>=num_exploring):
                print("training starts")
                s_batch, a_batch, r_batch, done_batch, s2_batch = replay_buffer.sample_batch(batch_size)  # [group0:[batch_size, trace.dimension], group1, ... group8]
                target_q = r_batch + gamma * critic.predict_target(s2_batch, actor.predict_target(s2_batch))
                predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, output_len)))
                a_outs = actor.predict(s_batch)  # a_outs和a_batch是完全相同的
                grads = critic.action_gradients(s_batch, a_outs)  # delta Q对a的导数
                actor.train(s_batch, grads)

                actor.update_target_network()
                critic.update_target_network()

                # if(t % save_freq == 0):
                    # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings")
                    # U.save_state(model_file_save)
                    # replay_buffer.save()

        print("Total reward in episode {} = {}".format(e, episode_reward))
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", e)
        logger.record_tabular("reward", episode_reward)

        logger.dump_tabular()

    # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings")
    # U.save_state(model_file_save)

    env.close()
Esempio n. 16
0
def main():

    start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
    lr = 0.002
    batch_size = 32  # 32
    num_agents = 8
    vector_obs_len = 248  # local obs:80 ; global state:168;
    output_len = 14
    hidden_vector_len = 256 #128  # 1  256
    tau = 0.001



    env = StarCraft2Env(map_name="8m",reward_only_positive=False, reward_scale_rate=200)  #8m  DefeatZerglingsAndBanelings  reward_scale_rate=200
    env_info = env.get_env_info()

    n_episodes = 4000 #4000    #2000
    n_agents = env_info["n_agents"]
    episode_len = env_info["episode_limit"]

    timesteps = n_episodes * episode_len

    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
        "BicNet",
        timesteps,
        16,
        lr,
        start_time
    )

    Logger.DEFAULT \
        = Logger.CURRENT \
        = Logger(dir=None,
                 output_formats=[TensorBoardOutputFormat(logdir)])

    sess = U.make_session()
    sess.__enter__()

    actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len)
    critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, vector_obs_len,
                              output_len, hidden_vector_len)
    sess.run(tf.global_variables_initializer())
    model_file_load = os.path.join(str(300000) + "_" + "model_segment_training/", "defeat_zerglings")
    U.load_state(model_file_load, sess)

    t = 0
    for e in range(n_episodes):
        env.reset()
        terminated = False
        episode_reward = 0
        obs = env.get_obs()
        obs = np.array(obs)
        state, min = env.get_state()
        screen_expand = np.zeros([obs.shape[0],obs.shape[1] + state.shape[0]])
        for i in range(obs.shape[0]):
            screen_expand[i] = np.append(obs[i],state.flatten())

        while not terminated:
            t = t+1
            screen_input = np.expand_dims(screen_expand, axis=0)
            action = actor.predict(screen_input)[0]
            act_with_noise = action
            act_mat_norm = (act_with_noise+1)/2
            actions = []

            for agent_id in range(n_agents):
                sum_avail_act = 0
                act_prob = []
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                act_unit_norm = act_mat_norm[agent_id]
                for i in avail_actions_ind:
                    act_prob.append(act_unit_norm[i])
                    sum_avail_act = sum_avail_act + act_unit_norm[i]

                if(sum_avail_act == 0):
                    act_prob = (np.array(act_prob) + 1)/len(act_prob)
                else : act_prob = np.array(act_prob)/sum_avail_act
                index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel())

                actions.append(index)
            reward, terminated, _ = env.step(actions)

            new_obs = env.get_obs()
            new_obs = np.array(new_obs)
            new_state, min = env.get_state()
            new_screen_expand = np.zeros([new_obs.shape[0], new_obs.shape[1] + new_state.shape[0]])
            for i in range(new_obs.shape[0]):
                new_screen_expand[i] = np.append(new_obs[i], new_state.flatten())

            episode_reward += reward
            screen_expand = new_screen_expand

        print("Total reward in episode {} = {}".format(e, episode_reward))
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", e)
        logger.record_tabular("reward", episode_reward)

        logger.dump_tabular()

    env.close()
Esempio n. 17
0
def main():

  # tf.reset_default_graph()
  # config = tf.ConfigProto()
  # config.gpu_options.allow_growth = True

  FLAGS(sys.argv)
  # steps_left = FLAGS.timesteps

  logdir = "tensorboard"
  if(FLAGS.algorithm == "deepq"):
    logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.exploration_fraction,
      FLAGS.prioritized,
      FLAGS.dueling,
      FLAGS.lr,
      start_time
    )
  elif(FLAGS.algorithm == "acktr"):
    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.num_cpu,
      FLAGS.lr,
      start_time
    )
  elif(FLAGS.algorithm == "BicNet"):
    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.num_cpu,
      FLAGS.lr,
      start_time
    )

  if(FLAGS.log == "tensorboard"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[TensorBoardOutputFormat(logdir)])

  elif(FLAGS.log == "stdout"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[HumanOutputFormat(sys.stdout)])

  AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
    feature_dimensions=sc2_env.Dimensions(screen=64, minimap=64),#feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
    use_feature_units=True
  )

  lr = FLAGS.lr
  batch_size = 32  # 32
  gamma = 0.99
  num_agents = 9
  vector_obs_len = 33   #4096  # 32*32  1024
  output_len = 3
  hidden_vector_len = 128   #1
  tau = 0.001
  # stddev = 0.1


  sess = U.make_session()
  sess.__enter__()
  actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len)
  sess.run(tf.global_variables_initializer())

  # while(steps_left > 0):
  with sc2_env.SC2Env(
      map_name="DefeatZerglingsAndBanelings",  #DefeatZerglingsAndBanelings
      step_mul=step_mul,
      save_replay_episodes=1,
      replay_dir="D:/StarCraft II/StarCraft II/Replays/video/0722",
      agent_interface_format=AGENT_INTERFACE_FORMAT,
      visualize=False, #True
      game_steps_per_episode=steps * step_mul) as env:

    learn(
      env,
      sess=sess,
      max_timesteps=FLAGS.timesteps,
      # callback=BicNet_callback,
      actor=actor,
      num_agents=num_agents
    )
Esempio n. 18
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_cpu : %s" % FLAGS.num_cpu)
    print("lr : %s" % FLAGS.lr)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env("CollectMineralShards",
                            step_mul=step_mul,
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(env,
                                             q_func=model,
                                             num_actions=64,
                                             lr=1e-3,
                                             max_timesteps=20000000,
                                             buffer_size=10000,
                                             exploration_fraction=0.5,
                                             exploration_final_eps=0.01,
                                             train_freq=4,
                                             learning_starts=10000,
                                             target_network_update_freq=1000,
                                             gamma=0.99,
                                             prioritized_replay=True,
                                             callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "acktr"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        # def make_env(rank):
        #   # env = sc2_env.SC2Env(
        #   #   "CollectMineralShards",
        #   #   step_mul=step_mul)
        #   # return env
        #   #env.seed(seed + rank)
        #   def _thunk():
        #     env = sc2_env.SC2Env(
        #         map_name=FLAGS.map,
        #         step_mul=step_mul,
        #         visualize=True)
        #     #env.seed(seed + rank)
        #     if logger.get_dir():
        #      env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
        #     return env
        #   return _thunk

        # agents = [Agent()
        #           for _ in range(num_cpu)]
        #
        # for agent in agents:
        #   time.sleep(1)
        #   agent.daemon = True
        #   agent.start()

        # agent_controller = AgentController(agents)

        #set_global_seeds(seed)
        env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map)

        policy_fn = CnnPolicy
        acktr_disc.learn(policy_fn,
                         env,
                         seed,
                         total_timesteps=num_timesteps,
                         nprocs=FLAGS.num_cpu,
                         ent_coef=0.1,
                         callback=acktr_callback)
Esempio n. 19
0
def main():

    if FLAGS.dataset == "mnist":
        train_dataloader = DataLoader(MNIST("/root/data",
                                            train=True,
                                            download=True,
                                            transform=transforms.ToTensor()),
                                      batch_size=FLAGS.batch_size)
        test_dataloader = DataLoader(MNIST("/root/data",
                                           train=False,
                                           download=True,
                                           transform=transforms.ToTensor()),
                                     batch_size=FLAGS.batch_size)
        input_dim = 784
        prob_dist = "discrete"
    else:
        train_dataloader = DataLoader(FreyFaces(train=True),
                                      batch_size=FLAGS.batch_size)
        test_dataloader = DataLoader(FreyFaces(train=False),
                                     batch_size=FLAGS.batch_size)
        input_dim = 560
        prob_dist = "continuous"

    model = VAE(hidden_dim=FLAGS.latent_dim,
                input_dim=input_dim,
                nh=FLAGS.hidden_dim,
                prob_dist=prob_dist).train().cuda()
    logdir = osp.join(FLAGS.logdir, FLAGS.exp)

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    logger = TensorBoardOutputFormat(logdir)

    it = FLAGS.resume_iter

    if not osp.exists(logdir):
        os.makedirs(logdir)

    if FLAGS.resume_iter != 0:
        model_path = osp.join(logdir, "model_{}".format(FLAGS.resume_iter))
        model.load_state_dict(torch.load(model_path))

    if FLAGS.train:

        stop = False
        its = []
        train_losses = []
        test_losses = []
        test_dataloader_iter = iter(test_dataloader)

        while it < FLAGS.num_iter:
            for dat, label in tqdm(train_dataloader):
                if FLAGS.dataset == "mnist":
                    dat = dat.cuda().reshape((dat.size(0), 28 * 28))
                else:
                    dat = dat.float().cuda().reshape((dat.size(0), 28 * 20))

                optimizer.zero_grad()
                outputs = model.forward(dat)

                if FLAGS.dataset == "mnist":
                    loss = model.compute_loss(outputs, dat)
                else:
                    loss = model.compute_loss(outputs, dat)

                loss.backward()
                optimizer.step()

                if it % (100 * FLAGS.batch_size) == 0:
                    loss = loss.item()
                    logger.writekvs({"loss": loss})
                    print(it, loss)

                    if FLAGS.gen_plots:
                        its.append(it)

                        if FLAGS.estimate_prob:
                            estimate_prob = model.estimate_prob(dat).item()
                            print(estimate_prob)
                            train_losses.append(estimate_prob)
                        else:
                            train_losses.append(-1 * loss)

                        try:
                            dat, label = test_dataloader_iter.next()
                        except:
                            test_dataloader_iter = iter(test_dataloader)
                            dat, label = test_dataloader_iter.next()

                        if FLAGS.dataset == "mnist":
                            dat = dat.cuda().reshape((dat.size(0), 28 * 28))
                        else:
                            dat = dat.float().cuda().reshape(
                                (dat.size(0), 28 * 20))

                        outputs = model.forward(dat)

                        if FLAGS.dataset == "mnist":
                            loss = model.compute_loss(outputs, dat)
                        else:
                            loss = model.compute_loss(outputs, dat)

                        if FLAGS.estimate_prob:
                            estimate_prob = model.estimate_prob(dat).item()
                            test_losses.append(estimate_prob)
                        else:
                            test_losses.append(-1 * loss)

                it += FLAGS.batch_size

                if it > FLAGS.num_iter:
                    break

        if FLAGS.gen_plots:
            plt.semilogx(its, train_losses, "r")
            plt.semilogx(its, test_losses, "b")
            plt.ylabel("ELBO")

            if FLAGS.dataset == "frey":
                data_string = "Frey Faces"
            elif FLAGS.dataset == "mnist":
                data_string = "MNIST"
            plt.title("{}, $N_z = {}$".format(data_string, FLAGS.latent_dim))
            time = str(datetime.datetime.now())
            plt.savefig("plot_{}_{}_{}.png".format(FLAGS.dataset, time,
                                                   FLAGS.latent_dim))

        model_path = osp.join(logdir, "model_{}".format(it))
        torch.save(model.state_dict(), model_path)

    if FLAGS.latent_traversal:
        intervals = np.linspace(-1, 1, 8)
        grid = np.meshgrid(intervals, intervals)
        value = np.stack([grid[0], grid[1]], axis=2)
        latent_input = torch.from_numpy(value.reshape((64, 2))).float().cuda()
        output = model.generate_sample(z=latent_input)
    else:
        output = model.generate_sample()
    output = output.cpu().detach().numpy()

    if FLAGS.dataset == "mnist":
        output = output.reshape((8, 8, 28, 28)).transpose(
            (0, 2, 1, 3)).reshape((8 * 28, 8 * 28))
    elif FLAGS.dataset == "frey":
        output = output.reshape((8, 8, 28, 20)).transpose(
            (0, 2, 1, 3)).reshape((8 * 28, 8 * 20))

    time = str(datetime.datetime.now())
    imsave("test_{}_{}_{}.png".format(FLAGS.dataset, time, FLAGS.latent_dim),
           output)

    print("Done")
Esempio n. 20
0
import os
from glob import glob
from baselines.bench.monitor import load_results
from baselines.logger import TensorBoardOutputFormat
from collections import deque
import numpy as np

monitor_files = glob(os.path.join(os.path.dirname(__file__), '../../result',
                                  '**/monitor.csv'),
                     recursive=True)

for monitor in monitor_files:

    dir = os.path.dirname(monitor)
    csv = load_results(dir)
    tb = TensorBoardOutputFormat(os.path.join(dir, 'tb2'))

    length = 100
    kv = {}

    for i in range(length, csv.r.size):
        t = csv.t.values[i]
        r = csv.r.values[i - length:i]
        l = csv.l.values[i - length:i]
        e = csv.best_exec.values[i - length:i] * 1000  # seconds to ms
        kv['EpExecMean'] = np.mean(e)
        kv['EpRewMean'] = np.mean(r)
        kv['EpLenMean'] = np.mean(l)
        tb.writekvs_wt(kv, t)

    tb.close()
Esempio n. 21
0
def main():
    print("Local rank: ", hvd.local_rank(), hvd.size())
    FLAGS.exp = FLAGS.exp + '_' + FLAGS.divergence

    logdir = osp.join(FLAGS.logdir, FLAGS.exp)
    if hvd.rank() == 0:
        if not osp.exists(logdir):
            os.makedirs(logdir)
        logger = TensorBoardOutputFormat(logdir)
    else:
        logger = None

    print("Loading data...")
    dataset = Cifar10(augment=FLAGS.augment, rescale=FLAGS.rescale)
    test_dataset = Cifar10(train=False, rescale=FLAGS.rescale)
    channel_num = 3

    X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
    X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
    LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32)
    LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32)

    if FLAGS.large_model:
        model = ResNet32Large(
            num_channels=channel_num,
            num_filters=128,
            train=True)
        model_dis = ResNet32Large(
            num_channels=channel_num,
            num_filters=128,
            train=True)
    elif FLAGS.larger_model:
        model = ResNet32Larger(
            num_channels=channel_num,
            num_filters=128)
        model_dis = ResNet32Larger(
            num_channels=channel_num,
            num_filters=128)
    elif FLAGS.wider_model:
        model = ResNet32Wider(
            num_channels=channel_num,
            num_filters=256)
        model_dis = ResNet32Wider(
            num_channels=channel_num,
            num_filters=256)
    else:
        model = ResNet32(
            num_channels=channel_num,
            num_filters=128)
        model_dis = ResNet32(
            num_channels=channel_num,
            num_filters=128)

    print("Done loading...")

    grad_exp, conjugate_grad_exp = get_divergence_funcs(FLAGS.divergence)

    data_loader = DataLoader(
        dataset,
        batch_size=FLAGS.batch_size,
        num_workers=FLAGS.data_workers,
        drop_last=True,
        shuffle=True)

    weights = [model.construct_weights('context_energy'), model_dis.construct_weights('context_dis')]

    Y = tf.placeholder(shape=(None), dtype=tf.int32)

    # Varibles to run in training
    X_SPLIT = tf.split(X, FLAGS.num_gpus)
    X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus)
    LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus)
    LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus)
    LABEL_SPLIT_INIT = list(LABEL_SPLIT)
    tower_grads = []
    tower_grads_dis = []
    tower_grads_l2 = []
    tower_grads_dis_l2 = []

    optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999)
    optimizer = hvd.DistributedOptimizer(optimizer)

    optimizer_dis = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999)
    optimizer_dis = hvd.DistributedOptimizer(optimizer_dis)

    for j in range(FLAGS.num_gpus):

        energy_pos = [
            model.forward(
                X_SPLIT[j],
                weights[0],
                label=LABEL_POS_SPLIT[j],
                stop_at_grad=False)]
        energy_pos = tf.concat(energy_pos, axis=0)

        score_pos = [
            model_dis.forward(
                X_SPLIT[j],
                weights[1],
                label=LABEL_POS_SPLIT[j],
                stop_at_grad=False)]
        score_pos = tf.concat(score_pos, axis=0)

        print("Building graph...")
        x_mod = x_orig = X_NOISE_SPLIT[j]

        x_grads = []

        energy_negs = []
        loss_energys = []

        energy_negs.extend([model.forward(tf.stop_gradient(
            x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True)])
        eps_begin = tf.zeros(1)

        steps = tf.constant(0)
        c = lambda i, x: tf.less(i, FLAGS.num_steps)

        def langevin_step(counter, x_mod):
            x_mod = x_mod + tf.random_normal(tf.shape(x_mod),
                                             mean=0.0,
                                             stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale)

            energy_noise = energy_start = tf.concat(
                [model.forward(
                        x_mod,
                        weights[0],
                        label=LABEL_SPLIT[j],
                        reuse=True,
                        stop_at_grad=False,
                        stop_batch=True)],
                axis=0)

            x_grad, label_grad = tf.gradients(energy_noise, [x_mod, LABEL_SPLIT[j]])
            energy_noise_old = energy_noise

            lr = FLAGS.step_lr

            if FLAGS.proj_norm != 0.0:
                if FLAGS.proj_norm_type == 'l2':
                    x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm)
                elif FLAGS.proj_norm_type == 'li':
                    x_grad = tf.clip_by_value(
                        x_grad, -FLAGS.proj_norm, FLAGS.proj_norm)
                else:
                    print("Other types of projection are not supported!!!")
                    assert False

            # Clip gradient norm for now
            if FLAGS.hmc:
                # Step size should be tuned to get around 65% acceptance
                def energy(x):
                    return FLAGS.temperature * \
                        model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True)

                x_last = hmc(x_mod, 15., 10, energy)
            else:
                x_last = x_mod - (lr) * x_grad

            x_mod = x_last
            x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale)

            counter = counter + 1

            return counter, x_mod

        steps, x_mod = tf.while_loop(c, langevin_step, (steps, x_mod))

        energy_eval = model.forward(x_mod, weights[0], label=LABEL_SPLIT[j],
                                    stop_at_grad=False, reuse=True)
        x_grad = tf.gradients(energy_eval, [x_mod])[0]
        x_grads.append(x_grad)

        energy_negs.append(
            model.forward(
                tf.stop_gradient(x_mod),
                weights[0],
                label=LABEL_SPLIT[j],
                stop_at_grad=False,
                reuse=True))

        score_neg = model_dis.forward(
                tf.stop_gradient(x_mod),
                weights[1],
                label=LABEL_SPLIT[j],
                stop_at_grad=False,
                reuse=True)

        test_x_mod = x_mod

        temp = FLAGS.temperature

        energy_neg = energy_negs[-1]
        x_off = tf.reduce_mean(
            tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j]))

        loss_energy = model.forward(
            x_mod,
            weights[0],
            reuse=True,
            label=LABEL,
            stop_grad=True)

        print("Finished processing loop construction ...")

        target_vars = {}

        if FLAGS.cclass or FLAGS.model_cclass:
            label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0)
            label_prob = label_sum / tf.reduce_sum(label_sum)
            label_ent = -tf.reduce_sum(label_prob *
                                       tf.math.log(label_prob + 1e-7))
        else:
            label_ent = tf.zeros(1)

        target_vars['label_ent'] = label_ent

        if FLAGS.train:

            loss_dis = - (tf.reduce_mean(grad_exp(score_pos + energy_pos)) - tf.reduce_mean(conjugate_grad_exp(score_neg + energy_neg)))
            loss_dis = loss_dis + FLAGS.l2_coeff * (tf.reduce_mean(tf.square(score_pos)) + tf.reduce_mean(tf.square(score_neg)))
            l2_dis = FLAGS.l2_coeff * (tf.reduce_mean(tf.square(score_pos)) + tf.reduce_mean(tf.square(score_neg)))

            loss_model = tf.reduce_mean(grad_exp(score_pos + energy_pos)) + \
                         tf.reduce_mean(energy_neg * tf.stop_gradient(conjugate_grad_exp(score_neg + energy_neg))) - \
                         tf.reduce_mean(energy_neg) * tf.stop_gradient(tf.reduce_mean(conjugate_grad_exp(score_neg + energy_neg))) - \
                         tf.reduce_mean(conjugate_grad_exp(score_neg + energy_neg))
            loss_model = loss_model + FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square(energy_neg)))
            l2_model = FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square(energy_neg)))

            print("Started gradient computation...")
            model_vars = [var for var in tf.trainable_variables() if 'context_energy' in var.name]
            print("model var number", len(model_vars))
            dis_vars = [var for var in tf.trainable_variables() if 'context_dis' in var.name]
            print("discriminator var number", len(dis_vars))

            gvs = optimizer.compute_gradients(loss_model, model_vars)
            gvs = [(k, v) for (k, v) in gvs if k is not None]
            tower_grads.append(gvs)

            gvs = optimizer.compute_gradients(l2_model, model_vars)
            gvs = [(k, v) for (k, v) in gvs if k is not None]
            tower_grads_l2.append(gvs)

            gvs_dis = optimizer_dis.compute_gradients(loss_dis, dis_vars)
            gvs_dis = [(k, v) for (k, v) in gvs_dis if k is not None]
            tower_grads_dis.append(gvs_dis)

            gvs_dis = optimizer_dis.compute_gradients(l2_dis, dis_vars)
            gvs_dis = [(k, v) for (k, v) in gvs_dis if k is not None]
            tower_grads_dis_l2.append(gvs_dis)

            print("Finished applying gradients.")

            target_vars['total_loss'] = loss_model
            target_vars['loss_energy'] = loss_energy
            target_vars['weights'] = weights
            target_vars['gvs'] = gvs

        target_vars['X'] = X
        target_vars['Y'] = Y
        target_vars['LABEL'] = LABEL
        target_vars['LABEL_POS'] = LABEL_POS
        target_vars['X_NOISE'] = X_NOISE
        target_vars['energy_pos'] = energy_pos
        target_vars['energy_start'] = energy_negs[0]

        if len(x_grads) >= 1:
            target_vars['x_grad'] = x_grads[-1]
            target_vars['x_grad_first'] = x_grads[0]
        else:
            target_vars['x_grad'] = tf.zeros(1)
            target_vars['x_grad_first'] = tf.zeros(1)

        target_vars['x_mod'] = x_mod
        target_vars['x_off'] = x_off
        target_vars['temp'] = temp
        target_vars['energy_neg'] = energy_neg
        target_vars['test_x_mod'] = test_x_mod
        target_vars['eps_begin'] = eps_begin
        target_vars['score_neg'] = score_neg
        target_vars['score_pos'] = score_pos

    if FLAGS.train:
        grads_model = average_gradients(tower_grads)
        train_op_model = optimizer.apply_gradients(grads_model)
        target_vars['train_op_model'] = train_op_model

        grads_model_l2 = average_gradients(tower_grads_l2)
        train_op_model_l2 = optimizer.apply_gradients(grads_model_l2)
        target_vars['train_op_model_l2'] = train_op_model_l2

        grads_model_dis = average_gradients(tower_grads_dis)
        train_op_dis = optimizer_dis.apply_gradients(grads_model_dis)
        target_vars['train_op_dis'] = train_op_dis

        grads_model_dis_l2 = average_gradients(tower_grads_dis_l2)
        train_op_dis_l2 = optimizer_dis.apply_gradients(grads_model_dis_l2)
        target_vars['train_op_dis_l2'] = train_op_dis_l2

    config = tf.ConfigProto()

    if hvd.size() > 1:
        config.gpu_options.visible_device_list = str(hvd.local_rank())

    sess = tf.Session(config=config)

    saver = loader = tf.train.Saver(max_to_keep=500)

    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        total_parameters += variable_parameters
    print("Model has a total of {} parameters".format(total_parameters))

    sess.run(tf.global_variables_initializer())

    resume_itr = 0

    if (FLAGS.resume_iter != -1 or not FLAGS.train) and hvd.rank() == 0:
        model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter))
        resume_itr = FLAGS.resume_iter
        saver.restore(sess, model_file)
        # optimistic_restore(sess, model_file)

    sess.run(hvd.broadcast_global_variables(0))
    print("Initializing variables...")

    print("Start broadcast")
    print("End broadcast")

    if FLAGS.train:
        train(target_vars, saver, sess,
              logger, data_loader, resume_itr,
              logdir)

    test(target_vars, saver, sess, logger, data_loader)
Esempio n. 22
0
def main():
    print("Local rank: ", hvd.local_rank(), hvd.size())

    logdir = osp.join(FLAGS.logdir, FLAGS.exp)
    if hvd.rank() == 0:
        if not osp.exists(logdir):
            os.makedirs(logdir)
        logger = TensorBoardOutputFormat(logdir)
    else:
        logger = None

    LABEL = None
    print("Loading data...")
    if FLAGS.dataset == 'cifar10':
        dataset = Cifar10(augment=FLAGS.augment, rescale=FLAGS.rescale)
        test_dataset = Cifar10(train=False, rescale=FLAGS.rescale)
        channel_num = 3

        X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32)

        if FLAGS.large_model:
            model = ResNet32Large(num_channels=channel_num,
                                  num_filters=128,
                                  train=True)
        elif FLAGS.larger_model:
            model = ResNet32Larger(num_channels=channel_num, num_filters=128)
        elif FLAGS.wider_model:
            model = ResNet32Wider(num_channels=channel_num, num_filters=192)
        else:
            model = ResNet32(num_channels=channel_num, num_filters=128)

    elif FLAGS.dataset == 'imagenet':
        dataset = Imagenet(train=True)
        test_dataset = Imagenet(train=False)
        channel_num = 3
        X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 1000), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 1000), dtype=tf.float32)

        model = ResNet32Wider(num_channels=channel_num, num_filters=256)

    elif FLAGS.dataset == 'imagenetfull':
        channel_num = 3
        X_NOISE = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 1000), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 1000), dtype=tf.float32)

        model = ResNet128(num_channels=channel_num, num_filters=64)

    elif FLAGS.dataset == 'mnist':
        dataset = Mnist(rescale=FLAGS.rescale)
        test_dataset = dataset
        channel_num = 1
        X_NOISE = tf.placeholder(shape=(None, 28, 28), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 28, 28), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32)

        model = MnistNet(num_channels=channel_num,
                         num_filters=FLAGS.num_filters)

    elif FLAGS.dataset == 'dsprites':
        dataset = DSprites(cond_shape=FLAGS.cond_shape,
                           cond_size=FLAGS.cond_size,
                           cond_pos=FLAGS.cond_pos,
                           cond_rot=FLAGS.cond_rot)
        test_dataset = dataset
        channel_num = 1

        X_NOISE = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32)

        if FLAGS.dpos_only:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        elif FLAGS.dsize_only:
            LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32)
        elif FLAGS.drot_only:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        elif FLAGS.cond_size:
            LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32)
        elif FLAGS.cond_shape:
            LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32)
        elif FLAGS.cond_pos:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        elif FLAGS.cond_rot:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        else:
            LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32)

        model = DspritesNet(num_channels=channel_num,
                            num_filters=FLAGS.num_filters,
                            cond_size=FLAGS.cond_size,
                            cond_shape=FLAGS.cond_shape,
                            cond_pos=FLAGS.cond_pos,
                            cond_rot=FLAGS.cond_rot)

    print("Done loading...")

    if FLAGS.dataset == "imagenetfull":
        # In the case of full imagenet, use custom_tensorflow dataloader
        data_loader = TFImagenetLoader('train',
                                       FLAGS.batch_size,
                                       hvd.rank(),
                                       hvd.size(),
                                       rescale=FLAGS.rescale)
    else:
        data_loader = DataLoader(dataset,
                                 batch_size=FLAGS.batch_size,
                                 num_workers=FLAGS.data_workers,
                                 drop_last=True,
                                 shuffle=True)

    batch_size = FLAGS.batch_size

    weights = [model.construct_weights('context_0')]

    Y = tf.placeholder(shape=(None), dtype=tf.int32)

    # Varibles to run in training
    X_SPLIT = tf.split(X, FLAGS.num_gpus)
    X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus)
    LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus)
    LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus)
    LABEL_SPLIT_INIT = list(LABEL_SPLIT)
    tower_grads = []
    tower_gen_grads = []
    x_mod_list = []

    optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999)
    optimizer = hvd.DistributedOptimizer(optimizer)

    for j in range(FLAGS.num_gpus):

        if FLAGS.model_cclass:
            ind_batch_size = FLAGS.batch_size // FLAGS.num_gpus
            label_tensor = tf.Variable(tf.convert_to_tensor(np.reshape(
                np.tile(np.eye(10), (FLAGS.batch_size, 1, 1)),
                (FLAGS.batch_size * 10, 10)),
                                                            dtype=tf.float32),
                                       trainable=False,
                                       dtype=tf.float32)
            x_split = tf.tile(
                tf.reshape(X_SPLIT[j], (ind_batch_size, 1, 32, 32, 3)),
                (1, 10, 1, 1, 1))
            x_split = tf.reshape(x_split, (ind_batch_size * 10, 32, 32, 3))
            energy_pos = model.forward(x_split,
                                       weights[0],
                                       label=label_tensor,
                                       stop_at_grad=False)

            energy_pos_full = tf.reshape(energy_pos, (ind_batch_size, 10))
            energy_partition_est = tf.reduce_logsumexp(energy_pos_full,
                                                       axis=1,
                                                       keepdims=True)
            uniform = tf.random_uniform(tf.shape(energy_pos_full))
            label_tensor = tf.argmax(-energy_pos_full -
                                     tf.log(-tf.log(uniform)) -
                                     energy_partition_est,
                                     axis=1)
            label = tf.one_hot(label_tensor, 10, dtype=tf.float32)
            label = tf.Print(label, [label_tensor, energy_pos_full])
            LABEL_SPLIT[j] = label
            energy_pos = tf.concat(energy_pos, axis=0)
        else:
            energy_pos = [
                model.forward(X_SPLIT[j],
                              weights[0],
                              label=LABEL_POS_SPLIT[j],
                              stop_at_grad=False)
            ]
            energy_pos = tf.concat(energy_pos, axis=0)

        print("Building graph...")
        x_mod = x_orig = X_NOISE_SPLIT[j]

        x_grads = []

        energy_negs = []
        loss_energys = []

        energy_negs.extend([
            model.forward(tf.stop_gradient(x_mod),
                          weights[0],
                          label=LABEL_SPLIT[j],
                          stop_at_grad=False,
                          reuse=True)
        ])
        eps_begin = tf.zeros(1)

        steps = tf.constant(0)
        c = lambda i, x: tf.less(i, FLAGS.num_steps)

        def langevin_step(counter, x_mod):
            x_mod = x_mod + tf.random_normal(
                tf.shape(x_mod),
                mean=0.0,
                stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale)

            energy_noise = energy_start = tf.concat([
                model.forward(x_mod,
                              weights[0],
                              label=LABEL_SPLIT[j],
                              reuse=True,
                              stop_at_grad=False,
                              stop_batch=True)
            ],
                                                    axis=0)

            x_grad, label_grad = tf.gradients(FLAGS.temperature * energy_noise,
                                              [x_mod, LABEL_SPLIT[j]])
            energy_noise_old = energy_noise

            lr = FLAGS.step_lr

            if FLAGS.proj_norm != 0.0:
                if FLAGS.proj_norm_type == 'l2':
                    x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm)
                elif FLAGS.proj_norm_type == 'li':
                    x_grad = tf.clip_by_value(x_grad, -FLAGS.proj_norm,
                                              FLAGS.proj_norm)
                else:
                    print("Other types of projection are not supported!!!")
                    assert False

            # Clip gradient norm for now
            if FLAGS.hmc:
                # Step size should be tuned to get around 65% acceptance
                def energy(x):
                    return FLAGS.temperature * \
                        model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True)

                x_last = hmc(x_mod, 15., 10, energy)
            else:
                x_last = x_mod - (lr) * x_grad

            x_mod = x_last
            x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale)

            counter = counter + 1

            return counter, x_mod

        steps, x_mod = tf.while_loop(c, langevin_step, (steps, x_mod))

        energy_eval = model.forward(x_mod,
                                    weights[0],
                                    label=LABEL_SPLIT[j],
                                    stop_at_grad=False,
                                    reuse=True)
        x_grad = tf.gradients(FLAGS.temperature * energy_eval, [x_mod])[0]
        x_grads.append(x_grad)

        energy_negs.append(
            model.forward(tf.stop_gradient(x_mod),
                          weights[0],
                          label=LABEL_SPLIT[j],
                          stop_at_grad=False,
                          reuse=True))

        test_x_mod = x_mod

        temp = FLAGS.temperature

        energy_neg = energy_negs[-1]
        x_off = tf.reduce_mean(
            tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j]))

        loss_energy = model.forward(x_mod,
                                    weights[0],
                                    reuse=True,
                                    label=LABEL,
                                    stop_grad=True)

        print("Finished processing loop construction ...")

        target_vars = {}

        if FLAGS.cclass or FLAGS.model_cclass:
            label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0)
            label_prob = label_sum / tf.reduce_sum(label_sum)
            label_ent = -tf.reduce_sum(
                label_prob * tf.math.log(label_prob + 1e-7))
        else:
            label_ent = tf.zeros(1)

        target_vars['label_ent'] = label_ent

        if FLAGS.train:

            if FLAGS.objective == 'logsumexp':
                pos_term = temp * energy_pos
                energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg))
                coeff = tf.stop_gradient(tf.exp(-temp * energy_neg_reduced))
                norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4
                pos_loss = tf.reduce_mean(temp * energy_pos)
                neg_loss = coeff * (-1 * temp * energy_neg) / norm_constant
                loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss))
            elif FLAGS.objective == 'cd':
                pos_loss = tf.reduce_mean(temp * energy_pos)
                neg_loss = -tf.reduce_mean(temp * energy_neg)
                loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss))
            elif FLAGS.objective == 'softplus':
                loss_ml = FLAGS.ml_coeff * \
                    tf.nn.softplus(temp * (energy_pos - energy_neg))

            loss_total = tf.reduce_mean(loss_ml)

            if not FLAGS.zero_kl:
                loss_total = loss_total + tf.reduce_mean(loss_energy)

            loss_total = loss_total + \
                FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square((energy_neg))))

            print("Started gradient computation...")
            gvs = optimizer.compute_gradients(loss_total)
            gvs = [(k, v) for (k, v) in gvs if k is not None]

            print("Applying gradients...")

            tower_grads.append(gvs)

            print("Finished applying gradients.")

            target_vars['loss_ml'] = loss_ml
            target_vars['total_loss'] = loss_total
            target_vars['loss_energy'] = loss_energy
            target_vars['weights'] = weights
            target_vars['gvs'] = gvs

        target_vars['X'] = X
        target_vars['Y'] = Y
        target_vars['LABEL'] = LABEL
        target_vars['LABEL_POS'] = LABEL_POS
        target_vars['X_NOISE'] = X_NOISE
        target_vars['energy_pos'] = energy_pos
        target_vars['energy_start'] = energy_negs[0]

        if len(x_grads) >= 1:
            target_vars['x_grad'] = x_grads[-1]
            target_vars['x_grad_first'] = x_grads[0]
        else:
            target_vars['x_grad'] = tf.zeros(1)
            target_vars['x_grad_first'] = tf.zeros(1)

        target_vars['x_mod'] = x_mod
        target_vars['x_off'] = x_off
        target_vars['temp'] = temp
        target_vars['energy_neg'] = energy_neg
        target_vars['test_x_mod'] = test_x_mod
        target_vars['eps_begin'] = eps_begin

    if FLAGS.train:
        grads = average_gradients(tower_grads)
        train_op = optimizer.apply_gradients(grads)
        target_vars['train_op'] = train_op

    config = tf.ConfigProto()

    if hvd.size() > 1:
        config.gpu_options.visible_device_list = str(hvd.local_rank())

    sess = tf.Session(config=config)

    saver = loader = tf.train.Saver(max_to_keep=30,
                                    keep_checkpoint_every_n_hours=6)

    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        total_parameters += variable_parameters
    print("Model has a total of {} parameters".format(total_parameters))

    sess.run(tf.global_variables_initializer())

    resume_itr = 0

    if (FLAGS.resume_iter != -1 or not FLAGS.train) and hvd.rank() == 0:
        model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter))
        resume_itr = FLAGS.resume_iter
        # saver.restore(sess, model_file)
        optimistic_restore(sess, model_file)

    sess.run(hvd.broadcast_global_variables(0))
    print("Initializing variables...")

    print("Start broadcast")
    print("End broadcast")

    if FLAGS.train:
        print("Training phase")
        train(target_vars, saver, sess, logger, data_loader, resume_itr,
              logdir)
    print("Testing phase")
    test(target_vars, saver, sess, logger, data_loader)
Esempio n. 23
0
def main():

    start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")

    env = StarCraft2Env(map_name="8m",
                        reward_only_positive=False,
                        reward_scale_rate=200,
                        state_last_action=True,
                        obs_last_action=True,
                        obs_timestep_number=True,
                        state_timestep_number=True)  #reward_defeat=-200
    env_info = env.get_env_info()

    n_episodes = 2500  #4000    #2000
    timesteps = 500000
    n_agents = env_info["n_agents"]
    n_actions = env_info["n_actions"]
    output_len = n_actions
    lr = 0.002
    buffer_size = 70000  #int(timesteps * 0.1)  # 80000 # 减少一下,尽量是训练步数的1/10  70000  test 200  80000 20000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 8
    local_obs_len = 179  # local obs:80 ; global state:168;
    global_state_len = 348  # 179+169

    hidden_vector_len = 256  # 128  # 1  256
    tau = 0.001
    num_exploring = buffer_size  # buffer_size
    action_low = -1
    action_high = 1
    save_freq = 10000
    critic_output_len = 1

    logdir = "tensorboard/%s/%s_lr%s/%s" % ("BicNet", timesteps, lr,
                                            start_time)

    Logger.DEFAULT \
        = Logger.CURRENT \
        = Logger(dir=None,
                 output_formats=[TensorBoardOutputFormat(logdir)])

    sess = U.make_session()
    sess.__enter__()

    actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, local_obs_len,
                         output_len, hidden_vector_len)
    critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(),
                           num_agents, global_state_len, critic_output_len,
                           hidden_vector_len, n_actions)
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    action_noise = OU_noise(decay_period=timesteps - buffer_size)

    action_noise.reset()
    # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings")
    # U.load_state(model_file_load, sess)
    U.initialize()

    t = 0
    step_train = 0
    for e in range(n_episodes):
        env.reset()
        terminated = False
        episode_reward = 0
        local_obs = env.get_obs()
        local_obs = np.array(local_obs)
        global_state = env.get_state()
        global_state_expand = np.zeros(
            [local_obs.shape[0], local_obs.shape[1] + global_state.shape[0]])
        reward_hl_own_old = []
        reward_hl_en_old = []
        episode_reward_agent = [0 for n in range(n_agents)]
        for i in range(local_obs.shape[0]):
            global_state_expand[i] = np.append(local_obs[i],
                                               global_state.flatten())
            reward_hl_own_old.append(env.get_agent_health(i))
            reward_hl_en_old.append(env.get_enemy_health(i))

        while not terminated:
            t = t + 1
            critic_input = np.expand_dims(global_state_expand, axis=0)
            actor_input = np.expand_dims(local_obs, axis=0)
            action = actor.predict(actor_input)[0]
            act_with_noise = action  #np.clip(action + action_noise.get_noise(step_train), action_low, action_high)
            act_mat_norm = (act_with_noise + 1) / 2
            actions = []
            dead_unit = []
            rew_expand = np.zeros((n_agents, 1))

            for agent_id in range(n_agents):
                sum_avail_act = 0
                act_prob = []
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                act_unit_norm = act_mat_norm[agent_id]

                for i in avail_actions_ind:
                    act_prob.append(act_unit_norm[i])
                    sum_avail_act = sum_avail_act + act_unit_norm[i]

                if (sum_avail_act == 0):
                    act_prob = (np.array(act_prob) + 1) / len(act_prob)
                else:
                    act_prob = np.array(act_prob) / sum_avail_act

                index = np.random.choice(np.array(avail_actions_ind),
                                         p=act_prob.ravel())
                actions.append(index)

                if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0):
                    dead_unit.append(agent_id)

            reward_base, terminated, info = env.step(actions)

            new_local_obs = env.get_obs()
            new_local_obs = np.array(new_local_obs)
            new_global_state = env.get_state()
            new_global_state_expand = np.zeros([
                new_local_obs.shape[0],
                new_local_obs.shape[1] + new_global_state.shape[0]
            ])
            reward_hl_own_new = []
            reward_hl_en_new = []
            for i in range(new_local_obs.shape[0]):
                new_global_state_expand[i] = np.append(
                    new_local_obs[i], new_global_state.flatten())
                reward_hl_own_new.append(env.get_agent_health(i))
                reward_hl_en_new.append(env.get_enemy_health(i))

            for i in range(n_agents):
                if (i in dead_unit):
                    rew_expand[i] = 0
                else:
                    rew_expand[i] = -0.05
                    if (actions[i] > 5):
                        target_id = actions[i] - 6
                        health_reduce_en = reward_hl_en_old[
                            target_id] - reward_hl_en_new[target_id]
                        if (health_reduce_en > 0):
                            rew_expand[i] += 2 + health_reduce_en * 5
                            # if (reward_base > 50):
                            #     rew_expand[i] += 20
                        else:
                            rew_expand[i] += 1
                    else:
                        rew_expand[i] += (reward_hl_own_new[i] -
                                          reward_hl_own_old[i]) * 5
                #
                if (terminated):
                    if (info["battle_won"] is False):
                        rew_expand[i] += -10
                    else:
                        rew_expand[i] += 10

                episode_reward_agent[i] += rew_expand[i]

            replay_buffer.add(local_obs, global_state_expand, act_with_noise,
                              rew_expand, terminated, new_local_obs,
                              new_global_state_expand)

            episode_reward += reward_base
            local_obs = new_local_obs
            global_state_expand = new_global_state_expand
            if (t == num_exploring):
                print("training starts")
            if (t >= num_exploring):
                local_s_batch, global_s_batch, a_batch, r_batch, done_batch, local_s2_batch, global_s2_batch = replay_buffer.sample_batch(
                    batch_size
                )  # [group0:[batch_size, trace.dimension], group1, ... group8]
                target_q = r_batch + gamma * critic.predict_target(
                    global_s2_batch, actor.predict_target(local_s2_batch))
                predicted_q_value, _ = critic.train(
                    global_s_batch, a_batch,
                    np.reshape(target_q,
                               (batch_size, num_agents, critic_output_len)))
                a_outs = actor.predict(local_s_batch)  # a_outs和a_batch是完全相同的
                grads = critic.action_gradients(global_s_batch,
                                                a_outs)  # delta Q对a的导数
                actor.train(local_s_batch, grads)
                step_train = step_train + 1

                actor.update_target_network()
                critic.update_target_network()

                if (t % save_freq == 0):
                    model_file_save = os.path.join(
                        "model/" + str(step_train) + "_" +
                        "training_steps_model/", "8m")
                    U.save_state(model_file_save)
                    print("Model have been trained for %s times" %
                          (step_train))
                    # replay_buffer.save()

        print("steps until now : %s, episode: %s, episode reward: %s" %
              (t, e, episode_reward))
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", e)
        logger.record_tabular("reward_episode", episode_reward)
        for i in range(n_agents):
            logger.record_tabular("reward_agent_" + str(i),
                                  episode_reward_agent[i])

        logger.dump_tabular()

    # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings")
    # U.save_state(model_file_save)

    env.close()
Esempio n. 24
0
def main():

    logdir = osp.join(FLAGS.logdir, FLAGS.exp)
    logger = TensorBoardOutputFormat(logdir)

    config = tf.ConfigProto()

    sess = tf.Session(config=config)
    LABEL = None
    print("Loading data...")
    if FLAGS.dataset == 'cubes':
        dataset = Cubes(cond_idx=FLAGS.cond_idx)
        test_dataset = dataset

        if FLAGS.cond_idx == 0:
            label_size = 2
        elif FLAGS.cond_idx == 1:
            label_size = 1
        elif FLAGS.cond_idx == 2:
            label_size = 3
        elif FLAGS.cond_idx == 3:
            label_size = 20

        LABEL = tf.placeholder(shape=(None, label_size), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, label_size), dtype=tf.float32)
    elif FLAGS.dataset == 'color':
        dataset = CubesColor()
        test_dataset = dataset
        LABEL = tf.placeholder(shape=(None, 301), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 301), dtype=tf.float32)
        label_size = 301
    elif FLAGS.dataset == 'pos':
        dataset = CubesPos()
        test_dataset = dataset
        LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        label_size = 2
    elif FLAGS.dataset == "pairs":
        dataset = Pairs(cond_idx=0)
        test_dataset = dataset
        LABEL = tf.placeholder(shape=(None, 6), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 6), dtype=tf.float32)
        label_size = 6
    elif FLAGS.dataset == "continual":
        dataset = CubesContinual()
        test_dataset = dataset

        if FLAGS.prelearn_model_shape:
            LABEL = tf.placeholder(shape=(None, 20), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 20), dtype=tf.float32)
            label_size = 20
        else:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            label_size = 2

    elif FLAGS.dataset == "cross":
        dataset = CubesCrossProduct(FLAGS.ratio, cond_size=FLAGS.cond_size, cond_pos=FLAGS.cond_pos, joint_baseline=FLAGS.joint_baseline)
        test_dataset = dataset

        if FLAGS.cond_size:
            LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32)
            label_size = 1
        elif FLAGS.cond_pos:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            label_size = 2

        if FLAGS.joint_baseline:
            LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32)
            label_size = 3

    elif FLAGS.dataset == 'celeba':
        dataset = CelebA(cond_idx=FLAGS.celeba_cond_idx)
        test_dataset = dataset
        channel_num = 3
        X_NOISE = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)

        model = ResNet128(
            num_channels=channel_num,
            num_filters=64,
            classes=2)

    if FLAGS.joint_baseline:
        # Other stuff for joint model
        optimizer = AdamOptimizer(FLAGS.lr, beta1=0.99, beta2=0.999)

        X = tf.placeholder(shape=(None, 64, 64, 3), dtype=tf.float32)
        X_NOISE = tf.placeholder(shape=(None, 64, 64, 3), dtype=tf.float32)
        ATTENTION_MASK = tf.placeholder(shape=(None, 64, 64, FLAGS.cond_func), dtype=tf.float32)
        NOISE = tf.placeholder(shape=(None, 128), dtype=tf.float32)
        HIER_LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)

        channel_num = 3

        model = CubesNetGen(num_channels=channel_num, label_size=label_size)
        weights = model.construct_weights('context_0')
        output = model.forward(NOISE, weights, reuse=False, label=LABEL)
        print(output.get_shape())
        mse_loss = tf.reduce_mean(tf.square(output - X))
        gvs = optimizer.compute_gradients(mse_loss)
        train_op = optimizer.apply_gradients(gvs)
        gvs = [(k, v) for (k, v) in gvs if k is not None]

        target_vars = {}
        target_vars['train_op'] = train_op
        target_vars['X'] = X
        target_vars['X_NOISE'] = X_NOISE
        target_vars['ATTENTION_MASK'] = ATTENTION_MASK
        target_vars['eps_begin'] = tf.zeros(1)
        target_vars['gvs'] = gvs
        target_vars['energy_pos'] = tf.zeros(1)
        target_vars['energy_neg'] = tf.zeros(1)
        target_vars['loss_energy'] = tf.zeros(1)
        target_vars['loss_ml'] = tf.zeros(1)
        target_vars['total_loss'] = mse_loss
        target_vars['attention_mask'] = tf.zeros(1)
        target_vars['attention_grad'] = tf.zeros(1)
        target_vars['x_off'] = tf.reduce_mean(tf.abs(output - X))
        target_vars['x_mod'] = tf.zeros(1)
        target_vars['x_grad'] = tf.zeros(1)
        target_vars['NOISE'] = NOISE
        target_vars['LABEL'] = LABEL
        target_vars['LABEL_POS'] = LABEL_POS
        target_vars['HIER_LABEL'] = HIER_LABEL

        data_loader = DataLoader(
            dataset,
            batch_size=FLAGS.batch_size,
            num_workers=FLAGS.data_workers,
            drop_last=True,
            shuffle=True)
    else:
        print("label size here ", label_size)
        channel_num = 3
        X_NOISE = tf.placeholder(shape=(None, 64, 64, 3), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 64, 64, 3), dtype=tf.float32)
        HEIR_LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        ATTENTION_MASK = tf.placeholder(shape=(None, 64, 64, FLAGS.cond_func), dtype=tf.float32)

        if FLAGS.dataset != "celeba":
            model = CubesNet(num_channels=channel_num, label_size=label_size)

        heir_model = HeirNet(num_channels=FLAGS.cond_func)

        models_pretrain = []
        if FLAGS.prelearn_model:
            model_prelearn = CubesNet(num_channels=channel_num, label_size=FLAGS.prelearn_label)
            weights = model_prelearn.construct_weights('context_1')
            LABEL_PRELEARN = tf.placeholder(shape=(None, FLAGS.prelearn_label), dtype=tf.float32)
            models_pretrain.append((model_prelearn, weights, LABEL_PRELEARN))

            cubes_logdir = osp.join(FLAGS.logdir, FLAGS.prelearn_exp)
            if (FLAGS.prelearn_iter != -1 or not FLAGS.train):
                model_file = osp.join(cubes_logdir, 'model_{}'.format(FLAGS.prelearn_iter))
                resume_itr = FLAGS.resume_iter
                # saver.restore(sess, model_file)

                v_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='context_{}'.format(1))
                v_map = {(v.name.replace('context_{}'.format(1), 'context_0')[:-2]): v for v in v_list}
                saver = tf.train.Saver(v_map)
                saver.restore(sess, model_file)

        if FLAGS.prelearn_model_shape:
            model_prelearn = CubesNet(num_channels=channel_num, label_size=FLAGS.prelearn_label_shape)
            weights = model_prelearn.construct_weights('context_2')
            LABEL_PRELEARN = tf.placeholder(shape=(None, FLAGS.prelearn_label_shape), dtype=tf.float32)
            models_pretrain.append((model_prelearn, weights, LABEL_PRELEARN))

            cubes_logdir = osp.join(FLAGS.logdir, FLAGS.prelearn_exp_shape)
            if (FLAGS.prelearn_iter_shape != -1 or not FLAGS.train):
                model_file = osp.join(cubes_logdir, 'model_{}'.format(FLAGS.prelearn_iter_shape))
                resume_itr = FLAGS.resume_iter
                # saver.restore(sess, model_file)

                v_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='context_{}'.format(2))
                v_map = {(v.name.replace('context_{}'.format(2), 'context_0')[:-2]): v for v in v_list}
                saver = tf.train.Saver(v_map)
                saver.restore(sess, model_file)

        print("Done loading...")

        data_loader = DataLoader(
            dataset,
            batch_size=FLAGS.batch_size,
            num_workers=FLAGS.data_workers,
            drop_last=True,
            shuffle=True)

        batch_size = FLAGS.batch_size

        weights = model.construct_weights('context_0')

        if FLAGS.heir_mask:
            weights = heir_model.construct_weights('heir_0', weights=weights)

        Y = tf.placeholder(shape=(None), dtype=tf.int32)

        # Varibles to run in training

        X_SPLIT = tf.split(X, FLAGS.num_gpus)
        X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus)
        LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus)
        LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus)
        LABEL_SPLIT_INIT = list(LABEL_SPLIT)
        attention_mask = ATTENTION_MASK
        tower_grads = []
        tower_gen_grads = []
        x_mod_list = []

        optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.99)

        for j in range(FLAGS.num_gpus):

            x_mod = X_SPLIT[j]
            if FLAGS.comb_mask:
                steps = tf.constant(0)
                c = lambda i, x: tf.less(i, FLAGS.num_steps)

                def langevin_attention_step(counter, attention_mask):
                    attention_mask = attention_mask + tf.random_normal(tf.shape(attention_mask), mean=0.0, stddev=0.01)
                    energy_noise = energy_start = model.forward(
                                x_mod,
                                weights,
                                attention_mask,
                                label=LABEL_SPLIT[j],
                                reuse=True,
                                stop_at_grad=False,
                                stop_batch=True)

                    if FLAGS.heir_mask:
                        energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL)
                        energy_noise = energy_noise + energy_heir

                    attention_grad = tf.gradients(
                        FLAGS.temperature * energy_noise, [attention_mask])[0]
                    energy_noise_old = energy_noise

                    # Clip gradient norm for now
                    attention_mask = attention_mask - (FLAGS.attention_lr) * attention_grad
                    attention_mask = tf.layers.average_pooling2d(attention_mask, (3, 3), 1, padding='SAME')
                    attention_mask = tf.stop_gradient(attention_mask)

                    counter = counter + 1

                    return counter, attention_mask

                steps, attention_mask = tf.while_loop(c, langevin_attention_step, (steps, attention_mask))

                # attention_mask = tf.Print(attention_mask, [attention_mask])

                energy_pos = model.forward(
                        X_SPLIT[j],
                        weights,
                        tf.stop_gradient(attention_mask),
                        label=LABEL_POS_SPLIT[j],
                        stop_at_grad=False)

                if FLAGS.heir_mask:
                    energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL)
                    energy_pos = energy_heir + energy_pos

            else:
                energy_pos = model.forward(
                        X_SPLIT[j],
                        weights,
                        attention_mask,
                        label=LABEL_POS_SPLIT[j],
                        stop_at_grad=False)

                if FLAGS.heir_mask:
                    energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL)
                    energy_pos = energy_heir + energy_pos

            print("Building graph...")
            x_mod = x_orig = X_NOISE_SPLIT[j]

            x_grads = []

            loss_energys = []

            eps_begin = tf.zeros(1)

            steps = tf.constant(0)
            c_cond = lambda i, x, y: tf.less(i, FLAGS.num_steps)

            def langevin_step(counter, x_mod, attention_mask):

                lr = FLAGS.step_lr

                x_mod = x_mod + tf.random_normal(tf.shape(x_mod),
                                                 mean=0.0,
                                                 stddev=0.001 * FLAGS.rescale * FLAGS.noise_scale)
                attention_mask = attention_mask + tf.random_normal(tf.shape(attention_mask), mean=0.0, stddev=0.01)

                energy_noise = model.forward(
                            x_mod,
                            weights,
                            attention_mask,
                            label=LABEL_SPLIT[j],
                            reuse=True,
                            stop_at_grad=False,
                            stop_batch=True)

                if FLAGS.prelearn_model:
                    for m_i, w_i, l_i in models_pretrain:
                        energy_noise = energy_noise + m_i.forward(
                                    x_mod,
                                    w_i,
                                    attention_mask,
                                    label=l_i,
                                    reuse=True,
                                    stop_at_grad=False,
                                    stop_batch=True)


                if FLAGS.heir_mask:
                    energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL)
                    energy_noise = energy_heir + energy_noise

                x_grad, attention_grad = tf.gradients(
                    FLAGS.temperature * energy_noise, [x_mod, attention_mask])

                if not FLAGS.comb_mask:
                    attention_grad = tf.zeros(1)
                energy_noise_old = energy_noise

                if FLAGS.proj_norm != 0.0:
                    if FLAGS.proj_norm_type == 'l2':
                        x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm)
                    elif FLAGS.proj_norm_type == 'li':
                        x_grad = tf.clip_by_value(
                            x_grad, -FLAGS.proj_norm, FLAGS.proj_norm)
                    else:
                        print("Other types of projection are not supported!!!")
                        assert False

                # Clip gradient norm for now
                x_last = x_mod - (lr) * x_grad

                if FLAGS.comb_mask:
                    attention_mask = attention_mask - FLAGS.attention_lr * attention_grad
                    attention_mask = tf.layers.average_pooling2d(attention_mask, (3, 3), 1, padding='SAME')
                    attention_mask = tf.stop_gradient(attention_mask)

                x_mod = x_last
                x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale)

                counter = counter + 1

                return counter, x_mod, attention_mask


            steps, x_mod, attention_mask = tf.while_loop(c_cond, langevin_step, (steps, x_mod, attention_mask))

            attention_mask = tf.stop_gradient(attention_mask)
            # attention_mask = tf.Print(attention_mask, [attention_mask])

            energy_eval = model.forward(x_mod, weights, attention_mask, label=LABEL_SPLIT[j],
                                        stop_at_grad=False, reuse=True)
            x_grad, attention_grad = tf.gradients(FLAGS.temperature * energy_eval, [x_mod, attention_mask])
            x_grads.append(x_grad)

            energy_neg = model.forward(
                    tf.stop_gradient(x_mod),
                    weights,
                    tf.stop_gradient(attention_mask),
                    label=LABEL_SPLIT[j],
                    stop_at_grad=False,
                    reuse=True)

            if FLAGS.heir_mask:
                energy_heir = 1.00 * heir_model.forward(attention_mask, weights, label=HEIR_LABEL)
                energy_neg = energy_heir + energy_neg


            temp = FLAGS.temperature

            x_off = tf.reduce_mean(
                tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j]))

            loss_energy = model.forward(
                x_mod,
                weights,
                attention_mask,
                reuse=True,
                label=LABEL,
                stop_grad=True)

            print("Finished processing loop construction ...")

            target_vars = {}

            if FLAGS.antialias:
                antialias = tf.tile(stride_3, (1, 1, tf.shape(x_mod)[3], tf.shape(x_mod)[3]))
                inp = tf.nn.conv2d(x_mod, antialias, [1, 2, 2, 1], padding='SAME')

            test_x_mod = x_mod

            if FLAGS.cclass or FLAGS.model_cclass:
                label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0)
                label_prob = label_sum / tf.reduce_sum(label_sum)
                label_ent = -tf.reduce_sum(label_prob *
                                           tf.math.log(label_prob + 1e-7))
            else:
                label_ent = tf.zeros(1)

            target_vars['label_ent'] = label_ent

            if FLAGS.train:
                if FLAGS.objective == 'logsumexp':
                    pos_term = temp * energy_pos
                    energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg))
                    coeff = tf.stop_gradient(tf.exp(-temp * energy_neg_reduced))
                    norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4
                    pos_loss = tf.reduce_mean(temp * energy_pos)
                    neg_loss = coeff * (-1 * temp * energy_neg) / norm_constant
                    loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss))
                elif FLAGS.objective == 'cd':
                    pos_loss = tf.reduce_mean(temp * energy_pos)
                    neg_loss = -tf.reduce_mean(temp * energy_neg)
                    loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss))
                elif FLAGS.objective == 'softplus':
                    loss_ml = FLAGS.ml_coeff * \
                        tf.nn.softplus(temp * (energy_pos - energy_neg))

                loss_total = tf.reduce_mean(loss_ml)

                if not FLAGS.zero_kl:
                    loss_total = loss_total + tf.reduce_mean(loss_energy)

                loss_total = loss_total + \
                    FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square((energy_neg))))

                print("Started gradient computation...")
                gvs = optimizer.compute_gradients(loss_total)
                gvs = [(k, v) for (k, v) in gvs if k is not None]

                print("Applying gradients...")

                tower_grads.append(gvs)

                print("Finished applying gradients.")

                target_vars['loss_ml'] = loss_ml
                target_vars['total_loss'] = loss_total
                target_vars['loss_energy'] = loss_energy
                target_vars['weights'] = weights
                target_vars['gvs'] = gvs

            target_vars['X'] = X
            target_vars['Y'] = Y
            target_vars['LABEL'] = LABEL
            target_vars['HIER_LABEL'] = HEIR_LABEL
            target_vars['LABEL_POS'] = LABEL_POS
            target_vars['X_NOISE'] = X_NOISE
            target_vars['energy_pos'] = energy_pos
            target_vars['attention_grad'] = attention_grad

            if len(x_grads) >= 1:
                target_vars['x_grad'] = x_grads[-1]
                target_vars['x_grad_first'] = x_grads[0]
            else:
                target_vars['x_grad'] = tf.zeros(1)
                target_vars['x_grad_first'] = tf.zeros(1)

            target_vars['x_mod'] = x_mod
            target_vars['x_off'] = x_off
            target_vars['temp'] = temp
            target_vars['energy_neg'] = energy_neg
            target_vars['test_x_mod'] = test_x_mod
            target_vars['eps_begin'] = eps_begin
            target_vars['ATTENTION_MASK'] = ATTENTION_MASK
            target_vars['models_pretrain'] = models_pretrain
            if FLAGS.comb_mask:
                target_vars['attention_mask'] = tf.nn.softmax(attention_mask)
            else:
                target_vars['attention_mask'] = tf.zeros(1)

        if FLAGS.train:
            grads = average_gradients(tower_grads)
            train_op = optimizer.apply_gradients(grads)
            target_vars['train_op'] = train_op

    # sess = tf.Session(config=config)

    saver = loader = tf.train.Saver(
        max_to_keep=30, keep_checkpoint_every_n_hours=6)

    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        total_parameters += variable_parameters
    print("Model has a total of {} parameters".format(total_parameters))

    sess.run(tf.global_variables_initializer())

    resume_itr = 0

    if (FLAGS.resume_iter != -1 or not FLAGS.train):
        model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter))
        resume_itr = FLAGS.resume_iter
        # saver.restore(sess, model_file)
        optimistic_restore(sess, model_file)

    print("Initializing variables...")

    print("Start broadcast")
    print("End broadcast")

    if FLAGS.train:
        train(target_vars, saver, sess,
              logger, data_loader, resume_itr,
              logdir)

    test(target_vars, saver, sess, logger, data_loader)
Esempio n. 25
0
import os
from glob import glob
from baselines.logger import TensorBoardOutputFormat
import tensorflow as tf

tb_files = glob(os.path.join(os.path.dirname(__file__), '../../result',
                             '**/tb/*tfevents*'),
                recursive=True)

for file in tb_files:

    dir = os.path.join(os.path.dirname(file), os.path.pardir)
    tb = TensorBoardOutputFormat(os.path.join(dir, 'tb1'))

    for e in tf.train.summary_iterator(file):
        for v in e.summary.value:
            if v.tag == 'EpExecMean':
                v.simple_value = v.simple_value * 1000  # seconds to ms
        tb.writekvs_ev(e)

    tb.close()