Example #1
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, max_global_time_step, device):
        self.learn_rate = 0
        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step
        self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)
        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)


# #
        self.apply_gradients = tf.train.RMSPropOptimizer(
            self.learning_rate_input).apply_gradients(
                zip(self.gradients, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = Game()

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        self.prev_local_t = 0
Example #2
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, action_size, gamma, local_t_max, entropy_beta,
                 agent_type, performance_log_interval, log_level, random_seed):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.action_size = action_size
        self.gamma = gamma
        self.local_t_max = local_t_max
        self.agent_type = agent_type
        self.performance_log_interval = performance_log_interval
        self.log_level = log_level

        if self.agent_type == 'LSTM':
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   thread_index, device)
        else:
            self.local_network = GameACFFNetwork(self.action_size,
                                                 thread_index, device)

        self.local_network.prepare_loss(entropy_beta)

        with tf.device(device):
            var_refs = []
            variables = self.local_network.get_vars()
            for v in variables:
                var_refs.append(v)

            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        self.sync = self.local_network.sync_from(global_network)

        np.random.seed(random_seed)
        self.game_state = GameState(random_seed * thread_index,
                                    self.action_size)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate
        self.learn_rate = self.initial_learning_rate

        self.reset_counters()

        self.episode = 0

        # variable controling log output
        self.prev_local_t = 0
Example #3
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,task_index=""):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v._ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    if(global_network):
      self.apply_gradients = grad_applier.apply_gradients(
        global_network.get_vars(),
        self.gradients )
      self.sync = self.local_network.sync_from(global_network)
      self.mode="threading";
    else:
      self.apply_gradients = grad_applier.apply_gradients(
        self.local_network.get_vars(),
        self.gradients )
      self.mode="dist_tensor";
    if not (task_index): 
      self.game_state = GameState(113 * thread_index)
    else:
      self.game_state = GameState(113 * task_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v.ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.gradients )

    self.sync = self.local_network.sync_from(global_network)

    self.game_state = GameState(113 * thread_index)

    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0

    tempdir = os.path.join(os.getcwd(), "results")
    self.res_file = os.path.join(tempdir, RESULTS_FILE)
    file = open(self.res_file, 'wb')
    file.write('itr,mean_score,max,min,std,runs,test_steps\n')
    file.close()
    def __init__(self, thread_index, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_episode,
                 device, arrived_jobs, condition):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_episode = max_global_time_episode

        # 通过thread_index 即机器编号来获取在该机器上加工的所有工序
        self.operations = get_data_by_machine(thread_index)
        self.condition = condition
        self.is_terminal_counted = False
        self.last_episode_reward = 0

        if USE_LSTM:
            # 第一个参数是action size,这里传入在该机器上代加工的工序数
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index,
                                                   device)
        else:
            # 第一个参数是action size,这里传入在该机器上代加工的工序数
            self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index,
                                                 device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            self.local_network.get_vars(), self.gradients)

        # self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        # 创建该工序的环境
        self.env = JspEnv(self.operations, thread_index, arrived_jobs)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0
Example #6
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if NETWORK_TYPE == 'LSTM':
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        elif NETWORK_TYPE == 'DILATED':
            self.local_network = GameACDilatedNetwork(ACTION_SIZE, device)
        elif NETWORK_TYPE == 'CONV':
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize( self.local_network.total_loss,
                                       self.local_network.get_vars() )

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
          global_network.get_vars(),
          self.trainer.get_accum_grad_list() )

        self.sync = self.local_network.sync_from(global_network)




        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v._ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.gradients )
      
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
Example #8
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients( # watch out: update global_network
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        # STATE_SIZE = 6 - 3 Landmarks + 5 (comm-size)
        self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0
        self.epSteps = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0
Example #10
0
def run_a3c(args):
    """Run A3C experiment."""
    GYM_ENV_NAME = args.gym_env.replace('-', '_')
    GAME_NAME = args.gym_env.replace('NoFrameskip-v4','')

    # setup folder name and path to folder
    folder = pathlib.Path(setup_folder(args, GYM_ENV_NAME))

    # setup GPU (if applicable)
    import tensorflow as tf
    gpu_options = setup_gpu(tf, args.use_gpu, args.gpu_fraction)

    ######################################################
    # setup default device
    device = "/cpu:0"

    global_t = 0
    rewards = {'train': {}, 'eval': {}}
    best_model_reward = -(sys.maxsize)
    if args.load_pretrained_model:
        class_rewards = {'class_eval': {}}

    # setup logging info for analysis, see Section 4.2 of the paper
    sil_dict = {
                # count number of SIL updates
                "sil_ctr":{},
                # total number of butter D sampled during SIL
                "sil_a3c_sampled":{},
                # total number of buffer D samples (i.e., generated by A3C workers) used during SIL (i.e., passed max op)
                "sil_a3c_used":{},
                # the return of used samples for buffer D
                "sil_a3c_used_return":{},
                # total number of buffer R sampled during SIL
                "sil_rollout_sampled":{},
                # total number of buffer R samples (i.e., generated by refresher worker) used during SIL (i.e., passed max op)
                "sil_rollout_used":{},
                # the return of used samples for buffer R
                "sil_rollout_used_return":{},
                # number of old samples still used (even after refreshing)
                "sil_old_used":{}
                }
    sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return = 0, 0, 0, 0
    sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return = 0, 0, 0
    sil_old_used = 0


    rollout_dict = {
                    # total number of rollout performed
                    "rollout_ctr": {},
                    # total number of successful rollout (i.e., Gnew > G)
                    "rollout_added_ctr":{},
                    # the return of Gnew
                    "rollout_new_return":{},
                    # the return of G
                    "rollout_old_return":{}
                    }
    rollout_ctr, rollout_added_ctr = 0, 0
    rollout_new_return = 0 # this records the total, avg = this / rollout_added_ctr
    rollout_old_return = 0 # this records the total, avg = this / rollout_added_ctr

    # setup file names
    reward_fname = folder / '{}-a3c-rewards.pkl'.format(GYM_ENV_NAME)
    sil_fname = folder / '{}-a3c-dict-sil.pkl'.format(GYM_ENV_NAME)
    rollout_fname = folder / '{}-a3c-dict-rollout.pkl'.format(GYM_ENV_NAME)
    if args.load_pretrained_model:
        class_reward_fname = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME)

    sharedmem_fname = folder / '{}-sharedmem.pkl'.format(GYM_ENV_NAME)
    sharedmem_params_fname = folder / '{}-sharedmem-params.pkl'.format(GYM_ENV_NAME)
    sharedmem_trees_fname = folder / '{}-sharedmem-trees.pkl'.format(GYM_ENV_NAME)

    rolloutmem_fname = folder / '{}-rolloutmem.pkl'.format(GYM_ENV_NAME)
    rolloutmem_params_fname = folder / '{}-rolloutmem-params.pkl'.format(GYM_ENV_NAME)
    rolloutmem_trees_fname = folder / '{}-rolloutmem-trees.pkl'.format(GYM_ENV_NAME)

    # for removing older ckpt, save mem space
    prev_ckpt_t = -1

    stop_req = False

    game_state = GameState(env_id=args.gym_env)
    action_size = game_state.env.action_space.n
    game_state.close()
    del game_state.env
    del game_state

    input_shape = (args.input_shape, args.input_shape, 4)
    #######################################################
    # setup global A3C
    GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015
    global_network = GameACFFNetwork(
        action_size, -1, device, padding=args.padding,
        in_shape=input_shape)
    logger.info('A3C Initial Learning Rate={}'.format(args.initial_learn_rate))

    # setup pretrained model
    global_pretrained_model = None
    local_pretrained_model = None
    pretrain_graph = None

    # if use pretrained model to refresh
    # then must load pretrained model
    # otherwise, don't load model
    if args.use_lider and args.nstep_bc > 0:
        assert args.load_pretrained_model, "refreshing with other policies, must load a pre-trained model (TA or BC)"
    else:
        assert not args.load_pretrained_model, "refreshing with the current policy, don't load pre-trained models"

    if args.load_pretrained_model:
        pretrain_graph, global_pretrained_model = setup_pretrained_model(tf,
            args, action_size, input_shape,
            device="/gpu:0" if args.use_gpu else device)
        assert global_pretrained_model is not None
        assert pretrain_graph is not None

    time.sleep(2.0)

    # setup experience memory
    shared_memory = None # => this is BufferD
    rollout_buffer = None # => this is BufferR
    if args.use_sil:
        shared_memory = SILReplayMemory(
            action_size, max_len=args.memory_length, gamma=args.gamma,
            clip=False if args.unclipped_reward else True,
            height=input_shape[0], width=input_shape[1],
            phi_length=input_shape[2], priority=args.priority_memory,
            reward_constant=args.reward_constant)

        if args.use_lider and not args.onebuffer:
            rollout_buffer = SILReplayMemory(
                action_size, max_len=args.memory_length, gamma=args.gamma,
                clip=False if args.unclipped_reward else True,
                height=input_shape[0], width=input_shape[1],
                phi_length=input_shape[2], priority=args.priority_memory,
                reward_constant=args.reward_constant)

        # log memory information
        shared_memory.log()
        if args.use_lider and not args.onebuffer:
            rollout_buffer.log()

    ############## Setup Thread Workers BEGIN ################
    # 17 total number of threads for all experiments
    assert args.parallel_size ==17, "use 17 workers for all experiments"

    startIndex = 0
    all_workers = []

    # a3c and sil learning rate and optimizer
    learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr")
    grad_applier = tf.train.RMSPropOptimizer(
        learning_rate=learning_rate_input,
        decay=args.rmsp_alpha,
        epsilon=args.rmsp_epsilon)

    setup_common_worker(CommonWorker, args, action_size)

    # setup SIL worker
    sil_worker = None
    if args.use_sil:
        _device = "/gpu:0" if args.use_gpu else device

        sil_network = GameACFFNetwork(
            action_size, startIndex, device=_device,
            padding=args.padding, in_shape=input_shape)

        sil_worker = SILTrainingThread(startIndex, global_network, sil_network,
            args.initial_learn_rate,
            learning_rate_input,
            grad_applier, device=_device,
            batch_size=args.batch_size,
            use_rollout=args.use_lider,
            one_buffer=args.onebuffer,
            sampleR=args.sampleR)

        all_workers.append(sil_worker)
        startIndex += 1

    # setup refresh worker
    refresh_worker = None
    if args.use_lider:
        _device = "/gpu:0" if args.use_gpu else device

        refresh_network = GameACFFNetwork(
            action_size, startIndex, device=_device,
            padding=args.padding, in_shape=input_shape)

        refresh_local_pretrained_model = None
        # if refreshing with other polies
        if args.nstep_bc > 0:
            refresh_local_pretrained_model = PretrainedModelNetwork(
                pretrain_graph, action_size, startIndex,
                padding=args.padding,
                in_shape=input_shape, sae=False,
                tied_weights=False,
                use_denoising=False,
                noise_factor=0.3,
                loss_function='mse',
                use_slv=False, device=_device)

        refresh_worker = RefreshThread(
            thread_index=startIndex, action_size=action_size, env_id=args.gym_env,
            global_a3c=global_network, local_a3c=refresh_network,
            update_in_rollout=args.update_in_rollout, nstep_bc=args.nstep_bc,
            global_pretrained_model=global_pretrained_model,
            local_pretrained_model=refresh_local_pretrained_model,
            transformed_bellman = args.transformed_bellman,
            device=_device,
            entropy_beta=args.entropy_beta, clip_norm=args.grad_norm_clip,
            grad_applier=grad_applier,
            initial_learn_rate=args.initial_learn_rate,
            learning_rate_input=learning_rate_input)

        all_workers.append(refresh_worker)
        startIndex += 1

    # setup a3c workers
    setup_a3c_worker(A3CTrainingThread, args, startIndex)
    for i in range(startIndex, args.parallel_size):
        local_network = GameACFFNetwork(
            action_size, i, device="/cpu:0",
            padding=args.padding,
            in_shape=input_shape)

        a3c_worker = A3CTrainingThread(
            i, global_network, local_network,
            args.initial_learn_rate, learning_rate_input, grad_applier,
            device="/cpu:0", no_op_max=30)

        all_workers.append(a3c_worker)
    ############## Setup Thread Workers END ################

    # setup config for tensorflow
    config = tf.ConfigProto(
        gpu_options=gpu_options,
        log_device_placement=False,
        allow_soft_placement=True)

    # prepare sessions
    sess = tf.Session(config=config)
    pretrain_sess = None
    if global_pretrained_model:
        pretrain_sess = tf.Session(config=config, graph=pretrain_graph)

    # initial pretrained model
    if pretrain_sess:
        assert args.pretrained_model_folder is not None
        global_pretrained_model.load(
            pretrain_sess,
            args.pretrained_model_folder)

    sess.run(tf.global_variables_initializer())
    if global_pretrained_model:
        initialize_uninitialized(tf, pretrain_sess,
                                 global_pretrained_model)
    if local_pretrained_model:
        initialize_uninitialized(tf, pretrain_sess,
                                 local_pretrained_model)

    # summary writer for tensorboard
    summ_file = args.save_to+'log/a3c/{}/'.format(GYM_ENV_NAME) + str(folder)[58:] # str(folder)[12:]
    summary_writer = tf.summary.FileWriter(summ_file, sess.graph)

    # init or load checkpoint with saver
    root_saver = tf.train.Saver(max_to_keep=1)
    saver = tf.train.Saver(max_to_keep=1)
    best_saver = tf.train.Saver(max_to_keep=1)

    checkpoint = tf.train.get_checkpoint_state(str(folder)+'/model_checkpoints')
    if checkpoint and checkpoint.model_checkpoint_path:
        root_saver.restore(sess, checkpoint.model_checkpoint_path)
        logger.info("checkpoint loaded:{}".format(
            checkpoint.model_checkpoint_path))
        tokens = checkpoint.model_checkpoint_path.split("-")
        # set global step
        global_t = int(tokens[-1])
        logger.info(">>> global step set: {}".format(global_t))

        tmp_t = (global_t // args.eval_freq) * args.eval_freq
        logger.info(">>> tmp_t: {}".format(tmp_t))

        # set wall time
        wall_t = 0.

        # set up reward files
        best_reward_file = folder / 'model_best/best_model_reward'
        with best_reward_file.open('r') as f:
            best_model_reward = float(f.read())

        # restore rewards
        rewards = restore_dict(reward_fname, global_t)
        logger.info(">>> restored: rewards")

        # restore loggings
        sil_dict = restore_dict(sil_fname, global_t)
        sil_ctr = sil_dict['sil_ctr'][tmp_t]
        sil_a3c_sampled = sil_dict['sil_a3c_sampled'][tmp_t]
        sil_a3c_used = sil_dict['sil_a3c_used'][tmp_t]
        sil_a3c_used_return = sil_dict['sil_a3c_used_return'][tmp_t]
        sil_rollout_sampled = sil_dict['sil_rollout_sampled'][tmp_t]
        sil_rollout_used = sil_dict['sil_rollout_used'][tmp_t]
        sil_rollout_used_return = sil_dict['sil_rollout_used_return'][tmp_t]
        sil_old_used = sil_dict['sil_old_used'][tmp_t]
        logger.info(">>> restored: sil_dict")

        rollout_dict = restore_dict(rollout_fname, global_t)
        rollout_ctr = rollout_dict['rollout_ctr'][tmp_t]
        rollout_added_ctr = rollout_dict['rollout_added_ctr'][tmp_t]
        rollout_new_return = rollout_dict['rollout_new_return'][tmp_t]
        rollout_old_return = rollout_dict['rollout_old_return'][tmp_t]
        logger.info(">>> restored: rollout_dict")

        if args.load_pretrained_model:
            class_reward_file = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME)
            class_rewards = restore_dict(class_reward_file, global_t)

        # restore replay buffers (if saved)
        if args.checkpoint_buffer:
            # restore buffer D
            if args.use_sil and args.priority_memory:
                shared_memory = restore_buffer(sharedmem_fname, shared_memory, global_t)
                shared_memory = restore_buffer_trees(sharedmem_trees_fname, shared_memory, global_t)
                shared_memory = restore_buffer_params(sharedmem_params_fname, shared_memory, global_t)
                logger.info(">>> restored: shared_memory (Buffer D)")
                shared_memory.log()
                # restore buffer R
                if args.use_lider and not args.onebuffer:
                    rollout_buffer = restore_buffer(rolloutmem_fname, rollout_buffer, global_t)
                    rollout_buffer = restore_buffer_trees(rolloutmem_trees_fname, rollout_buffer, global_t)
                    rollout_buffer = restore_buffer_params(rolloutmem_params_fname, rollout_buffer, global_t)
                    logger.info(">>> restored: rollout_buffer (Buffer R)")
                    rollout_buffer.log()

        # if all restores okay, remove old ckpt to save storage space
        prev_ckpt_t = global_t

    else:
        logger.warning("Could not find old checkpoint")
        wall_t = 0.0
        prepare_dir(folder, empty=True)
        prepare_dir(folder / 'model_checkpoints', empty=True)
        prepare_dir(folder / 'model_best', empty=True)
        prepare_dir(folder / 'frames', empty=True)

    lock = threading.Lock()

    # next saving global_t
    def next_t(current_t, freq):
        return np.ceil((current_t + 0.00001) / freq) * freq

    next_global_t = next_t(global_t, args.eval_freq)
    next_save_t = next_t(
        global_t, args.eval_freq*args.checkpoint_freq)

    step_t = 0

    def train_function(parallel_idx, th_ctr, ep_queue, net_updates):
        nonlocal global_t, step_t, rewards, class_rewards, lock, \
                 next_save_t, next_global_t, prev_ckpt_t
        nonlocal shared_memory, rollout_buffer
        nonlocal sil_dict, sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return, \
                 sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return, \
                 sil_old_used
        nonlocal rollout_dict, rollout_ctr, rollout_added_ctr, \
                 rollout_new_return, rollout_old_return

        parallel_worker = all_workers[parallel_idx]
        parallel_worker.set_summary_writer(summary_writer)

        with lock:
            # Evaluate model before training
            if not stop_req and global_t == 0 and step_t == 0:
                rewards['eval'][step_t] = parallel_worker.testing(
                    sess, args.eval_max_steps, global_t, folder,
                    worker=all_workers[-1])

                # testing pretrained TA or BC in game
                if args.load_pretrained_model:
                    assert pretrain_sess is not None
                    assert global_pretrained_model is not None
                    class_rewards['class_eval'][step_t] = \
                        parallel_worker.test_loaded_classifier(global_t=global_t,
                                                    max_eps=50, # testing 50 episodes
                                                    sess=pretrain_sess,
                                                    worker=all_workers[-1],
                                                    model=global_pretrained_model)
                    # log pretrained model performance
                    class_eval_file = pathlib.Path(args.pretrained_model_folder[:21]+\
                        str(GAME_NAME)+"/"+str(GAME_NAME)+'-model-eval.txt')
                    class_std = np.std(class_rewards['class_eval'][step_t][-1])
                    class_mean = np.mean(class_rewards['class_eval'][step_t][-1])
                    with class_eval_file.open('w') as f:
                        f.write("class_mean: \n" + str(class_mean) + "\n")
                        f.write("class_std: \n" + str(class_std) + "\n")
                        f.write("class_rewards: \n" + str(class_rewards['class_eval'][step_t][-1]) + "\n")

                checkpt_file = folder / 'model_checkpoints'
                checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME)
                saver.save(sess, str(checkpt_file), global_step=global_t)
                save_best_model(rewards['eval'][global_t][0])

                # saving worker info to dicts for analysis
                sil_dict['sil_ctr'][step_t] = sil_ctr
                sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled
                sil_dict['sil_a3c_used'][step_t] = sil_a3c_used
                sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return
                sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled
                sil_dict['sil_rollout_used'][step_t] = sil_rollout_used
                sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return
                sil_dict['sil_old_used'][step_t] = sil_old_used

                rollout_dict['rollout_ctr'][step_t] = rollout_ctr
                rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr
                rollout_dict['rollout_new_return'][step_t] = rollout_new_return
                rollout_dict['rollout_old_return'][step_t] = rollout_old_return

                # dump pickle
                dump_pickle([rewards, sil_dict, rollout_dict],
                            [reward_fname, sil_fname, rollout_fname],
                            global_t)
                if args.load_pretrained_model:
                    dump_pickle([class_rewards], [class_reward_fname], global_t)

                logger.info('Dump pickle at step {}'.format(global_t))

                # save replay buffer (only works under priority mem)
                if args.checkpoint_buffer:
                    if shared_memory is not None and args.priority_memory:
                        params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority]
                        trees = [shared_memory.buff._it_sum._value,
                                 shared_memory.buff._it_min._value]
                        dump_pickle([shared_memory.buff._storage, params, trees],
                                    [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname],
                                    global_t)
                        logger.info('Saving shared_memory')

                    if rollout_buffer is not None and args.priority_memory:
                        params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority]
                        trees = [rollout_buffer.buff._it_sum._value,
                                 rollout_buffer.buff._it_min._value]
                        dump_pickle([rollout_buffer.buff._storage, params, trees],
                                    [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname],
                                    global_t)
                        logger.info('Saving rollout_buffer')

                prev_ckpt_t = global_t

                step_t = 1

        # set start_time
        start_time = time.time() - wall_t
        parallel_worker.set_start_time(start_time)

        if parallel_worker.is_sil_thread:
            sil_interval = 0  # bigger number => slower SIL updates
            m_repeat = 4
            min_mem = args.batch_size * m_repeat
            sil_train_flag = len(shared_memory) >= min_mem

        while True:
            if stop_req:
                return

            if global_t >= (args.max_time_step * args.max_time_step_fraction):
                return

            if parallel_worker.is_sil_thread:
                # before sil starts, init local count
                local_sil_ctr = 0
                local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0
                local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0
                local_sil_old_used = 0

                if net_updates.qsize() >= sil_interval \
                   and len(shared_memory) >= min_mem:
                    sil_train_flag = True

                if sil_train_flag:
                    sil_train_flag = False

                    th_ctr.get()

                    train_out = parallel_worker.sil_train(
                        sess, global_t, shared_memory, m_repeat,
                        rollout_buffer=rollout_buffer)

                    local_sil_ctr, local_sil_a3c_sampled, local_sil_a3c_used, \
                       local_sil_a3c_used_return, \
                       local_sil_rollout_sampled, local_sil_rollout_used, \
                       local_sil_rollout_used_return, \
                       local_sil_old_used = train_out

                    th_ctr.put(1)

                    with net_updates.mutex:
                        net_updates.queue.clear()

                    if args.use_lider:
                        parallel_worker.record_sil(sil_ctr=sil_ctr,
                                              total_used=(sil_a3c_used + sil_rollout_used),
                                              num_a3c_used=sil_a3c_used,
                                              a3c_used_return=sil_a3c_used_return/(sil_a3c_used+1),#add one in case divide by zero
                                              rollout_used=sil_rollout_used,
                                              rollout_used_return=sil_rollout_used_return/(sil_rollout_used+1),
                                              old_used=sil_old_used,
                                              global_t=global_t)

                        if sil_ctr % 200 == 0 and sil_ctr > 0:
                            rollout_buffsize = 0
                            if not args.onebuffer:
                                rollout_buffsize = len(rollout_buffer)
                            log_data = (sil_ctr, len(shared_memory),
                                        rollout_buffsize,
                                        sil_a3c_used+sil_rollout_used,
                                        args.batch_size*sil_ctr,
                                        sil_a3c_used,
                                        sil_a3c_used_return/(sil_a3c_used+1),
                                        sil_rollout_used,
                                        sil_rollout_used_return/(sil_rollout_used+1),
                                        sil_old_used)
                            logger.info("SIL: sil_ctr={0:}"
                                        " sil_memory_size={1:}"
                                        " rollout_buffer_size={2:}"
                                        " total_sample_used={3:}/{4:}"
                                        " a3c_used={5:}"
                                        " a3c_used_return_avg={6:.2f}"
                                        " rollout_used={7:}"
                                        " rollout_used_return_avg={8:.2f}"
                                        " old_used={9:}".format(*log_data))
                    else:
                        parallel_worker.record_sil(sil_ctr=sil_ctr,
                                                   total_used=(sil_a3c_used + sil_rollout_used),
                                                   num_a3c_used=sil_a3c_used,
                                                   rollout_used=sil_rollout_used,
                                                   global_t=global_t)
                        if sil_ctr % 200 == 0 and sil_ctr > 0:
                            log_data = (sil_ctr, sil_a3c_used+sil_rollout_used,
                                        args.batch_size*sil_ctr,
                                        sil_a3c_used,
                                        len(shared_memory))
                            logger.info("SIL: sil_ctr={0:}"
                                        " total_sample_used={1:}/{2:}"
                                        " a3c_used={3:}"
                                        " sil_memory_size={4:}".format(*log_data))

                # Adding episodes to SIL memory is centralize to ensure
                # sampling and updating of priorities does not become a problem
                # since we add new episodes to SIL at once and during
                # SIL training it is guaranteed that SIL memory is untouched.
                max = args.parallel_size
                while not ep_queue.empty():
                    data = ep_queue.get()
                    parallel_worker.episode.set_data(*data)
                    shared_memory.extend(parallel_worker.episode)
                    parallel_worker.episode.reset()
                    max -= 1
                    if max <= 0: # This ensures that SIL has a chance to train
                        break

                diff_global_t = 0

                # centralized rollout counting
                local_rollout_ctr, local_rollout_added_ctr = 0, 0
                local_rollout_new_return, local_rollout_old_return = 0, 0

            elif parallel_worker.is_refresh_thread:
                # before refresh starts, init local count
                diff_global_t = 0
                local_rollout_ctr, local_rollout_added_ctr = 0, 0
                local_rollout_new_return, local_rollout_old_return = 0, 0

                if len(shared_memory) >= 1:
                    th_ctr.get()
                    # randomly sample a state from buffer D
                    sample = shared_memory.sample_one_random()
                    # after sample, flip refreshed to True
                    # TODO: fix this so that only *succesful* refresh is flipped to True
                    # currently counting *all* refresh as True
                    assert sample[-1] == True

                    train_out = parallel_worker.rollout(sess, folder, pretrain_sess,
                                                        global_t, sample,
                                                        args.addall,
                                                        args.max_ep_step,
                                                        args.nstep_bc,
                                                        args.update_in_rollout)

                    diff_global_t, episode_end, part_end, local_rollout_ctr, \
                        local_rollout_added_ctr, add, local_rollout_new_return, \
                        local_rollout_old_return = train_out

                    th_ctr.put(1)

                    if rollout_ctr % 20 == 0 and rollout_ctr > 0:
                        log_msg = "ROLLOUT: rollout_ctr={} added_rollout_ct={} worker={}".format(
                        rollout_ctr, rollout_added_ctr, parallel_worker.thread_idx)
                        logger.info(log_msg)
                        logger.info("ROLLOUT Gnew: {}, G: {}".format(local_rollout_new_return,
                                                                     local_rollout_old_return))

                    # should always part_end, i.e., end of episode
                    # and only add if new return is better (if not LiDER-AddAll)
                    if part_end and add:
                        if not args.onebuffer:
                            # directly put into Buffer R
                            rollout_buffer.extend(parallel_worker.episode)
                        else:
                            # Buffer D add sample is centralized when OneBuffer
                            ep_queue.put(parallel_worker.episode.get_data())

                    parallel_worker.episode.reset()

                # centralized SIL counting
                local_sil_ctr = 0
                local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0
                local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0
                local_sil_old_used = 0

            # a3c training thread worker
            else:
                th_ctr.get()

                train_out = parallel_worker.train(sess, global_t, rewards)
                diff_global_t, episode_end, part_end = train_out

                th_ctr.put(1)

                if args.use_sil:
                    net_updates.put(1)
                    if part_end:
                        ep_queue.put(parallel_worker.episode.get_data())
                        parallel_worker.episode.reset()

                # centralized SIL counting
                local_sil_ctr = 0
                local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0
                local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0
                local_sil_old_used = 0
                # centralized rollout counting
                local_rollout_ctr, local_rollout_added_ctr = 0, 0
                local_rollout_new_return, local_rollout_old_return = 0, 0

            # ensure only one thread is updating global_t at a time
            with lock:
                global_t += diff_global_t

                # centralize increasing count for SIL and Rollout
                sil_ctr += local_sil_ctr
                sil_a3c_sampled += local_sil_a3c_sampled
                sil_a3c_used += local_sil_a3c_used
                sil_a3c_used_return += local_sil_a3c_used_return
                sil_rollout_sampled += local_sil_rollout_sampled
                sil_rollout_used += local_sil_rollout_used
                sil_rollout_used_return += local_sil_rollout_used_return
                sil_old_used += local_sil_old_used

                rollout_ctr += local_rollout_ctr
                rollout_added_ctr += local_rollout_added_ctr
                rollout_new_return += local_rollout_new_return
                rollout_old_return += local_rollout_old_return

                # if during a thread's update, global_t has reached a evaluation interval
                if global_t > next_global_t:
                    next_global_t = next_t(global_t, args.eval_freq)
                    step_t = int(next_global_t - args.eval_freq)

                    # wait for all threads to be done before testing
                    while not stop_req and th_ctr.qsize() < len(all_workers):
                        time.sleep(0.001)

                    step_t = int(next_global_t - args.eval_freq)

                    # Evaluate for 125,000 steps
                    rewards['eval'][step_t] = parallel_worker.testing(
                        sess, args.eval_max_steps, step_t, folder,
                        worker=all_workers[-1])
                    save_best_model(rewards['eval'][step_t][0])
                    last_reward = rewards['eval'][step_t][0]

                    # saving worker info to dicts
                    # SIL
                    sil_dict['sil_ctr'][step_t] = sil_ctr
                    sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled
                    sil_dict['sil_a3c_used'][step_t] = sil_a3c_used
                    sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return
                    sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled
                    sil_dict['sil_rollout_used'][step_t] = sil_rollout_used
                    sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return
                    sil_dict['sil_old_used'][step_t] = sil_old_used
                    # ROLLOUT
                    rollout_dict['rollout_ctr'][step_t] = rollout_ctr
                    rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr
                    rollout_dict['rollout_new_return'][step_t] = rollout_new_return
                    rollout_dict['rollout_old_return'][step_t] = rollout_old_return

                    # save ckpt after done with eval
                    if global_t > next_save_t:
                        next_save_t = next_t(global_t, args.eval_freq*args.checkpoint_freq)

                        # dump pickle
                        dump_pickle([rewards, sil_dict, rollout_dict],
                                    [reward_fname, sil_fname, rollout_fname],
                                    global_t)
                        if args.load_pretrained_model:
                            dump_pickle([class_rewards], [class_reward_fname], global_t)
                        logger.info('Dump pickle at step {}'.format(global_t))

                        # save replay buffer (only works for priority mem for now)
                        if args.checkpoint_buffer:
                            if shared_memory is not None and args.priority_memory:
                                params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority]
                                trees = [shared_memory.buff._it_sum._value,
                                         shared_memory.buff._it_min._value]
                                dump_pickle([shared_memory.buff._storage, params, trees],
                                            [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname],
                                            global_t)
                                logger.info('Saved shared_memory')

                            if rollout_buffer is not None and args.priority_memory:
                                params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority]
                                trees = [rollout_buffer.buff._it_sum._value,
                                         rollout_buffer.buff._it_min._value]
                                dump_pickle([rollout_buffer.buff._storage, params, trees],
                                            [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname],
                                            global_t)
                                logger.info('Saved rollout_buffer')

                        # save a3c after saving buffer -- in case saving buffer OOM
                        # so that at least we can revert back to the previous ckpt
                        checkpt_file = folder / 'model_checkpoints'
                        checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME)
                        saver.save(sess, str(checkpt_file), global_step=global_t,
                                   write_meta_graph=False)
                        logger.info('Saved model ckpt')

                        # if everything saves okay, clean up previous ckpt to save space
                        remove_pickle([reward_fname, sil_fname, rollout_fname],
                                      prev_ckpt_t)
                        if args.load_pretrained_model:
                            remove_pickle([class_reward_fname], prev_ckpt_t)

                        remove_pickle([sharedmem_fname, sharedmem_params_fname,
                                       sharedmem_trees_fname],
                                      prev_ckpt_t)
                        if rollout_buffer is not None and args.priority_memory:
                            remove_pickle([rolloutmem_fname, rolloutmem_params_fname,
                                           rolloutmem_trees_fname],
                                          prev_ckpt_t)

                        logger.info('Removed ckpt from step {}'.format(prev_ckpt_t))

                        prev_ckpt_t = global_t


    def signal_handler(signal, frame):
        nonlocal stop_req
        logger.info('You pressed Ctrl+C!')
        stop_req = True

        if stop_req and global_t == 0:
            sys.exit(1)

    def save_best_model(test_reward):
        nonlocal best_model_reward
        if test_reward > best_model_reward:
            best_model_reward = test_reward
            best_reward_file = folder / 'model_best/best_model_reward'

            with best_reward_file.open('w') as f:
                f.write(str(best_model_reward))

            best_checkpt_file = folder / 'model_best'
            best_checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME)
            best_saver.save(sess, str(best_checkpt_file))


    train_threads = []
    th_ctr = Queue()
    for i in range(args.parallel_size):
        th_ctr.put(1)

    episodes_queue = None
    net_updates = None
    if args.use_sil:
        episodes_queue = Queue()
        net_updates = Queue()

    for i in range(args.parallel_size):
        worker_thread = Thread(
            target=train_function,
            args=(i, th_ctr, episodes_queue, net_updates,))
        train_threads.append(worker_thread)

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    # set start time
    start_time = time.time() - wall_t

    for t in train_threads:
        t.start()

    print('Press Ctrl+C to stop')

    for t in train_threads:
        t.join()

    logger.info('Now saving data. Please wait')

    # write wall time
    wall_t = time.time() - start_time
    wall_t_fname = folder / 'wall_t.{}'.format(global_t)
    with wall_t_fname.open('w') as f:
        f.write(str(wall_t))

    # save final model
    checkpoint_file = str(folder / '{}_checkpoint_a3c'.format(GYM_ENV_NAME))
    root_saver.save(sess, checkpoint_file, global_step=global_t)

    dump_final_pickle([rewards, sil_dict, rollout_dict],
                      [reward_fname, sil_fname, rollout_fname])

    logger.info('Data saved!')

    # if everything saves okay & is done training (not because of pressed Ctrl+C),
    # clean up previous ckpt to save space
    if global_t >= (args.max_time_step * args.max_time_step_fraction):
        remove_pickle([reward_fname, sil_fname, rollout_fname],
                      prev_ckpt_t)
        if args.load_pretrained_model:
            remove_pickle([class_reward_fname], prev_ckpt_t)

        remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname],
                      prev_ckpt_t)
        if rollout_buffer is not None and args.priority_memory:
            remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname],
                          prev_ckpt_t)

        logger.info('Done training, removed ckpt from step {}'.format(prev_ckpt_t))


    sess.close()
    if pretrain_sess:
        pretrain_sess.close()
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0


  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if USE_LSTM:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        if USE_LSTM:
          self.local_network.reset_state()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    if USE_LSTM:
      batch_si.reverse()
      batch_a.reverse()
      batch_td.reverse()
      batch_R.reverse()

      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.local_network.initial_lstm_state: start_lstm_state,
                  self.local_network.step_size : [len(batch_a)] } )
    else:
      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R} )
      
    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
import matplotlib.pyplot as plt

from game_ac_network import GameACFFNetwork, GameACLSTMNetwork
from a3c_training_thread import A3CTrainingThread
from rmsprop_applier import RMSPropApplier

import options
options = options.options

# use CPU for weight visualize tool
device = "/cpu:0"

if options.use_lstm:
  global_network = GameACLSTMNetwork(options.action_size, -1, device)
else:
  global_network = GameACFFNetwork(options.action_size, device)

training_threads = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                              decay = options.rmsp_alpha,
                              momentum = 0.0,
                              epsilon = options.rmsp_epsilon,
                              clip_norm = options.grad_norm_clip,
                              device = device)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
    def __init__(self,
                 thread_index,
                 global_network,
                 pinitial_learning_rate,
                 plearning_rate_input,
                 pgrad_applier,
                 vinitial_learning_rate,
                 vlearning_rate_input,
                 vgrad_applier,
                 max_global_time_step,
                 device,
                 task_index=""):

        self.thread_index = thread_index
        self.plearning_rate_input = plearning_rate_input
        self.vlearning_rate_input = vlearning_rate_input
        self.max_global_time_step = max_global_time_step
        self.game_state = GameState()
        state = self.game_state.reset()
        self.game_state.reset_gs(state)
        self.action_size = self.game_state.action_size
        self.state_size = self.game_state.state_size
        self.local_max_iter = self.game_state.local_max_iter

        if USE_LSTM:
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   self.state_size,
                                                   self.game_state.action_low,
                                                   self.game_state.action_high,
                                                   thread_index, device)
        else:
            self.local_network = GameACFFNetwork(self.action_size,
                                                 self.state_size,
                                                 self.game_state.action_low,
                                                 self.game_state.action_high,
                                                 thread_index, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            pvar_refs = [v._ref() for v in self.local_network.get_pvars()]
            self.policy_gradients = tf.gradients(
                self.local_network.policy_loss,
                pvar_refs,
                gate_gradients=False,
                aggregation_method=None,
                colocate_gradients_with_ops=False)
            vvar_refs = [v._ref() for v in self.local_network.get_vvars()]
            self.value_gradients = tf.gradients(
                self.local_network.value_loss,
                vvar_refs,
                gate_gradients=False,
                aggregation_method=None,
                colocate_gradients_with_ops=False)

        self.apply_policy_gradients = pgrad_applier.apply_gradients(
            self.local_network.get_pvars(), self.policy_gradients)
        self.apply_value_gradients = vgrad_applier.apply_gradients(
            self.local_network.get_vvars(), self.value_gradients)

        self.local_t = 0

        self.pinitial_learning_rate = pinitial_learning_rate
        self.vinitial_learning_rate = vinitial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0
Example #14
0
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index,
                                                   device)
        else:
            self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index,
                                                 device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        if USE_LSTM:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        self.episode_reward = 0
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            action = self.choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            # process game
            self.game_state.process(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward
            # print(self.episode_reward)

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            # self._record_score(sess, summary_writer, summary_op, score_input,
            #                    self.episode_reward, global_t)

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                # self._record_score(sess, summary_writer, summary_op, score_input,
                #                    self.episode_reward, global_t)

                # self.episode_reward = 0
                self.game_state.reset()
                if USE_LSTM:
                    self.local_network.reset_state()
                break
        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                         self.local_network.step_size: [len(batch_a)],
                         self.learning_rate_input: cur_learning_rate
                     })
        else:
            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.learning_rate_input: cur_learning_rate
                     })

        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print(
                "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Example #15
0
def display(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip,
            agent_type, action_size, rand_seed, checkpoint_dir,
            display_time_sleep, display_episodes, display_log_level,
            display_save_log, show_max):

    # use CPU for display tool
    device = "/cpu:0"

    LOG_FILE = 'log_{}-{}.txt'.format(experiment_name, agent_type)

    if agent_type == 'LSTM':
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        global_network = GameACFFNetwork(action_size, -1, device)

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=rmsp_alpha,
                                  momentum=0.0,
                                  epsilon=rmsp_epsilon,
                                  clip_norm=grad_norm_clip,
                                  device=device)

    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    episode = 0
    terminal = False

    episode_rewards = []
    episode_steps = []
    episode_passed_obsts = []
    print ' '
    print 'DISPLAYING {} EPISODES'.format(display_episodes)
    print '--------------------------------------------------- '

    while not episode == display_episodes:
        episode_reward = 0
        episode_passed_obst = 0

        game_state = GameState(rand_seed, action_size, show_score=True)

        if display_log_level == 'FULL':
            print 'EPISODE {}'.format(episode)

        full_frame = None
        while True:
            pi_values, value = global_network.run_policy_and_value(
                sess, game_state.s_t)
            action = choose_action(pi_values)
            game_state.process(action)
            terminal = game_state.terminal
            episode_step = game_state.steps
            reward = game_state.reward
            passed_obst = game_state.passed_obst
            if len(episode_passed_obsts) == 0 and show_max:
                if passed_obst > 0:
                    full_frame = game_state.full_frame
            elif episode_passed_obst > np.max(
                    episode_passed_obsts) and show_max:
                full_frame = game_state.full_frame

            episode_reward += reward
            episode_passed_obst = passed_obst

            if display_log_level == 'FULL':
                print 'step  /  pi_values: {}  /  value: {}  /  action: {}  /  reward: {}  /  passed_obst: {}'.format(
                    pi_values, value, action, reward, passed_obst)

            time.sleep(display_time_sleep)

            if not terminal:
                game_state.update()
            else:
                break

        episode_rewards.append(episode_reward)
        episode_steps.append(episode_step)
        episode_passed_obsts.append(episode_passed_obst)

        if not display_log_level == 'NONE':
            reward_steps = format(
                float(episode_reward) / float(episode_step), '.4f')
            print "EPISODE: {}  /  STEPS: {}  /  PASSED OBST: {}  /  REWARD: {}  /  REWARD/STEP: {}".format(
                episode, episode_step, passed_obst, episode_reward,
                reward_steps)

        if display_save_log:
            with open(LOG_FILE, "a") as text_file:
                text_file.write('{},{},{},{},{}\n'.format(
                    episode, episode_step, passed_obst, episode_reward,
                    reward_steps))

        episode += 1

    print '--------------------------------------------------- '
    print 'DISPLAY SESSION FINISHED'
    print 'TOTAL EPISODES: {}'.format(display_episodes)
    print ' '
    print 'MIN'
    print 'REWARD: {}  /  STEPS: {}  /  PASSED OBST: {}'.format(
        np.min(episode_rewards), np.min(episode_steps),
        np.min(episode_passed_obsts))
    print ' '
    print 'AVERAGE'
    print 'REWARD: {}  /  STEPS: {}  /  PASSED OBST: {}'.format(
        np.average(episode_rewards), np.average(episode_steps),
        np.average(episode_passed_obsts))
    print ' '
    print 'MAX'
    print 'REWARD: {}  /   STEPS: {}  /   PASSED OBST: {}'.format(
        np.max(episode_rewards), np.max(episode_steps),
        np.max(episode_passed_obsts))

    if show_max and not full_frame == None:
        plt.imshow(full_frame, origin='lower')
        plt.show()
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, action_size, gamma, local_t_max, entropy_beta,
                 agent_type, performance_log_interval, log_level, random_seed):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input  #每个worker不同
        self.max_global_time_step = max_global_time_step  #4000w steps

        self.action_size = action_size  #2
        self.gamma = gamma  # 0.99
        self.local_t_max = local_t_max  # 256
        self.agent_type = agent_type  #FF
        self.performance_log_interval = performance_log_interval
        self.log_level = log_level

        #初始化worker的网络
        if self.agent_type == 'LSTM':
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   thread_index, device)
        else:
            self.local_network = GameACFFNetwork(self.action_size,
                                                 thread_index, device)
        #创建一下loss的相关变量
        self.local_network.prepare_loss(entropy_beta)

        with tf.device(device):
            #获取worker网络的参数
            #[self.W_conv1, self.b_conv1, self.W_conv2, self.b_conv2,self.W_fc1, self.b_fc1,self.W_fc2, self.b_fc2,self.W_fc3, self.b_fc3]
            var_refs = []
            variables = self.local_network.get_vars()
            for v in variables:
                var_refs.append(v)
            #计算梯度,
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)
        #更新网络
        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        #拉取global网络参数
        self.sync = self.local_network.sync_from(global_network)

        #初始化游戏环境
        np.random.seed(random_seed)
        self.game_state = GameState(random_seed * thread_index,
                                    self.action_size)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate
        self.learn_rate = self.initial_learning_rate

        #重置一些计数器
        self.reset_counters()

        self.episode = 0

        # variable controling log output
        self.prev_local_t = 0
Example #17
0
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               options):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step
    self.options = options

    if options.use_lstm:
      self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(options.action_size, device)

    self.local_network.prepare_loss(options.entropy_beta)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    self.indent = "         |" * self.thread_index
    self.steps = 0
    self.no_reward_steps = 0
    self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0)

    if self.options.train_episode_steps > 0:
      self.max_reward = 0.0
      self.max_episode_reward = 0.0
      self.episode_states = []
      self.episode_actions = []
      self.episode_rewards = []
      self.episode_values = []
      self.episode_liveses = []
      self.episode_scores = Episode_scores(options)
      self.tes = self.options.train_episode_steps
      if self.options.tes_list is not None:
        self.tes = self.options.tes_list[thread_index]
        print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes))
    self.initial_lives = self.game_state.initial_lives
    self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1)

    if self.options.record_new_record_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_record_dir):
          os.makedirs(self.options.record_new_record_dir)
      self.episode_screens = []

    if self.options.record_new_room_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_room_dir):
          os.makedirs(self.options.record_new_room_dir)
      self.episode_screens = []

    self.greediness = options.greediness
    self.repeat_action_ratio = options.repeat_action_ratio
    self.prev_action = 0

    
    

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values, global_t):
    # Add greediness for broader exploration
    r = random.random()
    if r < self.greediness:
      action =  int(r * len(pi_values))
    elif r < self.repeat_action_ratio:
      action = self.prev_action
    else:
      # Increase randomness of choice if no reward term is too long
      if self.no_reward_steps > self.options.no_reward_steps:
        randomness = (self.no_reward_steps - self.options.no_reward_steps) * self.options.randomness
        pi_values += randomness
        pi_values /= sum(pi_values)
        if self.local_t % self.options.randomness_log_interval == 0:
          elapsed_time = time.time() - self.start_time
          print("t={:6.0f},s={:9d},th={}:{}randomness={:.8f}".format(
                elapsed_time, global_t, self.thread_index, self.indent, randomness))

      pi_values -= np.finfo(np.float32).epsneg
      action_samples = np.random.multinomial(self.options.num_experiments, pi_values)
      action = action_samples.argmax(0)

    self.prev_action = action
    return action

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def set_start_time(self, start_time):
    self.start_time = start_time

  #@profile
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []
    liveses = [self.game_state.lives]
    if self.tes > 0:
      if self.episode_liveses == []:
        self.episode_liveses.append(self.game_state.lives)

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if self.options.use_lstm:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(self.options.local_t_max):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_, global_t)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)
      liveses.append(self.game_state.lives)

      if (self.thread_index == 0) and (self.local_t % self.options.log_interval == 0):
        print("pi={} (thread{})".format(pi_, self.thread_index))
        print(" V={} (thread{})".format(value_, self.thread_index))

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward
      if reward > 0 and \
         (self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0"):
        elapsed_time = time.time() - self.start_time
        print("t={:6.0f},s={:4.0f},th={}:{}r={:3.0f}RM{:02d}| NEW-SCORE".format(
              elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward,
              self.game_state.room_no))

      # pseudo-count reward
      if self.options.psc_use:
        reward += self.game_state.psc_reward

      # add basic income after some no reward steps
      if self.no_reward_steps > self.options.no_reward_steps:
        reward += self.options.basic_income

      # clip reward
      if self.options.reward_clip > 0.0:
        reward = np.clip(reward, -self.options.reward_clip, self.options.reward_clip)
      rewards.append( reward )

      # collect episode log
      if self.tes > 0:
        self.episode_states.append(self.game_state.s_t)
        self.episode_actions.append(action)
        self.episode_rewards.append(reward)
        self.episode_values.append(value_)
        self.episode_liveses.append(self.game_state.lives)
        if len(self.episode_states) > self.max_history * 2:
          self.episode_states = self.episode_states[-self.max_history:]
          self.episode_actions = self.episode_actions[-self.max_history:]
          self.episode_rewards = self.episode_rewards[-self.max_history:]
          self.episode_values = self.episode_values[-self.max_history:]
          self.episode_liveses = self.episode_liveses[-self.max_history-1:]
        # requirement for OpenAI Gym: --clear-history-on-death=False
        if self.options.clear_history_on_death and (liveses[-2] > liveses[-1]):
          self.episode_states = []
          self.episode_actions = []
          self.episode_rewards = []
          self.episode_values = []
          self.episode_liveses = self.episode_liveses[-2:]
 
      self.local_t += 1

      if self.options.record_new_record_dir is not None \
         or self.options.record_new_room_dir is not None:
        screen = self.game_state.uncropped_screen
        if self.options.compress_frame:
          screen = lzma.compress(screen.tobytes(), preset=0)
        self.episode_screens.append(screen)

      # terminate if the play time is too long
      self.steps += 1
      if self.steps > self.options.max_play_steps:
        terminal = True

      # requirement for OpenAI Gym: --terminate-on-lives-lost=False
      # terminate if lives lost
      if self.terminate_on_lives_lost and (liveses[-2] > liveses[-1]):
        terminal = True

      # count no reward steps
      if self.game_state.reward == 0.0:
        self.no_reward_steps += 1
      else:
        self.no_reward_steps = 0

      # s_t1 -> s_t
      self.game_state.update()
      
      if self.local_t % self.options.score_log_interval == 0:
        elapsed_time = time.time() - self.start_time
        print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format(
              elapsed_time, global_t, self.thread_index, self.indent,
              self.episode_reward, self.game_state.room_no,
              self.game_state.lives, value_, self.game_state.psc_reward))

      # if self.game_state.room_no != self.game_state.prev_room_no:
      #   elapsed_time = time.time() - self.start_time
      #   print("t={:6.0f},s={:9d},th={}:{}RM{:02d}>RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format(
      #         elapsed_time, global_t, self.thread_index, self.indent, 
      #         self.game_state.prev_room_no, self.game_state.room_no,
      #         self.game_state.lives, value_, self.game_state.psc_reward))

      if self.tes > 0:
        if self.game_state.lives < self.episode_liveses[-2]:
          elapsed_time = time.time() - self.start_time
          print("t={:6.0f},s={:9d},th={}:{}l={:.0f}>{:.0f}RM{:02d}|".format(
                elapsed_time, global_t, self.thread_index, self.indent, 
                self.episode_liveses[-2], self.game_state.lives, self.game_state.room_no))

      # seperate steps after getting reward
      if self.game_state.reward > 0:
        if not terminal:
          break

      if terminal:
        terminal_end = True
        elapsed_time = time.time() - self.start_time
        end_mark = "end" if self.terminate_on_lives_lost else "END"
        print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}@{}|".format(
              elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, end_mark))

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        if self.tes > 0:
          if self.options.record_new_room_dir is not None \
             and self.game_state.new_room >= 0:
            dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t,  self.thread_index,\
                       self.episode_reward, self.game_state.new_room)
            dirname = os.path.join(self.options.record_new_room_dir, dirname)
            os.makedirs(dirname)
            for index, screen in enumerate(self.episode_screens):
              filename = "{:06d}.png".format(index)
              filename = os.path.join(dirname, filename)
              screen_image = screen
              if self.options.compress_frame:
                screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160))
              cv2.imwrite(filename, screen_image)
            print("@@@ New Room record screens saved to {}".format(dirname))

          if self.episode_reward > self.max_episode_reward:
            if self.options.record_new_record_dir is not None:
              dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t,  self.thread_index,\
                         self.episode_reward, self.game_state.room_no)
              dirname = os.path.join(self.options.record_new_record_dir, dirname)
              os.makedirs(dirname)
              for index, screen in enumerate(self.episode_screens):
                filename = "{:06d}.png".format(index)
                filename = os.path.join(dirname, filename)
                screen_image = screen
                if self.options.compress_frame:
                  screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160))
                cv2.imwrite(filename, screen_image)
              print("@@@ New Record screens saved to {}".format(dirname))
            self.max_episode_reward = self.episode_reward
            if self.options.record_all_non0_record:
              self.max_episode_reward = 0

          self.max_reward = 0.0
          self.episode_states = []
          self.episode_actions = []
          self.episode_rewards = []
          self.episode_values = []
          self.episode_liveses = []
          self.episode_scores.add(self.episode_reward, global_t, self.thread_index)
          if self.options.record_new_record_dir is not None \
             or self.options.record_new_room_dir is not None:
            self.episode_screens= []

        self.episode_reward = 0
        self.steps = 0
        self.no_reward_steps = 0
        self.game_state.reset()
        if self.options.use_lstm:
          self.local_network.reset_state()
        break

    if self.thread_index == 0 and self.local_t % self.options.performance_log_interval < self.options.local_t_max:
      elapsed_time = time.time() - self.start_time
      steps_per_sec = global_t / elapsed_time
      print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
            global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

    if self.options.gym_eval:
      diff_local_t = self.local_t - start_local_t
      return diff_local_t, terminal_end

    # don't train if following condition
    # requirement for OpenAI Gym: --terminate-on-lives-lost=False
    if self.options.terminate_on_lives_lost and (self.thread_index == 0) and (not self.options.train_in_eval):
      return 0, terminal_end
    else:
      if self.tes > 0:
        _ = self.episode_scores.is_highscore(self.episode_reward)
        if self.episode_reward > self.max_reward:
          self.max_reward = self.episode_reward
          if True:
            tes = self.tes
            # requirement for OpenAI Gym: --test-extend=False
            if self.options.tes_extend and self.initial_lives != 0:
              tes *= self.options.tes_extend_ratio * (self.game_state.lives / self.initial_lives)
              if self.game_state.lives == self.initial_lives:
                tes *= 2
              tes = int(tes)
            tes = min(tes, len(self.episode_states))
            print("[OHL]SCORE={:3.0f},s={:9d},th={},lives={},steps={},tes={},RM{:02d}".format(self.episode_reward,  global_t, self.thread_index, self.game_state.lives, self.steps, tes, self.game_state.room_no))
            if tes == 0:
              states = []
              actions = []
              rewards = []
              values = []
              liveses = self.episode_liveses[-1:]
            else:
              states = self.episode_states[-tes:]
              actions = self.episode_actions[-tes:]
              rewards = self.episode_rewards[-tes:]
              values = self.episode_values[-tes:]
              liveses = self.episode_liveses[-tes-1:]
            if self.options.clear_history_after_ohl:
              self.episode_states = []
              self.episode_actions = []
              self.episode_rewards = []
              self.episode_values = []
              self.episode_liveses = self.episode_liveses[-2:]

      if len(states) > 0:
        R = 0.0
        if not terminal_end:
          R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        lives = liveses.pop()
        # compute and accmulate gradients
        for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
          # Consider the number of lives
          if (not self.options.use_gym) and self.initial_lives != 0.0 and not self.terminate_on_lives_lost:
            prev_lives = liveses.pop()
            if prev_lives > lives:
              weight = self.options.lives_lost_weight
              rratio = self.options.lives_lost_rratio
              R *= rratio * ( (1.0 - weight) + weight * (lives / prev_lives) )
              ri = self.options.lives_lost_reward
              lives = prev_lives

          R = ri + self.options.gamma * R
          td = R - Vi
          a = np.zeros([self.options.action_size])
          a[ai] = 1

          batch_si.append(si)
          batch_a.append(a)
          batch_td.append(td)
          batch_R.append(R)

        if self.options.use_lstm:
          batch_si.reverse()
          batch_a.reverse()
          batch_td.reverse()
          batch_R.reverse()

          sess.run( self.accum_gradients,
                    feed_dict = {
                      self.local_network.s: batch_si,
                      self.local_network.a: batch_a,
                      self.local_network.td: batch_td,
                      self.local_network.r: batch_R,
                      self.local_network.initial_lstm_state: start_lstm_state,
                      self.local_network.step_size : [len(batch_a)] } )
        else:
          sess.run( self.accum_gradients,
                    feed_dict = {
                      self.local_network.s: batch_si,
                      self.local_network.a: batch_a,
                      self.local_network.td: batch_td,
                      self.local_network.r: batch_R} )
          
        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run( self.apply_gradients,
                  feed_dict = { self.learning_rate_input: cur_learning_rate } )

      # return advanced local step size
      diff_local_t = self.local_t - start_local_t
      return diff_local_t, terminal_end
Example #18
0
class A3CTrainingThread(object):
    log_interval = 100
    performance_log_interval = 1000
    local_t_max = 20
    demo_t_max = 20
    use_lstm = False
    action_size = -1
    entropy_beta = 0.01
    demo_entropy_beta = 0.01
    gamma = 0.99
    use_mnih_2015 = False
    env_id = None
    reward_type = 'CLIP'  # CLIP | LOG | RAW
    finetune_upper_layers_oinly = False
    shaping_reward = 0.001
    shaping_factor = 1.
    shaping_gamma = 0.85
    advice_confidence = 0.8
    shaping_actions = -1  # -1 all actions, 0 exclude noop
    transformed_bellman = False
    clip_norm = 0.5
    use_grad_cam = False

    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device=None,
                 pretrained_model=None,
                 pretrained_model_sess=None,
                 advice=False,
                 reward_shaping=False):
        assert self.action_size != -1

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step
        self.use_pretrained_model_as_advice = advice
        self.use_pretrained_model_as_reward_shaping = reward_shaping

        logger.info("thread_index: {}".format(self.thread_index))
        logger.info("local_t_max: {}".format(self.local_t_max))
        logger.info("use_lstm: {}".format(
            colored(self.use_lstm, "green" if self.use_lstm else "red")))
        logger.info("action_size: {}".format(self.action_size))
        logger.info("entropy_beta: {}".format(self.entropy_beta))
        logger.info("gamma: {}".format(self.gamma))
        logger.info("reward_type: {}".format(self.reward_type))
        logger.info("finetune_upper_layers_only: {}".format(
            colored(self.finetune_upper_layers_only,
                    "green" if self.finetune_upper_layers_only else "red")))
        logger.info("use_pretrained_model_as_advice: {}".format(
            colored(
                self.use_pretrained_model_as_advice,
                "green" if self.use_pretrained_model_as_advice else "red")))
        logger.info("use_pretrained_model_as_reward_shaping: {}".format(
            colored(
                self.use_pretrained_model_as_reward_shaping, "green"
                if self.use_pretrained_model_as_reward_shaping else "red")))
        logger.info("transformed_bellman: {}".format(
            colored(self.transformed_bellman,
                    "green" if self.transformed_bellman else "red")))
        logger.info("clip_norm: {}".format(self.clip_norm))
        logger.info("use_grad_cam: {}".format(
            colored(self.use_grad_cam,
                    "green" if self.use_grad_cam else "red")))

        if self.use_lstm:
            GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   thread_index, device)
        else:
            GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015
            self.local_network = GameACFFNetwork(self.action_size,
                                                 thread_index, device)

        with tf.device(device):
            self.local_network.prepare_loss(entropy_beta=self.entropy_beta,
                                            critic_lr=0.5)
            local_vars = self.local_network.get_vars
            if self.finetune_upper_layers_only:
                local_vars = self.local_network.get_vars_upper
            var_refs = [v._ref() for v in local_vars()]

            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs)

        global_vars = global_network.get_vars
        if self.finetune_upper_layers_only:
            global_vars = global_network.get_vars_upper

        with tf.device(device):
            if self.clip_norm is not None:
                self.gradients, grad_norm = tf.clip_by_global_norm(
                    self.gradients, self.clip_norm)
            self.gradients = list(zip(self.gradients, global_vars()))
            self.apply_gradients = grad_applier.apply_gradients(self.gradients)

            #self.apply_gradients = grad_applier.apply_gradients(
            #    global_vars(),
            #    self.gradients)

        self.sync = self.local_network.sync_from(
            global_network, upper_layers_only=self.finetune_upper_layers_only)

        self.game_state = GameState(env_id=self.env_id,
                                    display=False,
                                    no_op_max=30,
                                    human_demo=False,
                                    episode_life=True)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_steps = 0

        # variable controlling log output
        self.prev_local_t = 0

        self.is_demo_thread = False

        with tf.device(device):
            if self.use_grad_cam:
                self.action_meaning = self.game_state.env.unwrapped.get_action_meanings(
                )
                self.local_network.build_grad_cam_grads()

        self.pretrained_model = pretrained_model
        self.pretrained_model_sess = pretrained_model_sess
        self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0
        self.advice_ctr = 0
        self.shaping_ctr = 0
        self.last_rho = 0.

        if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping:
            assert self.pretrained_model is not None

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, logits):
        """sample() in https://github.com/ppyht2/tf-a2c/blob/master/src/policy.py"""
        noise = np.random.uniform(0, 1, np.shape(logits))
        return np.argmax(logits - np.log(-np.log(noise)))

    def choose_action_with_high_confidence(self, pi_values, exclude_noop=True):
        actions_confidence = []
        # exclude NOOP action
        for action in range(1 if exclude_noop else 0, self.action_size):
            actions_confidence.append(pi_values[action][0][0])
        max_confidence_action = np.argmax(actions_confidence)
        confidence = actions_confidence[max_confidence_action]
        return (max_confidence_action + (1 if exclude_noop else 0)), confidence

    def set_summary_writer(self, writer):
        self.writer = writer

    def record_summary(self,
                       score=0,
                       steps=0,
                       episodes=None,
                       global_t=0,
                       mode='Test'):
        summary = tf.Summary()
        summary.value.add(tag='{}/score'.format(mode),
                          simple_value=float(score))
        summary.value.add(tag='{}/steps'.format(mode),
                          simple_value=float(steps))
        if episodes is not None:
            summary.value.add(tag='{}/episodes'.format(mode),
                              simple_value=float(episodes))
        self.writer.add_summary(summary, global_t)
        self.writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def generate_cam(self, sess, test_cam_si, global_t):
        cam_side_img = []
        for i in range(len(test_cam_si)):
            # get max action per demo state
            readout_t = self.local_network.run_policy(sess, test_cam_si[i])
            action = np.argmax(readout_t)

            # convert action to one-hot vector
            action_onehot = [0.] * self.game_state.env.action_space.n
            action_onehot[action] = 1.

            # compute grad cam for conv layer 3
            activations, gradients = self.local_network.evaluate_grad_cam(
                sess, test_cam_si[i], action_onehot)
            cam = grad_cam(activations, gradients)
            cam_img = visualize_cam(cam)

            side_by_side = generate_image_for_cam_video(
                test_cam_si[i], cam_img, global_t, i,
                self.action_meaning[action])

            cam_side_img.append(side_by_side)

        return cam_side_img

    def generate_cam_video(self,
                           sess,
                           time_per_step,
                           global_t,
                           folder,
                           demo_memory_cam,
                           demo_cam_human=False):
        # use one demonstration data to record cam
        # only need to make movie for demo data once
        cam_side_img = self.generate_cam(sess, demo_memory_cam, global_t)

        path = '/frames/demo-cam_side_img'
        if demo_cam_human:
            path += '_human'

        make_movie(cam_side_img,
                   folder + '{}{ep:010d}'.format(path, ep=(global_t)),
                   duration=len(cam_side_img) * time_per_step,
                   true_image=True,
                   salience=False)
        del cam_side_img

    def testing_model(self,
                      sess,
                      max_steps,
                      global_t,
                      folder,
                      demo_memory_cam=None,
                      demo_cam_human=False):
        logger.info("Testing model at global_t={}...".format(global_t))
        # copy weights from shared to local
        sess.run(self.sync)

        if demo_memory_cam is not None:
            self.generate_cam_video(sess, 0.03, global_t, folder,
                                    demo_memory_cam, demo_cam_human)
            return
        else:
            self.game_state.reset(hard_reset=True)
            max_steps += 4
            test_memory = ReplayMemory(
                84,
                84,
                np.random.RandomState(),
                max_steps=max_steps,
                phi_length=4,
                num_actions=self.game_state.env.action_space.n,
                wrap_memory=False,
                full_state_size=self.game_state.clone_full_state().shape[0])
            for _ in range(4):
                test_memory.add(self.game_state.x_t,
                                0,
                                self.game_state.reward,
                                self.game_state.terminal,
                                self.game_state.lives,
                                fullstate=self.game_state.full_state)

        episode_buffer = []
        test_memory_cam = []

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        terminal = False
        while True:
            #pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            test_memory_cam.append(self.game_state.s_t)
            episode_buffer.append(self.game_state.get_screen_rgb())
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            #action = self.choose_action(logits_)
            action = np.argmax(pi_)

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal
            memory_full = episode_steps == max_steps - 5
            terminal_ = terminal or memory_full

            # store the transition to replay memory
            test_memory.add(self.game_state.x_t1,
                            action,
                            self.game_state.reward,
                            terminal_,
                            self.game_state.lives,
                            fullstate=self.game_state.full_state1)

            # update the old values
            episode_reward += self.game_state.reward
            episode_steps += 1

            # s_t = s_t1
            self.game_state.update()

            if terminal_:
                if get_wrapper_by_name(
                        self.game_state.env,
                        'EpisodicLifeEnv').was_real_done or memory_full:
                    time_per_step = 0.03
                    images = np.array(episode_buffer)
                    make_movie(images,
                               folder +
                               '/frames/image{ep:010d}'.format(ep=global_t),
                               duration=len(images) * time_per_step,
                               true_image=True,
                               salience=False)
                    break

                self.game_state.reset(hard_reset=False)
                if self.use_lstm:
                    self.local_network.reset_state()

        total_reward = episode_reward
        total_steps = episode_steps
        log_data = (global_t, self.thread_index, total_reward, total_steps)
        logger.info(
            "test: global_t={} worker={} final score={} final steps={}".format(
                *log_data))

        self.generate_cam_video(sess, 0.03, global_t, folder,
                                np.array(test_memory_cam))
        test_memory.save(name='test_cam', folder=folder, resize=True)

        if self.use_lstm:
            self.local_network.reset_state()

        return

    def testing(self, sess, max_steps, global_t, folder, demo_memory_cam=None):
        logger.info("Evaluate policy at global_t={}...".format(global_t))
        # copy weights from shared to local
        sess.run(self.sync)

        if demo_memory_cam is not None and global_t % 5000000 == 0:
            self.generate_cam_video(sess, 0.03, global_t, folder,
                                    demo_memory_cam)

        episode_buffer = []
        self.game_state.reset(hard_reset=True)
        episode_buffer.append(self.game_state.get_screen_rgb())

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        while max_steps > 0:
            #pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            if False:
                action = np.random.choice(range(self.action_size), p=pi_)
            else:
                action = self.choose_action(logits_)

            if self.use_pretrained_model_as_advice:
                psi = self.psi if self.psi > 0.001 else 0.0
                if psi > np.random.rand():
                    model_pi = self.pretrained_model.run_policy(
                        self.pretrained_model_sess, self.game_state.s_t)
                    model_action, confidence = self.choose_action_with_high_confidence(
                        model_pi, exclude_noop=False)
                    if model_action > self.shaping_actions and confidence >= self.advice_confidence:
                        action = model_action

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal

            if n_episodes == 0 and global_t % 5000000 == 0:
                episode_buffer.append(self.game_state.get_screen_rgb())

            episode_reward += self.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    if n_episodes == 0 and global_t % 5000000 == 0:
                        time_per_step = 0.0167
                        images = np.array(episode_buffer)
                        make_movie(
                            images,
                            folder +
                            '/frames/image{ep:010d}'.format(ep=global_t),
                            duration=len(images) * time_per_step,
                            true_image=True,
                            salience=False)
                        episode_buffer = []
                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "blue")
                    log_data = (global_t, self.thread_index, n_episodes,
                                score_str, steps_str, total_steps)
                    logger.debug(
                        "test: global_t={} worker={} trial={} {} {} total_steps={}"
                        .format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0

                self.game_state.reset(hard_reset=False)
                if self.use_lstm:
                    self.local_network.reset_state()

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            # (timestep, total sum of rewards, total # of steps before terminating)
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (global_t, self.thread_index, total_reward, total_steps,
                    n_episodes)
        logger.info(
            "test: global_t={} worker={} final score={} final steps={} # trials={}"
            .format(*log_data))

        self.record_summary(score=total_reward,
                            steps=total_steps,
                            episodes=n_episodes,
                            global_t=global_t,
                            mode='Test')

        # reset variables used in training
        self.episode_reward = 0
        self.episode_steps = 0
        self.game_state.reset(hard_reset=True)
        self.last_rho = 0.
        if self.is_demo_thread:
            self.replay_mem_reset()

        if self.use_lstm:
            self.local_network.reset_state()
        return total_reward, total_steps, n_episodes

    def pretrain_init(self, demo_memory):
        self.demo_memory_size = len(demo_memory)
        self.demo_memory = demo_memory
        self.replay_mem_reset()

    def replay_mem_reset(self, demo_memory_idx=None):
        if demo_memory_idx is not None:
            self.demo_memory_idx = demo_memory_idx
        else:
            # new random episode
            self.demo_memory_idx = np.random.randint(0, self.demo_memory_size)
        self.demo_memory_count = np.random.randint(
            0,
            len(self.demo_memory[self.demo_memory_idx]) - self.local_t_max)
        # if self.demo_memory_count+self.local_t_max < len(self.demo_memory[self.demo_memory_idx]):
        #           self.demo_memory_max_count = np.random.randint(self.demo_memory_count+self.local_t_max, len(self.demo_memory[self.demo_memory_idx]))
        # else:
        #           self.demo_memory_max_count = len(self.demo_memory[self.demo_memory_idx])
        logger.debug(
            "worker={} mem_reset demo_memory_idx={} demo_memory_start={}".
            format(self.thread_index, self.demo_memory_idx,
                   self.demo_memory_count))
        s_t, action, reward, terminal = self.demo_memory[self.demo_memory_idx][
            self.demo_memory_count]
        self.demo_memory_action = action
        self.demo_memory_reward = reward
        self.demo_memory_terminal = terminal
        if not self.demo_memory[self.demo_memory_idx].imgs_normalized:
            self.demo_memory_s_t = s_t * (1.0 / 255.0)
        else:
            self.demo_memory_s_t = s_t

    def replay_mem_process(self):
        self.demo_memory_count += 1
        s_t, action, reward, terminal = self.demo_memory[self.demo_memory_idx][
            self.demo_memory_count]
        self.demo_memory_next_action = action
        self.demo_memory_reward = reward
        self.demo_memory_terminal = terminal
        if not self.demo_memory[self.demo_memory_idx].imgs_normalized:
            self.demo_memory_s_t1 = s_t * (1.0 / 255.0)
        else:
            self.demo_memory_s_t1 = s_t

    def replay_mem_update(self):
        self.demo_memory_action = self.demo_memory_next_action
        self.demo_memory_s_t = self.demo_memory_s_t1

    def demo_process(self, sess, global_t, demo_memory_idx=None):
        states = []
        actions = []
        rewards = []
        values = []

        demo_ended = False
        terminal_end = False

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        if self.use_lstm:
            reset_lstm_state = False
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(self.demo_t_max):
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.demo_memory_s_t)
            action = self.demo_memory_action
            time.sleep(0.0025)

            states.append(self.demo_memory_s_t)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % self.log_interval
                                             == 0):
                log_msg = "lg={}".format(
                    np.array_str(logits_, precision=4, suppress_small=True))
                log_msg += " pi={}".format(
                    np.array_str(pi_, precision=4, suppress_small=True))
                log_msg += " V={:.4f}".format(value_)
                logger.debug(log_msg)

            # process replay memory
            self.replay_mem_process()

            # receive replay memory result
            reward = self.demo_memory_reward
            terminal = self.demo_memory_terminal

            self.episode_reward += reward

            if self.reward_type == 'LOG':
                reward = np.sign(reward) * np.log(1 + np.abs(reward))
            elif self.reward_type == 'CLIP':
                # clip reward
                reward = np.sign(reward)

            rewards.append(reward)

            self.local_t += 1
            self.episode_steps += 1

            # demo_memory_s_t1 -> demo_memory_s_t
            self.replay_mem_update()
            s_t = self.demo_memory_s_t

            if terminal or self.demo_memory_count == len(
                    self.demo_memory[self.demo_memory_idx]):
                logger.debug("worker={} score={}".format(
                    self.thread_index, self.episode_reward))
                demo_ended = True
                if terminal:
                    terminal_end = True
                    if self.use_lstm:
                        self.local_network.reset_state()

                else:
                    # some demo episodes doesn't reach terminal state
                    if self.use_lstm:
                        reset_lstm_state = True

                self.episode_reward = 0
                self.episode_steps = 0
                self.replay_mem_reset(demo_memory_idx=demo_memory_idx)
                break

        cumulative_reward = 0.0
        if not terminal_end:
            cumulative_reward = self.local_network.run_value(sess, s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_state = []
        batch_action = []
        batch_adv = []
        batch_cumulative_reward = []

        # compute and accmulate gradients
        for (ai, ri, si, vi) in zip(actions, rewards, states, values):
            cumulative_reward = ri + self.gamma * cumulative_reward
            advantage = cumulative_reward - vi

            # convert action to one-hot vector
            a = np.zeros([self.action_size])
            a[ai] = 1

            batch_state.append(si)
            batch_action.append(a)
            batch_adv.append(advantage)
            batch_cumulative_reward.append(cumulative_reward)

        cur_learning_rate = self._anneal_learning_rate(global_t)  #* 0.005

        if self.use_lstm:
            batch_state.reverse()
            batch_action.reverse()
            batch_adv.reverse()
            batch_cumulative_reward.reverse()

            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_state,
                         self.local_network.a: batch_action,
                         self.local_network.advantage: batch_adv,
                         self.local_network.cumulative_reward:
                         batch_cumulative_reward,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                         self.local_network.step_size: [len(batch_action)],
                         self.learning_rate_input: cur_learning_rate
                     })

            # some demo episodes doesn't reach terminal state
            if reset_lstm_state:
                self.local_network.reset_state()
                reset_lstm_state = False
        else:
            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_state,
                         self.local_network.a: batch_action,
                         self.local_network.advantage: batch_adv,
                         self.local_network.cumulative_reward: batch_R,
                         self.learning_rate_input: cur_learning_rate
                     })

        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         self.performance_log_interval):
            self.prev_local_t += self.performance_log_interval

        # return advancd local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, demo_ended

    def process(self, sess, global_t, train_rewards):
        states = []
        actions = []
        rewards = []
        values = []
        rho = []

        terminal_end = False

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        if self.use_lstm:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(self.local_t_max):
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            action = self.choose_action(logits_)

            model_pi = None
            confidence = 0.
            if self.use_pretrained_model_as_advice:
                self.psi = 0.9999 * (
                    0.9999**
                    global_t) if self.psi > 0.001 else 0.0  # 0.99995 works
                if self.psi > np.random.rand():
                    model_pi = self.pretrained_model.run_policy(
                        self.pretrained_model_sess, self.game_state.s_t)
                    model_action, confidence = self.choose_action_with_high_confidence(
                        model_pi, exclude_noop=False)
                    if (model_action > self.shaping_actions
                            and confidence >= self.advice_confidence):
                        action = model_action
                        self.advice_ctr += 1
            if self.use_pretrained_model_as_reward_shaping:
                #if action > 0:
                if model_pi is None:
                    model_pi = self.pretrained_model.run_policy(
                        self.pretrained_model_sess, self.game_state.s_t)
                    confidence = model_pi[action][0][0]
                if (action > self.shaping_actions
                        and confidence >= self.advice_confidence):
                    #rho.append(round(confidence, 5))
                    rho.append(self.shaping_reward)
                    self.shaping_ctr += 1
                else:
                    rho.append(0.)
                #self.shaping_ctr += 1

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            if self.thread_index == 0 and self.local_t % self.log_interval == 0:
                log_msg1 = "lg={}".format(
                    np.array_str(logits_, precision=4, suppress_small=True))
                log_msg2 = "pi={}".format(
                    np.array_str(pi_, precision=4, suppress_small=True))
                log_msg3 = "V={:.4f}".format(value_)
                if self.use_pretrained_model_as_advice:
                    log_msg3 += " psi={:.4f}".format(self.psi)
                logger.debug(log_msg1)
                logger.debug(log_msg2)
                logger.debug(log_msg3)

            # process game
            self.game_state.step(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal
            if self.use_pretrained_model_as_reward_shaping:
                if reward < 0 and reward > 0:
                    rho[i] = 0.
                    j = i - 1
                    while j > i - 5:
                        if rewards[j] != 0:
                            break
                        rho[j] = 0.
                        j -= 1
            #     if self.game_state.loss_life:
            #     if self.game_state.gain_life or reward > 0:
            #         rho[i] = 0.
            #         j = i-1
            #         k = 1
            #         while j >= 0:
            #             if rewards[j] != 0:
            #                 rho[j] = self.shaping_reward * (self.gamma ** -1)
            #                 break
            #             rho[j] = self.shaping_reward / k
            #             j -= 1
            #             k += 1

            self.episode_reward += reward

            if self.reward_type == 'LOG':
                reward = np.sign(reward) * np.log(1 + np.abs(reward))
            elif self.reward_type == 'CLIP':
                # clip reward
                reward = np.sign(reward)

            rewards.append(reward)

            self.local_t += 1
            self.episode_steps += 1
            global_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    log_msg = "train: worker={} global_t={}".format(
                        self.thread_index, global_t)
                    if self.use_pretrained_model_as_advice:
                        log_msg += " advice_ctr={}".format(self.advice_ctr)
                    if self.use_pretrained_model_as_reward_shaping:
                        log_msg += " shaping_ctr={}".format(self.shaping_ctr)
                    score_str = colored("score={}".format(self.episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(self.episode_steps),
                                        "blue")
                    log_msg += " {} {}".format(score_str, steps_str)
                    logger.debug(log_msg)
                    train_rewards['train'][global_t] = (self.episode_reward,
                                                        self.episode_steps)
                    self.record_summary(score=self.episode_reward,
                                        steps=self.episode_steps,
                                        episodes=None,
                                        global_t=global_t,
                                        mode='Train')
                    self.episode_reward = 0
                    self.episode_steps = 0
                    terminal_end = True

                self.last_rho = 0.
                if self.use_lstm:
                    self.local_network.reset_state()
                self.game_state.reset(hard_reset=False)
                break

        cumulative_reward = 0.0
        if not terminal:
            cumulative_reward = self.local_network.run_value(
                sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_state = []
        batch_action = []
        batch_adv = []
        batch_cumulative_reward = []

        if self.use_pretrained_model_as_reward_shaping:
            rho.reverse()
            rho.append(self.last_rho)
            self.last_rho = rho[0]
            i = 0
            # compute and accumulate gradients
            for (ai, ri, si, vi) in zip(actions, rewards, states, values):
                # Wiewiora et al.(2003) Principled Methods for Advising RL agents
                # Look-Back Advice
                #F = rho[i] - (self.shaping_gamma**-1) * rho[i+1]
                #F = rho[i] - self.shaping_gamma * rho[i+1]
                f = (self.shaping_gamma**-1) * rho[i] - rho[i + 1]
                if (i == 0 and terminal) or (f != 0 and (ri > 0 or ri < 0)):
                    #logger.warn("averted additional F in absorbing state")
                    F = 0.
                # if (F < 0. and ri > 0) or (F > 0. and ri < 0):
                #     logger.warn("Negative reward shaping F={} ri={} rho[s]={} rhos[s-1]={}".format(F, ri, rho[i], rho[i+1]))
                #     F = 0.
                cumulative_reward = (ri + f * self.shaping_factor
                                     ) + self.gamma * cumulative_reward
                advantage = cumulative_reward - vi

                a = np.zeros([self.action_size])
                a[ai] = 1

                batch_state.append(si)
                batch_action.append(a)
                batch_adv.append(advantage)
                batch_cumulative_reward.append(cumulative_reward)
                i += 1
        else:

            def h(z, eps=10**-2):
                return (np.sign(z) *
                        (np.sqrt(np.abs(z) + 1.) - 1.)) + (eps * z)

            def h_inv(z, eps=10**-2):
                return np.sign(z) * (np.square(
                    (np.sqrt(1 + 4 * eps *
                             (np.abs(z) + 1 + eps)) - 1) / (2 * eps)) - 1)

            def h_log(z, eps=.6):
                return (np.sign(z) * np.log(1. + np.abs(z)) * eps)

            def h_inv_log(z, eps=.6):
                return np.sign(z) * (np.exp(np.abs(z) / eps) - 1)

            # compute and accumulate gradients
            for (ai, ri, si, vi) in zip(actions, rewards, states, values):
                if self.transformed_bellman:
                    cumulative_reward = h(ri + self.gamma *
                                          h_inv(cumulative_reward))
                else:
                    cumulative_reward = ri + self.gamma * cumulative_reward
                advantage = cumulative_reward - vi

                # convert action to one-hot vector
                a = np.zeros([self.action_size])
                a[ai] = 1

                batch_state.append(si)
                batch_action.append(a)
                batch_adv.append(advantage)
                batch_cumulative_reward.append(cumulative_reward)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        if self.use_lstm:
            batch_state.reverse()
            batch_action.reverse()
            batch_adv.reverse()
            batch_cumulative_reward.reverse()

            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_state,
                         self.local_network.a: batch_action,
                         self.local_network.advantage: batch_adv,
                         self.local_network.cumulative_reward:
                         batch_cumulative_reward,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                         self.local_network.step_size: [len(batch_action)],
                         self.learning_rate_input: cur_learning_rate
                     })
        else:
            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_state,
                         self.local_network.a: batch_action,
                         self.local_network.advantage: batch_adv,
                         self.local_network.cumulative_reward:
                         batch_cumulative_reward,
                         self.learning_rate_input: cur_learning_rate
                     })

        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         self.performance_log_interval):
            self.prev_local_t += self.performance_log_interval
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            logger.info(
                "Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, terminal_end
Example #19
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device=None,
                 pretrained_model=None,
                 pretrained_model_sess=None,
                 advice=False,
                 reward_shaping=False):
        assert self.action_size != -1

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step
        self.use_pretrained_model_as_advice = advice
        self.use_pretrained_model_as_reward_shaping = reward_shaping

        logger.info("thread_index: {}".format(self.thread_index))
        logger.info("local_t_max: {}".format(self.local_t_max))
        logger.info("use_lstm: {}".format(
            colored(self.use_lstm, "green" if self.use_lstm else "red")))
        logger.info("action_size: {}".format(self.action_size))
        logger.info("entropy_beta: {}".format(self.entropy_beta))
        logger.info("gamma: {}".format(self.gamma))
        logger.info("reward_type: {}".format(self.reward_type))
        logger.info("finetune_upper_layers_only: {}".format(
            colored(self.finetune_upper_layers_only,
                    "green" if self.finetune_upper_layers_only else "red")))
        logger.info("use_pretrained_model_as_advice: {}".format(
            colored(
                self.use_pretrained_model_as_advice,
                "green" if self.use_pretrained_model_as_advice else "red")))
        logger.info("use_pretrained_model_as_reward_shaping: {}".format(
            colored(
                self.use_pretrained_model_as_reward_shaping, "green"
                if self.use_pretrained_model_as_reward_shaping else "red")))
        logger.info("transformed_bellman: {}".format(
            colored(self.transformed_bellman,
                    "green" if self.transformed_bellman else "red")))
        logger.info("clip_norm: {}".format(self.clip_norm))
        logger.info("use_grad_cam: {}".format(
            colored(self.use_grad_cam,
                    "green" if self.use_grad_cam else "red")))

        if self.use_lstm:
            GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   thread_index, device)
        else:
            GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015
            self.local_network = GameACFFNetwork(self.action_size,
                                                 thread_index, device)

        with tf.device(device):
            self.local_network.prepare_loss(entropy_beta=self.entropy_beta,
                                            critic_lr=0.5)
            local_vars = self.local_network.get_vars
            if self.finetune_upper_layers_only:
                local_vars = self.local_network.get_vars_upper
            var_refs = [v._ref() for v in local_vars()]

            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs)

        global_vars = global_network.get_vars
        if self.finetune_upper_layers_only:
            global_vars = global_network.get_vars_upper

        with tf.device(device):
            if self.clip_norm is not None:
                self.gradients, grad_norm = tf.clip_by_global_norm(
                    self.gradients, self.clip_norm)
            self.gradients = list(zip(self.gradients, global_vars()))
            self.apply_gradients = grad_applier.apply_gradients(self.gradients)

            #self.apply_gradients = grad_applier.apply_gradients(
            #    global_vars(),
            #    self.gradients)

        self.sync = self.local_network.sync_from(
            global_network, upper_layers_only=self.finetune_upper_layers_only)

        self.game_state = GameState(env_id=self.env_id,
                                    display=False,
                                    no_op_max=30,
                                    human_demo=False,
                                    episode_life=True)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_steps = 0

        # variable controlling log output
        self.prev_local_t = 0

        self.is_demo_thread = False

        with tf.device(device):
            if self.use_grad_cam:
                self.action_meaning = self.game_state.env.unwrapped.get_action_meanings(
                )
                self.local_network.build_grad_cam_grads()

        self.pretrained_model = pretrained_model
        self.pretrained_model_sess = pretrained_model_sess
        self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0
        self.advice_ctr = 0
        self.shaping_ctr = 0
        self.last_rho = 0.

        if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping:
            assert self.pretrained_model is not None
Example #20
0
def make_network():
    if USE_LSTM:
        return GameACLSTMNetwork(ACTION_SIZE, -1, device)
    else:
        return GameACFFNetwork(ACTION_SIZE, device)
Example #21
0
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, action_size, gamma, local_t_max, entropy_beta,
                 agent_type, performance_log_interval, log_level, random_seed):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.action_size = action_size
        self.gamma = gamma
        self.local_t_max = local_t_max
        self.agent_type = agent_type
        self.performance_log_interval = performance_log_interval
        self.log_level = log_level

        if self.agent_type == 'LSTM':
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   thread_index, device)
        else:
            self.local_network = GameACFFNetwork(self.action_size,
                                                 thread_index, device)

        self.local_network.prepare_loss(entropy_beta)

        with tf.device(device):
            var_refs = []
            variables = self.local_network.get_vars()
            for v in variables:
                var_refs.append(v)

            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.gradients)

        self.sync = self.local_network.sync_from(global_network)

        np.random.seed(random_seed)
        self.game_state = GameState(random_seed * thread_index,
                                    self.action_size)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate
        self.learn_rate = self.initial_learning_rate

        self.reset_counters()

        self.episode = 0

        # variable controling log output
        self.prev_local_t = 0

    def reset_counters(self):
        self.total_q_max = 0
        self.episode_reward = 0
        self.episode_actions = []
        self.passed_obst = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        self.learn_rate = learning_rate

        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def set_start_time(self, start_time):
        self.start_time = start_time

    def process(self, sess, global_t, statistics):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        if self.agent_type == 'LSTM':
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(self.local_t_max):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            action = self.choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            # process game
            try:  # Bitblt may raise error, but we can safely ignore it, otherwise thread will die
                self.game_state.process(action)
            except Exception as e:
                print e.message

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal
            steps = self.game_state.steps
            passed = self.game_state.passed_obst

            self.episode_reward += reward

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            self.total_q_max += np.max(pi_)
            self.episode_actions.append(action)
            self.passed_obst = self.game_state.passed_obst

            if terminal:
                terminal_end = True
                self.episode += 1

                if self.log_level == 'FULL':
                    reward_steps = format(
                        float(self.episode_reward) / float(steps), '.4f')
                    print "THREAD: {}  /  EPISODE: {}  /  TOTAL STEPS: {}  /  STEPS: {}  /  PASSED OBST: {}  /  REWARD: {}  /  REWARD/STEP: {}".format(
                        self.thread_index, self.episode, global_t, steps,
                        self.passed_obst, self.episode_reward, reward_steps)

                statistics.update(global_t, self.episode_reward,
                                  self.total_q_max, steps,
                                  self.episode_actions, self.learn_rate,
                                  self.passed_obst)

                self.reset_counters()

                self.game_state.reset()
                if self.agent_type == 'LSTM':
                    self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            td = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        if self.agent_type == 'LSTM':
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                         self.local_network.step_size: [len(batch_a)],
                         self.learning_rate_input: cur_learning_rate
                     })
        else:
            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.learning_rate_input: cur_learning_rate
                     })

        if (self.thread_index
                == 0) and (self.local_t - self.prev_local_t >=
                           self.performance_log_interval) and (self.log_level
                                                               == 'FULL'):
            self.prev_local_t += self.performance_log_interval
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print(
                "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Example #22
0
if USE_GPU:
    device = "/gpu:0"

initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH,
                                    INITIAL_ALPHA_LOG_RATE)

global_t = 0

stop_requested = False

global_game = DoomGameState(scenario_path="scenarios/cig.cfg")
if USE_LSTM:
    global_network = GameACLSTMNetwork(global_game.get_action_size(), -1,
                                       device)
else:
    global_network = GameACFFNetwork(global_game.get_action_size(), -1, device)
del global_game

training_threads = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                              decay=RMSP_ALPHA,
                              momentum=0.0,
                              epsilon=RMSP_EPSILON,
                              clip_norm=GRAD_NORM_CLIP,
                              device=device)

for i in range(PARALLEL_SIZE):
    game = DoomGameState(scenario_path="scenarios/cig.cfg")
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index,
                                                   device)
        else:
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i
        #fail safe
        return len(values) - 1

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        if USE_LSTM:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            action = self.choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print("pi=", pi_)
                print(" V=", value_)

            # process game
            self.game_state.process(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                terminal_end = True
                print("score=", self.episode_reward)

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                self.episode_reward = 0
                self.game_state.reset()
                if USE_LSTM:
                    self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.accum_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                         self.local_network.step_size: [len(batch_a)]
                     })
        else:
            sess.run(self.accum_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R
                     })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print("TIMESTEP", self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Example #24
0
class A3CTrainingThread(object):
    def __init__(self, thread_index, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_episode,
                 device, arrived_jobs, condition):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_episode = max_global_time_episode

        # 通过thread_index 即机器编号来获取在该机器上加工的所有工序
        self.operations = get_data_by_machine(thread_index)
        self.condition = condition
        self.is_terminal_counted = False
        self.last_episode_reward = 0

        if USE_LSTM:
            # 第一个参数是action size,这里传入在该机器上代加工的工序数
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index,
                                                   device)
        else:
            # 第一个参数是action size,这里传入在该机器上代加工的工序数
            self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index,
                                                 device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            self.local_network.get_vars(), self.gradients)

        # self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        # 创建该工序的环境
        self.env = JspEnv(self.operations, thread_index, arrived_jobs)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0

    def _anneal_learning_rate(self, global_time_step):
        # return self.initial_learning_rate
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_episode -
            global_time_step) / self.max_global_time_episode
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values, use_max_choice):
        # if len(self.env.action_space) != 1:
        #   print('\n------------------------------------------------'
        #         'machine = {}'.format(self.thread_index))
        #   print('action space = {}'.format(self.env.action_space))
        #   print('pi = {}'.format(pi_values))
        #
        # for i in range(len(pi_values)):
        #   if i not in self.env.action_space:
        #     pi_values[i] = 0
        # sum = np.sum(pi_values)
        # if sum == 0:
        #   return np.random.choice(self.env.action_space)
        # else:
        #   for i in range(len(pi_values)):
        #     pi_values[i] = pi_values[i] / sum
        #   if use_max_choice:
        #     if len(self.env.action_space) != 1:
        #       pi_values[self.env.machine_size] = 0
        #     return np.argmax(pi_values)
        #   else:
        #     return np.random.choice(range(len(pi_values)), p=pi_values)
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def process(self, sess, global_t, summary_writer, summary_op, score_input,
                use_max_choice):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # copy weights from shared to local
        # sess.run( self.sync )

        start_local_t = self.local_t

        if USE_LSTM:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        # for i in range(LOCAL_T_MAX):
        while True:
            # pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.local_state)
            action = self.choose_action(pi_, use_max_choice)

            # states.append(self.game_state.s_t)
            states.append(self.env.local_state)
            actions.append(action)
            values.append(value_)

            # if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
            # if (self.thread_index == 0):
            #   print('machine index: ' + str(self.thread_index))
            #   print('arrived jobs:{}'.format(self.env.arrived_jobs[self.thread_index]))
            #   print('actions:{}'.format(action))
            #   print('clock:{}'.format(self.env.clock))
            #   print("action space = {}".format(self.env.action_space))
            #
            #   print("pi={}".format(pi_))
            #   print(" V={}".format(value_))
            '''
      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal
      '''

            new_state, reward, terminal, info = self.env.step(action)

            self.episode_reward += reward

            # clip reward
            # rewards.append( np.clip(reward, -1, 1) )
            rewards.append(reward)

            self.local_t += 1

            # s_t1 -> s_t
            # self.game_state.update()

            if terminal:
                terminal_end = True
                # print("score={}".format(self.episode_reward))
                # print("complete time={}".format(self.env.clock))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                # print('\n----------------------------------------------------')
                # print('machine index: ' + str(self.thread_index))
                # print('arrived jobs:{}'.format(self.env.arrived_jobs[self.thread_index]))
                # print('actions:{}'.format(action))
                # print('clock:{}'.format(self.env.clock))
                # print("jobs size = {}".format(len(self.env.init_operations)))
                # print("action space = {}".format(self.env.action_space))
                # print("pi={}".format(pi_))
                # print(" V={}".format(value_))
                # print('----------------------------------------------------\n')

                self.complete_time = self.env.clock
                self.last_episode_reward = self.episode_reward
                self.episode_reward = 0
                # self.game_state.reset()
                self.env.reset()
                if USE_LSTM:
                    self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            # R = self.local_network.run_value(sess, self.game_state.s_t)
            R = self.local_network.run_value(sess, self.env.local_state)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            # a = np.zeros([ACTION_SIZE])
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                         self.local_network.step_size: [len(batch_a)],
                         self.learning_rate_input: cur_learning_rate
                     })
        else:
            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.learning_rate_input: cur_learning_rate
                     })

        # if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
        #   self.prev_local_t += PERFORMANCE_LOG_INTERVAL
        #   elapsed_time = time.time() - self.start_time
        #   steps_per_sec = global_t / elapsed_time
        #   print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
        #     global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, self.complete_time, self.last_episode_reward
Example #25
0
if not settings.mode == 'display' and not settings.mode == 'visualize':
    device = "/cpu:0"
    if settings.use_gpu:
        device = "/gpu:0"

    initial_learning_rates = log_uniform(settings.initial_alpha_low,
                                         settings.initial_alpha_high,
                                         settings.parallel_agent_size)
    global_t = 0

    stop_requested = False

    if settings.agent_type == 'LSTM':
        global_network = GameACLSTMNetwork(settings.action_size, -1, device)
    else:
        global_network = GameACFFNetwork(settings.action_size, -1, device)

    training_threads = []

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=settings.rmsp_alpha,
                                  momentum=0.0,
                                  epsilon=settings.rmsp_epsilon,
                                  clip_norm=settings.grad_norm_clip,
                                  device=device)

    for i in range(settings.parallel_agent_size):
        training_thread = A3CTrainingThread(
            i, global_network, initial_learning_rates[i], learning_rate_input,
Example #26
0
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, max_global_time_step, device):
        self.learn_rate = 0
        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step
        self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)
        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            var_refs = [v._ref() for v in self.local_network.get_vars()]
            self.gradients = tf.gradients(self.local_network.total_loss,
                                          var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)


# #
        self.apply_gradients = tf.train.RMSPropOptimizer(
            self.learning_rate_input).apply_gradients(
                zip(self.gradients, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = Game()

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        self.prev_local_t = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def process(self, sess, global_t):
        states = []
        actions = []
        rewards = []
        values = []
        temp_reward = 0
        terminal_end = False
        sess.run(self.sync)

        start_local_t = self.local_t

        for i in range(0, LOCAL_T_MAX):
            #     while True:
            #       sleep(100)
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            #       print(pi_)
            action = self.choose_action(pi_)
            #       print(action)
            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            temp_action = [0, 0, 0]
            temp_action[action] = 1
            self.game_state.process(temp_action)
            # receive game result
            reward = self.game_state.reward
            #       print(self.game_state.terminal)
            terminal = self.game_state.terminal

            self.episode_reward += reward
            temp_reward = self.episode_reward

            # clip reward
            rewards.append(reward)

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))
                print("process:", self.thread_index, " learn_rate:",
                      self.learn_rate)
                self.episode_reward = 0
                self.game_state.reset()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        cur_learning_rate = self._anneal_learning_rate(global_t)
        self.learn_rate = cur_learning_rate

        sess.run(self.apply_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.learning_rate_input: cur_learning_rate
                 })

        diff_local_t = self.local_t - start_local_t
        return diff_local_t, temp_reward
Example #27
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               options):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step
    self.options = options

    if options.use_lstm:
      self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(options.action_size, device)

    self.local_network.prepare_loss(options.entropy_beta)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    self.indent = "         |" * self.thread_index
    self.steps = 0
    self.no_reward_steps = 0
    self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0)

    if self.options.train_episode_steps > 0:
      self.max_reward = 0.0
      self.max_episode_reward = 0.0
      self.episode_states = []
      self.episode_actions = []
      self.episode_rewards = []
      self.episode_values = []
      self.episode_liveses = []
      self.episode_scores = Episode_scores(options)
      self.tes = self.options.train_episode_steps
      if self.options.tes_list is not None:
        self.tes = self.options.tes_list[thread_index]
        print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes))
    self.initial_lives = self.game_state.initial_lives
    self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1)

    if self.options.record_new_record_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_record_dir):
          os.makedirs(self.options.record_new_record_dir)
      self.episode_screens = []

    if self.options.record_new_room_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_room_dir):
          os.makedirs(self.options.record_new_room_dir)
      self.episode_screens = []

    self.greediness = options.greediness
    self.repeat_action_ratio = options.repeat_action_ratio
    self.prev_action = 0
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v._ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.gradients )
      
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    return np.random.choice(range(len(pi_values)), p=pi_values)

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    summary_writer.flush()
    
  def set_start_time(self, start_time):
    self.start_time = start_time

  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if USE_LSTM:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
        print("pi={}".format(pi_))
        print(" V={}".format(value_))

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print("score={}".format(self.episode_reward))

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        if USE_LSTM:
          self.local_network.reset_state()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    cur_learning_rate = self._anneal_learning_rate(global_t)

    if USE_LSTM:
      batch_si.reverse()
      batch_a.reverse()
      batch_td.reverse()
      batch_R.reverse()

      sess.run( self.apply_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.local_network.initial_lstm_state: start_lstm_state,
                  self.local_network.step_size : [len(batch_a)],
                  self.learning_rate_input: cur_learning_rate } )
    else:
      sess.run( self.apply_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.learning_rate_input: cur_learning_rate} )
      
    if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
      self.prev_local_t += PERFORMANCE_LOG_INTERVAL
      elapsed_time = time.time() - self.start_time
      steps_per_sec = global_t / elapsed_time
      print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
        global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
Example #29
0
def run_a3c_test(args):
    """Run A3C testing."""
    GYM_ENV_NAME = args.gym_env.replace('-', '_')

    if args.use_gpu:
        assert args.cuda_devices != ''
        os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    import tensorflow as tf

    if not os.path.exists('results/a3c'):
        os.makedirs('results/a3c')

    if args.folder is not None:
        folder = args.folder
    else:
        folder = 'results/a3c/{}'.format(GYM_ENV_NAME)
        end_str = ''

        if args.use_mnih_2015:
            end_str += '_mnih2015'
        if args.use_lstm:
            end_str += '_lstm'
        if args.unclipped_reward:
            end_str += '_rawreward'
        elif args.log_scale_reward:
            end_str += '_logreward'
        if args.transformed_bellman:
            end_str += '_transformedbell'

        if args.use_transfer:
            end_str += '_transfer'
            if args.not_transfer_conv2:
                end_str += '_noconv2'
            elif args.not_transfer_conv3 and args.use_mnih_2015:
                end_str += '_noconv3'
            elif args.not_transfer_fc1:
                end_str += '_nofc1'
            elif args.not_transfer_fc2:
                end_str += '_nofc2'
        if args.finetune_upper_layers_only:
            end_str += '_tune_upperlayers'
        if args.train_with_demo_num_steps > 0 \
           or args.train_with_demo_num_epochs > 0:
            end_str += '_pretrain_ina3c'
        if args.use_demo_threads:
            end_str += '_demothreads'

        if args.load_pretrained_model:
            if args.use_pretrained_model_as_advice:
                end_str += '_modelasadvice'
            if args.use_pretrained_model_as_reward_shaping:
                end_str += '_modelasshaping'

        if args.padding == 'SAME':
            end_str += '_same'

        folder += end_str

    folder = pathlib.Path(folder)

    demo_memory_cam = None
    demo_cam_human = False
    if args.load_demo_cam:
        if args.demo_memory_folder is not None:
            demo_memory_folder = args.demo_memory_folder
        else:
            demo_memory_folder = 'collected_demo/{}'.format(GYM_ENV_NAME)

        demo_memory_folder = pathlib.Path(demo_memory_folder)

        if args.demo_cam_id is not None:
            demo_cam_human = True
            demo_cam, _, total_rewards_cam, _ = load_memory(
                name=None,
                demo_memory_folder=demo_memory_folder,
                demo_ids=args.demo_cam_id,
                imgs_normalized=False)

            demo_cam = demo_cam[int(args.demo_cam_id)]
            logger.info("loaded demo {} for testing CAM".format(
                args.demo_cam_id))

        else:
            demo_cam_folder = pathlib.Path(args.demo_cam_folder)
            demo_cam = ReplayMemory()
            demo_cam.load(name='test_cam', folder=demo_cam_folder)
            logger.info("loaded demo {} for testing CAM".format(
                str(demo_cam_folder / 'test_cam')))

        demo_memory_cam = np.zeros(
            (len(demo_cam),
             demo_cam.height,
             demo_cam.width,
             demo_cam.phi_length),
            dtype=np.float32)

        for i in range(len(demo_cam)):
            s0, _, _, _, _, _, t1, _ = demo_cam[i]
            demo_memory_cam[i] = np.copy(s0)

        del demo_cam

    device = "/cpu:0"
    gpu_options = None
    if args.use_gpu:
        device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"]
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=args.gpu_fraction)

    initial_learning_rate = args.initial_learn_rate
    logger.info('Initial Learning Rate={}'.format(initial_learning_rate))
    time.sleep(2)

    global_t = 0
    stop_requested = False

    game_state = GameState(env_id=args.gym_env)
    action_size = game_state.env.action_space.n

    config = tf.ConfigProto(
        gpu_options=gpu_options,
        log_device_placement=False,
        allow_soft_placement=True)

    input_shape = (84, 84, 4) if args.padding == 'VALID' else (88, 88, 4)
    if args.use_lstm:
        GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015
        global_network = GameACFFNetwork(
            action_size, -1, device, padding=args.padding,
            in_shape=input_shape)

    learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr")

    grad_applier = tf.train.RMSPropOptimizer(
        learning_rate=learning_rate_input,
        decay=args.rmsp_alpha,
        epsilon=args.rmsp_epsilon)

    A3CTrainingThread.log_interval = args.log_interval
    A3CTrainingThread.performance_log_interval = args.performance_log_interval
    A3CTrainingThread.local_t_max = args.local_t_max
    A3CTrainingThread.demo_t_max = args.demo_t_max
    A3CTrainingThread.use_lstm = args.use_lstm
    A3CTrainingThread.action_size = action_size
    A3CTrainingThread.entropy_beta = args.entropy_beta
    A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta
    A3CTrainingThread.gamma = args.gamma
    A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015
    A3CTrainingThread.env_id = args.gym_env
    A3CTrainingThread.finetune_upper_layers_only = \
        args.finetune_upper_layers_only
    A3CTrainingThread.transformed_bellman = args.transformed_bellman
    A3CTrainingThread.clip_norm = args.grad_norm_clip
    A3CTrainingThread.use_grad_cam = args.use_grad_cam

    if args.unclipped_reward:
        A3CTrainingThread.reward_type = "RAW"
    elif args.log_scale_reward:
        A3CTrainingThread.reward_type = "LOG"
    else:
        A3CTrainingThread.reward_type = "CLIP"

    if args.use_lstm:
        local_network = GameACLSTMNetwork(action_size, 0, device)
    else:
        local_network = GameACFFNetwork(
            action_size, 0, device, padding=args.padding,
            in_shape=input_shape)

    testing_thread = A3CTrainingThread(
        0, global_network, local_network, initial_learning_rate,
        learning_rate_input,
        grad_applier, 0,
        device=device)

    # prepare session
    sess = tf.Session(config=config)

    if args.use_transfer:
        if args.transfer_folder is not None:
            transfer_folder = args.transfer_folder
        else:
            transfer_folder = 'results/pretrain_models/{}'.format(GYM_ENV_NAME)
            end_str = ''

            if args.use_mnih_2015:
                end_str += '_mnih2015'
            end_str += '_l2beta1E-04_batchprop'  # TODO: make this an argument
            transfer_folder += end_str

        transfer_folder = pathlib.Path(transfer_folder)
        transfer_folder /= 'transfer_model'

        if args.not_transfer_conv2:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                ]

        elif (args.not_transfer_conv3 and args.use_mnih_2015):
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
                ]

        elif args.not_transfer_fc1:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
                ]

            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3,
                    global_network.b_conv3,
                    ]

        elif args.not_transfer_fc2:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
                global_network.W_fc1,
                global_network.b_fc1,
                ]

            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3,
                    global_network.b_conv3,
                    ]

        else:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
                global_network.W_fc1,
                global_network.b_fc1,
                global_network.W_fc2,
                global_network.b_fc2,
                ]

            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3,
                    global_network.b_conv3,
                    ]

        global_network.load_transfer_model(
            sess, folder=transfer_folder,
            not_transfer_fc2=args.not_transfer_fc2,
            not_transfer_fc1=args.not_transfer_fc1,
            not_transfer_conv3=(args.not_transfer_conv3
                                and args.use_mnih_2015),
            not_transfer_conv2=args.not_transfer_conv2,
            var_list=transfer_var_list,
            )

    def initialize_uninitialized(sess):
        global_vars = tf.global_variables()
        is_not_initialized = sess.run(
            [tf.is_variable_initialized(var) for var in global_vars])
        not_initialized_vars = [
            v for (v, f) in zip(global_vars, is_not_initialized) if not f]

        if len(not_initialized_vars):
            sess.run(tf.variables_initializer(not_initialized_vars))

    if args.use_transfer:
        initialize_uninitialized(sess)
    else:
        sess.run(tf.global_variables_initializer())

    # init or load checkpoint with saver
    root_saver = tf.train.Saver(max_to_keep=1)
    checkpoint = tf.train.get_checkpoint_state(str(folder))
    if checkpoint and checkpoint.model_checkpoint_path:
        root_saver.restore(sess, checkpoint.model_checkpoint_path)
        logger.info("checkpoint loaded:{}".format(
            checkpoint.model_checkpoint_path))
        tokens = checkpoint.model_checkpoint_path.split("-")
        # set global step
        global_t = int(tokens[-1])
        logger.info(">>> global step set: {}".format(global_t))
    else:
        logger.warning("Could not find old checkpoint")

    def test_function():
        nonlocal global_t

        if args.use_transfer:
            from_folder = str(transfer_folder).split('/')[-2]
        else:
            from_folder = str(folder).split('/')[-1]

        from_folder = pathlib.Path(from_folder)
        save_folder = 'results/test_model/a3c' / from_folder
        prepare_dir(str(save_folder), empty=False)
        prepare_dir(str(save_folder / 'frames'), empty=False)

        # Evaluate model before training
        if not stop_requested:
            testing_thread.testing_model(
                sess, args.eval_max_steps, global_t, save_folder,
                demo_memory_cam=demo_memory_cam, demo_cam_human=demo_cam_human)

    def signal_handler(signal, frame):
        nonlocal stop_requested
        logger.info('You pressed Ctrl+C!')
        stop_requested = True

        if stop_requested and global_t == 0:
            sys.exit(1)

    test_thread = threading.Thread(target=test_function, args=())

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    test_thread.start()

    print('Press Ctrl+C to stop')

    test_thread.join()

    sess.close()
class A3CTrainingThread(object):

    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        else:
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(),
          self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * \
            (self.max_global_time_step - global_time_step) / \
             self.max_global_time_step
        assert learning_rate > 0, 'Learning rate {} is not >0'.format(
            learning_rate)
        return learning_rate

    def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
        summary_str = sess.run(summary_op, feed_dict={
                               score_input: score
                               })
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        if USE_LSTM:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        start_local_t = self.local_t
        terminal_end = False
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
            action = choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            # Debug output for progress
            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print(('local_t = {:10}  pi = ' + '{:7.5f} ' * len(pi_) + ' V = {:8.4f} (thread {})').format(self.local_t,
                                                                                                             *pi_, value_, self.thread_index))

            # process game
            self.game_state.process(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            # TODO: Does this make sense?
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                terminal_end = True
                print ("score=", self.episode_reward)

                self._record_score(
                    sess, summary_writer, summary_op, score_input,
                                   self.episode_reward, global_t)

                self.episode_reward = 0
                self.game_state.reset()
                if USE_LSTM:
                    self.local_network.reset_state()
                break

        # Compute and accmulate gradients

        R = 0.0 if terminal_end else self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # What is the meaning of these values?
        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.accum_gradients,
                     feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.local_network.initial_lstm_state: start_lstm_state,
                     self.local_network.step_size: [len(batch_a)]})
        else:
            sess.run(self.accum_gradients,
                     feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R})

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print ("TIMESTEP", self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Example #31
0
def run_a3c(args):
    """
    python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015

    python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<>

    python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> --load-pretrained-model --onevsall-mtl --pretrained-model-folder=<> --use-pretrained-model-as-advice --use-pretrained-model-as-reward-shaping
    """
    from game_ac_network import GameACFFNetwork, GameACLSTMNetwork
    from a3c_training_thread import A3CTrainingThread
    if args.use_gpu:
        assert args.cuda_devices != ''
        os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    import tensorflow as tf

    def log_uniform(lo, hi, rate):
        log_lo = math.log(lo)
        log_hi = math.log(hi)
        v = log_lo * (1 - rate) + log_hi * rate
        return math.exp(v)

    if not os.path.exists('results/a3c'):
        os.makedirs('results/a3c')

    if args.folder is not None:
        folder = 'results/a3c/{}_{}'.format(args.gym_env.replace('-', '_'),
                                            args.folder)
    else:
        folder = 'results/a3c/{}'.format(args.gym_env.replace('-', '_'))
        end_str = ''

        if args.use_mnih_2015:
            end_str += '_mnih2015'
        if args.use_lstm:
            end_str += '_lstm'
        if args.unclipped_reward:
            end_str += '_rawreward'
        elif args.log_scale_reward:
            end_str += '_logreward'
        if args.transformed_bellman:
            end_str += '_transformedbell'

        if args.use_transfer:
            end_str += '_transfer'
            if args.not_transfer_conv2:
                end_str += '_noconv2'
            elif args.not_transfer_conv3 and args.use_mnih_2015:
                end_str += '_noconv3'
            elif args.not_transfer_fc1:
                end_str += '_nofc1'
            elif args.not_transfer_fc2:
                end_str += '_nofc2'
        if args.finetune_upper_layers_only:
            end_str += '_tune_upperlayers'
        if args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0:
            end_str += '_pretrain_ina3c'
        if args.use_demo_threads:
            end_str += '_demothreads'

        if args.load_pretrained_model:
            if args.use_pretrained_model_as_advice:
                end_str += '_modelasadvice'
            if args.use_pretrained_model_as_reward_shaping:
                end_str += '_modelasshaping'
        folder += end_str

    if args.append_experiment_num is not None:
        folder += '_' + args.append_experiment_num

    if False:
        from common.util import LogFormatter
        fh = logging.FileHandler('{}/a3c.log'.format(folder), mode='w')
        fh.setLevel(logging.DEBUG)
        formatter = LogFormatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)

    demo_memory = None
    num_demos = 0
    max_reward = 0.
    if args.load_memory or args.load_demo_cam:
        if args.demo_memory_folder is not None:
            demo_memory_folder = args.demo_memory_folder
        else:
            demo_memory_folder = 'collected_demo/{}'.format(
                args.gym_env.replace('-', '_'))

    if args.load_memory:
        # FIXME: use new load_memory function
        demo_memory, actions_ctr, max_reward = load_memory(
            args.gym_env, demo_memory_folder,
            imgs_normalized=True)  #, create_symmetry=True)
        action_freq = [
            actions_ctr[a] for a in range(demo_memory[0].num_actions)
        ]
        num_demos = len(demo_memory)

    demo_memory_cam = None
    if args.load_demo_cam:
        demo_cam, _, total_rewards_cam, _ = load_memory(
            name=None,
            demo_memory_folder=demo_memory_folder,
            demo_ids=args.demo_cam_id,
            imgs_normalized=False)

        demo_cam = demo_cam[int(args.demo_cam_id)]
        demo_memory_cam = np.zeros((len(demo_cam), demo_cam.height,
                                    demo_cam.width, demo_cam.phi_length),
                                   dtype=np.float32)
        for i in range(len(demo_cam)):
            s0 = (demo_cam[i])[0]
            demo_memory_cam[i] = np.copy(s0)
        del demo_cam
        logger.info("loaded demo {} for testing CAM".format(args.demo_cam_id))

    device = "/cpu:0"
    gpu_options = None
    if args.use_gpu:
        device = "/gpu:" + os.environ["CUDA_VISIBLE_DEVICES"]
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=args.gpu_fraction)

    initial_learning_rate = args.initial_learn_rate
    logger.info('Initial Learning Rate={}'.format(initial_learning_rate))
    time.sleep(2)

    global_t = 0
    pretrain_global_t = 0
    pretrain_epoch = 0
    rewards = {'train': {}, 'eval': {}}
    best_model_reward = -(sys.maxsize)

    stop_requested = False

    game_state = GameState(env_id=args.gym_env)
    action_size = game_state.env.action_space.n
    game_state.close()
    del game_state.env
    del game_state

    config = tf.ConfigProto(gpu_options=gpu_options,
                            log_device_placement=False,
                            allow_soft_placement=True)

    pretrained_model = None
    pretrained_model_sess = None
    if args.load_pretrained_model:
        if args.onevsall_mtl:
            from game_class_network import MTLBinaryClassNetwork as PretrainedModelNetwork
        elif args.onevsall_mtl_linear:
            from game_class_network import MTLMultivariateNetwork as PretrainedModelNetwork
        else:
            from game_class_network import MultiClassNetwork as PretrainedModelNetwork
            logger.error("Not supported yet!")
            assert False

        if args.pretrained_model_folder is not None:
            pretrained_model_folder = args.pretrained_model_folder
        else:
            pretrained_model_folder = '{}_classifier_use_mnih_onevsall_mtl'.format(
                args.gym_env.replace('-', '_'))
        PretrainedModelNetwork.use_mnih_2015 = args.use_mnih_2015
        pretrained_model = PretrainedModelNetwork(action_size, -1, device)
        pretrained_model_sess = tf.Session(config=config,
                                           graph=pretrained_model.graph)
        pretrained_model.load(
            pretrained_model_sess,
            '{}/{}_checkpoint'.format(pretrained_model_folder,
                                      args.gym_env.replace('-', '_')))

    if args.use_lstm:
        GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015
        global_network = GameACFFNetwork(action_size, -1, device)

    training_threads = []

    learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr")

    grad_applier = tf.train.RMSPropOptimizer(learning_rate=learning_rate_input,
                                             decay=args.rmsp_alpha,
                                             epsilon=args.rmsp_epsilon)

    A3CTrainingThread.log_interval = args.log_interval
    A3CTrainingThread.performance_log_interval = args.performance_log_interval
    A3CTrainingThread.local_t_max = args.local_t_max
    A3CTrainingThread.demo_t_max = args.demo_t_max
    A3CTrainingThread.use_lstm = args.use_lstm
    A3CTrainingThread.action_size = action_size
    A3CTrainingThread.entropy_beta = args.entropy_beta
    A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta
    A3CTrainingThread.gamma = args.gamma
    A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015
    A3CTrainingThread.env_id = args.gym_env
    A3CTrainingThread.finetune_upper_layers_only = args.finetune_upper_layers_only
    A3CTrainingThread.transformed_bellman = args.transformed_bellman
    A3CTrainingThread.clip_norm = args.grad_norm_clip
    A3CTrainingThread.use_grad_cam = args.use_grad_cam

    if args.unclipped_reward:
        A3CTrainingThread.reward_type = "RAW"
    elif args.log_scale_reward:
        A3CTrainingThread.reward_type = "LOG"
    else:
        A3CTrainingThread.reward_type = "CLIP"

    n_shapers = args.parallel_size  #int(args.parallel_size * .25)
    mod = args.parallel_size // n_shapers
    for i in range(args.parallel_size):
        is_reward_shape = False
        is_advice = False
        if i % mod == 0:
            is_reward_shape = args.use_pretrained_model_as_reward_shaping
            is_advice = args.use_pretrained_model_as_advice
        training_thread = A3CTrainingThread(
            i,
            global_network,
            initial_learning_rate,
            learning_rate_input,
            grad_applier,
            args.max_time_step,
            device=device,
            pretrained_model=pretrained_model,
            pretrained_model_sess=pretrained_model_sess,
            advice=is_advice,
            reward_shaping=is_reward_shape)
        training_threads.append(training_thread)

    # prepare session
    sess = tf.Session(config=config)

    if args.use_transfer:
        if args.transfer_folder is not None:
            transfer_folder = args.transfer_folder
        else:
            transfer_folder = 'results/pretrain_models/{}'.format(
                args.gym_env.replace('-', '_'))
            end_str = ''
            if args.use_mnih_2015:
                end_str += '_mnih2015'
            end_str += '_l2beta1E-04_batchprop'  #TODO: make this an argument
            transfer_folder += end_str

        transfer_folder += '/transfer_model'

        if args.not_transfer_conv2:
            transfer_var_list = [
                global_network.W_conv1, global_network.b_conv1
            ]
        elif (args.not_transfer_conv3 and args.use_mnih_2015):
            transfer_var_list = [
                global_network.W_conv1, global_network.b_conv1,
                global_network.W_conv2, global_network.b_conv2
            ]
        elif args.not_transfer_fc1:
            transfer_var_list = [
                global_network.W_conv1,
                global_network.b_conv1,
                global_network.W_conv2,
                global_network.b_conv2,
            ]
            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3, global_network.b_conv3
                ]
        elif args.not_transfer_fc2:
            transfer_var_list = [
                global_network.W_conv1, global_network.b_conv1,
                global_network.W_conv2, global_network.b_conv2,
                global_network.W_fc1, global_network.b_fc1
            ]
            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3, global_network.b_conv3
                ]
        else:
            transfer_var_list = [
                global_network.W_conv1, global_network.b_conv1,
                global_network.W_conv2, global_network.b_conv2,
                global_network.W_fc1, global_network.b_fc1,
                global_network.W_fc2, global_network.b_fc2
            ]
            if args.use_mnih_2015:
                transfer_var_list += [
                    global_network.W_conv3, global_network.b_conv3
                ]

        global_network.load_transfer_model(
            sess,
            folder=transfer_folder,
            not_transfer_fc2=args.not_transfer_fc2,
            not_transfer_fc1=args.not_transfer_fc1,
            not_transfer_conv3=(args.not_transfer_conv3
                                and args.use_mnih_2015),
            not_transfer_conv2=args.not_transfer_conv2,
            var_list=transfer_var_list)

    def initialize_uninitialized(sess):
        global_vars = tf.global_variables()
        is_not_initialized = sess.run(
            [tf.is_variable_initialized(var) for var in global_vars])
        not_initialized_vars = [
            v for (v, f) in zip(global_vars, is_not_initialized) if not f
        ]

        if len(not_initialized_vars):
            sess.run(tf.variables_initializer(not_initialized_vars))

    if args.use_transfer:
        initialize_uninitialized(sess)
    else:
        sess.run(tf.global_variables_initializer())

    # summary writer for tensorboard
    summary_op = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(
        'results/log/a3c/{}/'.format(args.gym_env.replace('-', '_')) +
        folder[12:], sess.graph)

    # init or load checkpoint with saver
    root_saver = tf.train.Saver(max_to_keep=1)
    saver = tf.train.Saver(max_to_keep=6)
    best_saver = tf.train.Saver(max_to_keep=1)
    checkpoint = tf.train.get_checkpoint_state(folder)
    if checkpoint and checkpoint.model_checkpoint_path:
        root_saver.restore(sess, checkpoint.model_checkpoint_path)
        logger.info("checkpoint loaded:{}".format(
            checkpoint.model_checkpoint_path))
        tokens = checkpoint.model_checkpoint_path.split("-")
        # set global step
        global_t = int(tokens[-1])
        logger.info(">>> global step set: {}".format(global_t))
        # set wall time
        wall_t_fname = folder + '/' + 'wall_t.' + str(global_t)
        with open(wall_t_fname, 'r') as f:
            wall_t = float(f.read())
        with open(folder + '/pretrain_global_t', 'r') as f:
            pretrain_global_t = int(f.read())
        with open(folder + '/model_best/best_model_reward',
                  'r') as f_best_model_reward:
            best_model_reward = float(f_best_model_reward.read())
        rewards = pickle.load(
            open(
                folder + '/' + args.gym_env.replace('-', '_') +
                '-a3c-rewards.pkl', 'rb'))
    else:
        logger.warning("Could not find old checkpoint")
        # set wall time
        wall_t = 0.0
        prepare_dir(folder, empty=True)
        prepare_dir(folder + '/model_checkpoints', empty=True)
        prepare_dir(folder + '/model_best', empty=True)
        prepare_dir(folder + '/frames', empty=True)

    lock = threading.Lock()
    test_lock = False
    if global_t == 0:
        test_lock = True

    last_temp_global_t = global_t
    ispretrain_markers = [False] * args.parallel_size
    num_demo_thread = 0
    ctr_demo_thread = 0

    def train_function(parallel_index):
        nonlocal global_t, pretrain_global_t, pretrain_epoch, \
            rewards, test_lock, lock, \
            last_temp_global_t, ispretrain_markers, num_demo_thread, \
            ctr_demo_thread
        training_thread = training_threads[parallel_index]

        training_thread.set_summary_writer(summary_writer)

        # set all threads as demo threads
        training_thread.is_demo_thread = args.load_memory and args.use_demo_threads
        if training_thread.is_demo_thread or args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs:
            training_thread.pretrain_init(demo_memory)

        if global_t == 0 and (
                args.train_with_demo_num_steps > 0
                or args.train_with_demo_num_epochs > 0) and parallel_index < 2:
            ispretrain_markers[parallel_index] = True
            training_thread.replay_mem_reset()

            # Pretraining with demo memory
            logger.info("t_idx={} pretrain starting".format(parallel_index))
            while ispretrain_markers[parallel_index]:
                if stop_requested:
                    return
                if pretrain_global_t > args.train_with_demo_num_steps and pretrain_epoch > args.train_with_demo_num_epochs:
                    # At end of pretraining, reset state
                    training_thread.replay_mem_reset()
                    training_thread.episode_reward = 0
                    training_thread.local_t = 0
                    if args.use_lstm:
                        training_thread.local_network.reset_state()
                    ispretrain_markers[parallel_index] = False
                    logger.info(
                        "t_idx={} pretrain ended".format(parallel_index))
                    break

                diff_pretrain_global_t, _ = training_thread.demo_process(
                    sess, pretrain_global_t)
                for _ in range(diff_pretrain_global_t):
                    pretrain_global_t += 1
                    if pretrain_global_t % 10000 == 0:
                        logger.debug(
                            "pretrain_global_t={}".format(pretrain_global_t))

                pretrain_epoch += 1
                if pretrain_epoch % 1000 == 0:
                    logger.debug("pretrain_epoch={}".format(pretrain_epoch))

            # Waits for all threads to finish pretraining
            while not stop_requested and any(ispretrain_markers):
                time.sleep(0.01)

        # Evaluate model before training
        if not stop_requested and global_t == 0:
            with lock:
                if parallel_index == 0:
                    test_reward, test_steps, test_episodes = training_threads[
                        0].testing(sess,
                                   args.eval_max_steps,
                                   global_t,
                                   folder,
                                   demo_memory_cam=demo_memory_cam)
                    rewards['eval'][global_t] = (test_reward, test_steps,
                                                 test_episodes)
                    saver.save(
                        sess,
                        folder + '/model_checkpoints/' +
                        '{}_checkpoint'.format(args.gym_env.replace('-', '_')),
                        global_step=global_t)
                    save_best_model(test_reward)
                    test_lock = False
            # all threads wait until evaluation finishes
            while not stop_requested and test_lock:
                time.sleep(0.01)

        # set start_time
        start_time = time.time() - wall_t
        training_thread.set_start_time(start_time)
        episode_end = True
        use_demo_thread = False
        while True:
            if stop_requested:
                return
            if global_t >= (args.max_time_step * args.max_time_step_fraction):
                return

            if args.use_demo_threads and global_t < args.max_steps_threads_as_demo and episode_end and num_demo_thread < 16:
                #if num_demo_thread < 2:
                demo_rate = 1.0 * (args.max_steps_threads_as_demo -
                                   global_t) / args.max_steps_threads_as_demo
                if demo_rate < 0.0333:
                    demo_rate = 0.0333

                if np.random.random() <= demo_rate and num_demo_thread < 16:
                    ctr_demo_thread += 1
                    training_thread.replay_mem_reset(D_idx=ctr_demo_thread %
                                                     num_demos)
                    num_demo_thread += 1
                    logger.info(
                        "idx={} as demo thread started ({}/16) rate={}".format(
                            parallel_index, num_demo_thread, demo_rate))
                    use_demo_thread = True

            if use_demo_thread:
                diff_global_t, episode_end = training_thread.demo_process(
                    sess, global_t)
                if episode_end:
                    num_demo_thread -= 1
                    use_demo_thread = False
                    logger.info("idx={} demo thread concluded ({}/16)".format(
                        parallel_index, num_demo_thread))
            else:
                diff_global_t, episode_end = training_thread.process(
                    sess, global_t, rewards)

            for _ in range(diff_global_t):
                global_t += 1
                if global_t % args.eval_freq == 0:
                    temp_global_t = global_t
                    lock.acquire()
                    try:
                        # catch multiple threads getting in at the same time
                        if last_temp_global_t == temp_global_t:
                            logger.info("Threading race problem averted!")
                            continue
                        test_lock = True
                        test_reward, test_steps, n_episodes = training_thread.testing(
                            sess,
                            args.eval_max_steps,
                            temp_global_t,
                            folder,
                            demo_memory_cam=demo_memory_cam)
                        rewards['eval'][temp_global_t] = (test_reward,
                                                          test_steps,
                                                          n_episodes)
                        if temp_global_t % (
                            (args.max_time_step * args.max_time_step_fraction)
                                // 5) == 0:
                            saver.save(sess,
                                       folder + '/model_checkpoints/' +
                                       '{}_checkpoint'.format(
                                           args.gym_env.replace('-', '_')),
                                       global_step=temp_global_t,
                                       write_meta_graph=False)
                        if test_reward > best_model_reward:
                            save_best_model(test_reward)
                        test_lock = False
                        last_temp_global_t = temp_global_t
                    finally:
                        lock.release()
                if global_t % (
                    (args.max_time_step * args.max_time_step_fraction) //
                        5) == 0:
                    saver.save(
                        sess,
                        folder + '/model_checkpoints/' +
                        '{}_checkpoint'.format(args.gym_env.replace('-', '_')),
                        global_step=global_t,
                        write_meta_graph=False)
                # all threads wait until evaluation finishes
                while not stop_requested and test_lock:
                    time.sleep(0.01)

    def signal_handler(signal, frame):
        nonlocal stop_requested
        logger.info('You pressed Ctrl+C!')
        stop_requested = True

        if stop_requested and global_t == 0:
            sys.exit(1)

    def save_best_model(test_reward):
        nonlocal best_model_reward
        best_model_reward = test_reward
        with open(folder + '/model_best/best_model_reward',
                  'w') as f_best_model_reward:
            f_best_model_reward.write(str(best_model_reward))
        best_saver.save(
            sess, folder + '/model_best/' +
            '{}_checkpoint'.format(args.gym_env.replace('-', '_')))

    train_threads = []
    for i in range(args.parallel_size):
        train_threads.append(
            threading.Thread(target=train_function, args=(i, )))

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    # set start time
    start_time = time.time() - wall_t

    for t in train_threads:
        t.start()

    print('Press Ctrl+C to stop')

    for t in train_threads:
        t.join()

    logger.info('Now saving data. Please wait')

    # write wall time
    wall_t = time.time() - start_time
    wall_t_fname = folder + '/' + 'wall_t.' + str(global_t)
    with open(wall_t_fname, 'w') as f:
        f.write(str(wall_t))
    with open(folder + '/pretrain_global_t', 'w') as f:
        f.write(str(pretrain_global_t))

    root_saver.save(
        sess,
        folder + '/{}_checkpoint_a3c'.format(args.gym_env.replace('-', '_')),
        global_step=global_t)

    pickle.dump(
        rewards,
        open(
            folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl',
            'wb'), pickle.HIGHEST_PROTOCOL)
    logger.info('Data saved!')

    sess.close()
def visualize(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip,
              agent_type, action_size, rand_seed, checkpoint_dir):

    # use CPU for weight visualize tool
    device = "/cpu:0"

    if agent_type == 'LSTM':
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        global_network = GameACFFNetwork(action_size, -1, device)

    training_threads = []

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=rmsp_alpha,
                                  momentum=0.0,
                                  epsilon=rmsp_epsilon,
                                  clip_norm=grad_norm_clip,
                                  device=device)

    game = GameState(rand_seed, action_size)
    game.process(0)
    x_t = game.x_t

    plt.imshow(x_t, interpolation="nearest", cmap=plt.cm.gray)

    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    W_conv1 = sess.run(global_network.W_conv1)

    # show graph of W_conv1
    fig, axes = plt.subplots(4,
                             16,
                             figsize=(12, 6),
                             subplot_kw={
                                 'xticks': [],
                                 'yticks': []
                             })
    fig.subplots_adjust(hspace=0.1, wspace=0.1)

    for ax, i in zip(axes.flat, range(4 * 16)):
        inch = i // 16
        outch = i % 16
        img = W_conv1[:, :, inch, outch]
        ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
        ax.set_title(str(inch) + "," + str(outch))

    plt.show()

    W_conv2 = sess.run(global_network.W_conv2)

    # show graph of W_conv2
    fig, axes = plt.subplots(2,
                             32,
                             figsize=(27, 6),
                             subplot_kw={
                                 'xticks': [],
                                 'yticks': []
                             })
    fig.subplots_adjust(hspace=0.1, wspace=0.1)

    for ax, i in zip(axes.flat, range(2 * 32)):
        inch = i // 32
        outch = i % 32
        img = W_conv2[:, :, inch, outch]
        ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
        ax.set_title(str(inch) + "," + str(outch))

    plt.show()

    arr = sess.run(global_network.get_vars())

    s = tf.placeholder("float", [None, 84, 84, 4])

    b_conv1 = sess.run(global_network.b_conv1)
    b_conv2 = sess.run(global_network.b_conv2)

    inp_1 = tf.nn.conv2d(s, W_conv1, strides=[1, 4, 4, 1], padding="VALID")
    h_conv1 = tf.nn.relu(inp_1 + b_conv1)

    inp_2 = tf.nn.conv2d(h_conv1,
                         W_conv2,
                         strides=[1, 2, 2, 1],
                         padding="VALID")
    h_conv2 = tf.nn.relu(inp_2 + b_conv2)

    s_t = game.s_t

    getActivations(sess, s, h_conv1, s_t, 16)
    getActivations(sess, s, h_conv2, s_t, 32)
Example #33
0
from constants import PARALLEL_SIZE
from constants import MAX_TIME_STEP
from constants import CHECKPOINT_DIR
from constants import RMSP_EPSILON
from constants import RMSP_ALPHA
from constants import GRAD_NORM_CLIP
from constants import USE_GPU
from constants import USE_LSTM

# use CPU for weight visualize tool
device = "/cpu:0"

if USE_LSTM:
  global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device)
else:
  global_network = GameACFFNetwork(ACTION_SIZE, -1, device)

training_threads = []

learning_rate_input = tf.placeholder(PRECISION)

grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                              decay = RMSP_ALPHA,
                              momentum = 0.0,
                              epsilon = RMSP_EPSILON,
                              clip_norm = GRAD_NORM_CLIP,
                              device = device)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 pinitial_learning_rate,
                 plearning_rate_input,
                 pgrad_applier,
                 vinitial_learning_rate,
                 vlearning_rate_input,
                 vgrad_applier,
                 max_global_time_step,
                 device,
                 task_index=""):

        self.thread_index = thread_index
        self.plearning_rate_input = plearning_rate_input
        self.vlearning_rate_input = vlearning_rate_input
        self.max_global_time_step = max_global_time_step
        self.game_state = GameState()
        state = self.game_state.reset()
        self.game_state.reset_gs(state)
        self.action_size = self.game_state.action_size
        self.state_size = self.game_state.state_size
        self.local_max_iter = self.game_state.local_max_iter

        if USE_LSTM:
            self.local_network = GameACLSTMNetwork(self.action_size,
                                                   self.state_size,
                                                   self.game_state.action_low,
                                                   self.game_state.action_high,
                                                   thread_index, device)
        else:
            self.local_network = GameACFFNetwork(self.action_size,
                                                 self.state_size,
                                                 self.game_state.action_low,
                                                 self.game_state.action_high,
                                                 thread_index, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        with tf.device(device):
            pvar_refs = [v._ref() for v in self.local_network.get_pvars()]
            self.policy_gradients = tf.gradients(
                self.local_network.policy_loss,
                pvar_refs,
                gate_gradients=False,
                aggregation_method=None,
                colocate_gradients_with_ops=False)
            vvar_refs = [v._ref() for v in self.local_network.get_vvars()]
            self.value_gradients = tf.gradients(
                self.local_network.value_loss,
                vvar_refs,
                gate_gradients=False,
                aggregation_method=None,
                colocate_gradients_with_ops=False)

        self.apply_policy_gradients = pgrad_applier.apply_gradients(
            self.local_network.get_pvars(), self.policy_gradients)
        self.apply_value_gradients = vgrad_applier.apply_gradients(
            self.local_network.get_vvars(), self.value_gradients)

        self.local_t = 0

        self.pinitial_learning_rate = pinitial_learning_rate
        self.vinitial_learning_rate = vinitial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0

    def _panneal_learning_rate(self, global_time_step):
        learning_rate = self.pinitial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def _vanneal_learning_rate(self, global_time_step):
        learning_rate = self.vinitial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def process(self,
                sess,
                global_t,
                summary_writer,
                summary_op,
                score_input,
                score_ph="",
                score_ops=""):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_local_t = self.local_t

        if USE_LSTM:
            pstart_lstm_state = self.local_network.plstm_state_out
            vstart_lstm_state = self.local_network.vlstm_state_out

        # t_max times loop
        for i in range(self.local_max_iter):
            action, value_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            # process game
            self.game_state.process(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            #rewards.append( np.clip(reward,-1,1) )
            rewards.append(reward)

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()
            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward /
                                        self.game_state.r_sc))
                score = self.episode_reward / self.game_state.r_sc
                if summary_writer:
                    self._record_score(
                        sess, summary_writer, summary_op, score_input,
                        self.episode_reward / self.game_state.r_sc, global_t)
                else:
                    sess.run(
                        score_ops,
                        {score_ph: self.episode_reward / self.game_state.r_sc})

                self.episode_reward = 0
                state = self.game_state.reset()
                self.game_state.reset_gs(state)
                if USE_LSTM:
                    self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)
            score = self.episode_reward / self.game_state.r_sc

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi

            batch_si.append(si)
            batch_R.append(R)
            batch_td.append(td)

        pcur_learning_rate = self._panneal_learning_rate(global_t)
        vcur_learning_rate = self._vanneal_learning_rate(global_t)

        if USE_LSTM:
            batch_si.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.apply_policy_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.local_network.pinitial_lstm_state:
                         pstart_lstm_state,
                         self.local_network.pstep_size: [len(batch_a)],
                         self.local_network.vinitial_lstm_state:
                         vstart_lstm_state,
                         self.local_network.vstep_size: [len(batch_a)],
                         self.plearning_rate_input: pcur_learning_rate
                     })
            sess.run(self.apply_value_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.local_network.pinitial_lstm_state:
                         pstart_lstm_state,
                         self.local_network.pstep_size: [len(batch_a)],
                         self.local_network.vinitial_lstm_state:
                         vstart_lstm_state,
                         self.local_network.vstep_size: [len(batch_a)],
                         self.vlearning_rate_input: vcur_learning_rate
                     })
        else:
            sess.run(self.apply_policy_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.r: batch_R,
                         self.local_network.td: batch_td,
                         self.plearning_rate_input: pcur_learning_rate
                     })
            sess.run(self.apply_value_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.r: batch_R,
                         self.local_network.td: batch_td,
                         self.vlearning_rate_input: vcur_learning_rate
                     })

        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            #print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
            #  global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t