Esempio n. 1
0
def main(_):
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        config = get_config(FLAGS) or FLAGS

        # if config.env_type == 'simple':
        #   env = SimpleGymEnvironment(config)
        # else:
        #   env = GymEnvironment(config)

        env1 = ThreeDMountainCarEnv()
        env2 = MountainCarEnv()

        if not tf.test.is_gpu_available() and FLAGS.use_gpu:
            raise Exception("use_gpu flag is true when no GPUs are available")

        if not FLAGS.use_gpu:
            config.cnn_format = 'NHWC'

        agent1 = Agent(config, env1, sess)
        agent2 = Agent(config, env2, sess)

        if FLAGS.is_train:
            agent1.train()
        else:
            agent.play()
Esempio n. 2
0
    def __init__(self, cfg):
        self.num_states = cfg.MODEL.SIZE_STATE
        self.num_actions = cfg.MODEL.SIZE_ACTION
        self.num_episodes = cfg.SOLVER.NUM_EPISODES

        self.tetris = Tetris(cfg)
        self.agent = Agent(cfg,self.tetris)
Esempio n. 3
0
def main(_):
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        K.set_session(sess)
        config = get_config(FLAGS) or FLAGS

        if config.env_type == 'simple':
            env = SimpleGymEnvironment(config)
        else:
            env = GymEnvironment(config)

        if not tf.test.is_gpu_available() and FLAGS.use_gpu:
            raise Exception("use_gpu flag is true when no GPUs are available")

        if not FLAGS.use_gpu:
            config.cnn_format = 'NHWC'

        # Create a single instance of Agent to be multi-threaded
        agent = Agent(config, env, sess, threading.Lock())

        if FLAGS.is_train:
            init_threads(agent, config)
        else:
            agent.play(env)
Esempio n. 4
0
def main(_):
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    config = get_config(FLAGS) or FLAGS

    if config.env_type == 'simple':
      env = SimpleGymEnvironment(config)
    else:
      env = GymEnvironment(config)

    if FLAGS.cpu:
      config.cnn_format = 'NHWC'

    agent = Agent(config, env, sess)

    if FLAGS.save_weight:
      agent.save_weight_to_pkl()
    if FLAGS.load_weight:
      agent.load_weight_from_pkl(cpu_mode=FLAGS.cpu)

    if FLAGS.is_train:
      agent.train()
    else:
      agent.play()
Esempio n. 5
0
 def __init__(self, state_shape, n_action, net, model_path='model/dqn'):
     self.state_shape = state_shape
     self.n_action = n_action
     self.lr = 1e-4
     self.gamma = 0.9
     self.sampling_size = 20000
     self.agent = Agent(self.state_shape, self.n_action, self.lr, 0.9, net)
     self.sampling_pool = Sampling_Pool(self.sampling_size)
     self.cum_r = []
     self.model_path = model_path
def main(_):
    with tf.Session() as sess:
        config = get_config(FLAGS) or FLAGS
        env = MyEnvironment(config)
        agent = Agent(config, env, sess)

        if FLAGS.is_train:
            # agent.train()
            print('To be released.')
        else:
            agent.play()
Esempio n. 7
0
def main(_):
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        config = get_config(FLAGS) or FLAGS

        if config.env_type == 'simple':
            env = SimpleGymEnvironment(config)
        else:
            env = GymEnvironment(config)

        ACPconfig = ACPConfig(env)

        if not tf.test.is_gpu_available() and FLAGS.use_gpu:
            raise Exception("use_gpu flag is true when no GPUs are available")

        if not FLAGS.use_gpu:
            config.cnn_format = 'NHWC'

        # Becuase of code shittines, these steps should be after each other!
        acpAgent = acp.acp(sess, ACPconfig)
        agentDQN = Agent(config, env, acpAgent, sess)
        acpAgent.setdir(agentDQN.model_dir)

        sess.run(tf.initializers.global_variables())
        # Load both models if exist any checkpoint
        acpAgent.load()
        agentDQN.load()
        if FLAGS.is_train:
            agentDQN.train()
        else:
            raise Exception('agentDQN.play() is Not Implemented')
            agentDQN.play()
Esempio n. 8
0
def main(_):
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    config = get_config(FLAGS) or FLAGS

    if config.env_type == 'simple':
      env = SimpleGymEnvironment(config)
    else:
      env = GymEnvironment(config)

    if not tf.test.is_gpu_available() and FLAGS.use_gpu:
      raise Exception("use_gpu flag is true when no GPUs are available")

    if not FLAGS.use_gpu:
      config.cnn_format = 'NHWC'

    agent = Agent(config, env, sess)

    if FLAGS.mode == "train":
      agent.train()
    elif FLAGS.mode == "test":
      agent.play()
    elif FLAGS.mode == "ale":
      agent.play2()
Esempio n. 9
0
def main(_):
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    config = get_config(FLAGS) or FLAGS

    env = OurEnvironment(config)

    if not tf.test.is_gpu_available() and FLAGS.use_gpu:
      raise Exception("use_gpu flag is true when no GPUs are available")

    agent = Agent(config, env, sess)

    agent.train()
Esempio n. 10
0
def main():
    sess = tf.Session()
    config = Config()
    env = Environment(config)
    agent = Agent(sess, config, env)
    if config.test:
        agent.play(test=True)
    elif config.train:
        agent.train()
    else:
        agent.play()
Esempio n. 11
0
def main(_):
  if FLAGS.gpu_fraction == "1/1":
    FLAGS.gpu_fraction = "0.999/1.0"
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
  #Set ratio of usage for GPU or tensorflow would report error

  #config = tf.ConfigProto()
  #config.gpu_options.allow_growth = True
  #with tf.Session(config=config) as sess:

    config = get_config(FLAGS) or FLAGS

    if config.env_type == 'simple':
      env = SimpleGymEnvironment(config)
    else:
      env = GymEnvironment(config)

    if FLAGS.poison:
      config.poison_line = input("input the number of poison line:")




    if not tf.test.is_gpu_available() and FLAGS.use_gpu:
      raise Exception("use_gpu flag is true when no GPUs are available")

    if not FLAGS.use_gpu:
      config.cnn_format = 'NHWC'

    agent = Agent(config, env, sess)

    if FLAGS.is_train:
      if FLAGS.poison:
      	agent.train_poison()
      else:
      	agent.train()
    else:
      if FLAGS.poison:
      	agent.play_poison()
      else:
      	agent.play()
Esempio n. 12
0
File: main.py Progetto: guoyijie/ERL
def main(_):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--seed', help='RNG seed', type=int, default=123)
    parser.add_argument('--test', action="store_true")
    parser.add_argument("--use-gpu", action="store_true")
    parser.add_argument("--mode", help="Bonus mode", default="pixelcnn")
    args = parser.parse_args()

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        config = get_config(args)

        if config.env_type == 'simple':
            env = SimpleGymEnvironment(config)
        else:
            env = GymEnvironment(config)

        if not tf.test.is_gpu_available() and args.use_gpu:
            raise Exception("use_gpu flag is true when no GPUs are available")

        if args.mode == "pixelcnn":
            from dqn.agent import Agent
            agent = Agent(config, env, sess)
        elif args.mode == "autoencoder":
            from dqn.agent_model import Agent
            agent = Agent(config, env, sess)
        elif args.mode == "top-pixelcnn":
            from dqn.agent_top import Agent
            agent = Agent(config, env, sess)
        else:
            raise ValueError("No such mode")

        print("CNN format", config.cnn_format)
        if not args.test:
            print("training ...")
            agent.train()
        else:
            print("testing ...")
            agent.play()
Esempio n. 13
0
def main(_):
    with tf.Session() as sess:
        config = get_config(FLAGS)
        env = MyEnvironment(config)
        agent = Agent(config, env, sess)

        if FLAGS.is_train:
            agent.train()
        else:
            if FLAGS.dataset == 'mine':
                agent.play_mine()
            else:
                agent.play()
Esempio n. 14
0
def main(_):
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    config = get_config(FLAGS) or FLAGS

    if config.env_type == 'simple':
      env = SimpleGymEnvironment(config)
    else:
      env = GymEnvironment(config)

    if FLAGS.use_gpu:
      config.cnn_format = 'NCHW'

    agent = Agent(config, env, sess)

    if FLAGS.is_train:
      agent.train()
    else:
      agent.play()
Esempio n. 15
0
def main(_):
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        config = get_config(FLAGS) or FLAGS  # 通过config.py的get_config方法加载配置选项

        if config.env_type == 'simple':
            env = SimpleGymEnvironment(config)
        else:
            env = GymEnvironment(config)

        if not tf.test.is_gpu_available(
        ) and FLAGS.use_gpu:  # 如果能检查到就使用GPU;如果设置了使用GPU但是没有检测到GPU则报错。
            raise Exception("use_gpu flag is true when no GPUs are available")

        if not FLAGS.use_gpu:
            config.cnn_format = 'NHWC'  # 输入的格式:[batch, in_height, in_width, in_channels]
        # 另一种数据输入格式NCHW:[batch, in_channels, in_height, in_width]

        agent = Agent(config, env, sess)  # 新建DQN的智能体

        if FLAGS.is_train:
            agent.train()
        else:
            agent.play()  # 不进行训练(仅仅演示)
Esempio n. 16
0
def main(_):
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    config = get_config(FLAGS) or FLAGS

    #if config.env_type == 'simple':
    #  env = SimpleGymEnvironment(config)
    #else:
    #  env = GymEnvironment(config)


    env = retro.make(game='SonicAndKnuckles3-Genesis', state='MushroomHillZone.Act1')
    env = SonicDiscretizer(env)

    if not tf.test.is_gpu_available() and FLAGS.use_gpu:
      raise Exception("use_gpu flag is true when no GPUs are available")

    if not FLAGS.use_gpu:
      config.cnn_format = 'NHWC'

    agent = Agent(config, env, sess)

    if FLAGS.is_train:
      agent.train()
    else:
      agent.play()
Esempio n. 17
0
def main(_):
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = True
    #sess = tf.Session(config=config)

    with tf.Session(config=config) as sess:
        config = get_config(FLAGS) or FLAGS

        if config.env_type == 'simple':
            env = SimpleGymEnvironment(config)
        else:
            env = GymEnvironment(config)

        if not tf.test.is_gpu_available() and FLAGS.use_gpu:
            raise Exception("use_gpu flag is true when no GPUs are available")

        if not FLAGS.use_gpu:
            config.cnn_format = 'NHWC'

        agent = Agent(config, env, sess)
        if FLAGS.is_train:
            agent.train()
        else:
            agent.play()
Esempio n. 18
0
def main(args):
    with tf.Session() as sess:
        config = Config()
        player = Agent(config, sess)

        if config.isTrain:
            player.train()
        else:
            player.play()
Esempio n. 19
0
def main(_):
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    config = get_config(FLAGS) or FLAGS

    if config.env_type == 'simple':
      env = SimpleGymEnvironment(config)
    else:
      env = GymEnvironment(config)

    if not tf.test.is_gpu_available() and FLAGS.use_gpu:
      raise Exception("use_gpu flag is true when no GPUs are available")

    if not FLAGS.use_gpu:
      config.cnn_format = 'NHWC'

    agent = Agent(config, env, sess)

    if FLAGS.is_train:
      agent.train()
    else:
      agent.play()
Esempio n. 20
0
def main(_):

    with tf.Session() as sess:
        config = get_config(FLAGS) or FLAGS

        env = GymEnvironment(config)

        agent = Agent(config, env, sess)

        if config.is_train:
            agent.train()
        else:
            agent.play()
Esempio n. 21
0
def main(_):
    with tf.Session() as sess:
        config = get_config(FLAGS)
        env = MyEnvironment(config)
        agent = Agent(config, env, sess)

        if FLAGS.is_train:
            agent.train()
        else:
            if FLAGS.dataset == 'image':
                img_dirs = glob.glob(os.path.join(FLAGS.fold, "*.*"))
                
                for img_dir in img_dirs:
                    img = cv2.imread(img_dir)
                    out_img = agent.play_image(img)
                    cv2.imwrite(os.path.join("results/images", os.path.basename(img_dir)), out_img)
                    print("processing {}".format(img_dirs))
            else:
                agent.play()
Esempio n. 22
0
def main(_):
  #设置每个进程所占用的GPU内存比例
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: 
    config = get_config(FLAGS) or FLAGS

    if config.env_type == 'simple':
      env = SimpleGymEnvironment(config)
    else:
      env = GymEnvironment(config)

    if not FLAGS.use_gpu:
      config.cnn_format = 'NHWC'

    agent = Agent(config, env, sess)

    if FLAGS.is_train:
      agent.train()
    else:
      agent.play()
Esempio n. 23
0
def main(_):
    gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list='0')

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        config = get_config(FLAGS) or FLAGS

        if config.env_type == 'simple':
            env = SimpleGymEnvironment(config)
        else:
            env = GymEnvironment(config)

        if not tf.test.is_gpu_available() and FLAGS.use_gpu:
            raise Exception("use_gpu flag is true when no GPUs are available")

        if not FLAGS.use_gpu:
            config.cnn_format = 'NHWC'

        agent = Agent(config, env, sess)

        if FLAGS.is_train:
            agent.train()
        else:
            agent.play()
Esempio n. 24
0
def main(_):
  gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=True)) as sess:
  # with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    config = get_config(FLAGS) or FLAGS

    if config.env_type == 'simple':
      env = SimpleGymEnvironment(config)
    else:
      env = GymEnvironment(config)

    if not FLAGS.use_gpu:
      config.cnn_format = 'NHWC'
    
    with tf.device('/gpu:2'):
        agent = Agent(config, env, sess)

    if FLAGS.is_train:
      agent.train()
    else:
      agent.play()
Esempio n. 25
0
def main(_):
    # Trying to request all the GPU memory will fail, since the system
    # always allocates a little memory on each GPU for itself. Only set
    # up a GPU configuration if fractional amount of memory is requested.
    tf_config = None
    gpu_fraction = calc_gpu_fraction(FLAGS.gpu_fraction)
    if gpu_fraction < 1:
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
        tf_config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(config=tf_config) as sess:
        config = get_config(FLAGS) or FLAGS
        env = GymEnvironment(config)

        # Change data format for running on a CPU.
        if not FLAGS.use_gpu:
            config.cnn_format = 'NHWC'

        agent = Agent(config, env, sess)

        if FLAGS.train:
            agent.train()
        else:
            agent.play()
def main(_):

  with tf.Session() as sess:
    config = get_config(FLAGS) or FLAGS

    if config.env_type == 'simple':
      env = SimpleGymEnvironment(config)
    else:
      env = GymEnvironment(config)

    if not tf.test.is_gpu_available() and FLAGS.use_gpu:
      raise Exception("use_gpu flag is true when no GPUs are available")

    if not FLAGS.use_gpu:
      config.cnn_format = 'NHWC'

    roms = 'roms/Pong2PlayerVS.bin'
    ale = ALEInterface(roms.encode('utf-8'))
    width = ale.ale_getScreenWidth()
    height = ale.ale_getScreenHeight()
    game_screen = GameScreen()
    ale.ale_resetGame()
    (display_width, display_height) = (width * 2, height * 2)

    pygame.init()
    screen_ale = pygame.display.set_mode((display_width, display_height))
    pygame.display.set_caption("Arcade Learning Environment Random Agent Display")
    pygame.display.flip()

    game_surface = pygame.Surface((width, height), depth=8)
    clock = pygame.time.Clock()

    # Clear screen
    screen_ale.fill((0, 0, 0))

    agent = Agent(config, env, sess, 'A')
    agent2 = Agent2(config, env, sess, 'B')

    if FLAGS.is_train:
      start_epoch = agent.epoch_op.eval()
      start_step = agent.step_op.eval()
      start_time = time.time()

      # Loop for epochs
      for agent.epoch in range(start_epoch, agent.max_epoch):
        agent2.epoch = agent.epoch

        # Initialize information of gameplay
        num_game, agent.update_count, agent2.update_count, ep_rewardA, ep_rewardB = 0, 0, 0, 0., 0.
        total_rewardA, total_rewardB, agent.total_loss, agent2.total_loss, agent.total_q, agent2.total_q = 0., 0., 0., 0., 0., 0.
        max_avg_ep_rewardA, max_avg_ep_rewardB = 0, 0
        ep_rewardsA, ep_rewardsB, actionsA, actionsB = [], [], [], []

        # Get first frame of gameplay
        numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8)
        rgb = getRgbFromPalette(ale, game_surface, numpy_surface)
        del numpy_surface        
        game_screen.paint(rgb)
        pooled_screen = game_screen.grab()
        scaled_pooled_screen = scale_image(pooled_screen)

        # Add first frame of gameplay into both agents' replay history
        for _ in range(agent.history_length):
          agent.history.add(scaled_pooled_screen)
          agent2.history.add(scaled_pooled_screen)

        # Loop for training iterations
        for agent.step in tqdm(range(start_step, agent.max_step), ncols=70, initial=start_step):
          agent2.step = agent.step

          # End of burn in period, start to learn from frames
          if agent.step == agent.learn_start:
            num_game, agent.update_count, agent2.update_count, ep_rewardA, ep_rewardB = 0, 0, 0, 0., 0.
            total_rewardA, total_rewardB, agent.total_loss, agent2.total_loss, agent.total_q, agent2.total_q = 0., 0., 0., 0., 0., 0.
            max_avg_ep_rewardA, max_avg_ep_rewardB = 0, 0
            ep_rewardsA, ep_rewardsB, actionsA, actionsB = [], [], [], []
          
          # 1. predict
          action1 = agent.predict(agent.history.get())
          action2 = agent2.predict(agent2.history.get())

          # 2. act
          ale.ale_act2(action1, action2)
          terminal = ale.ale_isGameOver()
          # End of end epoch, finish up training so that game statistics can be collected without training data being messed up
          if agent.step == agent.max_step - 1:
            terminal = True
          rewardA = ale.ale_getRewardA()
          rewardB = ale.ale_getRewardB()
          
          # Fill buffer of game screen with current frame
          numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8)
          rgb = getRgbFromPalette(ale, game_surface, numpy_surface)
          del numpy_surface        
          game_screen.paint(rgb)
          pooled_screen = game_screen.grab()
          scaled_pooled_screen = scale_image(pooled_screen)
          agent.observe(scaled_pooled_screen, rewardA, action1, terminal)
          agent2.observe(scaled_pooled_screen, rewardB, action2, terminal)

          # Print frame onto display screen
          screen_ale.blit(pygame.transform.scale2x(game_surface), (0, 0))

          # Update the display screen
          pygame.display.flip()

          # Check if current episode ended
          if terminal:
            ale.ale_resetGame()
            terminal = ale.ale_isGameOver()
            rewardA = ale.ale_getRewardA()
            rewardB = ale.ale_getRewardB()
            numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8)

            rgb = getRgbFromPalette(ale, game_surface, numpy_surface)
            del numpy_surface        
            game_screen.paint(rgb)
            pooled_screen = game_screen.grab()
            scaled_pooled_screen = scale_image(pooled_screen)

            # End of an episode
            num_game += 1
            ep_rewardsA.append(ep_rewardA)
            ep_rewardsB.append(ep_rewardB)
            ep_rewardA = 0.
            ep_rewardB = 0.
          else:
            ep_rewardA += rewardA
            ep_rewardB += rewardB

          actionsA.append(action1)
          actionsB.append(action2)
          total_rewardA += rewardA
          total_rewardB += rewardB

          # Do a test to get statistics so far
          if agent.step >= agent.learn_start:
            if agent.step % agent.test_step == agent.test_step - 1:
              avg_rewardA = total_rewardA / agent.test_step
              avg_rewardB = total_rewardB / agent2.test_step
              avg_lossA = agent.total_loss / agent.update_count
              avg_lossB = agent2.total_loss / agent2.update_count
              avg_qA = agent.total_q / agent.update_count
              avg_qB = agent2.total_q / agent2.update_count

              try:
                max_ep_rewardA = np.max(ep_rewardsA)
                min_ep_rewardA = np.min(ep_rewardsA)
                avg_ep_rewardA = np.mean(ep_rewardsA)
                max_ep_rewardB = np.max(ep_rewardsB)
                min_ep_rewardB = np.min(ep_rewardsB)
                avg_ep_rewardB = np.mean(ep_rewardsB)
              except:
                max_ep_rewardA, min_ep_rewardA, avg_ep_rewardA, max_ep_rewardB, min_ep_rewardB, avg_ep_rewardB = 0, 0, 0, 0, 0, 0

              print('\nFor Agent A at Epoch %d: avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \
                  % (agent.epoch, avg_rewardA, avg_lossA, avg_qA, avg_ep_rewardA, max_ep_rewardA, min_ep_rewardA, num_game))
              print('\nFor Agent B at Epoch %d: avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \
                  % (agent2.epoch, avg_rewardB, avg_lossB, avg_qB, avg_ep_rewardB, max_ep_rewardB, min_ep_rewardB, num_game))

              if max_avg_ep_rewardA * 0.9 <= avg_ep_rewardA:
                agent.step_assign_op.eval({agent.step_input: agent.step + 1})
                agent.save_model(agent.step + 1)

                max_avg_ep_rewardA = max(max_avg_ep_rewardA, avg_ep_rewardA)

              if max_avg_ep_rewardB * 0.9 <= avg_ep_rewardB:
                agent2.step_assign_op.eval({agent2.step_input: agent2.step + 1})
                agent2.save_model(agent2.step + 1)

                max_avg_ep_rewardB = max(max_avg_ep_rewardB, avg_ep_rewardB)

              if agent.step > 180:
                agent.inject_summary({
                    'average.reward': avg_rewardA,
                    'average.loss': avg_lossA,
                    'average.q': avg_qA,
                    'episode.max reward': max_ep_rewardA,
                    'episode.min reward': min_ep_rewardA,
                    'episode.avg reward': avg_ep_rewardA,
                    'episode.num of game': num_game,
                    'episode.rewards': ep_rewardsA,
                    'episode.actions': actionsA,
                    'training.learning_rate': agent.learning_rate_op.eval({agent.learning_rate_step: agent.step}),
                  }, agent.step)

              if agent2.step > 180:
                agent2.inject_summary({
                    'average.reward': avg_rewardB,
                    'average.loss': avg_lossB,
                    'average.q': avg_qB,
                    'episode.max reward': max_ep_rewardB,
                    'episode.min reward': min_ep_rewardB,
                    'episode.avg reward': avg_ep_rewardB,
                    'episode.num of game': num_game,
                    'episode.rewards': ep_rewardsB,
                    'episode.actions': actionsB,
                    'training.learning_rate': agent2.learning_rate_op.eval({agent2.learning_rate_step: agent2.step}),
                  }, agent2.step)

              # Reset statistics
              num_game = 0
              total_rewardA, total_rewardB = 0., 0.
              agent.total_loss, agent2.total_loss = 0., 0.
              agent.total_q, agent2.total_q = 0., 0.
              agent.update_count, agent2.update_count = 0, 0
              ep_rewardA, ep_rewardB = 0., 0.
              ep_rewardsA, ep_rewardsB = [], []
              actionsA, actionsB = [], []

        # Play 10 games at the end of epoch to get game statistics
        total_points, paddle_bounce, wall_bounce, serving_time = [], [], [], []
        for _ in range(10):
          cur_total_points, cur_paddle_bounce, cur_wall_bounce, cur_serving_time = 0, 0, 0, 0

          # Restart game
          ale.ale_resetGame()

          # Get first frame of gameplay
          numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8)
          rgb = getRgbFromPalette(ale, game_surface, numpy_surface)
          del numpy_surface        
          game_screen.paint(rgb)
          pooled_screen = game_screen.grab()
          scaled_pooled_screen = scale_image(pooled_screen)

          # Create history for testing purposes
          test_history = History(config)

          # Fill first 4 images with initial screen
          for _ in range(agent.history_length):
            test_history.add(scaled_pooled_screen)

          while not ale.ale_isGameOver():
            # 1. predict
            action1 = agent.predict(agent.history.get())
            action2 = agent2.predict(agent2.history.get())

            # 2. act
            ale.ale_act2(action1, action2)
            terminal = ale.ale_isGameOver()
            rewardA = ale.ale_getRewardA()
            rewardB = ale.ale_getRewardB()

            # Record game statistics of current episode
            cur_total_points = ale.ale_getPoints()
            cur_paddle_bounce = ale.ale_getSideBouncing()
            if ale.ale_getWallBouncing():
              cur_wall_bounce += 1
            if ale.ale_getServing():
              cur_serving_time += 1

            # Fill buffer of game screen with current frame
            numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8)
            rgb = getRgbFromPalette(ale, game_surface, numpy_surface)
            del numpy_surface        
            game_screen.paint(rgb)
            pooled_screen = game_screen.grab()
            scaled_pooled_screen = scale_image(pooled_screen)
            agent.observe(scaled_pooled_screen, rewardA, action1, terminal)
            agent2.observe(scaled_pooled_screen, rewardB, action2, terminal)

            # Print frame onto display screen
            screen_ale.blit(pygame.transform.scale2x(game_surface), (0, 0))

            # Update the display screen
            pygame.display.flip()

          # Append current episode's statistics into list
          total_points.append(cur_total_points)
          paddle_bounce.append(cur_paddle_bounce / cur_total_points)
          if cur_paddle_bounce == 0:
            wall_bounce.append(cur_wall_bounce / (cur_paddle_bounce + 1))
          else:
            wall_bounce.append(cur_wall_bounce / cur_paddle_bounce)
          serving_time.append(cur_serving_time / cur_total_points)

        # Save results of test after current epoch
        cur_paddle_op = agent.paddle_op.eval()
        cur_paddle_op[agent.epoch] = sum(paddle_bounce) / len(paddle_bounce)
        agent.paddle_assign_op.eval({agent.paddle_input: cur_paddle_op})

        cur_wall_op = agent.wall_op.eval()
        cur_wall_op[agent.epoch] = sum(wall_bounce) / len(wall_bounce)
        agent.wall_assign_op.eval({agent.wall_input: cur_wall_op})

        cur_serving_op = agent.serving_op.eval()
        cur_serving_op[agent.epoch] = sum(serving_time) / len(serving_time)
        agent.serving_assign_op.eval({agent.serving_input: cur_serving_op})

        agent.save_model(agent.step + 1)
    else:
      agent.play()
      agent2.play()
Esempio n. 27
0
def train(sess, config):

    env = GymEnvironment(config)

    log_dir = './log/{}_lookahead_{}_gats_{}/'.format(config.env_name,
                                                      config.lookahead,
                                                      config.gats)
    checkpoint_dir = os.path.join(log_dir, 'checkpoints/')
    image_dir = os.path.join(log_dir, 'rollout/')
    if os.path.isdir(log_dir):
        shutil.rmtree(log_dir)
        print(' [*] Removed log dir: ' + log_dir)

    with tf.variable_scope('step'):
        step_op = tf.Variable(0, trainable=False, name='step')
        step_input = tf.placeholder('int32', None, name='step_input')
        step_assign_op = step_op.assign(step_input)

    with tf.variable_scope('summary'):
        scalar_summary_tags = [
            'average.reward', 'average.loss', 'average.q value',
            'episode.max reward', 'episode.min reward', 'episode.avg reward',
            'episode.num of game', 'training.learning_rate', 'rp.rp_accuracy',
            'rp.rp_plus_accuracy', 'rp.rp_minus_accuracy',
            'rp.nonzero_rp_accuracy'
        ]

        summary_placeholders = {}
        summary_ops = {}

        for tag in scalar_summary_tags:
            summary_placeholders[tag] = tf.placeholder('float32',
                                                       None,
                                                       name=tag.replace(
                                                           ' ', '_'))
            summary_ops[tag] = tf.summary.scalar(
                "%s-%s/%s" % (config.env_name, config.env_type, tag),
                summary_placeholders[tag])

        histogram_summary_tags = ['episode.rewards', 'episode.actions']

        for tag in histogram_summary_tags:
            summary_placeholders[tag] = tf.placeholder('float32',
                                                       None,
                                                       name=tag.replace(
                                                           ' ', '_'))
            summary_ops[tag] = tf.summary.histogram(tag,
                                                    summary_placeholders[tag])

    config.num_actions = env.action_size
    # config.num_actions = 3

    exploration = LinearSchedule(config.epsilon_end_t, config.epsilon_end)

    agent = Agent(sess, config, num_actions=config.num_actions)

    if config.gats:
        lookahead = config.lookahead
        rp_train_frequency = 4
        gdm_train_frequency = 4
        gdm = GDM(sess, config, num_actions=config.num_actions)
        rp = RP(sess, config, num_actions=config.num_actions)
        leaves_size = config.num_actions**config.lookahead
        if config.dyna:
            gan_memory = GANReplayMemory(config)
        else:
            gan_memory = None

        def base_generator():
            tree_base = np.zeros((leaves_size, lookahead)).astype('uint8')
            for i in range(leaves_size):
                n = i
                j = 0
                while n:
                    n, r = divmod(n, config.num_actions)
                    tree_base[i, lookahead - 1 - j] = r
                    j = j + 1
            return tree_base

        tree_base = base_generator()

    # memory = ReplayMemory(config)
    memory = ReplayMemory(config, log_dir)
    history = History(config)

    tf.global_variables_initializer().run()
    saver = tf.train.Saver(max_to_keep=30)

    # model load, if exist ckpt.
    load_model(sess, saver, checkpoint_dir)

    agent.updated_target_q_network()

    writer = tf.summary.FileWriter(log_dir, sess.graph)

    num_game, update_count, ep_reward = 0, 0, 0.
    total_reward, total_loss, total_q_value = 0., 0., 0.
    max_avg_ep_reward = -100
    ep_rewards, actions = [], []

    rp_accuracy = []
    rp_plus_accuracy = []
    rp_minus_accuracy = []
    nonzero_rp_accuracy = []

    screen, reward, action, terminal = env.new_random_game()

    # init state
    for _ in range(config.history_length):
        history.add(screen)

    start_step = step_op.eval()

    # main
    for step in tqdm(range(start_step, config.max_step),
                     ncols=70,
                     initial=start_step):

        if step == config.learn_start:
            num_game, update_count, ep_reward = 0, 0, 0.
            total_reward, total_loss, total_q_value = 0., 0., 0.
            ep_rewards, actions = [], []

        if step == config.gan_dqn_learn_start:
            rp_accuracy = []
            rp_plus_accuracy = []
            rp_minus_accuracy = []
            nonzero_rp_accuracy = []

        # ε-greedy
        MCTS_FLAG = False
        epsilon = exploration.value(step)
        if random.random() < epsilon:
            action = random.randrange(config.num_actions)
        else:
            current_state = norm_frame(np.expand_dims(history.get(), axis=0))
            if config.gats and (step >= config.gan_dqn_learn_start):
                action, predicted_reward = MCTS_planning(
                    gdm, rp, agent, current_state, leaves_size, tree_base,
                    config, exploration, step, gan_memory)
                MCTS_FLAG = True
            else:
                action = agent.get_action(
                    norm_frame_Q(unnorm_frame(current_state)))

        # GATS用?
        apply_action = action
        # if int(apply_action != 0):
        #     apply_action += 1

        # Observe
        screen, reward, terminal = env.act(apply_action, is_training=True)
        reward = max(config.min_reward, min(config.max_reward, reward))
        history.add(screen)
        memory.add(screen, reward, action, terminal)

        if MCTS_FLAG:
            rp_accuracy.append(int(predicted_reward == reward))
            if reward != 0:
                nonzero_rp_accuracy.append(int(predicted_reward == reward))
                if reward == 1:
                    rp_plus_accuracy.append(int(predicted_reward == reward))
                elif reward == -1:
                    rp_minus_accuracy.append(int(predicted_reward == reward))

        # Train
        if step > config.gan_learn_start and config.gats:
            if step % rp_train_frequency == 0 and memory.can_sample(
                    config.rp_batch_size):
                obs, act, rew = memory.reward_sample(config.rp_batch_size)
                # obs, act, rew = memory.reward_sample2(
                #     config.rp_batch_size, config.lookahead)
                reward_obs, reward_act, reward_rew = memory.reward_sample(
                    config.nonzero_batch_size, nonzero=True)
                # reward_obs, reward_act, reward_rew = memory.nonzero_reward_sample(
                #     config.rp_batch_size, config.lookahead)
                obs_batch = norm_frame(
                    np.concatenate((obs, reward_obs), axis=0))
                act_batch = np.concatenate((act, reward_act), axis=0)
                rew_batch = np.concatenate((rew, reward_rew), axis=0)
                reward_label = rew_batch + 1

                trajectories = gdm.get_state(obs_batch, act_batch[:, :-1])

                rp_summary = rp.train(trajectories, act_batch, reward_label)
                writer.add_summary(rp_summary, step)

            if step % gdm_train_frequency == 0 and memory.can_sample(
                    config.gan_batch_size):
                state_batch, action_batch, next_state_batch = memory.GAN_sample(
                )
                # state_batch, act_batch, next_state_batch = memory.GAN_sample2(
                #     config.gan_batch_size, config.lookahead)

                # gdm.summary, disc_summary, merged_summary = gdm.train(
                #     norm_frame(state_batch), act_batch, norm_frame(next_state_batch), warmup_bool)
                gdm.summary, disc_summary = gdm.train(
                    norm_frame(state_batch), action_batch,
                    norm_frame(next_state_batch))

        if step > config.learn_start:
            # if step % config.train_frequency == 0 and memory.can_sample(config.batch_size):
            if step % config.train_frequency == 0:
                # s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch = memory.sample(
                #     config.batch_size, config.lookahead)
                s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch = memory.sample(
                )
                s_t, s_t_plus_1 = norm_frame(s_t), norm_frame(s_t_plus_1)
                if config.gats and config.dyna:
                    if step > config.gan_dqn_learn_start and gan_memory.can_sample(
                            config.batch_size):
                        gan_obs_batch, gan_act_batch, gan_rew_batch, gan_terminal_batch = gan_memory.sample(
                        )
                        # gan_obs_batch, gan_act_batch, gan_rew_batch = gan_memory.sample(
                        #     config.batch_size)
                        gan_obs_batch = norm_frame(gan_obs_batch)
                        trajectories = gdm.get_state(
                            gan_obs_batch, np.expand_dims(gan_act_batch,
                                                          axis=1))
                        gan_next_obs_batch = trajectories[:, -config.
                                                          history_length:, ...]

                        # gan_obs_batch, gan_next_obs_batch = \
                        #     norm_frame(gan_obs_batch), norm_frame(gan_next_obs_batch)

                        s_t = np.concatenate([s_t, gan_obs_batch], axis=0)
                        act_batch = np.concatenate([act_batch, gan_act_batch],
                                                   axis=0)
                        rew_batch = np.concatenate([rew_batch, gan_rew_batch],
                                                   axis=0)
                        s_t_plus_1 = np.concatenate(
                            [s_t_plus_1, gan_next_obs_batch], axis=0)
                        terminal_batch = np.concatenate(
                            [terminal_batch, gan_terminal_batch], axis=0)

                s_t, s_t_plus_1 = norm_frame_Q(
                    unnorm_frame(s_t)), norm_frame_Q(unnorm_frame(s_t_plus_1))

                q_t, loss, dqn_summary = agent.train(s_t, act_batch, rew_batch,
                                                     s_t_plus_1,
                                                     terminal_batch, step)

                writer.add_summary(dqn_summary, step)
                total_loss += loss
                total_q_value += q_t.mean()
                update_count += 1

            if step % config.target_q_update_step == config.target_q_update_step - 1:
                agent.updated_target_q_network()

        # reinit
        if terminal:
            screen, reward, action, terminal = env.new_random_game()

            num_game += 1
            ep_rewards.append(ep_reward)
            ep_reward = 0.
        else:
            ep_reward += reward

        total_reward += reward

        # change train freqancy
        if config.gats:
            if step == 10000 - 1:
                rp_train_frequency = 8
                gdm_train_frequency = 8
            if step == 50000 - 1:
                rp_train_frequency = 16
                gdm_train_frequency = 16
            if step == 100000 - 1:
                rp_train_frequency = 24
                gdm_train_frequency = 24

        # rolloutを行い画像を保存
        if config.gats and step % config._test_step == config._test_step - 1:
            rollout_image(config, image_dir, gdm, memory, step + 1, 16)

        # calcurate infometion
        if step >= config.learn_start:
            if step % config._test_step == config._test_step - 1:

                # plot
                if config.gats:
                    writer.add_summary(gdm.summary, step)
                    writer.add_summary(disc_summary, step)

                avg_reward = total_reward / config._test_step
                avg_loss = total_loss / update_count
                avg_q = total_q_value / update_count

                try:
                    max_ep_reward = np.max(ep_rewards)
                    min_ep_reward = np.min(ep_rewards)
                    avg_ep_reward = np.mean(ep_rewards)
                except:
                    max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                print(
                    '\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d'
                    % (avg_reward, avg_loss, avg_q, avg_ep_reward,
                       max_ep_reward, min_ep_reward, num_game))

                # require terget q network
                if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                    step_assign_op.eval({step_input: step + 1})
                    save_model(sess, saver, checkpoint_dir, step + 1)

                    max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward)

                if step >= config.gan_dqn_learn_start:
                    if len(rp_accuracy) > 0:
                        rp_accuracy = np.mean(rp_accuracy)
                        rp_plus_accuracy = np.mean(rp_plus_accuracy)
                        rp_minus_accuracy = np.mean(rp_minus_accuracy)
                        nonzero_rp_accuracy = np.mean(nonzero_rp_accuracy)
                    else:
                        rp_accuracy = 0
                        rp_plus_accuracy = 0
                        rp_minus_accuracy = 0
                        nonzero_rp_accuracy = 0
                else:
                    rp_accuracy = 0
                    rp_plus_accuracy = 0
                    rp_minus_accuracy = 0
                    nonzero_rp_accuracy = 0

                # summary
                if step > 180:
                    inject_summary(
                        sess, writer, summary_ops, summary_placeholders, {
                            'average.reward': avg_reward,
                            'average.loss': avg_loss,
                            'average.q value': avg_q,
                            'episode.max reward': max_ep_reward,
                            'episode.min reward': min_ep_reward,
                            'episode.avg reward': avg_ep_reward,
                            'episode.num of game': num_game,
                            'episode.rewards': ep_rewards,
                            'episode.actions': actions,
                            'rp.rp_accuracy': rp_accuracy,
                            'rp.rp_plus_accuracy': rp_plus_accuracy,
                            'rp.rp_minus_accuracy': rp_minus_accuracy,
                            'rp.nonzero_rp_accuracy': nonzero_rp_accuracy
                        }, step)

                num_game = 0
                total_reward = 0.
                total_loss = 0.
                total_q_value = 0.
                update_count = 0
                ep_reward = 0.
                ep_rewards = []
                actions = []

                rp_accuracy = []
                rp_plus_accuracy = []
                rp_minus_accuracy = []
                nonzero_rp_accuracy = []
Esempio n. 28
0
def main(_):
    # tensorflow 在执行过程中会默认使用全部的 GPU 内存,给系统保留 200 M,因此我们可以使用如下语句指定 GPU 内存的分配比例:
    if FLAGS.gpu_fraction == '':
        raise ValueError("--gpu_fraction should be defined")
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))
    # 在终端监视:watch -n 10 nvidia-smi

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        config = DQNConfig(FLAGS) or FLAGS
        print("\n [*] Current Configuration")
        pp(config.list_all_member())

        # Notice before the process
        # Code in remoteApi.start(19999) in Vrep otherwise it may cause some unpredictable problem

        if not tf.test.is_gpu_available() and FLAGS.use_gpu:
            raise Exception("use_gpu flag is true when no GPUs are available")

        if config.is_train:
            env = DQNEnvironment(config)
            agent = Agent(config, env, sess)
            agent.train()
        else:
            if config.is_sim:
                env = DQNEnvironment(config)
                agent = Agent(config, env, sess)
                agent.play()
                agent.randomplay()
            else:
                from experiment.environment import REALEnvironment
                env = REALEnvironment(config)
                agent = Agent(config, env, sess)
                agent.exp_play()

        env.close()
Esempio n. 29
0
state_size = num_nodes * num_features_per_node
action_size = 5

# Add some variables to keep track of the progress
scores_window, steps_window, collisions_window, done_window = [
    deque(maxlen=200) for _ in range(4)
]
agent_obs = [None] * flags.num_agents
agent_obs_buffer = [None] * flags.num_agents
agent_action_buffer = [2] * flags.num_agents
max_steps = 8 * (flags.grid_width + flags.grid_height)
start_time = time.time()

# Load an RL agent and initialize it from checkpoint if necessary
if flags.agent_type == "dqn":
    agent = DQN_Agent(state_size, action_size, flags.num_agents)
elif flags.agent_type == "ppo":
    agent = PPO_Agent(state_size, action_size, flags.num_agents)

if flags.load_model:
    start, eps = agent.load(project_root / 'checkpoints', 0, 1.0)
else:
    start, eps = 0, 1.0

if not flags.train:
    eps = 0.0

# We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop
# through the generators to get all the old networks out of the way
if start > 0: print(f"Skipping {start} railways")
for _ in range(0, start):
Esempio n. 30
0
def demo():
    agents = (Agent(), Agent())

    train_agents_and_save(agents)
Esempio n. 31
0
def dqn_argo(param_set: Parameter_Set, max_reward):
    # Agentの生成
    netWork = Network(action_dim=2)
    target_network = Network(action_dim=2)
    agent = Agent(network=netWork,
                  target_network=target_network,
                  eps_start=param_set.eps_init,
                  eps_anneal=param_set.eps_anneal,
                  eps_min=param_set.eps_min,
                  lr=param_set.lr,
                  gamma=param_set.gamma)

    # Envの生成
    env = gym.make('CartPole-v0')

    replay_buffer = Replay_Buffer(param_set.cap)

    save_reward_list = []
    reward_list = []
    for i in range(REWARD_SAVE_EVALUATION_SIZE):
        save_reward_list.append(0)
    for i in range(REWARD_EVALUATION_SIZE):
        reward_list.append(0)

    # データ集め(何回ゲームをやるか)
    for i in range(EPISODE_NUM):

        # Envの初期化情報の取得
        state = env.reset()
        done = False

        # エピソード報酬初期化
        episode_reward = 0

        # 1ゲーム終了させる(Envから終了判定もらう)
        while not done:

            if i > INIT_EXPLORATION:
                # Actionをε-greedyで決める
                action = agent.get_action(state)
            else:
                action = env.action_space.sample()

            # Action引数にEnvからS、r,dの情報を引っ張ってくる
            next_state, reward, done, info = env.step(action)

            # エピソード報酬計算
            episode_reward += reward

            # ReplayBufferにaddする
            replay_buffer.add(state, action, next_state, reward, done)

            # StにSt+1を代入(更新処理)
            state = next_state
        loss = tf.constant(0)

        if i > INIT_EXPLORATION:
            # ニューラルネットワーク学習
            sample = replay_buffer.sample(BATCH_SIZE)
            if sample:
                loss = agent.update(replay_buffer.sample(BATCH_SIZE))

            if i % param_set.q_update == 0:
                agent.network_synchronize()

            reward_list[i % REWARD_EVALUATION_SIZE] = episode_reward

            save_reward_list[i % REWARD_SAVE_EVALUATION_SIZE] = episode_reward

            if sum(save_reward_list) / len(save_reward_list) >= max_reward:
                print("最高記録更新!!!")
                agent.save(SAVE_DIRECTORY + SAVE_FILE)
                max_reward = sum(save_reward_list) / len(save_reward_list)
    return sum(reward_list) / len(reward_list), max_reward
Esempio n. 32
0
class Train:

    def __init__(self, cfg):
        self.num_states = cfg.MODEL.SIZE_STATE
        self.num_actions = cfg.MODEL.SIZE_ACTION
        self.num_episodes = cfg.SOLVER.NUM_EPISODES

        self.tetris = Tetris(cfg)
        self.agent = Agent(cfg,self.tetris)

    def run(self):
        episode_10_list = np.zeros(10)
        episode_final = False
        reward_per_epoch = []
        lifetime_per_epoch = []

        for episode in range(self.num_episodes):

            self.tetris.init()
            brd, mino = self.tetris.get_state()
            observation = torch.tensor(np.append(brd.flatten(), mino))
            state = observation
            state = state.type(torch.FloatTensor)
            state = torch.unsqueeze(state, 0)

            # frames = [self.env.getScreenRGB()]

            cum_reward = 0
            t = 0
            step = 0

            if episode % 15 == 0:
                self.agent.update_target_model()

            while not self.tetris.check_dead():
                step += 1

                action = self.agent.get_action(state,mino, episode)
                self.tetris.update_state(action.squeeze())
                rew = self.tetris.score
                print(rew)
                t += 1
                brd, mino = self.tetris.get_state()
                observation_next = torch.tensor(np.append(brd.flatten(), mino))
                done = self.tetris.check_dead()

                # frames.append(self.env.getScreenRGB())

                # 報酬を与える。さらにepisodeの終了評価と、state_nextを設定する
                if done:  # ステップ数が200経過するか、一定角度以上傾くとdoneはtrueになる
                    state_next = None  # 次の状態はないので、Noneを格納

                    # 直近10episodeの立てたstep数リストに追加
                    episode_10_list = np.hstack(
                        (episode_10_list[1:], step + 1))

                    # 罰則を与える
                    reward = torch.FloatTensor([-1.0])

                else:
                    if rew > 0:
                        reward = torch.FloatTensor([1.0])
                    else:
                        reward = torch.FloatTensor([0.0])

                    state_next = observation_next.type(torch.FloatTensor)
                    # state_next = torch.from_numpy(state_next).type(
                    #     torch.FloatTensor)
                    state_next = torch.unsqueeze(
                        state_next, 0)

                cum_reward += rew

                self.agent.memorize(state, action, state_next, reward)

                self.agent.update_q_network()

                state = state_next

                # 終了時の処理
                if done:
                    print('%d Episode: Finished after %d steps:10試行の平均step数 = %.1lf' % (
                        episode, step + 1, episode_10_list.mean()))
                    reward_per_epoch.append(cum_reward)
                    lifetime_per_epoch.append(step + 1)
                    break

            if episode_final is True:
                # 動画の保存と描画
                display_frames_as_gif(frames)
                break

            # 50エピソード毎にlogを出力
            if episode % PRINT_EVERY_EPISODE == 0:
                print("Episode %d finished after %f time steps" % (episode, t))
                print("cumulated reward: %f" % cum_reward)

            # 100エピソード毎にアニメーションを作成
            if episode % SHOW_GIF_EVERY_EPISODE == 0:
                print("len frames:", len(frames))
                display_frames_as_gif(frames)
                continue

            # 2000タイムステップ以上続いたアニメーションを作成
            if step > 2000:
                print("len frames:", len(frames))
                display_frames_as_gif(frames)



    # モデルの保存

    def save_model():
        torch.save(agent.brain.model.state_dict(), 'weight.pth')