def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS # if config.env_type == 'simple': # env = SimpleGymEnvironment(config) # else: # env = GymEnvironment(config) env1 = ThreeDMountainCarEnv() env2 = MountainCarEnv() if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent1 = Agent(config, env1, sess) agent2 = Agent(config, env2, sess) if FLAGS.is_train: agent1.train() else: agent.play()
def __init__(self, cfg): self.num_states = cfg.MODEL.SIZE_STATE self.num_actions = cfg.MODEL.SIZE_ACTION self.num_episodes = cfg.SOLVER.NUM_EPISODES self.tetris = Tetris(cfg) self.agent = Agent(cfg,self.tetris)
def main(_): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: K.set_session(sess) config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' # Create a single instance of Agent to be multi-threaded agent = Agent(config, env, sess, threading.Lock()) if FLAGS.is_train: init_threads(agent, config) else: agent.play(env)
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if FLAGS.cpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.save_weight: agent.save_weight_to_pkl() if FLAGS.load_weight: agent.load_weight_from_pkl(cpu_mode=FLAGS.cpu) if FLAGS.is_train: agent.train() else: agent.play()
def __init__(self, state_shape, n_action, net, model_path='model/dqn'): self.state_shape = state_shape self.n_action = n_action self.lr = 1e-4 self.gamma = 0.9 self.sampling_size = 20000 self.agent = Agent(self.state_shape, self.n_action, self.lr, 0.9, net) self.sampling_pool = Sampling_Pool(self.sampling_size) self.cum_r = [] self.model_path = model_path
def main(_): with tf.Session() as sess: config = get_config(FLAGS) or FLAGS env = MyEnvironment(config) agent = Agent(config, env, sess) if FLAGS.is_train: # agent.train() print('To be released.') else: agent.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) ACPconfig = ACPConfig(env) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' # Becuase of code shittines, these steps should be after each other! acpAgent = acp.acp(sess, ACPconfig) agentDQN = Agent(config, env, acpAgent, sess) acpAgent.setdir(agentDQN.model_dir) sess.run(tf.initializers.global_variables()) # Load both models if exist any checkpoint acpAgent.load() agentDQN.load() if FLAGS.is_train: agentDQN.train() else: raise Exception('agentDQN.play() is Not Implemented') agentDQN.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.mode == "train": agent.train() elif FLAGS.mode == "test": agent.play() elif FLAGS.mode == "ale": agent.play2()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS env = OurEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") agent = Agent(config, env, sess) agent.train()
def main(): sess = tf.Session() config = Config() env = Environment(config) agent = Agent(sess, config, env) if config.test: agent.play(test=True) elif config.train: agent.train() else: agent.play()
def main(_): if FLAGS.gpu_fraction == "1/1": FLAGS.gpu_fraction = "0.999/1.0" gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: #Set ratio of usage for GPU or tensorflow would report error #config = tf.ConfigProto() #config.gpu_options.allow_growth = True #with tf.Session(config=config) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if FLAGS.poison: config.poison_line = input("input the number of poison line:") if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: if FLAGS.poison: agent.train_poison() else: agent.train() else: if FLAGS.poison: agent.play_poison() else: agent.play()
def main(_): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--seed', help='RNG seed', type=int, default=123) parser.add_argument('--test', action="store_true") parser.add_argument("--use-gpu", action="store_true") parser.add_argument("--mode", help="Bonus mode", default="pixelcnn") args = parser.parse_args() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: config = get_config(args) if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and args.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if args.mode == "pixelcnn": from dqn.agent import Agent agent = Agent(config, env, sess) elif args.mode == "autoencoder": from dqn.agent_model import Agent agent = Agent(config, env, sess) elif args.mode == "top-pixelcnn": from dqn.agent_top import Agent agent = Agent(config, env, sess) else: raise ValueError("No such mode") print("CNN format", config.cnn_format) if not args.test: print("training ...") agent.train() else: print("testing ...") agent.play()
def main(_): with tf.Session() as sess: config = get_config(FLAGS) env = MyEnvironment(config) agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: if FLAGS.dataset == 'mine': agent.play_mine() else: agent.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if FLAGS.use_gpu: config.cnn_format = 'NCHW' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS # 通过config.py的get_config方法加载配置选项 if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available( ) and FLAGS.use_gpu: # 如果能检查到就使用GPU;如果设置了使用GPU但是没有检测到GPU则报错。 raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' # 输入的格式:[batch, in_height, in_width, in_channels] # 另一种数据输入格式NCHW:[batch, in_channels, in_height, in_width] agent = Agent(config, env, sess) # 新建DQN的智能体 if FLAGS.is_train: agent.train() else: agent.play() # 不进行训练(仅仅演示)
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS #if config.env_type == 'simple': # env = SimpleGymEnvironment(config) #else: # env = GymEnvironment(config) env = retro.make(game='SonicAndKnuckles3-Genesis', state='MushroomHillZone.Act1') env = SonicDiscretizer(env) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True #sess = tf.Session(config=config) with tf.Session(config=config) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(args): with tf.Session() as sess: config = Config() player = Agent(config, sess) if config.isTrain: player.train() else: player.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): with tf.Session() as sess: config = get_config(FLAGS) or FLAGS env = GymEnvironment(config) agent = Agent(config, env, sess) if config.is_train: agent.train() else: agent.play()
def main(_): with tf.Session() as sess: config = get_config(FLAGS) env = MyEnvironment(config) agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: if FLAGS.dataset == 'image': img_dirs = glob.glob(os.path.join(FLAGS.fold, "*.*")) for img_dir in img_dirs: img = cv2.imread(img_dir) out_img = agent.play_image(img) cv2.imwrite(os.path.join("results/images", os.path.basename(img_dir)), out_img) print("processing {}".format(img_dirs)) else: agent.play()
def main(_): #设置每个进程所占用的GPU内存比例 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list='0') with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=True)) as sess: # with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not FLAGS.use_gpu: config.cnn_format = 'NHWC' with tf.device('/gpu:2'): agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): # Trying to request all the GPU memory will fail, since the system # always allocates a little memory on each GPU for itself. Only set # up a GPU configuration if fractional amount of memory is requested. tf_config = None gpu_fraction = calc_gpu_fraction(FLAGS.gpu_fraction) if gpu_fraction < 1: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) tf_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=tf_config) as sess: config = get_config(FLAGS) or FLAGS env = GymEnvironment(config) # Change data format for running on a CPU. if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.train: agent.train() else: agent.play()
def main(_): with tf.Session() as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' roms = 'roms/Pong2PlayerVS.bin' ale = ALEInterface(roms.encode('utf-8')) width = ale.ale_getScreenWidth() height = ale.ale_getScreenHeight() game_screen = GameScreen() ale.ale_resetGame() (display_width, display_height) = (width * 2, height * 2) pygame.init() screen_ale = pygame.display.set_mode((display_width, display_height)) pygame.display.set_caption("Arcade Learning Environment Random Agent Display") pygame.display.flip() game_surface = pygame.Surface((width, height), depth=8) clock = pygame.time.Clock() # Clear screen screen_ale.fill((0, 0, 0)) agent = Agent(config, env, sess, 'A') agent2 = Agent2(config, env, sess, 'B') if FLAGS.is_train: start_epoch = agent.epoch_op.eval() start_step = agent.step_op.eval() start_time = time.time() # Loop for epochs for agent.epoch in range(start_epoch, agent.max_epoch): agent2.epoch = agent.epoch # Initialize information of gameplay num_game, agent.update_count, agent2.update_count, ep_rewardA, ep_rewardB = 0, 0, 0, 0., 0. total_rewardA, total_rewardB, agent.total_loss, agent2.total_loss, agent.total_q, agent2.total_q = 0., 0., 0., 0., 0., 0. max_avg_ep_rewardA, max_avg_ep_rewardB = 0, 0 ep_rewardsA, ep_rewardsB, actionsA, actionsB = [], [], [], [] # Get first frame of gameplay numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) # Add first frame of gameplay into both agents' replay history for _ in range(agent.history_length): agent.history.add(scaled_pooled_screen) agent2.history.add(scaled_pooled_screen) # Loop for training iterations for agent.step in tqdm(range(start_step, agent.max_step), ncols=70, initial=start_step): agent2.step = agent.step # End of burn in period, start to learn from frames if agent.step == agent.learn_start: num_game, agent.update_count, agent2.update_count, ep_rewardA, ep_rewardB = 0, 0, 0, 0., 0. total_rewardA, total_rewardB, agent.total_loss, agent2.total_loss, agent.total_q, agent2.total_q = 0., 0., 0., 0., 0., 0. max_avg_ep_rewardA, max_avg_ep_rewardB = 0, 0 ep_rewardsA, ep_rewardsB, actionsA, actionsB = [], [], [], [] # 1. predict action1 = agent.predict(agent.history.get()) action2 = agent2.predict(agent2.history.get()) # 2. act ale.ale_act2(action1, action2) terminal = ale.ale_isGameOver() # End of end epoch, finish up training so that game statistics can be collected without training data being messed up if agent.step == agent.max_step - 1: terminal = True rewardA = ale.ale_getRewardA() rewardB = ale.ale_getRewardB() # Fill buffer of game screen with current frame numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) agent.observe(scaled_pooled_screen, rewardA, action1, terminal) agent2.observe(scaled_pooled_screen, rewardB, action2, terminal) # Print frame onto display screen screen_ale.blit(pygame.transform.scale2x(game_surface), (0, 0)) # Update the display screen pygame.display.flip() # Check if current episode ended if terminal: ale.ale_resetGame() terminal = ale.ale_isGameOver() rewardA = ale.ale_getRewardA() rewardB = ale.ale_getRewardB() numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) # End of an episode num_game += 1 ep_rewardsA.append(ep_rewardA) ep_rewardsB.append(ep_rewardB) ep_rewardA = 0. ep_rewardB = 0. else: ep_rewardA += rewardA ep_rewardB += rewardB actionsA.append(action1) actionsB.append(action2) total_rewardA += rewardA total_rewardB += rewardB # Do a test to get statistics so far if agent.step >= agent.learn_start: if agent.step % agent.test_step == agent.test_step - 1: avg_rewardA = total_rewardA / agent.test_step avg_rewardB = total_rewardB / agent2.test_step avg_lossA = agent.total_loss / agent.update_count avg_lossB = agent2.total_loss / agent2.update_count avg_qA = agent.total_q / agent.update_count avg_qB = agent2.total_q / agent2.update_count try: max_ep_rewardA = np.max(ep_rewardsA) min_ep_rewardA = np.min(ep_rewardsA) avg_ep_rewardA = np.mean(ep_rewardsA) max_ep_rewardB = np.max(ep_rewardsB) min_ep_rewardB = np.min(ep_rewardsB) avg_ep_rewardB = np.mean(ep_rewardsB) except: max_ep_rewardA, min_ep_rewardA, avg_ep_rewardA, max_ep_rewardB, min_ep_rewardB, avg_ep_rewardB = 0, 0, 0, 0, 0, 0 print('\nFor Agent A at Epoch %d: avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (agent.epoch, avg_rewardA, avg_lossA, avg_qA, avg_ep_rewardA, max_ep_rewardA, min_ep_rewardA, num_game)) print('\nFor Agent B at Epoch %d: avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (agent2.epoch, avg_rewardB, avg_lossB, avg_qB, avg_ep_rewardB, max_ep_rewardB, min_ep_rewardB, num_game)) if max_avg_ep_rewardA * 0.9 <= avg_ep_rewardA: agent.step_assign_op.eval({agent.step_input: agent.step + 1}) agent.save_model(agent.step + 1) max_avg_ep_rewardA = max(max_avg_ep_rewardA, avg_ep_rewardA) if max_avg_ep_rewardB * 0.9 <= avg_ep_rewardB: agent2.step_assign_op.eval({agent2.step_input: agent2.step + 1}) agent2.save_model(agent2.step + 1) max_avg_ep_rewardB = max(max_avg_ep_rewardB, avg_ep_rewardB) if agent.step > 180: agent.inject_summary({ 'average.reward': avg_rewardA, 'average.loss': avg_lossA, 'average.q': avg_qA, 'episode.max reward': max_ep_rewardA, 'episode.min reward': min_ep_rewardA, 'episode.avg reward': avg_ep_rewardA, 'episode.num of game': num_game, 'episode.rewards': ep_rewardsA, 'episode.actions': actionsA, 'training.learning_rate': agent.learning_rate_op.eval({agent.learning_rate_step: agent.step}), }, agent.step) if agent2.step > 180: agent2.inject_summary({ 'average.reward': avg_rewardB, 'average.loss': avg_lossB, 'average.q': avg_qB, 'episode.max reward': max_ep_rewardB, 'episode.min reward': min_ep_rewardB, 'episode.avg reward': avg_ep_rewardB, 'episode.num of game': num_game, 'episode.rewards': ep_rewardsB, 'episode.actions': actionsB, 'training.learning_rate': agent2.learning_rate_op.eval({agent2.learning_rate_step: agent2.step}), }, agent2.step) # Reset statistics num_game = 0 total_rewardA, total_rewardB = 0., 0. agent.total_loss, agent2.total_loss = 0., 0. agent.total_q, agent2.total_q = 0., 0. agent.update_count, agent2.update_count = 0, 0 ep_rewardA, ep_rewardB = 0., 0. ep_rewardsA, ep_rewardsB = [], [] actionsA, actionsB = [], [] # Play 10 games at the end of epoch to get game statistics total_points, paddle_bounce, wall_bounce, serving_time = [], [], [], [] for _ in range(10): cur_total_points, cur_paddle_bounce, cur_wall_bounce, cur_serving_time = 0, 0, 0, 0 # Restart game ale.ale_resetGame() # Get first frame of gameplay numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) # Create history for testing purposes test_history = History(config) # Fill first 4 images with initial screen for _ in range(agent.history_length): test_history.add(scaled_pooled_screen) while not ale.ale_isGameOver(): # 1. predict action1 = agent.predict(agent.history.get()) action2 = agent2.predict(agent2.history.get()) # 2. act ale.ale_act2(action1, action2) terminal = ale.ale_isGameOver() rewardA = ale.ale_getRewardA() rewardB = ale.ale_getRewardB() # Record game statistics of current episode cur_total_points = ale.ale_getPoints() cur_paddle_bounce = ale.ale_getSideBouncing() if ale.ale_getWallBouncing(): cur_wall_bounce += 1 if ale.ale_getServing(): cur_serving_time += 1 # Fill buffer of game screen with current frame numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) agent.observe(scaled_pooled_screen, rewardA, action1, terminal) agent2.observe(scaled_pooled_screen, rewardB, action2, terminal) # Print frame onto display screen screen_ale.blit(pygame.transform.scale2x(game_surface), (0, 0)) # Update the display screen pygame.display.flip() # Append current episode's statistics into list total_points.append(cur_total_points) paddle_bounce.append(cur_paddle_bounce / cur_total_points) if cur_paddle_bounce == 0: wall_bounce.append(cur_wall_bounce / (cur_paddle_bounce + 1)) else: wall_bounce.append(cur_wall_bounce / cur_paddle_bounce) serving_time.append(cur_serving_time / cur_total_points) # Save results of test after current epoch cur_paddle_op = agent.paddle_op.eval() cur_paddle_op[agent.epoch] = sum(paddle_bounce) / len(paddle_bounce) agent.paddle_assign_op.eval({agent.paddle_input: cur_paddle_op}) cur_wall_op = agent.wall_op.eval() cur_wall_op[agent.epoch] = sum(wall_bounce) / len(wall_bounce) agent.wall_assign_op.eval({agent.wall_input: cur_wall_op}) cur_serving_op = agent.serving_op.eval() cur_serving_op[agent.epoch] = sum(serving_time) / len(serving_time) agent.serving_assign_op.eval({agent.serving_input: cur_serving_op}) agent.save_model(agent.step + 1) else: agent.play() agent2.play()
def train(sess, config): env = GymEnvironment(config) log_dir = './log/{}_lookahead_{}_gats_{}/'.format(config.env_name, config.lookahead, config.gats) checkpoint_dir = os.path.join(log_dir, 'checkpoints/') image_dir = os.path.join(log_dir, 'rollout/') if os.path.isdir(log_dir): shutil.rmtree(log_dir) print(' [*] Removed log dir: ' + log_dir) with tf.variable_scope('step'): step_op = tf.Variable(0, trainable=False, name='step') step_input = tf.placeholder('int32', None, name='step_input') step_assign_op = step_op.assign(step_input) with tf.variable_scope('summary'): scalar_summary_tags = [ 'average.reward', 'average.loss', 'average.q value', 'episode.max reward', 'episode.min reward', 'episode.avg reward', 'episode.num of game', 'training.learning_rate', 'rp.rp_accuracy', 'rp.rp_plus_accuracy', 'rp.rp_minus_accuracy', 'rp.nonzero_rp_accuracy' ] summary_placeholders = {} summary_ops = {} for tag in scalar_summary_tags: summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace( ' ', '_')) summary_ops[tag] = tf.summary.scalar( "%s-%s/%s" % (config.env_name, config.env_type, tag), summary_placeholders[tag]) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace( ' ', '_')) summary_ops[tag] = tf.summary.histogram(tag, summary_placeholders[tag]) config.num_actions = env.action_size # config.num_actions = 3 exploration = LinearSchedule(config.epsilon_end_t, config.epsilon_end) agent = Agent(sess, config, num_actions=config.num_actions) if config.gats: lookahead = config.lookahead rp_train_frequency = 4 gdm_train_frequency = 4 gdm = GDM(sess, config, num_actions=config.num_actions) rp = RP(sess, config, num_actions=config.num_actions) leaves_size = config.num_actions**config.lookahead if config.dyna: gan_memory = GANReplayMemory(config) else: gan_memory = None def base_generator(): tree_base = np.zeros((leaves_size, lookahead)).astype('uint8') for i in range(leaves_size): n = i j = 0 while n: n, r = divmod(n, config.num_actions) tree_base[i, lookahead - 1 - j] = r j = j + 1 return tree_base tree_base = base_generator() # memory = ReplayMemory(config) memory = ReplayMemory(config, log_dir) history = History(config) tf.global_variables_initializer().run() saver = tf.train.Saver(max_to_keep=30) # model load, if exist ckpt. load_model(sess, saver, checkpoint_dir) agent.updated_target_q_network() writer = tf.summary.FileWriter(log_dir, sess.graph) num_game, update_count, ep_reward = 0, 0, 0. total_reward, total_loss, total_q_value = 0., 0., 0. max_avg_ep_reward = -100 ep_rewards, actions = [], [] rp_accuracy = [] rp_plus_accuracy = [] rp_minus_accuracy = [] nonzero_rp_accuracy = [] screen, reward, action, terminal = env.new_random_game() # init state for _ in range(config.history_length): history.add(screen) start_step = step_op.eval() # main for step in tqdm(range(start_step, config.max_step), ncols=70, initial=start_step): if step == config.learn_start: num_game, update_count, ep_reward = 0, 0, 0. total_reward, total_loss, total_q_value = 0., 0., 0. ep_rewards, actions = [], [] if step == config.gan_dqn_learn_start: rp_accuracy = [] rp_plus_accuracy = [] rp_minus_accuracy = [] nonzero_rp_accuracy = [] # ε-greedy MCTS_FLAG = False epsilon = exploration.value(step) if random.random() < epsilon: action = random.randrange(config.num_actions) else: current_state = norm_frame(np.expand_dims(history.get(), axis=0)) if config.gats and (step >= config.gan_dqn_learn_start): action, predicted_reward = MCTS_planning( gdm, rp, agent, current_state, leaves_size, tree_base, config, exploration, step, gan_memory) MCTS_FLAG = True else: action = agent.get_action( norm_frame_Q(unnorm_frame(current_state))) # GATS用? apply_action = action # if int(apply_action != 0): # apply_action += 1 # Observe screen, reward, terminal = env.act(apply_action, is_training=True) reward = max(config.min_reward, min(config.max_reward, reward)) history.add(screen) memory.add(screen, reward, action, terminal) if MCTS_FLAG: rp_accuracy.append(int(predicted_reward == reward)) if reward != 0: nonzero_rp_accuracy.append(int(predicted_reward == reward)) if reward == 1: rp_plus_accuracy.append(int(predicted_reward == reward)) elif reward == -1: rp_minus_accuracy.append(int(predicted_reward == reward)) # Train if step > config.gan_learn_start and config.gats: if step % rp_train_frequency == 0 and memory.can_sample( config.rp_batch_size): obs, act, rew = memory.reward_sample(config.rp_batch_size) # obs, act, rew = memory.reward_sample2( # config.rp_batch_size, config.lookahead) reward_obs, reward_act, reward_rew = memory.reward_sample( config.nonzero_batch_size, nonzero=True) # reward_obs, reward_act, reward_rew = memory.nonzero_reward_sample( # config.rp_batch_size, config.lookahead) obs_batch = norm_frame( np.concatenate((obs, reward_obs), axis=0)) act_batch = np.concatenate((act, reward_act), axis=0) rew_batch = np.concatenate((rew, reward_rew), axis=0) reward_label = rew_batch + 1 trajectories = gdm.get_state(obs_batch, act_batch[:, :-1]) rp_summary = rp.train(trajectories, act_batch, reward_label) writer.add_summary(rp_summary, step) if step % gdm_train_frequency == 0 and memory.can_sample( config.gan_batch_size): state_batch, action_batch, next_state_batch = memory.GAN_sample( ) # state_batch, act_batch, next_state_batch = memory.GAN_sample2( # config.gan_batch_size, config.lookahead) # gdm.summary, disc_summary, merged_summary = gdm.train( # norm_frame(state_batch), act_batch, norm_frame(next_state_batch), warmup_bool) gdm.summary, disc_summary = gdm.train( norm_frame(state_batch), action_batch, norm_frame(next_state_batch)) if step > config.learn_start: # if step % config.train_frequency == 0 and memory.can_sample(config.batch_size): if step % config.train_frequency == 0: # s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch = memory.sample( # config.batch_size, config.lookahead) s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch = memory.sample( ) s_t, s_t_plus_1 = norm_frame(s_t), norm_frame(s_t_plus_1) if config.gats and config.dyna: if step > config.gan_dqn_learn_start and gan_memory.can_sample( config.batch_size): gan_obs_batch, gan_act_batch, gan_rew_batch, gan_terminal_batch = gan_memory.sample( ) # gan_obs_batch, gan_act_batch, gan_rew_batch = gan_memory.sample( # config.batch_size) gan_obs_batch = norm_frame(gan_obs_batch) trajectories = gdm.get_state( gan_obs_batch, np.expand_dims(gan_act_batch, axis=1)) gan_next_obs_batch = trajectories[:, -config. history_length:, ...] # gan_obs_batch, gan_next_obs_batch = \ # norm_frame(gan_obs_batch), norm_frame(gan_next_obs_batch) s_t = np.concatenate([s_t, gan_obs_batch], axis=0) act_batch = np.concatenate([act_batch, gan_act_batch], axis=0) rew_batch = np.concatenate([rew_batch, gan_rew_batch], axis=0) s_t_plus_1 = np.concatenate( [s_t_plus_1, gan_next_obs_batch], axis=0) terminal_batch = np.concatenate( [terminal_batch, gan_terminal_batch], axis=0) s_t, s_t_plus_1 = norm_frame_Q( unnorm_frame(s_t)), norm_frame_Q(unnorm_frame(s_t_plus_1)) q_t, loss, dqn_summary = agent.train(s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch, step) writer.add_summary(dqn_summary, step) total_loss += loss total_q_value += q_t.mean() update_count += 1 if step % config.target_q_update_step == config.target_q_update_step - 1: agent.updated_target_q_network() # reinit if terminal: screen, reward, action, terminal = env.new_random_game() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward total_reward += reward # change train freqancy if config.gats: if step == 10000 - 1: rp_train_frequency = 8 gdm_train_frequency = 8 if step == 50000 - 1: rp_train_frequency = 16 gdm_train_frequency = 16 if step == 100000 - 1: rp_train_frequency = 24 gdm_train_frequency = 24 # rolloutを行い画像を保存 if config.gats and step % config._test_step == config._test_step - 1: rollout_image(config, image_dir, gdm, memory, step + 1, 16) # calcurate infometion if step >= config.learn_start: if step % config._test_step == config._test_step - 1: # plot if config.gats: writer.add_summary(gdm.summary, step) writer.add_summary(disc_summary, step) avg_reward = total_reward / config._test_step avg_loss = total_loss / update_count avg_q = total_q_value / update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 print( '\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)) # require terget q network if max_avg_ep_reward * 0.9 <= avg_ep_reward: step_assign_op.eval({step_input: step + 1}) save_model(sess, saver, checkpoint_dir, step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if step >= config.gan_dqn_learn_start: if len(rp_accuracy) > 0: rp_accuracy = np.mean(rp_accuracy) rp_plus_accuracy = np.mean(rp_plus_accuracy) rp_minus_accuracy = np.mean(rp_minus_accuracy) nonzero_rp_accuracy = np.mean(nonzero_rp_accuracy) else: rp_accuracy = 0 rp_plus_accuracy = 0 rp_minus_accuracy = 0 nonzero_rp_accuracy = 0 else: rp_accuracy = 0 rp_plus_accuracy = 0 rp_minus_accuracy = 0 nonzero_rp_accuracy = 0 # summary if step > 180: inject_summary( sess, writer, summary_ops, summary_placeholders, { 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q value': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of game': num_game, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'rp.rp_accuracy': rp_accuracy, 'rp.rp_plus_accuracy': rp_plus_accuracy, 'rp.rp_minus_accuracy': rp_minus_accuracy, 'rp.nonzero_rp_accuracy': nonzero_rp_accuracy }, step) num_game = 0 total_reward = 0. total_loss = 0. total_q_value = 0. update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] rp_accuracy = [] rp_plus_accuracy = [] rp_minus_accuracy = [] nonzero_rp_accuracy = []
def main(_): # tensorflow 在执行过程中会默认使用全部的 GPU 内存,给系统保留 200 M,因此我们可以使用如下语句指定 GPU 内存的分配比例: if FLAGS.gpu_fraction == '': raise ValueError("--gpu_fraction should be defined") gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) # 在终端监视:watch -n 10 nvidia-smi with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = DQNConfig(FLAGS) or FLAGS print("\n [*] Current Configuration") pp(config.list_all_member()) # Notice before the process # Code in remoteApi.start(19999) in Vrep otherwise it may cause some unpredictable problem if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if config.is_train: env = DQNEnvironment(config) agent = Agent(config, env, sess) agent.train() else: if config.is_sim: env = DQNEnvironment(config) agent = Agent(config, env, sess) agent.play() agent.randomplay() else: from experiment.environment import REALEnvironment env = REALEnvironment(config) agent = Agent(config, env, sess) agent.exp_play() env.close()
state_size = num_nodes * num_features_per_node action_size = 5 # Add some variables to keep track of the progress scores_window, steps_window, collisions_window, done_window = [ deque(maxlen=200) for _ in range(4) ] agent_obs = [None] * flags.num_agents agent_obs_buffer = [None] * flags.num_agents agent_action_buffer = [2] * flags.num_agents max_steps = 8 * (flags.grid_width + flags.grid_height) start_time = time.time() # Load an RL agent and initialize it from checkpoint if necessary if flags.agent_type == "dqn": agent = DQN_Agent(state_size, action_size, flags.num_agents) elif flags.agent_type == "ppo": agent = PPO_Agent(state_size, action_size, flags.num_agents) if flags.load_model: start, eps = agent.load(project_root / 'checkpoints', 0, 1.0) else: start, eps = 0, 1.0 if not flags.train: eps = 0.0 # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop # through the generators to get all the old networks out of the way if start > 0: print(f"Skipping {start} railways") for _ in range(0, start):
def demo(): agents = (Agent(), Agent()) train_agents_and_save(agents)
def dqn_argo(param_set: Parameter_Set, max_reward): # Agentの生成 netWork = Network(action_dim=2) target_network = Network(action_dim=2) agent = Agent(network=netWork, target_network=target_network, eps_start=param_set.eps_init, eps_anneal=param_set.eps_anneal, eps_min=param_set.eps_min, lr=param_set.lr, gamma=param_set.gamma) # Envの生成 env = gym.make('CartPole-v0') replay_buffer = Replay_Buffer(param_set.cap) save_reward_list = [] reward_list = [] for i in range(REWARD_SAVE_EVALUATION_SIZE): save_reward_list.append(0) for i in range(REWARD_EVALUATION_SIZE): reward_list.append(0) # データ集め(何回ゲームをやるか) for i in range(EPISODE_NUM): # Envの初期化情報の取得 state = env.reset() done = False # エピソード報酬初期化 episode_reward = 0 # 1ゲーム終了させる(Envから終了判定もらう) while not done: if i > INIT_EXPLORATION: # Actionをε-greedyで決める action = agent.get_action(state) else: action = env.action_space.sample() # Action引数にEnvからS、r,dの情報を引っ張ってくる next_state, reward, done, info = env.step(action) # エピソード報酬計算 episode_reward += reward # ReplayBufferにaddする replay_buffer.add(state, action, next_state, reward, done) # StにSt+1を代入(更新処理) state = next_state loss = tf.constant(0) if i > INIT_EXPLORATION: # ニューラルネットワーク学習 sample = replay_buffer.sample(BATCH_SIZE) if sample: loss = agent.update(replay_buffer.sample(BATCH_SIZE)) if i % param_set.q_update == 0: agent.network_synchronize() reward_list[i % REWARD_EVALUATION_SIZE] = episode_reward save_reward_list[i % REWARD_SAVE_EVALUATION_SIZE] = episode_reward if sum(save_reward_list) / len(save_reward_list) >= max_reward: print("最高記録更新!!!") agent.save(SAVE_DIRECTORY + SAVE_FILE) max_reward = sum(save_reward_list) / len(save_reward_list) return sum(reward_list) / len(reward_list), max_reward
class Train: def __init__(self, cfg): self.num_states = cfg.MODEL.SIZE_STATE self.num_actions = cfg.MODEL.SIZE_ACTION self.num_episodes = cfg.SOLVER.NUM_EPISODES self.tetris = Tetris(cfg) self.agent = Agent(cfg,self.tetris) def run(self): episode_10_list = np.zeros(10) episode_final = False reward_per_epoch = [] lifetime_per_epoch = [] for episode in range(self.num_episodes): self.tetris.init() brd, mino = self.tetris.get_state() observation = torch.tensor(np.append(brd.flatten(), mino)) state = observation state = state.type(torch.FloatTensor) state = torch.unsqueeze(state, 0) # frames = [self.env.getScreenRGB()] cum_reward = 0 t = 0 step = 0 if episode % 15 == 0: self.agent.update_target_model() while not self.tetris.check_dead(): step += 1 action = self.agent.get_action(state,mino, episode) self.tetris.update_state(action.squeeze()) rew = self.tetris.score print(rew) t += 1 brd, mino = self.tetris.get_state() observation_next = torch.tensor(np.append(brd.flatten(), mino)) done = self.tetris.check_dead() # frames.append(self.env.getScreenRGB()) # 報酬を与える。さらにepisodeの終了評価と、state_nextを設定する if done: # ステップ数が200経過するか、一定角度以上傾くとdoneはtrueになる state_next = None # 次の状態はないので、Noneを格納 # 直近10episodeの立てたstep数リストに追加 episode_10_list = np.hstack( (episode_10_list[1:], step + 1)) # 罰則を与える reward = torch.FloatTensor([-1.0]) else: if rew > 0: reward = torch.FloatTensor([1.0]) else: reward = torch.FloatTensor([0.0]) state_next = observation_next.type(torch.FloatTensor) # state_next = torch.from_numpy(state_next).type( # torch.FloatTensor) state_next = torch.unsqueeze( state_next, 0) cum_reward += rew self.agent.memorize(state, action, state_next, reward) self.agent.update_q_network() state = state_next # 終了時の処理 if done: print('%d Episode: Finished after %d steps:10試行の平均step数 = %.1lf' % ( episode, step + 1, episode_10_list.mean())) reward_per_epoch.append(cum_reward) lifetime_per_epoch.append(step + 1) break if episode_final is True: # 動画の保存と描画 display_frames_as_gif(frames) break # 50エピソード毎にlogを出力 if episode % PRINT_EVERY_EPISODE == 0: print("Episode %d finished after %f time steps" % (episode, t)) print("cumulated reward: %f" % cum_reward) # 100エピソード毎にアニメーションを作成 if episode % SHOW_GIF_EVERY_EPISODE == 0: print("len frames:", len(frames)) display_frames_as_gif(frames) continue # 2000タイムステップ以上続いたアニメーションを作成 if step > 2000: print("len frames:", len(frames)) display_frames_as_gif(frames) # モデルの保存 def save_model(): torch.save(agent.brain.model.state_dict(), 'weight.pth')