def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Space Invaders') parser.add_argument('--seed', default=10703, type=int, help='Random seed') parser.add_argument('--input_shape', default=SIZE_OF_STATE, help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') # TODO experiment with this value. parser.add_argument('--epsilon', default=0.1, help='Final exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.') parser.add_argument('--batch_size', default=32, type = int, help= 'Batch size of the training part') parser.add_argument('--question', type=int, default=7, help='Which hw question to run.') parser.add_argument('--evaluate', action='store_true', help='Only affects worker. Run evaluation instead of training.') parser.add_argument('--worker_epsilon', type=float, help='Only affects worker. Override epsilon to use (instead of one in file).') parser.add_argument('--skip_model_restore', action='store_true', help='Only affects worker. Use a newly initialized model instead of restoring one.') parser.add_argument('--generate_fixed_samples', action='store_true', help=('Special case execution. Generate fixed samples and close. ' + 'This is necessary to run whenever the network or action space changes.')) parser.add_argument('--ai_input_dir', default='gcloud/inputs/', help='Input directory with initialization files.') parser.add_argument('--ai_output_dir', default='gcloud/outputs/', help='Output directory for gameplay files.') parser.add_argument('--is_worker', dest='is_manager', action='store_false', help='Whether this is a worker (no training).') parser.add_argument('--is_manager', dest='is_manager', action='store_true', help='Whether this is a manager (trains).') parser.set_defaults(is_manager=True) parser.add_argument('--psc', action='store_true', help=('Only affects manager. Whether on PSC, ' + 'and should for example reduce disk usage.')) # Copied from original phillip code (run.py). for opt in CPU.full_opts(): opt.update_parser(parser) parser.add_argument("--dolphin", action="store_true", default=None, help="run dolphin") for opt in DolphinRunner.full_opts(): opt.update_parser(parser) args = parser.parse_args() # run.sh might pass these in via environment variable, so user directory # might not already be expanded. args.ai_input_dir = os.path.expanduser(args.ai_input_dir) args.ai_output_dir = os.path.expanduser(args.ai_output_dir) if args.is_manager: random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) do_evaluation = args.evaluate or random.random() < WORKER_EVALUATION_PROBABILITY if do_evaluation or args.generate_fixed_samples: args.cpu = EVAL_CPU_LEVEL print('OVERRIDING cpu level to: ' + str(EVAL_CPU_LEVEL)) if args.generate_fixed_samples and args.is_manager: raise Exception('Can not generate fixed samples as manager. Must use ' + '--is_worker and all other necessary flags (e.g. --iso ISO_PATH)') env = SmashEnv() if not args.is_manager: env.make(args) # Opens Dolphin. question_settings = get_question_settings(args.question, args.batch_size) online_model, online_params = create_model( input_shape=args.input_shape, num_actions=env.action_space.n, model_name='online_model', create_network_fn=question_settings['create_network_fn'], learning_rate=args.learning_rate) target_model = online_model update_target_params_ops = [] if (question_settings['target_update_freq'] is not None or question_settings['is_double_network']): target_model, target_params = create_model( input_shape=args.input_shape, num_actions=env.action_space.n, model_name='target_model', create_network_fn=question_settings['create_network_fn'], learning_rate=args.learning_rate) update_target_params_ops = [t.assign(s) for s, t in zip(online_params, target_params)] replay_memory = ReplayMemory( max_size=question_settings['replay_memory_size'], error_if_full=(not args.is_manager)) saver = tf.train.Saver(max_to_keep=None) agent = DQNAgent(online_model=online_model, target_model = target_model, memory=replay_memory, gamma=args.gamma, target_update_freq=question_settings['target_update_freq'], update_target_params_ops=update_target_params_ops, batch_size=args.batch_size, is_double_network=question_settings['is_double_network'], is_double_dqn=question_settings['is_double_dqn']) sess = tf.Session() with sess.as_default(): if args.generate_fixed_samples: print('Generating ' + str(NUM_FIXED_SAMPLES) + ' fixed samples and saving to ./' + FIXED_SAMPLES_FILENAME) print('This file is only ever used on the manager.') agent.compile(sess) fix_samples = agent.prepare_fixed_samples( env, sess, UniformRandomPolicy(env.action_space.n), NUM_FIXED_SAMPLES, MAX_EPISODE_LENGTH) env.terminate() with open(FIXED_SAMPLES_FILENAME, 'wb') as f: pickle.dump(fix_samples, f) return if args.is_manager or args.skip_model_restore: agent.compile(sess) else: saver.restore(sess, os.path.join(args.ai_input_dir, WORKER_INPUT_MODEL_FILENAME)) print('_________________') print('number_actions: ' + str(env.action_space.n)) # Worker code. if not args.is_manager: print('ai_input_dir: ' + args.ai_input_dir) print('ai_output_dir: ' + args.ai_output_dir) if do_evaluation: evaluation = agent.evaluate(env, sess, GreedyPolicy(), EVAL_EPISODES, MAX_EPISODE_LENGTH) print('Evaluation: ' + str(evaluation)) with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f: fix_samples = pickle.load(fixed_samples_f) mean_max_Q = calculate_mean_max_Q(sess, online_model, fix_samples) evaluation = evaluation + (mean_max_Q,) with open(os.path.join(args.ai_output_dir, WORKER_OUTPUT_EVALUATE_FILENAME), 'wb') as f: pickle.dump(evaluation, f) env.terminate() return worker_epsilon = args.worker_epsilon if worker_epsilon is None: with open(os.path.join(args.ai_input_dir, WORKER_INPUT_EPSILON_FILENAME)) as f: lines = f.readlines() # TODO handle unexpected lines better than just ignoring? worker_epsilon = float(lines[0]) print('Worker epsilon: ' + str(worker_epsilon)) train_policy = GreedyEpsilonPolicy(worker_epsilon) agent.play(env, sess, train_policy, total_seconds=PLAY_TOTAL_SECONDS, max_episode_length=MAX_EPISODE_LENGTH) replay_memory.save_to_file(os.path.join(args.ai_output_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME)) env.terminate() return # Manager code. mprint('Loading fix samples') with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f: fix_samples = pickle.load(fixed_samples_f) evaluation_dirs = set() play_dirs = set() save_model(saver, sess, args.ai_input_dir, epsilon=1.0) epsilon_generator = LinearDecayGreedyEpsilonPolicy( 1.0, args.epsilon, TOTAL_WORKER_JOBS / 5.0) fits_so_far = 0 mprint('Begin to train (now safe to run gcloud)') mprint('Initial mean_max_q: ' + str(calculate_mean_max_Q(sess, online_model, fix_samples))) while len(play_dirs) < TOTAL_WORKER_JOBS: output_dirs = os.listdir(args.ai_output_dir) output_dirs = [os.path.join(args.ai_output_dir, x) for x in output_dirs] output_dirs = set(x for x in output_dirs if os.path.isdir(x)) new_dirs = sorted(output_dirs - evaluation_dirs - play_dirs) if len(new_dirs) == 0: time.sleep(0.1) continue new_dir = new_dirs[-1] # Most recent gameplay. evaluation_path = os.path.join(new_dir, WORKER_OUTPUT_EVALUATE_FILENAME) if os.path.isfile(evaluation_path): evaluation_dirs.add(new_dir) with open(evaluation_path, 'rb') as evaluation_file: rewards, game_lengths, mean_max_Q = pickle.load(evaluation_file) evaluation = [np.mean(rewards), np.std(rewards), np.mean(game_lengths), np.std(game_lengths), mean_max_Q] mprint('Evaluation: ' + '\t'.join(str(x) for x in evaluation)) continue memory_path = os.path.join(new_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME) try: if os.path.getsize(memory_path) == 0: # TODO Figure out why this happens despite temporary directory work. # Also sometimes the file doesn't exist? Hence the try/except. mprint('Output not ready somehow: ' + memory_path) time.sleep(0.1) continue with open(memory_path, 'rb') as memory_file: worker_memories = pickle.load(memory_file) except Exception as exception: print('Error reading ' + memory_path + ': ' + str(exception.args)) time.sleep(0.1) continue for worker_memory in worker_memories: replay_memory.append(*worker_memory) if args.psc: os.remove(memory_path) play_dirs.add(new_dir) if len(play_dirs) <= NUM_BURN_IN_JOBS: mprint('Skip training because still burn in.') mprint('len(worker_memories): ' + str(len(worker_memories))) continue for _ in range(int(len(worker_memories) * FITS_PER_SINGLE_MEMORY)): agent.fit(sess, fits_so_far) fits_so_far += 1 # Partial evaluation to give frequent insight into agent progress. # Last time checked, this took ~0.1 seconds to complete. mprint('mean_max_q, len(worker_memories): ' + str(calculate_mean_max_Q(sess, online_model, fix_samples)) + ', ' + str(len(worker_memories))) # Always decrement epsilon (e.g. not just when saving model). model_epsilon = epsilon_generator.get_epsilon(decay_epsilon=True) if len(play_dirs) % SAVE_MODEL_EVERY == 0: save_model(saver, sess, args.ai_input_dir, model_epsilon)
def main(args): # gpu id # gpu_id = args.gpu # os.environ['CUDA_VISIBLE_DEVICES'] = '%d'%gpu_id # make env env = gym.make(args.env) if args.mode == 'test' and args.submit: monitor_log = os.path.join(args.output, 'monitor.log') env = wrappers.Monitor(env, monitor_log, force=True) # build model # actions 0-5: 0 do nothing, 1 fire, 2 right, 3 left, 4 right+fire, 5 left+fire num_actions = env.action_space.n mem_size = 1000000 window = 4 input_shape = (84, 84) if args.type in ['DQN', 'double-DQN']: model = create_model(window, input_shape, num_actions, args.init) target = create_model(window, input_shape, num_actions, args.init) elif args.type in ['linear', 'linear-simple', 'double-Q']: model = create_model_linear(window, input_shape, num_actions, args.init) target = create_model_linear(window, input_shape, num_actions, args.init) elif args.type == 'duel': model = create_model_duel(window, input_shape, num_actions, args.init) target = create_model_duel(window, input_shape, num_actions, args.init) # memory = ReplayMemory(1000000, 100) # window length is arbitrary # target_update_freq = 10000 # num_burn_in = 50000 target_update_freq = 10000 num_burn_in = 50000 train_freq = 4 batch_size = 32 gamma = 0.99 epsilon = 0.05 updates_per_epoch = 50000 num_iterations = 50000000 eval_episodes = 100 max_episode_length = 10000 # simple: no experience replay and no target fixing # if args.type == 'linear-simple': # mem_size = 5 # target_update_freq = 1 # num_burn_in = 0 # batch_size = 1 if args.type == 'linear-simple': num_burn_in = 0 memory = ReplayMemoryEfficient(mem_size, window, input_shape) # with tf.device('/gpu:%d'%gpu_id): config = tf.ConfigProto(intra_op_parallelism_threads=8) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # preprocessor preprocessor = PreprocessorSequence() # policy policy = LinearDecayGreedyEpsilonPolicy(1, 0.1, 1000000) policy_eval = GreedyEpsilonPolicy(epsilon) # build agent dqn_agent = DQNAgent(sess, env, args.type, model, target, preprocessor, memory, policy, policy_eval, gamma, target_update_freq, num_burn_in, train_freq, batch_size, num_actions, updates_per_epoch, args.output) if args.mode == 'train': # compile net and train with fit # rmsprop = RMSprop(lr=0.00025, rho=0.95, epsilon=0.01) # dqn_agent.compile_networks(rmsprop, mean_huber_loss) # adam = Adam(lr=0.00025, beta_1=0.95, beta_2=0.95, epsilon=0.1) adam = Adam(lr=0.0001) dqn_agent.compile_networks(adam, mean_huber_loss) if args.type == 'linear-simple': dqn_agent.fit_simple(num_iterations, max_episode_length) else: dqn_agent.fit(num_iterations, max_episode_length) elif args.mode == 'test': # load net and evaluate model_path = os.path.join(args.output, 'model_epoch%03d' % args.epoch) dqn_agent.load_networks(model_path) if args.submit: eval_episodes = 1 dqn_agent.play(eval_episodes, max_episode_length) # if args.submit: # gym.upload(monitor_log, api_key='sk_wa5MgeDTnOQ209qBCP7jQ') # else: # log_file = open(os.path.join(args.output, 'evaluation.txt'), 'a+') # log_file.write('%d %f %f %f %f\n' % (args.epoch, # np.mean(lengths), # np.std(lengths), # np.mean(rewards), # np.std(rewards))) # log_file.close() env.close()