def main(): if(len(sys.argv) != 5): print("usage:{} <env> <model_json> <weights> <directory>".format(sys.argv[0])) return sys.exit() env = gym.make(sys.argv[1]) env.frameskip = 1 with open(sys.argv[2]) as json_file: model = model_from_json(json.load(json_file),{"Eq9":Eq9}) model.load_weights(sys.argv[3]) epsilon = 0.01 input_shape = (84,84) history_size = 4 eval_size = 1 directory = sys.argv[4] history_prep = HistoryPreprocessor(history_size) atari_prep = AtariPreprocessor(input_shape,0,999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence([atari_prep, history_prep, numpy_prep]) #from left to right policy = GreedyEpsilonPolicy(epsilon) agent = DQNAgent(model, preprocessors, None, policy, 0.99, None,None,None,None) env = gym.wrappers.Monitor(env,directory,force=True) reward_arr, length_arr = agent.evaluate_detailed(env,eval_size,render=False, verbose=True)
def evaluate(self, env, num_episodes): """Test your agent with a provided environment. You shouldn't update your network parameters here. Also if you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. Basically run your policy on the environment and collect stats like cumulative reward, average episode length, etc. You can also call the render function here if you want to visually inspect your policy. """ eval_policy = GreedyEpsilonPolicy(self.config.epsilon) cumu_reward = 0 epscnt = 0 while epscnt < num_episodes: isterminal = False _screen_raw = self.process_env_reset(env) # Save to history while not isterminal: current_state = self.historyPre.get_current_state() action = self.select_action_test(current_state, eval_policy) # Get action _screen_next_raw, reward, isterminal, _ = env.step( action) # take action, observe new cumu_reward += reward _screen_raw = self.process_one_screen( _screen_raw, action, reward, _screen_next_raw, isterminal, True) # Save to history, Memory epscnt += 1 return cumu_reward / num_episodes
def model_evaluate(model, env, IMAGE_SIZE, HISTORY_LENGTH): # Initialize everything. observation = env.reset() atari_processor = AtariProcessor(IMAGE_SIZE) history_store = HistoryStore(HISTORY_LENGTH, IMAGE_SIZE) greedy_selector = GreedyEpsilonPolicy(0.1) reward_cum = 0 # Cumulative total reward. done = False cnt_interaction = 0 # Run and calculate cumulative reward until reaching terminate state. while done == False: state = atari_processor.state_for_nn(observation) history_store.add_history(state) nn_input = history_store.get_history_for_nn() q_values = model_predict(model, nn_input) action = greedy_selector.select_action(q_values) observation, reward, done, info = env.step(action) reward_cum += reward cnt_interaction += 1 # print("Total reward is", reward_cum) return reward_cum, cnt_interaction
def select_action(self, state, is_training, net_namescope, iteration_number=None): """Select the action based on the current state. You will probably want to vary your behavior here based on which stage of training your in. For example, if you're still collecting random samples you might want to use a UniformRandomPolicy. If you're testing, you might want to use a GreedyEpsilonPolicy with a low epsilon. If you're training, you might want to use the LinearDecayGreedyEpsilonPolicy. This would also be a good place to call process_state_for_network in your preprocessor. Returns -------- selected action """ q_vals = self.calc_q_values(state, net_namescope) # Use a linear decay greedy epsilon policy to choose actions for training. if is_training: action_index = self.policy.select_action(q_vals, is_training, iteration_number) if not (is_training): epsilon = 0.05 # For evaluating use greedy epsilon. pol = GreedyEpsilonPolicy(epsilon, self.num_actions) action_index = pol.select_action(q_vals) return action_index
def main(): if (len(sys.argv) != 6): print("usage:{} <env> <model_json> <weights> <render> <random>".format( sys.argv[0])) return sys.exit() env = gym.make(sys.argv[1]) env.frameskip = 1 with open(sys.argv[2]) as json_file: model = model_from_json(json.load(json_file), {"Eq9": Eq9}) model.load_weights(sys.argv[3]) epsilon = 0.01 input_shape = (84, 84) history_size = 4 eval_size = 100 render = (sys.argv[4] == "y") history_prep = HistoryPreprocessor(history_size) atari_prep = AtariPreprocessor(input_shape, 0, 999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence( [atari_prep, history_prep, numpy_prep]) #from left to right if (sys.argv[5] == "y"): print("using random policy") policy = UniformRandomPolicy(env.action_space.n) else: print("using greedy policy") policy = GreedyEpsilonPolicy(epsilon) agent = DQNAgent(model, preprocessors, None, policy, 0.99, None, None, None, None) agent.add_keras_custom_layers({"Eq9": Eq9}) reward_arr, length_arr = agent.evaluate_detailed(env, eval_size, render=render, verbose=True) print("\rPlayed {} games, reward:M={}, SD={} length:M={}, SD={}".format( eval_size, np.mean(reward_arr), np.std(reward_arr), np.mean(length_arr), np.std(reward_arr))) print("max:{} min:{}".format(np.max(reward_arr), np.min(reward_arr))) plt.hist(reward_arr) plt.show()
def main(): # noqa: D103 #(SpaceInvaders-v0 # Enduro-v0 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') #parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') #parser.add_argument('--env', default='PendulumSai-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) #args.output = get_output_folder(args.output, args.env) # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. model_name = 'linear' env = gym.make(args.env) num_iter = 2000000 max_epi_iter = 1000 epsilon = 0.4 window = 4 gamma = 0.99 target_update_freq = 5000 train_freq = 1 batch_size = 32 num_burn_in = 5000 num_actions = 3 #env.action_space.n state_size = (84, 84, 1) new_size = state_size max_size = 1000000 lr = 0.00020 beta_1 = 0.9 beta_2 = 0.999 epsilon2 = 1e-08 decay = 0.0 u_policy = UniformRandomPolicy(num_actions) ge_policy = GreedyEpsilonPolicy(epsilon) g_policy = GreedyPolicy() policy = { 'u_policy': u_policy, 'ge_policy': ge_policy, 'g_policy': g_policy } #preprocessor = PreprocessorSequence([AtariPreprocessor(new_size), HistoryPreprocessor(window)]) preprocessor = AtariPreprocessor(new_size) memory = SequentialMemory(max_size=max_size, window_length=window) model = create_model(window, state_size, num_actions) print(model.summary()) dqnA = DQNAgent(q_network=model, preprocessor=preprocessor, memory=memory, policy=policy, gamma=gamma, target_update_freq=target_update_freq, num_burn_in=num_burn_in, train_freq=train_freq, batch_size=batch_size, model_name=model_name) #testing #selected_action = dqnA.select_action( np.random.rand(1,210,160,12), train=1, warmup_phase=0) h_loss = huber_loss optimizer = Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon2, decay=decay) dqnA.compile(optimizer, h_loss) #callback1 = ProgbarLogger(count_mode='samples') dqnA.fit(env, num_iterations=num_iter, max_episode_length=max_epi_iter)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Space Invaders') parser.add_argument('--seed', default=10703, type=int, help='Random seed') parser.add_argument('--input_shape', default=SIZE_OF_STATE, help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') # TODO experiment with this value. parser.add_argument('--epsilon', default=0.1, help='Final exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.') parser.add_argument('--batch_size', default=32, type = int, help= 'Batch size of the training part') parser.add_argument('--question', type=int, default=7, help='Which hw question to run.') parser.add_argument('--evaluate', action='store_true', help='Only affects worker. Run evaluation instead of training.') parser.add_argument('--worker_epsilon', type=float, help='Only affects worker. Override epsilon to use (instead of one in file).') parser.add_argument('--skip_model_restore', action='store_true', help='Only affects worker. Use a newly initialized model instead of restoring one.') parser.add_argument('--generate_fixed_samples', action='store_true', help=('Special case execution. Generate fixed samples and close. ' + 'This is necessary to run whenever the network or action space changes.')) parser.add_argument('--ai_input_dir', default='gcloud/inputs/', help='Input directory with initialization files.') parser.add_argument('--ai_output_dir', default='gcloud/outputs/', help='Output directory for gameplay files.') parser.add_argument('--is_worker', dest='is_manager', action='store_false', help='Whether this is a worker (no training).') parser.add_argument('--is_manager', dest='is_manager', action='store_true', help='Whether this is a manager (trains).') parser.set_defaults(is_manager=True) parser.add_argument('--psc', action='store_true', help=('Only affects manager. Whether on PSC, ' + 'and should for example reduce disk usage.')) # Copied from original phillip code (run.py). for opt in CPU.full_opts(): opt.update_parser(parser) parser.add_argument("--dolphin", action="store_true", default=None, help="run dolphin") for opt in DolphinRunner.full_opts(): opt.update_parser(parser) args = parser.parse_args() # run.sh might pass these in via environment variable, so user directory # might not already be expanded. args.ai_input_dir = os.path.expanduser(args.ai_input_dir) args.ai_output_dir = os.path.expanduser(args.ai_output_dir) if args.is_manager: random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) do_evaluation = args.evaluate or random.random() < WORKER_EVALUATION_PROBABILITY if do_evaluation or args.generate_fixed_samples: args.cpu = EVAL_CPU_LEVEL print('OVERRIDING cpu level to: ' + str(EVAL_CPU_LEVEL)) if args.generate_fixed_samples and args.is_manager: raise Exception('Can not generate fixed samples as manager. Must use ' + '--is_worker and all other necessary flags (e.g. --iso ISO_PATH)') env = SmashEnv() if not args.is_manager: env.make(args) # Opens Dolphin. question_settings = get_question_settings(args.question, args.batch_size) online_model, online_params = create_model( input_shape=args.input_shape, num_actions=env.action_space.n, model_name='online_model', create_network_fn=question_settings['create_network_fn'], learning_rate=args.learning_rate) target_model = online_model update_target_params_ops = [] if (question_settings['target_update_freq'] is not None or question_settings['is_double_network']): target_model, target_params = create_model( input_shape=args.input_shape, num_actions=env.action_space.n, model_name='target_model', create_network_fn=question_settings['create_network_fn'], learning_rate=args.learning_rate) update_target_params_ops = [t.assign(s) for s, t in zip(online_params, target_params)] replay_memory = ReplayMemory( max_size=question_settings['replay_memory_size'], error_if_full=(not args.is_manager)) saver = tf.train.Saver(max_to_keep=None) agent = DQNAgent(online_model=online_model, target_model = target_model, memory=replay_memory, gamma=args.gamma, target_update_freq=question_settings['target_update_freq'], update_target_params_ops=update_target_params_ops, batch_size=args.batch_size, is_double_network=question_settings['is_double_network'], is_double_dqn=question_settings['is_double_dqn']) sess = tf.Session() with sess.as_default(): if args.generate_fixed_samples: print('Generating ' + str(NUM_FIXED_SAMPLES) + ' fixed samples and saving to ./' + FIXED_SAMPLES_FILENAME) print('This file is only ever used on the manager.') agent.compile(sess) fix_samples = agent.prepare_fixed_samples( env, sess, UniformRandomPolicy(env.action_space.n), NUM_FIXED_SAMPLES, MAX_EPISODE_LENGTH) env.terminate() with open(FIXED_SAMPLES_FILENAME, 'wb') as f: pickle.dump(fix_samples, f) return if args.is_manager or args.skip_model_restore: agent.compile(sess) else: saver.restore(sess, os.path.join(args.ai_input_dir, WORKER_INPUT_MODEL_FILENAME)) print('_________________') print('number_actions: ' + str(env.action_space.n)) # Worker code. if not args.is_manager: print('ai_input_dir: ' + args.ai_input_dir) print('ai_output_dir: ' + args.ai_output_dir) if do_evaluation: evaluation = agent.evaluate(env, sess, GreedyPolicy(), EVAL_EPISODES, MAX_EPISODE_LENGTH) print('Evaluation: ' + str(evaluation)) with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f: fix_samples = pickle.load(fixed_samples_f) mean_max_Q = calculate_mean_max_Q(sess, online_model, fix_samples) evaluation = evaluation + (mean_max_Q,) with open(os.path.join(args.ai_output_dir, WORKER_OUTPUT_EVALUATE_FILENAME), 'wb') as f: pickle.dump(evaluation, f) env.terminate() return worker_epsilon = args.worker_epsilon if worker_epsilon is None: with open(os.path.join(args.ai_input_dir, WORKER_INPUT_EPSILON_FILENAME)) as f: lines = f.readlines() # TODO handle unexpected lines better than just ignoring? worker_epsilon = float(lines[0]) print('Worker epsilon: ' + str(worker_epsilon)) train_policy = GreedyEpsilonPolicy(worker_epsilon) agent.play(env, sess, train_policy, total_seconds=PLAY_TOTAL_SECONDS, max_episode_length=MAX_EPISODE_LENGTH) replay_memory.save_to_file(os.path.join(args.ai_output_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME)) env.terminate() return # Manager code. mprint('Loading fix samples') with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f: fix_samples = pickle.load(fixed_samples_f) evaluation_dirs = set() play_dirs = set() save_model(saver, sess, args.ai_input_dir, epsilon=1.0) epsilon_generator = LinearDecayGreedyEpsilonPolicy( 1.0, args.epsilon, TOTAL_WORKER_JOBS / 5.0) fits_so_far = 0 mprint('Begin to train (now safe to run gcloud)') mprint('Initial mean_max_q: ' + str(calculate_mean_max_Q(sess, online_model, fix_samples))) while len(play_dirs) < TOTAL_WORKER_JOBS: output_dirs = os.listdir(args.ai_output_dir) output_dirs = [os.path.join(args.ai_output_dir, x) for x in output_dirs] output_dirs = set(x for x in output_dirs if os.path.isdir(x)) new_dirs = sorted(output_dirs - evaluation_dirs - play_dirs) if len(new_dirs) == 0: time.sleep(0.1) continue new_dir = new_dirs[-1] # Most recent gameplay. evaluation_path = os.path.join(new_dir, WORKER_OUTPUT_EVALUATE_FILENAME) if os.path.isfile(evaluation_path): evaluation_dirs.add(new_dir) with open(evaluation_path, 'rb') as evaluation_file: rewards, game_lengths, mean_max_Q = pickle.load(evaluation_file) evaluation = [np.mean(rewards), np.std(rewards), np.mean(game_lengths), np.std(game_lengths), mean_max_Q] mprint('Evaluation: ' + '\t'.join(str(x) for x in evaluation)) continue memory_path = os.path.join(new_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME) try: if os.path.getsize(memory_path) == 0: # TODO Figure out why this happens despite temporary directory work. # Also sometimes the file doesn't exist? Hence the try/except. mprint('Output not ready somehow: ' + memory_path) time.sleep(0.1) continue with open(memory_path, 'rb') as memory_file: worker_memories = pickle.load(memory_file) except Exception as exception: print('Error reading ' + memory_path + ': ' + str(exception.args)) time.sleep(0.1) continue for worker_memory in worker_memories: replay_memory.append(*worker_memory) if args.psc: os.remove(memory_path) play_dirs.add(new_dir) if len(play_dirs) <= NUM_BURN_IN_JOBS: mprint('Skip training because still burn in.') mprint('len(worker_memories): ' + str(len(worker_memories))) continue for _ in range(int(len(worker_memories) * FITS_PER_SINGLE_MEMORY)): agent.fit(sess, fits_so_far) fits_so_far += 1 # Partial evaluation to give frequent insight into agent progress. # Last time checked, this took ~0.1 seconds to complete. mprint('mean_max_q, len(worker_memories): ' + str(calculate_mean_max_Q(sess, online_model, fix_samples)) + ', ' + str(len(worker_memories))) # Always decrement epsilon (e.g. not just when saving model). model_epsilon = epsilon_generator.get_epsilon(decay_epsilon=True) if len(play_dirs) % SAVE_MODEL_EVERY == 0: save_model(saver, sess, args.ai_input_dir, model_epsilon)
def main(args): # gpu id # gpu_id = args.gpu # os.environ['CUDA_VISIBLE_DEVICES'] = '%d'%gpu_id # make env env = gym.make(args.env) if args.mode == 'test' and args.submit: monitor_log = os.path.join(args.output, 'monitor.log') env = wrappers.Monitor(env, monitor_log, force=True) # build model # actions 0-5: 0 do nothing, 1 fire, 2 right, 3 left, 4 right+fire, 5 left+fire num_actions = env.action_space.n mem_size = 1000000 window = 4 input_shape = (84, 84) if args.type in ['DQN', 'double-DQN']: model = create_model(window, input_shape, num_actions, args.init) target = create_model(window, input_shape, num_actions, args.init) elif args.type in ['linear', 'linear-simple', 'double-Q']: model = create_model_linear(window, input_shape, num_actions, args.init) target = create_model_linear(window, input_shape, num_actions, args.init) elif args.type == 'duel': model = create_model_duel(window, input_shape, num_actions, args.init) target = create_model_duel(window, input_shape, num_actions, args.init) # memory = ReplayMemory(1000000, 100) # window length is arbitrary # target_update_freq = 10000 # num_burn_in = 50000 target_update_freq = 10000 num_burn_in = 50000 train_freq = 4 batch_size = 32 gamma = 0.99 epsilon = 0.05 updates_per_epoch = 50000 num_iterations = 50000000 eval_episodes = 100 max_episode_length = 10000 # simple: no experience replay and no target fixing # if args.type == 'linear-simple': # mem_size = 5 # target_update_freq = 1 # num_burn_in = 0 # batch_size = 1 if args.type == 'linear-simple': num_burn_in = 0 memory = ReplayMemoryEfficient(mem_size, window, input_shape) # with tf.device('/gpu:%d'%gpu_id): config = tf.ConfigProto(intra_op_parallelism_threads=8) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # preprocessor preprocessor = PreprocessorSequence() # policy policy = LinearDecayGreedyEpsilonPolicy(1, 0.1, 1000000) policy_eval = GreedyEpsilonPolicy(epsilon) # build agent dqn_agent = DQNAgent(sess, env, args.type, model, target, preprocessor, memory, policy, policy_eval, gamma, target_update_freq, num_burn_in, train_freq, batch_size, num_actions, updates_per_epoch, args.output) if args.mode == 'train': # compile net and train with fit # rmsprop = RMSprop(lr=0.00025, rho=0.95, epsilon=0.01) # dqn_agent.compile_networks(rmsprop, mean_huber_loss) # adam = Adam(lr=0.00025, beta_1=0.95, beta_2=0.95, epsilon=0.1) adam = Adam(lr=0.0001) dqn_agent.compile_networks(adam, mean_huber_loss) if args.type == 'linear-simple': dqn_agent.fit_simple(num_iterations, max_episode_length) else: dqn_agent.fit(num_iterations, max_episode_length) elif args.mode == 'test': # load net and evaluate model_path = os.path.join(args.output, 'model_epoch%03d' % args.epoch) dqn_agent.load_networks(model_path) if args.submit: eval_episodes = 1 dqn_agent.play(eval_episodes, max_episode_length) # if args.submit: # gym.upload(monitor_log, api_key='sk_wa5MgeDTnOQ209qBCP7jQ') # else: # log_file = open(os.path.join(args.output, 'evaluation.txt'), 'a+') # log_file.write('%d %f %f %f %f\n' % (args.epoch, # np.mean(lengths), # np.std(lengths), # np.mean(rewards), # np.std(rewards))) # log_file.close() env.close()
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--mode', choices=['train', 'test'], default='test') parser.add_argument('--network', choices=['deep', 'linear'], default='deep') parser.add_argument('--method', choices=['dqn', 'double', 'dueling'], default='dqn') parser.add_argument('--monitor', type=bool, default=True) parser.add_argument('--iter', type=int, default=2400000) parser.add_argument('--test_policy', choices=['Greedy', 'GreedyEpsilon'], default='GreedyEpsilon') args = parser.parse_args() args.seed = np.random.randint(0, 1000000, 1)[0] args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format( args.env, args.method, args.network, args.iter) args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format( args.env, args.method, args.network, args.iter, args.test_policy) if args.mode == 'train': args.monitor = False env = gym.make(args.env) if args.monitor: env = wrappers.Monitor(env, args.monitor_path) np.random.seed(args.seed) env.seed(args.seed) args.gamma = 0.99 args.learning_rate = 0.0001 args.epsilon = 0.05 args.num_iterations = 5000000 args.batch_size = 32 args.window_length = 4 args.num_burn_in = 50000 args.target_update_freq = 10000 args.log_interval = 10000 args.model_checkpoint_interval = 10000 args.train_freq = 4 args.num_actions = env.action_space.n args.input_shape = (84, 84) args.memory_max_size = 1000000 args.output = get_output_folder(args.output, args.env) args.suffix = args.method + '_' + args.network if (args.method == 'dqn'): args.enable_double_dqn = False args.enable_dueling_network = False elif (args.method == 'double'): args.enable_double_dqn = True args.enable_dueling_network = False elif (args.method == 'dueling'): args.enable_double_dqn = False args.enable_dueling_network = True else: print('Attention! Method Worng!!!') if args.test_policy == 'Greedy': test_policy = GreedyPolicy() elif args.test_policy == 'GreedyEpsilon': test_policy = GreedyEpsilonPolicy(args.epsilon) print(args) K.tensorflow_backend.set_session(get_session()) model = create_model(args.window_length, args.input_shape, args.num_actions, args.network) # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame Processor = {} Processor['Atari'] = AtariPreprocessor(args.input_shape) Processor['History'] = HistoryPreprocessor(args.window_length) ProcessorSequence = PreprocessorSequence(Processor) # construct 84x84x4 # we create our memory for saving all experience collected during training with window length 4 memory = ReplayMemory(max_size=args.memory_max_size, input_shape=args.input_shape, window_length=args.window_length) # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using # epsilon with 0.1 to further train the network policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon), attr_name='eps', start_value=1, end_value=0.1, num_steps=1000000) # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first # 50000 iterations, we only collect data to the memory and don't update our model. dqn = DQNAgent(q_network=model, policy=policy, memory=memory, num_actions=args.num_actions, test_policy=test_policy, preprocessor=ProcessorSequence, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size, enable_double_dqn=args.enable_double_dqn, enable_dueling_network=args.enable_dueling_network) adam = Adam(lr=args.learning_rate) dqn.compile(optimizer=adam) if args.mode == 'train': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f' log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix) log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=args.model_checkpoint_interval) ] callbacks += [FileLogger(log_filename, interval=100)] callbacks += [ TensorboardStepVisualization(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True) ] # start training # we don't apply action repetition explicitly since the game will randomly skip frame itself dqn.fit(env, callbacks=callbacks, verbose=1, num_iterations=args.num_iterations, action_repetition=1, log_interval=args.log_interval, visualize=True) dqn.save_weights(weights_filename, overwrite=True) dqn.evaluate(env, num_episodes=10, visualize=True, num_burn_in=5, action_repetition=1) elif args.mode == 'test': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.evaluate(env, num_episodes=250, visualize=True, num_burn_in=5, action_repetition=1) # we upload our result to openai gym if args.monitor: env.close() gym.upload(args.monitor_path, api_key='sk_J62obX9PQg2ExrM6H9rvzQ')
epsilon = 0.05 C = 10000 env = gym.make("Breakout-v0") #number of actions, used to construct an policy selector num_actions = env.action_space.n #create helpers #observation processor atari_processor = AtariProcessor(IMAGE_SIZE) history_store = HistoryStore(HISTORY_LENGTH, IMAGE_SIZE) #policy selector, for testing, use uniform random policy selector, just pass number of actions to the constructor random_selector = UniformRandomPolicy(num_actions) greedy_selector = GreedyPolicy() greedy_epsilon_selector = GreedyEpsilonPolicy(epsilon) greedy_epsilon_linear_decay_selector = LinearDecayGreedyEpsilonPolicy( 1, 0.05, int(round(MAX_INTERACTION / 5, 0))) # Initialize neural network # Online network which changes during training but not to calculate Q*. model_online = NN_cnn((IMAGE_SIZE[0], IMAGE_SIZE[1], HISTORY_LENGTH), num_actions) # Fixed network which is not changed during training but to calculate Q*. model_fixed = NN_cnn((IMAGE_SIZE[0], IMAGE_SIZE[1], HISTORY_LENGTH), num_actions) model_fixed.model.set_weights(model_online.model.get_weights()) #model_fixed.model = Model.from_config(model_online.model.get_config()) # Initialize memory. mem = NNMemStore(MEM_SIZE, (IMAGE_SIZE[0], IMAGE_SIZE[1], HISTORY_LENGTH))
env.render() epi_reward += reward1 print("episode reward", epi_reward) tot_rewards.append(epi_reward) return tot_rewards, np.sum(tot_rewards)/(num_epi + 0.00001) model_name='linear_naive' env = gym.make('SpaceInvaders-v0') epsilon = 0.05 window = 4 state_size = (84,84,1) num_actions = 3 action_rep = 4 epsilon = 0.4 ge_policy = GreedyEpsilonPolicy(epsilon) preprocessor = AtariPreprocessor(state_size) model = create_model_linear_naive(window, state_size, num_actions, model_name) print (model.summary()) model = load_weights(filepath='/home/sai/parameters/linear_naive-weights-440000.h5', model=model) #TODO get weights from files and plot the rewards based on the iterations #TODO find rewards for the final model. average over 100 episodes num_epi = 3 rewards, avg_reward = evaluate(env, ge_policy, preprocessor, model, num_epi, window_length=window, action_rep=action_rep) print("average reward", avg_reward)
def fit(self, env, num_iterations, max_episode_length=None): """Fit your model to the provided environment. Its a good idea to print out things like loss, average reward, Q-values, etc to see if your agent is actually improving. You should probably also periodically save your network weights and any other useful info. This is where you should sample actions from your network, collect experience samples and add them to your replay memory, and update your network parameters. Parameters ---------- env: gym.Env This is your Atari environment. You should wrap the environment using the wrap_atari_env function in the utils.py num_iterations: int How many samples/updates to perform. max_episode_length: int How long a single episode should last before the agent resets. Can help exploration. """ #copy the current network to the target_network self._target_network = clone_keras_model(self._network, self._keras_custom_layers) eval_policy = GreedyEpsilonPolicy(0.01) self._action_size = env.action_space.n start_fitting_time = time.time() total_step_num = 0 #number of iterations for num_i in range(0, num_iterations): ## Restart Variables and settings self._preprocessors.reset() #restart preprocessors curr_frame = env.reset( ) #restart the environment and get the start state is_terminal = False cumulative_reward = 0 cumulative_loss = 0 step_num = 0 #Step number for current episode start_time = time.time() #for storing over sta_filled_sizetes curr_action = 0 curr_reward = 0 next_maxed_frame = None last_frame = np.zeros(np.shape(curr_frame), dtype=np.uint8) mixed_frame = np.maximum(last_frame, curr_frame) processed_curr_state = self._preprocessors.process_state_for_memory( mixed_frame) #Loop until the end of the episode or hit the maximum number of episodes while (not is_terminal or (max_episode_length != None and step_num >= max_episode_length)): #use the policy to select the action based on the state curr_action = self.select_action(processed_curr_state) #execute one step curr_frame, curr_reward, is_terminal, debug_info = env.step( curr_action) processed_next_state = self._preprocessors.process_state_for_memory( curr_frame) #insert into memory self._replay_memory.append( processed_curr_state, processed_next_state, curr_action, self._preprocessors.process_reward(curr_reward), is_terminal) #update the policy training_loss = self.update_policy(total_step_num) cumulative_loss += training_loss #check if we should run an evaluation step and save the rewards if (total_step_num != 0 and total_step_num % self._eval_freq == 0): print("\nstart performance evaluation for step:{:09d}". format(total_step_num)) #change policy to use the evaluation policy curr_policy = self._policy self._policy = eval_policy #evaluate avg_reward, avg_length = self.evaluate(env, self._eval_times, verbose=True) #set back the policy self._policy = curr_policy #save the performance self._performance_recorder.append( (total_step_num, avg_reward, avg_length)) #check if we should to a checkpoint save if (total_step_num % self._checkin_freq == 0 or (time.time() - start_fitting_time) > self._total_duration): #do checkin self.save_check_point(total_step_num) ##update progress values step_num += 1 total_step_num += 1 cumulative_reward += curr_reward processed_curr_state = processed_next_state #for tracking purposes sys.stdout.write( "\r{:09d} ep:{:04d}, len:{:04d}, reward:{:.4f}, loss:{:.5f}, time_per_step:{:.5f}" .format(total_step_num, num_i, step_num, cumulative_reward, cumulative_loss / step_num, (time.time() - start_time) / step_num)) sys.stdout.flush() #save these generic informations, good for debugging? self._episodic_recorder.append( (num_i, step_num, cumulative_reward, cumulative_loss))