def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--mode', choices=['train', 'test'], default='test') parser.add_argument('--network', choices=['deep', 'linear'], default='deep') parser.add_argument('--method', choices=['dqn', 'double', 'dueling'], default='dqn') parser.add_argument('--monitor', type=bool, default=True) parser.add_argument('--iter', type=int, default=2400000) parser.add_argument('--test_policy', choices=['Greedy', 'GreedyEpsilon'], default='GreedyEpsilon') args = parser.parse_args() args.seed = np.random.randint(0, 1000000, 1)[0] args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format( args.env, args.method, args.network, args.iter) args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format( args.env, args.method, args.network, args.iter, args.test_policy) if args.mode == 'train': args.monitor = False env = gym.make(args.env) if args.monitor: env = wrappers.Monitor(env, args.monitor_path) np.random.seed(args.seed) env.seed(args.seed) args.gamma = 0.99 args.learning_rate = 0.0001 args.epsilon = 0.05 args.num_iterations = 5000000 args.batch_size = 32 args.window_length = 4 args.num_burn_in = 50000 args.target_update_freq = 10000 args.log_interval = 10000 args.model_checkpoint_interval = 10000 args.train_freq = 4 args.num_actions = env.action_space.n args.input_shape = (84, 84) args.memory_max_size = 1000000 args.output = get_output_folder(args.output, args.env) args.suffix = args.method + '_' + args.network if (args.method == 'dqn'): args.enable_double_dqn = False args.enable_dueling_network = False elif (args.method == 'double'): args.enable_double_dqn = True args.enable_dueling_network = False elif (args.method == 'dueling'): args.enable_double_dqn = False args.enable_dueling_network = True else: print('Attention! Method Worng!!!') if args.test_policy == 'Greedy': test_policy = GreedyPolicy() elif args.test_policy == 'GreedyEpsilon': test_policy = GreedyEpsilonPolicy(args.epsilon) print(args) K.tensorflow_backend.set_session(get_session()) model = create_model(args.window_length, args.input_shape, args.num_actions, args.network) # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame Processor = {} Processor['Atari'] = AtariPreprocessor(args.input_shape) Processor['History'] = HistoryPreprocessor(args.window_length) ProcessorSequence = PreprocessorSequence(Processor) # construct 84x84x4 # we create our memory for saving all experience collected during training with window length 4 memory = ReplayMemory(max_size=args.memory_max_size, input_shape=args.input_shape, window_length=args.window_length) # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using # epsilon with 0.1 to further train the network policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon), attr_name='eps', start_value=1, end_value=0.1, num_steps=1000000) # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first # 50000 iterations, we only collect data to the memory and don't update our model. dqn = DQNAgent(q_network=model, policy=policy, memory=memory, num_actions=args.num_actions, test_policy=test_policy, preprocessor=ProcessorSequence, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size, enable_double_dqn=args.enable_double_dqn, enable_dueling_network=args.enable_dueling_network) adam = Adam(lr=args.learning_rate) dqn.compile(optimizer=adam) if args.mode == 'train': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f' log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix) log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=args.model_checkpoint_interval) ] callbacks += [FileLogger(log_filename, interval=100)] callbacks += [ TensorboardStepVisualization(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True) ] # start training # we don't apply action repetition explicitly since the game will randomly skip frame itself dqn.fit(env, callbacks=callbacks, verbose=1, num_iterations=args.num_iterations, action_repetition=1, log_interval=args.log_interval, visualize=True) dqn.save_weights(weights_filename, overwrite=True) dqn.evaluate(env, num_episodes=10, visualize=True, num_burn_in=5, action_repetition=1) elif args.mode == 'test': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.evaluate(env, num_episodes=250, visualize=True, num_burn_in=5, action_repetition=1) # we upload our result to openai gym if args.monitor: env.close() gym.upload(args.monitor_path, api_key='sk_J62obX9PQg2ExrM6H9rvzQ')
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') #parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--output', default='results', help='Directory to save data to') parser.add_argument('-l', '--isLinear', default=0, type=int, choices=range(0, 2), help='1: use linear model; 0: use deep model') parser.add_argument( '-m', '--modelType', default='q', choices=['q', 'double', 'dueling'], help= 'q: q learning; double: double q learning; dueling: dueling q learning' ) parser.add_argument( '-s', '--simple', default=0, type=int, choices=range(0, 2), help= '1: without replay or target fixing ; 0: use replay and target fixing') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) if not os.path.exists(args.output): os.makedirs(args.output) model_name = ('linear_' if args.isLinear else 'deep_') + args.modelType + ( '_simple' if args.simple else '') args.output = get_output_folder(args.output + '/' + model_name, args.env) env = gym.make(args.env) #env = gym.wrappers.Monitor(env, args.output) env.seed(args.seed) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) K.get_session().run(tf.initialize_all_variables()) is_linear = args.isLinear agent = DQNAgent( q_network=create_model(4, (84, 84), env.action_space.n, is_linear, args.modelType), q_network2=create_model(4, (84, 84), env.action_space.n, is_linear, args.modelType), preprocessor=AtariPreprocessor((84, 84)), memory=ReplayMemory(1000000, 4), gamma=0.99, target_update_freq=10000, num_burn_in=50000, train_freq=4, batch_size=32, is_linear=is_linear, model_type=args.modelType, use_replay_and_target_fixing=(not args.simple), epsilon=0, #0.05, action_interval=4, output_path=args.output, save_freq=100000) agent.compile(lr=0.0001) agent.fit(env, 5000000) agent.load_weights() agent.evaluate(env, 100, video_path_suffix='final') env.close()