def main(): print("Creating model...") model = create_model() model.summary() print("Creating environment...") environment = gym.make("CartPole-v0") environment._max_episode_steps = 500 print("Creating agent...") if agent_type == "dqn": agent = DQNAgent(name="cartpole-dqn", model=model, environment=environment, observation_frames=1, observation_transformation=observation_transformation, reward_transformation=reward_transformation, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, number_of_iterations=1000000, replay_memory_size=2000, minibatch_size=32) elif agent_type == "ddqn": agent = DDQNAgent( name="cartpole-ddqn", model=model, environment=environment, observation_frames=1, observation_transformation=observation_transformation, reward_transformation=reward_transformation, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, number_of_iterations=1000000, replay_memory_size=2000, minibatch_size=32, model_copy_interval=100) agent.enable_rewards_tracking(rewards_running_means_length=10000) agent.enable_episodes_tracking(episodes_running_means_length=10000) agent.enable_maxq_tracking(maxq_running_means_length=10000) agent.enable_model_saving(model_save_frequency=100000) agent.enable_tensorboard_for_tracking() print("Training ...") agent.fit(verbose=True, headless="render" not in sys.argv)
def main(): print("Creating environment...") environment = gym_tetris.make('Tetris-v0') print("Creating model...") model = modelutils.create_model(number_of_actions) model.summary() print("Creating agent...") if agent_type == "dqn": agent = DQNAgent( name="tetris-dqn", environment=environment, model=model, observation_transformation=utils.resize_and_bgr2gray, observation_frames=4, number_of_iterations=1000000, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, replay_memory_size=2000, minibatch_size=32 ) elif agent_type == "ddqn": agent = DDQNAgent( name="tetris-ddqn", environment=environment, model=model, observation_transformation=utils.resize_and_bgr2gray, observation_frames=4, number_of_iterations=1000000, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, replay_memory_size=2000, minibatch_size=32, model_copy_interval=100 ) agent.enable_rewards_tracking(rewards_running_means_length=10000) agent.enable_episodes_tracking(episodes_running_means_length=100) agent.enable_maxq_tracking(maxq_running_means_length=10000) agent.enable_model_saving(model_save_frequency=10000) agent.enable_plots_saving(plots_save_frequency=10000) print("Training ...") agent.fit(verbose=True, headless="headless" in sys.argv, render_states=True)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description="Run DQN on iLOCuS") parser.add_argument("--network_name", default="deep_q_network", type=str, help="Type of model to use") parser.add_argument("--batch_size", default=32, type=int, help="Batch size") parser.add_argument("--map_shape", default=(15, 15), type=tuple, help="map size") parser.add_argument("--num_actions", default=4, type=int, help="level of pricing") parser.add_argument("--gamma", default=0.8, type=float, help="Discount factor") parser.add_argument("--alpha", default=0.0001, type=float, help="Learning rate") parser.add_argument("--epsilon", default=0.5, type=float, help="Exploration probability for epsilon-greedy") parser.add_argument("--target_update_freq", default=10000, type=int, help="Frequency for copying weights to target network") parser.add_argument( "--num_iterations", default=5000000, type=int, help="Number of overal interactions to the environment") parser.add_argument("--max_episode_length", default=200000, type=int, help="Terminate earlier for one episode") parser.add_argument("--train_freq", default=4, type=int, help="Frequency for training") parser.add_argument("--num-burn-in", default=10000, type=int, help="number of memory before train") parser.add_argument("-o", "--output", default="ilocus-v0", type=str, help="Directory to save data to") parser.add_argument("--seed", default=0, type=int, help="Random seed") parser.add_argument("--train", default=True, type=bool, help="Train/Evaluate, set True if train the model") parser.add_argument("--model_path", default="atari-v0", type=str, help="specify model path to evaluation") parser.add_argument("--max_grad", default=1.0, type=float, help="Parameter for huber loss") parser.add_argument("--log_dir", default="log", type=str, help="specify log folder to save evaluate result") parser.add_argument( "--flip_coin", default=False, type=str, help="specify whether or not choosing double q learning") parser.add_argument("--eval_num", default=100, type=int, help="number of evaluation to run") parser.add_argument("--save_freq", default=100000, type=int, help="model save frequency") # memory related args parser.add_argument("--buffer_size", default=100000, type=int, help="reply memory buffer size") parser.add_argument( "--look_back_steps", default=4, type=int, help="how many previous pricing tables will be fed into RL") args = parser.parse_args() print("\nParameters:") for arg in vars(args): print(arg, getattr(args, arg)) # Initiating policy for both tasks (training and evaluating) policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0.1, 1000000, args.num_actions) if not args.train: '''Evaluate the model''' # check model path if args.model_path is '': print("Model path must be set when evaluate") exit(1) # specific log file to save result log_file = os.path.join(args.log_dir, args.network_name, str(args.model_num)) model_dir = os.path.join(args.model_path, args.network_name, str(args.model_num)) with tf.Session() as sess: # load model # with open(model_dir + ".json", 'r') as json_file: # loaded_model_json = json_file.read() # q_network_online = model_from_json(loaded_model_json) # q_network_target = model_from_json(loaded_model_json) # # sess.run(tf.global_variables_initializer()) # # # load weights into model # q_network_online.load_weights(model_dir + ".h5") # q_network_target.load_weights(model_dir + ".h5") driver_sim = DriverSim() env = Environment(driver_sim=driver_sim) memory = ReplayMemory(args.buffer_size, args.look_back_steps) q_network = create_model(args.look_back_steps, args.map_shape, args.num_actions) dqn_agent = DQNAgent(q_network=q_network, memory=memory, policy=policy, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size) exit(0) '''Train the model''' with tf.Session() as sess: # with tf.device('/cpu:0'): print("created model") driver_sim = DriverSim() env = Environment(driver_sim=driver_sim) print("set up environment") # # create output dir, meant to pop up error when dir exist to avoid over written # os.mkdir(args.output + "/" + args.network_name) memory = ReplayMemory(args.buffer_size, args.look_back_steps) q_network = create_model(args.look_back_steps, args.map_shape, args.num_actions) dqn_agent = DQNAgent(q_network=q_network, memory=memory, policy=policy, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size) print("defined dqn agent") optimizer = Adam(learning_rate=args.alpha) q_network.compile(optimizer, mean_huber_loss) sess.run(tf.global_variables_initializer()) print("initializing environment") env.reset() print("in fit") if os.path.exists(args.output): shutil.rmtree(args.output) os.mkdir(args.output) dqn_agent.fit(env=env, num_iterations=args.num_iterations, output_dir=os.path.join(args.output), max_episode_length=args.max_episode_length)
load_weights_file=args.weights) memory = ReplayMemory(maxlen=1000000) processor = AtariProcessor() if (args.test): policy = MaxQPolicy() dqn = DQNAgent(env=env, memory=memory, policy=policy, model=model, discount_rate=0.99, processor=processor) dqn.play() else: policy = EpsilonPolicy(epsilon_max=1.0, epsilon_min=0.1, decay_steps=1250000) dqn = DQNAgent(env=env, memory=memory, policy=policy, batch_size=32, model=model, discount_rate=0.99, processor=processor, weights_filename='./pacman.h5') dqn.fit(num_steps=4000000, start_train=1000, learn_every=4, update_target_model=5000, save_every=1000)
def main(): parser = argparse.ArgumentParser( description='Train using Gazebo Simulations') parser.add_argument('--seed', default=10, type=int, help='Random seed') parser.add_argument('--input_shape', default=(80, 100), help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00001, help='learning rate') parser.add_argument('--window_size', default=4, type=int, help='Number of frames to feed to the Q-network') parser.add_argument('--num_time', default=4, type=int, help='Number of steps in RNN') parser.add_argument('--num_actions', default=7, type=int, help='Number of actions') parser.add_argument('--batch_size', default=64, type=int, help='Batch size of the training part') parser.add_argument('--num_iteration', default=500000, type=int, help='number of iterations to train') parser.add_argument( '--eval_every', default=0.01, type=float, help='What fraction of num_iteration to run between evaluations') args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) batch_environment = GazeboWorld() print('Environment initialized') replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) online_model, online_params = create_model(args.window_size, args.input_shape, args.num_actions, 'online_model', create_duel_q_network, trainable=True) target_model, target_params = create_model(args.window_size, args.input_shape, args.num_actions, 'target_model', create_duel_q_network, trainable=False) update_target_params_ops = [ t.assign(s) for s, t in zip(online_params, target_params) ] agent = DQNAgent(online_model, target_model, replay_memory, args.num_actions, args.gamma, TARGET_UPDATE_FREQENCY, update_target_params_ops, args.batch_size, args.learning_rate) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): # saving and loading networks trainables = tf.trainable_variables() trainable_saver = tf.train.Saver(trainables, max_to_keep=1) sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state("saved_networks") print('checkpoint:', checkpoint) if checkpoint and checkpoint.model_checkpoint_path: trainable_saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # make target_model equal to online_model sess.run(update_target_params_ops) print('Prepare fixed samples for mean max Q.') fixed_samples = get_fixed_samples(batch_environment, args.num_actions, NUM_FIXED_SAMPLES) # initialize replay buffer print('Burn in replay_memory.') agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False) # start training: fit_iteration = int(args.num_iteration * args.eval_every) for i in range(0, args.num_iteration, fit_iteration): # evaluate: reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate( sess, batch_environment) mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q( sess, fixed_samples) print("%d, %f, %f, %f, %f, %f, %f" % (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var, reward_max, reward_min)) # train: agent.fit(sess, batch_environment, fit_iteration, do_train=True) trainable_saver.save(sess, 'saved_networks/', global_step=i) reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate( sess, batch_environment) mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(sess, fixed_samples) print("%d, %f, %f, %f, %f, %f, %f" % (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var, reward_max, reward_min))
def main(): parser = argparse.ArgumentParser( description='Run DQN on Atari Space Invaders') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--seed', default=10703, type=int, help='Random seed') parser.add_argument('--input_shape', default=(84, 84), help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.') parser.add_argument('--window_size', default=4, type=int, help='Number of frames to feed to the Q-network') parser.add_argument('--batch_size', default=32, type=int, help='Batch size of the training part') parser.add_argument('--num_process', default=3, type=int, help='Number of parallel environment') parser.add_argument('--num_iteration', default=20000000, type=int, help='number of iterations to train') parser.add_argument( '--eval_every', default=0.001, type=float, help='What fraction of num_iteration to run between evaluations.') parser.add_argument('--is_duel', default=1, type=int, help='Whether use duel DQN, 0 means no, 1 means yes.') parser.add_argument( '--is_double', default=1, type=int, help='Whether use double DQN, 0 means no, 1 means yes.') parser.add_argument( '--is_per', default=1, type=int, help='Whether use PriorityExperienceReplay, 0 means no, 1 means yes.') parser.add_argument( '--is_distributional', default=1, type=int, help='Whether use distributional DQN, 0 means no, 1 means yes.') parser.add_argument('--num_step', default=1, type=int, help='Num Step for multi-step DQN, 3 is recommended') parser.add_argument('--is_noisy', default=1, type=int, help='Whether use NoisyNet, 0 means no, 1 means yes.') args = parser.parse_args() args.input_shape = tuple(args.input_shape) print('Environment: %s.' % (args.env, )) env = gym.make(args.env) num_actions = env.action_space.n print('number_actions: %d.' % (num_actions, )) env.close() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) batch_environment = BatchEnvironment(args.env, args.num_process, args.window_size, args.input_shape, NUM_FRAME_PER_ACTION, MAX_EPISODE_LENGTH) if args.is_per == 1: replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) else: replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model noisy = True if args.is_noisy == 1 else False online_model, online_params = create_model_fn(args.window_size, args.input_shape, num_actions, 'online_model', create_network_fn, trainable=True, noisy=noisy) target_model, target_params = create_model_fn(args.window_size, args.input_shape, num_actions, 'target_model', create_network_fn, trainable=False, noisy=noisy) update_target_params_ops = [ t.assign(s) for s, t in zip(online_params, target_params) ] agent = DQNAgent(online_model, target_model, replay_memory, num_actions, args.gamma, UPDATE_FREQUENCY, TARGET_UPDATE_FREQENCY, update_target_params_ops, args.batch_size, args.is_double, args.is_per, args.is_distributional, args.num_step, args.is_noisy, args.learning_rate, RMSP_DECAY, RMSP_MOMENTUM, RMSP_EPSILON) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): sess.run(tf.global_variables_initializer()) # make target_model equal to online_model sess.run(update_target_params_ops) print('Prepare fixed samples for mean max Q.') fixed_samples = get_fixed_samples(batch_environment, num_actions, NUM_FIXED_SAMPLES) print('Burn in replay_memory.') agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False) # Begin to train: fit_iteration = int(args.num_iteration * args.eval_every) for i in range(0, args.num_iteration, fit_iteration): # Evaluate: reward_mean, reward_var = agent.evaluate(sess, batch_environment, NUM_EVALUATE_EPSIODE) mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples) print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var)) # Train: agent.fit(sess, batch_environment, fit_iteration, do_train=True) batch_environment.close()
env.seed(123) nb_actions = env.action_space.n model = DQNModel(nb_actions=nb_actions).model policy = EpsGreedyPolicy(eps_min=0.1, eps_max=1, eps_test=0.05, nb_steps=1000000) memory = Memory(max_len=1000000) processor = AtariProcessor() dqn = DQNAgent(env, model, policy, memory, processor, gamma=0.99, batch_size=32, target_model_update_steps=10000, nb_episodes_warmup=500) dqn.fit(nb_episodes=20000, action_repetition=1, save_weights=True, save_weights_step=1000, weights_folder='./', visualize=True) # file = './weights.h5f' # dqn.load_weights(file) dqn.test(nb_episodes=10, visualize=True)
memory = ReplayMemory(maxlen=1000, game_over_bias=5) processor = VoidProcessor() if (args.test): policy = MaxQPolicy() dqn = DQNAgent(env=env, memory=memory, policy=policy, model=model, discount_rate=0.99, processor=processor) dqn.play() else: policy = EpsilonPolicy(epsilon_max=1.0, epsilon_min=0.05, decay_steps=10000) dqn = DQNAgent(env=env, memory=memory, policy=policy, batch_size=64, model=model, discount_rate=0.99, processor=processor, weights_filename='./cartpole.h5') dqn.fit(num_steps=20000, start_train=1000, learn_every=1, update_target_model=100, save_every=1000, max_episode_score=500)