def main(config, max_num_of_steps, max_num_of_episodes, load_model, save_model, load_memory, save_memory, log_path): agent = DQNAgent(config) with agent.graph.as_default(): if load_model: step = agent.load_model(load_model) screen_log.info("Load model: {}".format(load_model)) screen_log.info("Start from step {}".format(step)) else: step = 0 if load_memory: agent.load_memory(load_memory) n_frames = len(agent.memory) screen_log.info("Load memory: {}".format(load_memory)) screen_log.info("Memory size: {}".format(n_frames)) log_name = ('{:02}{:02}{:02}{:02}{:02}'.format(*time.localtime()[1:6])) summary_writer = tf.summary.FileWriter(logdir=os.path.join( log_path, '{}'.format(log_name)), graph=agent.graph) episode = 0 rewards_per_episode = [] sum_Qs = .0 sum_losses = .0 try: while (step < max_num_of_steps and episode < max_num_of_episodes): episode += 1 episode_done = False next_observation = reset_random_env() next_observation = preprocess_observation(next_observation) rewards_per_episode.append(0) while not episode_done: observation = next_observation if len(agent.memory) < config['replay_start_size']: # init replay memory action = env.action_space.sample() next_observation, reward, episode_done, info = env.step( action) next_observation = preprocess_observation( next_observation) agent.memory.append( MemoryItem(observation, action, reward, episode_done, info)) continue state = agent.get_recent_state(observation) Qs = agent.get_Q_values(state) Qs = Qs[0] # epsilon-greedy action selection epsilon = get_epsilon(config, step) if np.random.RandomState().rand() < epsilon: action = env.action_space.sample() else: action = agent.get_action_from_Q(Qs) next_observation, reward, episode_done, info = env.step( action) next_observation = preprocess_observation(next_observation) agent.memory.append( MemoryItem(observation, action, reward, episode_done, info)) step += 1 rewards_per_episode[-1] += reward sum_Qs += Qs[action] # train step loss, loss_summary_str = agent.optimize_Q() summary_writer.add_summary(loss_summary_str, step) sum_losses += loss if step % 1000 == 0: ave_loss = sum_losses / step ave_reward = np.mean(rewards_per_episode) ave_Q = sum_Qs / step [Q_summary_str, reward_summary_str ] = agent.evaluate(ave_reward, ave_Q) summary_writer.add_summary(Q_summary_str, step) summary_writer.add_summary(reward_summary_str, step) screen_log.info( 'step: {}, ave. loss: {:g}, ' 'ave. reward: {:g}, ave. Q: {:g}'.format( step, ave_loss, ave_reward, ave_Q, )) if step % 10000 == 0: agent.save_model(save_model, step) if step % 1000000 == 0: agent.save_memory(save_memory, step) except KeyboardInterrupt: print("\nUser interrupted training...") finally: summary_writer.close() agent.save_model(save_model, step) agent.save_memory(save_memory, step) screen_log.info( 'Finished: the number of steps {}, the number of episodes {}.'. format(step, episode))
def main(): parser = argparse.ArgumentParser( description='Run DQN on Atari Space Invaders') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--seed', default=10703, type=int, help='Random seed') parser.add_argument('--input_shape', default=(84, 84), help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.') parser.add_argument('--window_size', default=4, type=int, help='Number of frames to feed to the Q-network') parser.add_argument('--batch_size', default=32, type=int, help='Batch size of the training part') parser.add_argument('--num_process', default=3, type=int, help='Number of parallel environment') parser.add_argument('--num_iteration', default=20000000, type=int, help='number of iterations to train') parser.add_argument( '--eval_every', default=0.001, type=float, help='What fraction of num_iteration to run between evaluations.') parser.add_argument('--is_duel', default=1, type=int, help='Whether use duel DQN, 0 means no, 1 means yes.') parser.add_argument( '--is_double', default=1, type=int, help='Whether use double DQN, 0 means no, 1 means yes.') parser.add_argument( '--is_per', default=1, type=int, help='Whether use PriorityExperienceReplay, 0 means no, 1 means yes.') parser.add_argument( '--is_distributional', default=1, type=int, help='Whether use distributional DQN, 0 means no, 1 means yes.') parser.add_argument('--num_step', default=1, type=int, help='Num Step for multi-step DQN, 3 is recommended') parser.add_argument('--is_noisy', default=1, type=int, help='Whether use NoisyNet, 0 means no, 1 means yes.') args = parser.parse_args() args.input_shape = tuple(args.input_shape) print('Environment: %s.' % (args.env, )) env = gym.make(args.env) num_actions = env.action_space.n print('number_actions: %d.' % (num_actions, )) env.close() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) batch_environment = BatchEnvironment(args.env, args.num_process, args.window_size, args.input_shape, NUM_FRAME_PER_ACTION, MAX_EPISODE_LENGTH) if args.is_per == 1: replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) else: replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model noisy = True if args.is_noisy == 1 else False online_model, online_params = create_model_fn(args.window_size, args.input_shape, num_actions, 'online_model', create_network_fn, trainable=True, noisy=noisy) target_model, target_params = create_model_fn(args.window_size, args.input_shape, num_actions, 'target_model', create_network_fn, trainable=False, noisy=noisy) update_target_params_ops = [ t.assign(s) for s, t in zip(online_params, target_params) ] agent = DQNAgent(online_model, target_model, replay_memory, num_actions, args.gamma, UPDATE_FREQUENCY, TARGET_UPDATE_FREQENCY, update_target_params_ops, args.batch_size, args.is_double, args.is_per, args.is_distributional, args.num_step, args.is_noisy, args.learning_rate, RMSP_DECAY, RMSP_MOMENTUM, RMSP_EPSILON) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): sess.run(tf.global_variables_initializer()) # make target_model equal to online_model sess.run(update_target_params_ops) print('Prepare fixed samples for mean max Q.') fixed_samples = get_fixed_samples(batch_environment, num_actions, NUM_FIXED_SAMPLES) print('Burn in replay_memory.') agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False) # Begin to train: fit_iteration = int(args.num_iteration * args.eval_every) for i in range(0, args.num_iteration, fit_iteration): # Evaluate: reward_mean, reward_var = agent.evaluate(sess, batch_environment, NUM_EVALUATE_EPSIODE) mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples) print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var)) # Train: agent.fit(sess, batch_environment, fit_iteration, do_train=True) batch_environment.close()
def main(): parser = argparse.ArgumentParser( description='Train using Gazebo Simulations') parser.add_argument('--seed', default=10, type=int, help='Random seed') parser.add_argument('--input_shape', default=(80, 100), help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00001, help='learning rate') parser.add_argument('--window_size', default=4, type=int, help='Number of frames to feed to the Q-network') parser.add_argument('--num_time', default=4, type=int, help='Number of steps in RNN') parser.add_argument('--num_actions', default=7, type=int, help='Number of actions') parser.add_argument('--batch_size', default=64, type=int, help='Batch size of the training part') parser.add_argument('--num_iteration', default=500000, type=int, help='number of iterations to train') parser.add_argument( '--eval_every', default=0.01, type=float, help='What fraction of num_iteration to run between evaluations') args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) batch_environment = GazeboWorld() print('Environment initialized') replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) online_model, online_params = create_model(args.window_size, args.input_shape, args.num_actions, 'online_model', create_duel_q_network, trainable=True) target_model, target_params = create_model(args.window_size, args.input_shape, args.num_actions, 'target_model', create_duel_q_network, trainable=False) update_target_params_ops = [ t.assign(s) for s, t in zip(online_params, target_params) ] agent = DQNAgent(online_model, target_model, replay_memory, args.num_actions, args.gamma, TARGET_UPDATE_FREQENCY, update_target_params_ops, args.batch_size, args.learning_rate) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): # saving and loading networks trainables = tf.trainable_variables() trainable_saver = tf.train.Saver(trainables, max_to_keep=1) sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state("saved_networks") print('checkpoint:', checkpoint) if checkpoint and checkpoint.model_checkpoint_path: trainable_saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # make target_model equal to online_model sess.run(update_target_params_ops) print('Prepare fixed samples for mean max Q.') fixed_samples = get_fixed_samples(batch_environment, args.num_actions, NUM_FIXED_SAMPLES) # initialize replay buffer print('Burn in replay_memory.') agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False) # start training: fit_iteration = int(args.num_iteration * args.eval_every) for i in range(0, args.num_iteration, fit_iteration): # evaluate: reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate( sess, batch_environment) mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q( sess, fixed_samples) print("%d, %f, %f, %f, %f, %f, %f" % (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var, reward_max, reward_min)) # train: agent.fit(sess, batch_environment, fit_iteration, do_train=True) trainable_saver.save(sess, 'saved_networks/', global_step=i) reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate( sess, batch_environment) mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(sess, fixed_samples) print("%d, %f, %f, %f, %f, %f, %f" % (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var, reward_max, reward_min))
env = RewardNegativeDeath(env, death_factor=2) env = ObservationReshape(env) # create agent model = CartpoleNetwork(learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR, input_shape=(env.observation_space.shape[0], ), output_shape=env.action_space.n) agent = DQNAgent(actions=env.action_space.n, expl_max=EXPLORATION_MAX, expl_min=EXPLORATION_MIN, expl_decay=EXPLORATION_DECAY, model=model, memory_size=MEMORY_SIZE, batch_size=BATCH_SIZE) # get and parse user args args = Parser.parseargs(defaultTrainIterations=10000, defaultEvalIterations=10) if args.load: agent.load(env, args.loadversion) if args.train != 0: #agent.init_fill_memory(env, 50000) agent.train(env, args.train, train_s=1, save_i=MODEL_SAVE_EVERY) if args.eval != 0: print("Evaluation results (higher scores are better):") agent.evaluate(env, args.eval) if args.save: agent.save(env, args.saveversion) if args.render: agent.render_episode(env, random_action=args.renderrandom) # close env env.close()