def accumulate_experience(teacher, exp_replay: Supervised_ExperienceReplay, config=student_config): """ teacher feeds the Experience replay with new experiences :param teacher: teacher net, knows how to solve the problem :param exp_replay: the experience replay where the teacher saves its experiences :param config : holds customer variables such as OBSERVE :return: an experience replay filled with new experiences """ env = gym.make("PongNoFrameskip-v4") env = wrap_deepmind(env, frame_stack=True) steps = 0 while 1: state = env.reset() state = np.asarray(state) done = False while not done: steps += 1 teacher_q_value = teacher.get_q( state=np.reshape(state, (1, state.shape[0], state.shape[1], state.shape[2]))) action = teacher.select_action(teacher_q_value) next_state, reward, done, _ = env.step(action + 1) next_state = np.asarray(next_state) exp_replay.add_memory(state, teacher_q_value, action) # feeding the experience replay state = next_state if steps > config.OBSERVE: # we have OBSERVE number of exp in exp_replay try: del env except ImportError: pass break
def evaluate(agent: DQNAgent, n_epoch=10, render=False): """ evaluate the agent :param agent: agent to be evaluated :param n_epoch: number of epoch to evaluate, the bigger the more accurate the evaluation is :param render: if you want to visualize the evaluation :return: score of the evaluation """ env = gym.make("PongNoFrameskip-v4") env = wrap_deepmind(env, frame_stack=True) final_score = [] for e in range(n_epoch): state = env.reset() state = np.asarray(state) done = False epoch_reward = 0.0 while not done: if render: env.render() q_values = agent.get_q( state=np.reshape(state, (1, state.shape[0], state.shape[1], state.shape[2]))) action = agent.select_action(qValues=q_values, explore=False) next_state, reward, done, _ = env.step(action + 1) # 1 for up 2 for stay 3 for down, action is from 0 to 2 so we need an offset next_state = np.asarray(next_state) state = next_state epoch_reward += reward print("Episode ", e, " / {} finished with reward {}".format(n_epoch, epoch_reward)) final_score.append(epoch_reward) final_score = np.mean(final_score) try: del env except ImportError: pass return final_score
from agents.DQN_NSTEP import Model from utils.hyperparameters import Config from utils.plot import plot, save_plot from utils.wrappers import wrap_pytorch, make_atari, wrap_deepmind # 智能体名称 agent_name = "DQN" # 获取配置文件 config = Config() # 记录开始时间 start = timer() # 声明环境为乒乓球 env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=False) env = wrap_pytorch(env) # 构建模型 model = Model(env=env, config=config) # 场景的收获 episode_reward = 0 # 获取场景初始状态 observation = env.reset() # max_frames = int(config.MAX_FRAMES / 50) # 最大frame数量 max_frames = config.MAX_FRAMES # process_count = int(max_frames / 40) # 输出进度的频率值,达到数量即输出 process_count = int(max_frames / 2000) # 上次输出处理时间 process_time = 0
def fit(logger, agent, target_agent, n_epoch, update=True): logger.info("Start : training agent ") env = gym.make("PongNoFrameskip-v4") env = wrap_deepmind(env, frame_stack=True) if USE_PER: exp_replay = MultiStepPrioritizedExperienceReplay( size=dense_config.memory_size, gamma=agent.gamma, alpha=dense_config.ALPHA_PER) else: exp_replay = ExperienceReplayMultistep(size=dense_config.memory_size, gamma=agent.gamma) degradation = dense_config.steps_per_train / dense_config.EXPLORE agent.set_degradation(degradation) last_100_epochs_reward = np.zeros(100) total_steps = 0 best_reward = -21.0 i = 0 for e in range(n_epoch): state = env.reset() state = np.asarray(state) print("agent epsilon : {}".format(agent.epsilon)) done = False epoch_reward = 0.0 while not done: total_steps += 1 q_values = agent.get_q( state=np.reshape(state, (1, state.shape[0], state.shape[1], state.shape[2]))) action = agent.select_action(qValues=q_values) next_state, reward, done, _ = env.step( action + 1 ) # 1 for up 2 for stay 3 for down, action is from 0 to 2 so we need an offset next_state = np.asarray(next_state) exp_replay.add_memory( state, action, reward, next_state, done, total_steps % dense_config.steps_per_train == 0) # transaction are inserted after steps per train state = next_state epoch_reward += reward if total_steps % dense_config.steps_per_train == 0: agent.lower_epsilon() if total_steps < dense_config.OBSERVE: continue if total_steps % dense_config.steps_per_train == 0: train_on_batch(agent, target_agent, exp_replay, e) if update and total_steps % dense_config.UPDTATE_FREQ == 0: agent.save_model() target_agent.sync(agent_path=agent.model_path) print("Update target DQN") last_100_epochs_reward[e % 100] = epoch_reward if e < 100: if best_reward < epoch_reward: logger.info("Best Reward : episode {} / {}, reward {}".format( e, n_epoch, epoch_reward)) best_reward = epoch_reward print( "Episode ", e, " / {} finished with reward {}".format(n_epoch, epoch_reward)) else: mean_100_reward = sum(last_100_epochs_reward) / 100 if best_reward < mean_100_reward: print( "Best Reward : episode {} to {}, with average reward of {}" .format(e - 100, e, mean_100_reward)) best_reward = mean_100_reward print( "Episode ", e, " / {} finished with reward of {} and the last 100 average reward is {} " .format(n_epoch, epoch_reward, mean_100_reward)) logger.info( "Episode {} / {} finished with reward of {} and the last 100 average reward is {} " .format(e, n_epoch, epoch_reward, mean_100_reward)) if mean_100_reward > 20.0: agent.save_model() logger.info( "Goal achieved!, at episode {} to {}, with average reward of {}" .format(e - 100, e, mean_100_reward)) i += 1 if i % 5 == 0: break else: i = 0 try: del env except ImportError: pass
def main(argv): env = gym.make(config.game_name) env = wrap_deepmind(env, config.episode_life, config.preprocess, config.max_and_skip, config.clip_rewards, config.no_op_reset, config.scale) num_actions = env.action_space.n sess = tf.Session() agent = DQNAgent(sess=sess, num_actions=num_actions) sess.run(tf.global_variables_initializer()) rewards = tf.placeholder(dtype=tf.float32, shape=(None), name='reward') saver = tf.train.Saver() tf.summary.scalar('avg.reward/ep', tf.reduce_mean(rewards)) tf.summary.scalar('max.reward/ep', tf.reduce_max(rewards)) writer = tf.summary.FileWriter('logs_12_v4_allwrap_constant_lr', sess.graph) summary_merged = tf.summary.merge_all() episode_rewards = [] batch_loss = [] replay_buffer = ReplayBuffer() time_step = 0 episode = 0 total_reward_list = [] #scheduler e = e_scheduler() lr = lr_scheduler() while time_step < config.MAX_TIME_STEPS: done = False total_reward = 0 ''' frame --> 84 x 84 x 1 state --> 84 x 84 x 4 ''' frame = env.reset() frame_scale = np.array(frame).astype(np.float32) / 255.0 #맨 처음 frame을 받아올때는 past_frames이 존재하지않으므로, (84x84)의 0인 행렬을 받아서 초기화 past_frames = np.zeros( (config.height, config.width, agent.history_length - 1), dtype=np.uint8) #저장용 past_frames_scale = np.zeros( (config.height, config.width, agent.history_length - 1), dtype=np.float32) #학습용 state = agent.process_state_into_stacked_frames(frame, past_frames, past_state=None) state_scale = np.array(state).astype(np.float32) / 255.0 while not done: if np.random.rand() < e.get( ) or time_step < config.REPLAY_START_SIZE: action = env.action_space.sample() else: action = agent.predict_action(state_scale) time_step += 1 frame_after, reward, done, info = env.step(action) frame_after_scale = np.array(frame_after).astype( np.float32) / 255.0 replay_buffer.add_experience(state, action, reward, done) if not done: #+21 or -21 #새로 생긴 frame을 과거 state에 더해줌. state_after = agent.process_state_into_stacked_frames( frame_after, past_frames, past_state=state) state_after_scale = np.array(state_after).astype( np.float32) / 255.0 past_frames = np.concatenate((past_frames, frame_after), axis=2) past_frames = past_frames[:, :, 1:] past_frames_scale = np.array(past_frames).astype( np.float32) / 255.0 #print(past_frames.shape) state = state_after state_scale = state_after_scale total_reward += reward #training if time_step > config.REPLAY_START_SIZE and time_step % config.LEARNING_FREQ == 0: e.update(time_step) lr.update(time_step) b_state, b_action, b_reward, b_state_after, b_done = replay_buffer.sample_batch( config.BATCH_SIZE) Q_of_state_after = agent.sess.run( agent.target_Q, feed_dict={agent.target_state: b_state_after}) target_Q_p = [] for i in range(config.BATCH_SIZE): if b_done[i]: target_Q_p.append(b_reward[i]) else: target_Q_p.append(b_reward[i] + config.DISCOUNT_FACTOR * np.max(Q_of_state_after[i])) agent.sess.run( [agent.train_step, agent.Q, agent.loss], { agent.target_Q_p: target_Q_p, agent.action: b_action, agent.state: b_state, agent.lr: lr.get() }) if time_step % config.target_UPDATE_FREQ == 0: agent.sess.run(agent.update_fn) if time_step % config.REWARD_RECORD_FREQ == 0 and len( total_reward_list) != 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if time_step % config.MODEL_RECORD_FREQ == 0: saver.save(sess, 'model_12_v4_allwrap_constant_lr/dqn.ckpt', global_step=time_step) #학습과 상관 x episode += 1 #For Debugging if episode % 100 == 0: print('episode : %d 점수: %d' % (episode, total_reward)) total_reward_list.append(total_reward)