env = PrepareAtariEnv(env_id, log_dir) # Agent agent = DQNAgent(config, env, log_dir, static_policy=False) # Begin Interaction & Learning episode_reward = 0 observation = env.reset() for frame_idx in tqdm(range(1, config.MAX_FRAMES+1)): # Prepare to explore eps = agent.epsilon_by_frame(frame_idx) # Explore or Exploit action = agent.get_action(observation, eps) agent.save_action(action, frame_idx) # Execute prev_observation = observation observation, reward, done, info = env.step(action) if done: observation = None # Learn agent.update(prev_observation, action, reward, observation, frame_idx) episode_reward += reward # Episode End
def DQN_Exploration(args, log_dir, device, initial_state): env = NqubitEnvDiscrete(args.nbit, initial_state) # env.get_easy_T() remained to do agent = DQNAgent(args, env, log_dir, device) writer = SummaryWriter(log_dir) Temp = args.Temp totalstep = 0 epsilon = 1.0 obs = env.reset() print('initial_reward{0}'.format(env.get_current_threshold(obs))) for episode in tqdm(range(args.num_episodes)): Temp = Temp * 10.0**(-0.1) obs = env.reset() for step in tqdm(range(args.episode_length)): # choose large stepsize action number action = agent.get_action(obs, epsilon) # aciton <class 'int'> # execute large stepsize number if it satisfies the strong constraint next_obs, reward, done, info = env.step(obs, action, args.action_delta) #agent.buffer.push((obs, action, reward, next_obs)) # judge the large action stepsize effect # if ep = 0 : large stepsize is useless ep, action_delta = agent.prob(obs, next_obs, action) accept_probability = 1 if (ep > 0) else np.exp(ep / Temp) u = random.random() if u <= accept_probability: # take a small stepsize #agent.buffer.push((obs, action, reward, next_obs)) next_obs, reward, done, info = env.step( obs, action, action_delta) else: # No operation, the transition will be (obs, 0, reward, obs) action = 0 next_obs, reward, done, info = env.step( obs, action, action_delta) # record writer.add_scalar('threshold_rew', reward, totalstep) agent.buffer.push((obs, action, reward, next_obs)) if (totalstep > args.learn_start_steps) and ( totalstep % args.update_freq == 0): loss = agent.update() writer.add_scalar('loss', loss, totalstep) epsilon = agent.epsilon_by_step(totalstep) if epsilon < args.epsilon_min: epsilon = args.epsilon_min obs = next_obs totalstep += 1 if (reward >= -1.0): return reward, obs # Test_DQN_Agent if (totalstep % args.test_freq == 0): test_epsilon = 0.0 test_obs = env.reset() #T = env.get_easy_T(args.nbits) reward_recorder = -2.0 obs_recorder = test_obs for step in range(args.test_step): test_action = agent.get_action(test_obs, test_epsilon) # execute large stepsize number test_next_obs, reward, done, info = env.step( test_obs, test_action, args.action_delta) # judge the large action stepsize effect ep, action_delta = agent.prob(test_obs, test_next_obs, test_action) accept_probability = 1 if (ep > 0) else np.exp(ep / Temp) u = random.random() if u <= accept_probability: # take a small stepsize test_next_obs, reward, done, info = env.step( test_obs, test_action, action_delta) else: action = 0 test_next_obs = test_obs reward = env.get_current_threshold(test_obs) if reward > reward_recorder: reward_recorder = reward obs_recorder = test_next_obs if (reward >= -1.0): return reward, test_obs agent.buffer.push( (test_obs, action, reward, test_next_obs)) test_obs = test_next_obs writer.add_scalar('test_max_reward', reward_recorder, totalstep) writer.add_scalars( 'solution', { 's0': obs_recorder[0], 's1': obs_recorder[1], 's2': obs_recorder[2], 's3': obs_recorder[3], 's4': obs_recorder[4], 's5': obs_recorder[5] }, totalstep)