Example #1
0
    env = PrepareAtariEnv(env_id, log_dir)

    # Agent
    agent = DQNAgent(config, env, log_dir, static_policy=False)

    # Begin Interaction & Learning
    
    episode_reward = 0
    observation = env.reset()

    for frame_idx in tqdm(range(1, config.MAX_FRAMES+1)):
        # Prepare to explore
        eps = agent.epsilon_by_frame(frame_idx)

        # Explore or Exploit
        action = agent.get_action(observation, eps)
        agent.save_action(action, frame_idx)

        # Execute
        prev_observation = observation
        observation, reward, done, info = env.step(action)

        if done:
            observation = None

        # Learn
        agent.update(prev_observation, action, reward, observation, frame_idx)
        episode_reward += reward

        # Episode End
Example #2
0
def DQN_Exploration(args, log_dir, device, initial_state):
    env = NqubitEnvDiscrete(args.nbit,
                            initial_state)  # env.get_easy_T() remained to do
    agent = DQNAgent(args, env, log_dir, device)
    writer = SummaryWriter(log_dir)

    Temp = args.Temp
    totalstep = 0
    epsilon = 1.0
    obs = env.reset()
    print('initial_reward{0}'.format(env.get_current_threshold(obs)))

    for episode in tqdm(range(args.num_episodes)):
        Temp = Temp * 10.0**(-0.1)
        obs = env.reset()

        for step in tqdm(range(args.episode_length)):

            # choose large stepsize action number
            action = agent.get_action(obs, epsilon)
            # aciton <class 'int'>

            # execute large stepsize number if it satisfies the strong constraint
            next_obs, reward, done, info = env.step(obs, action,
                                                    args.action_delta)
            #agent.buffer.push((obs, action, reward, next_obs))

            # judge the large action stepsize effect
            # if ep = 0 : large stepsize is useless

            ep, action_delta = agent.prob(obs, next_obs, action)

            accept_probability = 1 if (ep > 0) else np.exp(ep / Temp)
            u = random.random()

            if u <= accept_probability:  # take a small stepsize
                #agent.buffer.push((obs, action, reward, next_obs))

                next_obs, reward, done, info = env.step(
                    obs, action, action_delta)
            else:  # No operation, the transition will be (obs, 0, reward, obs)
                action = 0
                next_obs, reward, done, info = env.step(
                    obs, action, action_delta)

            # record
            writer.add_scalar('threshold_rew', reward, totalstep)

            agent.buffer.push((obs, action, reward, next_obs))

            if (totalstep > args.learn_start_steps) and (
                    totalstep % args.update_freq == 0):
                loss = agent.update()
                writer.add_scalar('loss', loss, totalstep)
                epsilon = agent.epsilon_by_step(totalstep)
                if epsilon < args.epsilon_min:
                    epsilon = args.epsilon_min

            obs = next_obs
            totalstep += 1
            if (reward >= -1.0):
                return reward, obs

            # Test_DQN_Agent
            if (totalstep % args.test_freq == 0):
                test_epsilon = 0.0
                test_obs = env.reset()
                #T = env.get_easy_T(args.nbits)
                reward_recorder = -2.0
                obs_recorder = test_obs

                for step in range(args.test_step):
                    test_action = agent.get_action(test_obs, test_epsilon)

                    # execute large stepsize number
                    test_next_obs, reward, done, info = env.step(
                        test_obs, test_action, args.action_delta)

                    # judge the large action stepsize effect
                    ep, action_delta = agent.prob(test_obs, test_next_obs,
                                                  test_action)

                    accept_probability = 1 if (ep > 0) else np.exp(ep / Temp)
                    u = random.random()

                    if u <= accept_probability:  # take a small stepsize

                        test_next_obs, reward, done, info = env.step(
                            test_obs, test_action, action_delta)
                    else:
                        action = 0
                        test_next_obs = test_obs
                        reward = env.get_current_threshold(test_obs)

                    if reward > reward_recorder:
                        reward_recorder = reward
                        obs_recorder = test_next_obs
                    if (reward >= -1.0):
                        return reward, test_obs

                    agent.buffer.push(
                        (test_obs, action, reward, test_next_obs))
                    test_obs = test_next_obs

                writer.add_scalar('test_max_reward', reward_recorder,
                                  totalstep)
                writer.add_scalars(
                    'solution', {
                        's0': obs_recorder[0],
                        's1': obs_recorder[1],
                        's2': obs_recorder[2],
                        's3': obs_recorder[3],
                        's4': obs_recorder[4],
                        's5': obs_recorder[5]
                    }, totalstep)