コード例 #1
0
def main_DQN():
    env = AgarioEnv(render=RENDER,
                    speed_scale=SPEED_SCALE,
                    display_text=DISPLAY_TEXT,
                    grid_resolution=GRID_RESOLUTION)
    agent = DQNAgent(height=GRID_RESOLUTION,
                     width=GRID_RESOLUTION,
                     input_channels=2,
                     num_actions=ACTION_DISCRETIZATION,
                     loadpath='')
    # env.seed(41)
    # agent.seed(41)
    for episode in range(NUM_EPISODES):
        state = env.reset()
        done = False
        new_state = None
        reward = 0
        num_steps = 0
        while not done:
            raw_action = agent.get_action(state)
            action = agent.action_to_angle(raw_action)
            for _ in range(NUM_SKIP_FRAMES):
                if RENDER:
                    env.render()
                new_state, reward, done, _ = env.step(action)
            num_steps += 1
            # print(f'step = {num_steps}')
            if done or num_steps > MAX_STEPS:
                new_state = None
                done = True
            agent.memory.push(state, raw_action, new_state, reward)
            agent.optimize()
            if done:
                print(f'Episode {episode} done, max_mass = {state.mass}')
                agent.max_masses.append(state.mass)
                agent.print_final_stats()
            if num_steps % agent.TARGET_UPDATE == 0:
                # print(f'UPDATING TARGET')
                agent.target_net.load_state_dict(agent.policy_net.state_dict())
            state = new_state
        if episode % WEIGHTS_SAVE_EPISODE_STEP == 0:
            torch.save(
                agent.policy_net.state_dict(),
                f'DQN_weights/model_{episode}_{str(datetime.now()).replace(" ", "_")}_episodes.model'
            )
            np.savetxt(
                f'DQN_weights/model_{episode}_{str(datetime.now()).replace(" ", "_")}_episodes.performance',
                np.array(agent.max_masses))
    print(f'Complete')
    torch.save(
        agent.policy_net.state_dict(),
        f'model_{NUM_EPISODES}_{str(datetime.now()).replace(" ", "_")}_episodes.model'
    )
    np.savetxt(
        f'DQN_weights/model_{NUM_EPISODES}_{str(datetime.now()).replace(" ", "_")}_episodes.performance',
        np.array(agent.max_masses))
    agent.print_final_stats()
    env.close()
コード例 #2
0
ファイル: main.py プロジェクト: logar16/LunarLander
def setup(config: str, load_file: str) -> DQNAgent:
    if config:
        loader = AgentLoader(config, num_actions=num_actions, num_inputs=num_inputs)
        agent = loader.load()
    else:
        agent = DQNAgent(num_actions=num_actions, num_inputs=num_inputs)
    if load_file:
        print(f'Loading "{load_file}"...')
        agent.load(load_file)
    return agent
コード例 #3
0
ファイル: main.py プロジェクト: PeterParser/RLAcrobot
def start_training_dqn(is_prioritized):
    if is_prioritized:
        prio = "with_priority"
    else:
        prio = "no_priority"

    env = gym.make(hyperparams['environment'])
    state_spec = len(env.observation_space.sample())
    action_spec = env.action_space.n
    log_name = 'final_build' + prio
    log_dir = 'logs/acrobot/' + log_name

    log_writer = tf.summary.create_file_writer(log_dir)

    epsilon = hyperparams['epsilon']
    buffer = PrioritizedReplay(
        hyperparams['max_experiences']) if is_prioritized else UniformReplay(
            hyperparams['max_experiences'])

    agent = DQNAgent(hyperparams['hidden_layer_dqn'], state_spec, action_spec,
                     buffer, hyperparams['learning_rate_dqn'], is_prioritized)

    total_rewards = np.empty(hyperparams['episodes'])
    for episode in range(hyperparams['episodes']):
        episode_reward = 0
        epsilon = max(hyperparams['min_epsilon'],
                      epsilon * hyperparams['decay'])
        done = False
        state = env.reset()
        while not done:

            action = agent.play_action(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward

            buffer.add((state, action, reward, next_state, done))
            state = next_state

            if len(buffer.experiences) > hyperparams['min_experiences']:
                agent.train(hyperparams['gamma'], hyperparams['batch_size'])

        total_rewards[episode] = episode_reward
        avg_rewards = total_rewards[max(0, episode - 20):(episode + 1)].mean()
        env.reset()

        with log_writer.as_default():
            tf.summary.scalar('episode reward', episode_reward, step=episode)
            tf.summary.scalar('avg for 20 episodes', avg_rewards, step=episode)
    agent.network.save_weights('dqn_{}_network.h5'.format(prio))
    env.close()
コード例 #4
0
ファイル: main.py プロジェクト: logar16/LunarLander
def print_progress(agent: DQNAgent, data: dict):
    percent = data['percent']
    progress = '=' * int(percent)
    progress += '>'
    left = ' ' * (100 - percent)
    progress = f'{percent}% [{progress + left}]'

    reward, steps = data['stats']
    mean = round(reward.mean(), 1)
    std = round(reward.std(), 1)
    positive = reward[reward > 0].size
    total = reward.size
    steps = steps.sum()
    losses = data['losses']

    if total > 50:
        graph(reward, verbose=True)
        plt.savefig(f'figures/{run_id}_training.png')
        if len(losses) > 10:
            graph(losses.detach().numpy(), xlabel='Replays', ylabel='Loss', window=5)
            plt.savefig(f'figures/{run_id}_losses.png')
    # print(progress + f'  μ: {mean}, σ: {std}; +{positive}/{total}, steps: {steps}', end='\r')
    # if percent % 5 != 0:
    #     return
    last100 = reward[-100:]
    last_mean = round(last100.mean(), 2)
    last_std = round(last100.std(), 1)
    verbose = data['verbose']

    if percent % 2 == 0 and last_mean > 200:
        print(' ' * 100, end='\r')
        if verbose:
            print('Last 100 episodes average over 200! ', end='')
        agent.save(f'{run_id}_{percent}p', str(round(last_mean, 0)))

    # rar = f'rar: {round(data["rar"], 5)}' if verbose else ''
    # Spaces at the end are to clean up the progress bar
    print(f'Total mean: {mean}, std: {std};  '
          f'Last 100 mean: {last_mean}, std: {last_std};  '
          f'Positive: {positive}/{total}  '
          f'Steps: {steps}  ',
          # rar,
          " " * 20)
    if verbose:
        if len(losses) > 1:
            mean = round(losses.mean().item(), 3)
            std = round(torch.std(losses).item(), 3)
            print(f'Recent Losses: {losses[-5:]}, mean: {mean}, std: {std}')
    print(progress, end='\r')
コード例 #5
0
def main(num_episodes, render=False):
    # initialize gym environment and the agent
    # env = gym.make('SpaceInvaders-v0')
    env = gym.make('Breakout-v0')
    state = env.reset()
    state_shape = list(state.shape)
    state_shape[-1] = state_shape[-1] * 5
    agent = DQNAgent(state_shape, env.action_space.n)

    states = deque(maxlen=5)

    max_train_time = 800

    # Iterate the game
    for e in range(num_episodes):
        # reset state in the beginning of each game
        state = env.reset()
        for i in range(5):
            states.appendleft(state)
        # time_t represents each frame of the game
        num_random = 0
        total_reward = 0.
        for time_t in range(max_train_time):
            # turn this on if you want to render
            if render:
                env.render()
            # Decide action
            action = agent.act(states)
            if agent.acted_randomly:
                num_random += 1
            # Advance the game to the next frame based on the action.
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            # Remember the previous state, action, reward, and done
            agent.remember(states.copy(), action, reward, next_state, done)
            # make next_state the new current state for the next frame.
            states.appendleft(next_state)
            # done becomes True when the game ends
            if done:
                # print the score and break out of the loop
                rand_perc = num_random / float(
                    time_t + 1) * 100.  # Percentage of random actions.
                print(
                    "episode: {}/{}, training_time: {}, summed_reward: {}, random_actions: {}%, eps: {}"
                    .format(e, num_episodes, time_t, total_reward, rand_perc,
                            agent.epsilon))
                # train the agent with the experience of the episode
                agent.replay(min(100, time_t))
                break
        # print("epsilon {}".format(agent.epsilon))
        if e % 1000 == 0:
            agent.save("./deep_q_model.h5")
            print("saved model")
コード例 #6
0
ファイル: player.py プロジェクト: katnoria/dqn
    def __init__(self):
        """Player implementation of dqn and random agents"""
        self.env = UnityEnvironment(
            file_name="../env/Banana_Linux_NoVis/Banana.x86_64")
        self.brain_name = self.env.brain_names[0]
        brain = self.env.brains[self.brain_name]
        # reset the environment
        env_info = self.env.reset(train_mode=False)[self.brain_name]
        # number of actions
        self.action_size = brain.vector_action_space_size
        # examine the state space
        state = env_info.vector_observations[0]
        state_size = len(state)

        self.agent = DQNAgent(state_size, self.action_size, seed=0)
        self.agent.local_network.load_state_dict(
            torch.load('../saved_models/dqn_banana_best.pth'))
コード例 #7
0
ファイル: test_dqn_agent.py プロジェクト: dohnala/GridWorld
 def define_agent(self, width, height, num_actions):
     return DQNAgent(config=Config(num_actions=num_actions,
                                   encoder=OneHotEncoder(width, height),
                                   optimizer=AdamOptimizer(0.01),
                                   network=MLP(),
                                   policy=EpsilonGreedyPolicy(1, 0.01, 500),
                                   discount=0.95,
                                   capacity=100,
                                   batch_size=16))
コード例 #8
0
def run_exp(cfg=None):
    logger = Logger(cfg)
    agent = DQNAgent(cfg)
    env = Env(cfg)
    trainer = Trainer(env, agent, cfg)

    cfg = cfg.exp
    n_training_steps = cfg.n_episodes // cfg.train_after
    global_step = 0
    state = env.reset()
    joint_angles = np.empty(cfg.n_episodes)
    for step in range(cfg.n_episodes):
        state = trainer.single_step(state)
        # agent training
        if global_step % cfg.train_after == (cfg.train_after - 1):
            print(f"step: {step}")
            print("Training agents")
            # fw model warmup phase of 2000 steps
            metrics_dict = agent.train(
                cfg.train_iv, cfg.train_fw,
                cfg.train_policy if global_step >= 0 else False)
            logger.log_metrics(metrics_dict, global_step)
            logger.log_all_network_weights(agent.joint_agents[0], step)
            agent.decrease_eps(n_training_steps)

        # video logging
        if global_step % cfg.video_after == 0:
            print("logging video")
            vis, debug0, debug1 = trainer.record_frames(debug_cams=True)
            logger.log_vid_debug_cams(vis, debug0, debug1, global_step)

        # distractor toggling
        if global_step % cfg.toggle_table_after == (cfg.toggle_table_after -
                                                    1):
            env.toggle_table()

        global_step += 1
        pos = env.get_joint_positions()[0]
        joint_angles[step] = pos

    joint_angles = np.degrees(-joint_angles)
    plt.hist(joint_angles, bins=20, range=(0, 170))
    plt.savefig(os.path.join("plots", "explored_angles.png"))
コード例 #9
0
ファイル: auto.py プロジェクト: logar16/LunarLander
    def on_progress(self, agent: DQNAgent, data):
        """
        After 1% of the total iterations is complete, the agent will call this function
        This is an opportunity to decide if it is time to quit early.
        """
        percent: int = data['percent']
        reward, steps = data['stats']
        rar = data['rar']

        if len(reward) >= 100:
            last100 = reward[-100:]
            mean = np.round(last100.mean())
            if mean >= 200:
                print("Successfully completed goal")
                self.success = True
                self.exit_early = True
                agent.end_training_early()
            elif mean >= 50 and percent % 5 == 0:
                print("\nGood performance found, saving checkpoint")
                epoch = int(self.episodes * percent / 100)
                agent.save(f'{self.id}', f'{epoch}_{mean}')

        if self.verbose and percent % 10 == 0:
            # TODO: Print additional info
            print(f"\n{percent}% "
                  f"\tTotal reward={round(reward.mean(), 3)}  "
                  f"steps={steps.sum()}  "
                  f"rar={round(rar, 3)}")
            # look at the last several episodes
            reward = reward[-self.percent_size:]
            print(f"\t\tRecent reward={round(reward.mean(), 3)},  "
                  f"max={round(reward.max(), 3)}")

        if self.verbose:
            print(f'{percent}% ... ', end="")
        else:
            progress = '=' * int(percent)
            progress += '>'
            left = ' ' * (100 - percent)
            print(f'{percent}% [{progress + left}]', end='\r')
コード例 #10
0
 def define_agent(self, width, height, num_actions):
     return DQNAgent(
         config=Config(
             num_actions=num_actions,
             encoder=LayerEncoder(width, height, treasure_position=True),
             optimizer=AdamOptimizer(0.001),
             network=CNN(hidden_units=[128]),
             policy=EpsilonGreedyPolicy(1, 0.01, 50000),
             discount=0.95,
             capacity=10000,
             batch_size=8,
             target_sync=100,
             double_q=True))
コード例 #11
0
ファイル: main.py プロジェクト: PeterParser/RLAcrobot
def test_model(model, is_ac):
    env = gym.make(hyperparams['environment'])
    state_spec = len(env.observation_space.sample())
    action_spec = env.action_space.n
    buffer = None
    is_prioritized = False
    if is_ac:
        agent = ActorCriticAgent(hyperparams['hidden_layer_actor'],
                                 hyperparams['hidden_layer_critic'],
                                 state_spec, action_spec,
                                 hyperparams['learning_rate_actor'],
                                 hyperparams['learning_rate_critic'])
        agent.actor_network.load_weights(model)

    else:
        agent = DQNAgent(hyperparams['hidden_layer_dqn'], state_spec,
                         action_spec, buffer, hyperparams['learning_rate_dqn'],
                         is_prioritized)

        agent.network.load_weights(model)
    obs = env.reset()
    env.render()
    # Play 20 episodes
    for i in range(20):
        rewards = []
        while True:
            if is_ac:
                action = agent.play_action(obs)
            else:
                action = agent.play_action(obs, hyperparams['min_epsilon'])

            obs, reward, done, _ = env.step(action)
            env.render()
            rewards.append(reward)
            if done:
                print("Gathered {} reward".format(np.sum(rewards)))
                env.reset()
                break

    env.close()
コード例 #12
0
            batch_size=64,
            num_batches=20,
            starts_learning=5000,
            discount=0.99,
            target_freq=10,
            verbose=True,
            print_every=10)
        '''

        agent = DQNAgent(action_set=[0, 1, 2],
                         reward_function=mountain_car_reward_function,
                         feature_extractor=MountainCarIdentityFeature(),
                         hidden_dims=[50, 50],
                         learning_rate=5e-4,
                         buffer_size=50000,
                         batch_size=64,
                         num_batches=100,
                         starts_learning=5000,
                         final_epsilon=0.02,
                         discount=0.99,
                         target_freq=10,
                         verbose=True,
                         print_every=10)

        _, _, rewards = live(agent=agent,
                             environment=env,
                             num_episodes=episodes,
                             max_timesteps=200,
                             verbose=True,
                             print_every=50)

        np.save(os.path.join(reward_path, file_name), rewards)
コード例 #13
0
ファイル: test.py プロジェクト: vub-ai-lab/MAL-seminar
class TestAgent(object):
    def __init__(self, shape, n_actions):
        self.n_actions = n_actions
        self.db = ReplayDB(shape, 100)

    def select_action(self, obs):
        return np.random.choice(self.n_actions)

    def update(self, s, a, r, t):
        self.db.insert(s, a, r, t)


def create_mlp(inputs, n_out):
    net = nn.input_data(placeholder=inputs)
    net = nn.fully_connected(net, 25, activation='relu')
    net = nn.dropout(net, 0.4)
    net = nn.fully_connected(net, 25)
    net = nn.dropout(net, 0.4)
    net = nn.fully_connected(net, n_out, activation='linear')
    return net

if __name__ == "__main__":
    env = gym.make('MountainCar-v0')
    n_actions = env.action_space.n
    agent = DQNAgent(create_mlp, n_actions, env.observation_space.shape,
                     min_replay_size=10000, batch_size=64)
    exp = Experiment(agent, env)
    exp.run_epoch(1000000)
    print agent.db.num_samples()
    print agent.db.sample(10)
コード例 #14
0
def main_DQN_plus_greedy():
    GREEDY_TOTAL_NUM_EPISODES = 1000
    GREEDY_NUM_EPISODES = GREEDY_TOTAL_NUM_EPISODES // 3
    env = AgarioEnv(render=RENDER,
                    speed_scale=SPEED_SCALE,
                    display_text=DISPLAY_TEXT,
                    grid_resolution=GRID_RESOLUTION)
    agent = DQNAgent(height=GRID_RESOLUTION,
                     width=GRID_RESOLUTION,
                     input_channels=2,
                     num_actions=ACTION_DISCRETIZATION,
                     loadpath='')
    greedy = Greedy()
    env.seed(41)
    agent.seed(41)
    for episode in range(GREEDY_TOTAL_NUM_EPISODES):
        state = env.reset()
        done = False
        new_state = None
        raw_action, action = None, None
        reward = 0
        num_steps = 0
        is_greedy_episode = episode < GREEDY_NUM_EPISODES
        while not done:
            if is_greedy_episode:
                action = greedy.get_action(state)
                raw_action = agent.angle_to_action(action)
                # print(f'angle: {action}, raw_action: {raw_action}')
            else:
                raw_action = agent.get_action(state)
                action = agent.action_to_angle(raw_action)
            for _ in range(NUM_SKIP_FRAMES):
                if RENDER:
                    env.render()
                new_state, reward, done, _ = env.step(action)
            num_steps += 1
            # print(f'step = {num_steps}')
            if done or num_steps > MAX_STEPS:
                new_state = None
                done = True
            agent.memory.push(state, raw_action, new_state, reward)
            agent.optimize()
            if done:
                print(
                    f'{"Greedy" if is_greedy_episode else "DQN" } episode done, max_mass: {state.mass}'
                )
                if not is_greedy_episode:
                    agent.max_masses.append(state.mass)
            if num_steps % agent.TARGET_UPDATE == 0:
                # print(f'UPDATING TARGET')
                agent.target_net.load_state_dict(agent.policy_net.state_dict())
            state = new_state
    print(f'Complete')
    torch.save(
        agent.policy_net.state_dict(),
        f'model_GREEDY_DQN_{NUM_EPISODES}_{str(datetime.now()).replace(" ", "_")}_episodes.model'
    )
    agent.print_final_stats()
    env.close()
コード例 #15
0
 def create_agent(self, config):
     from agents import DQNAgent
     agent = DQNAgent(num_actions=self.num_actions, num_inputs=self.num_inputs, config=config, **config)
     self.agent = agent
     self.current_config = config
     return agent
コード例 #16
0
				action = np.random.randint(4)

			for i in range(self.action_repeat):
				reward = self.environment.act(action)
				total_score += reward
				self.environment.update_screen()


		return total_score


sess = tf.InteractiveSession()
counter = Counter(7000000)

replay_memory = ReplayMemory(1000000)
dqn_agent = DQNAgent((84,84,4), NATURE, 4, replay_memory, counter, tf_session=sess)
agent = EpsilonAgent(dqn_agent, 4, counter)
agi = AtariGameInterface('Breakout.bin', agent, replay_memory, counter)

# Create a Tensorboard monitor and populate with the desired summaries
tensorboard_monitor = TensorboardMonitor('./log', sess, counter)
tensorboard_monitor.add_scalar_summary('score', 'per_game_summary')
tensorboard_monitor.add_scalar_summary('training_loss', 'training_summary')
for i in range(4):
	tensorboard_monitor.add_histogram_summary('Q%d_training' % i, 'training_summary')

checkpoint_monitor = CheckpointRecorder(dqn_agent.dqn, replay_memory, counter, './checkpoints', sess)
agi.add_listener(checkpoint_monitor)
agi.add_listener(tensorboard_monitor)
dqn_agent.add_listener(tensorboard_monitor)
コード例 #17
0
def train(classifier):
    lg = global_logger["lg"]

    if opt.agent == 'policy':
        agent = PolicyAgent()
    elif opt.agent == 'dqn':
        agent = DQNAgent()
    elif opt.agent == 'dqn_target':
        agent = DQNTargetAgent()
    elif opt.agent == 'actor_critic':
        agent = ActorCriticAgent()
    elif opt.agent == 'random':
        agent = RandomAgent()
    else:
        agent = DQNAgent()

    start_episode = 0

    # load old model
    file_name = opt.load_model_name
    if file_name != "":
        old_model = load_external_model(file_name)
        start_episode = int(file_name.split('/')[1])
        agent.load_policynetwork(old_model)

    game = Game()
    model = classifier()
    for episode in range(start_episode, opt.episodes):
        model.reset()
        game.reboot(model)
        print('##>>>>>>> Episode {} of {} <<<<<<<<<##'.format(episode, opt.episodes))
        terminal = False
        num_of_zero = 0

        state = game.get_state(model)
        first_log = True
        cum_reward = 0
        while not terminal:
            action = agent.get_action(state)
            reward, next_state, terminal = game.feedback(action, model)
            if not terminal:
                agent.update(state, action, reward, next_state, terminal)

            cum_reward += reward
            if (action == 1):
                print("> State {:2} Action {:2} - reward {:.4f} - performance {:.4f}".format(game.current_state, action, reward, game.performance))
                # print(state)
                step = 0 if first_log else game.queried_times
                timer(lg.scalar_summary, ("last_episode_performance", game.performance, step))
                first_log = False
            else:
                num_of_zero += 1

            del state
            state = next_state
            if terminal:
                agent.finish_episode(episode)
                break

        # Reset model
        model.reset()
        timer(model.train_model, (data["active"], opt.full_epochs))
        metrics = timer(model.performance_validate, (data["dev"],))

        lg.dict_scalar_summary('episode-validation', metrics, episode)
        lg.scalar_summary('episode-cum-reward', cum_reward, episode)
        lg.scalar_summary('performance', game.performance, episode)
        lg.scalar_summary('number-of-0-actions', num_of_zero, episode)
コード例 #18
0
import gym
import numpy as np

from agents import QAgent, Agent, RandomAgent, DQNAgent

env = gym.make('LunarLander-v2')

num_episodes = 5000

agent = DQNAgent(env.observation_space.n, env.action_space.n)

average_reward = []
for episode in range(num_episodes):
    rewards = []
    state = env.reset()

    while True:
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        rewards.append(reward)
        agent.step(state, action, reward, next_state, done)
        state = next_state

        if done:
            average_reward.append(np.sum(rewards))
            break

    # monitor progress
    if episode % print_evry == 0:
        reward_last_100 = int(np.mean(average_reward[-99:]))
        learning_rate = agent.scheduler.get_lr().squeeze()
コード例 #19
0
def main():
    parser = argparse.ArgumentParser(description="-----[Agent tester]-----")
    parser.add_argument(
        '--agent',
        default='dqn',
        help='Type of reinforcement agent. (dqn | policy, actor_critic)')
    parser.add_argument('--env',
                        default='CartPole-v0',
                        help='Type of reinforcement env.')
    params = parser.parse_args()

    env = gym.make(params.env)
    env = env.unwrapped

    opt.actions = env.action_space.n
    opt.state_size = env.observation_space.shape[0]
    opt.hidden_size = 8
    opt.batch_size_rl = 32
    opt.cuda = False
    opt.reward_clip = True
    opt.gamma = 0.99
    opt.data_sizes = [opt.state_size]
    opt.learning_rate_rl = 0.01

    from agents import DQNAgent, DQNTargetAgent, PolicyAgent, ActorCriticAgent, RandomAgent
    if params.agent == 'policy':
        agent = PolicyAgent()
    elif params.agent == 'dqn':
        agent = DQNAgent()
    elif params.agent == 'dqn_target':
        agent = DQNTargetAgent()
    elif params.agent == 'actor_critic':
        agent = ActorCriticAgent()
    elif params.agent == 'random':
        agent = RandomAgent()
    else:
        agent = DQNAgent()

    print('\nCollecting experience...')
    for i_episode in range(4000):
        state = env.reset()
        state = torch.FloatTensor(state).view(1, -1)
        score = 0
        done = False
        while not done:
            env.render()
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = next_state
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            r = r1 + r2

            next_state = torch.FloatTensor(next_state).view(1, -1)

            if not done:
                agent.update(state, action, r, next_state, done)

            score += 1
            state = next_state

            if done:
                agent.finish_episode(i_episode)
                print('Ep: ', i_episode, '| Ep_r: ', round(score, 2))
                break
コード例 #20
0
            state = next_state  # roll over the state to next time s

            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
        if np.mean(scores_window) >= 13.0 or i_episode == n_episodes:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), filename)
            break
    return agent, scores


agent = DQNAgent(QNetworkDuellingCNN,
                 state_size,
                 action_size,
                 seed=0,
                 ddqn=True)
agent, scores = dqn(agent, 'duellingCNN.pth')

agent2 = DQNAgent(QNetworkCNN, state_size, action_size, seed=0, ddqn=True)
agent2, scores = dqn(agent2, 'CNN.pth')
コード例 #21
0
ファイル: test.py プロジェクト: maofeiyu/fx-drqn
#     filename = './data/' + cur + '_lag_' + str(lag) + '.csv'
#     df= pd.read_csv(filename).reset_index(drop = True)

if __name__ == '__main__':
    dqn_model_path = './AUDUSD/agents/20190526-174236/dqn|0.pt'

    np.random.seed(321)
    torch.manual_seed(123)

    env = ForexEnv(mode='eval')
    eps = 23
    rewards = []

    agent = DQNAgent(action_set=[0, 1, 2],
                     reward_function=functools.partial(Forex_reward_function),
                     feature_extractor=ForexIdentityFeature(),
                     hidden_dims=[10, 10],
                     test_model_path=dqn_model_path)

    for e in range(eps):
        observation_history, action_history = test(agent=agent,
                                                   environment=env,
                                                   max_timesteps=3600,
                                                   n=e)
        r = torch.sum(
            agent.get_episode_reward(observation_history, action_history))
        print('reward %.5f' % r)
        rewards.append(r)
        # print(action_history)
        if e == eps - 1:
            print(agent.get_episode_reward(observation_history,
コード例 #22
0
    print("Double DQN {}, Duelling Architecture {}".format(
        args.double_dqn, args.duelling))

    # instantiate appropriate agent
    if (args.double_dqn is True) & (args.duelling is True):
        agent = DDQNAgent(state_size=37,
                          action_size=4,
                          model=DuelingQNetwork,
                          seed=0)
        agent_name = 'duel_ddqn'

    elif (args.double_dqn is True) & (args.duelling is False):
        agent = DDQNAgent(state_size=37, action_size=4, model=QNetwork, seed=0)
        agent_name = 'ddqn'

    elif (args.double_dqn is False) & (args.duelling is True):
        agent = DQNAgent(state_size=37,
                         action_size=4,
                         model=DuelingQNetwork,
                         seed=0)
        agent_name = 'duel_dqn'

    else:
        agent = DQNAgent(state_size=37, action_size=4, model=QNetwork, seed=0)
        agent_name = 'dqn'

    # Run simulation with specified agent
    print('Running simulation with {} agent'.format(agent_name))
    run(agent, agent_name)
    env.close()
コード例 #23
0
# Create a keras Huber loss function
def hubert_loss(y_true, y_pred):
    err = y_pred - y_true
    return K.mean(K.sqrt(1 + K.square(err)) - 1, axis=-1)


## Instantiate the improved DQN agent
ragent = DQNAgent(
    name='FullDQNAgent-1',
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.n,
    epsdecay=0.975,
    buffersize=500000,
    samplesize=32,
    minsamples=1000,
    gamma=0.99,
    update_target_freq=600,
    nnparams={  # Improved DQN setting
        'hidden_layers': [(50, 'relu'), (40, 'relu')],
        'loss': hubert_loss,
        'optimizer': Adam(lr=0.0005),
        'target_network': True
    })

# Create an experiment with the LunarLander env and improved DQN agent for 500 train/test episodes
exp = Experiment(env, ragent, logdir="../log", verbose=True, num_episodes=500)

# Training trials
exp.run(testmode=False)

# Test trials
コード例 #24
0
ckpt_dir = Path(
    "C:/Users/kevin/OneDrive/Dokumente/Coding/reinforcement_learning/models")
log_dir = Path(
    "C:/Users/kevin/OneDrive/Dokumente/Coding/reinforcement_learning/logs")

# Tensorboard summar writer for logging
writer = tensorboard.SummaryWriter(log_dir=log_dir)

# Create DQN Agent
agent = DQNAgent(gamma=0.99,
                 epsilon=1,
                 lr=0.0001,
                 input_dims=(env.observation_space.shape),
                 n_actions=env.action_space.n,
                 mem_size=50000,
                 eps_min=0.1,
                 batch_size=32,
                 replace=1000,
                 eps_dec=0.99999,
                 chkpt_dir=ckpt_dir,
                 algo='DQNAgent',
                 env_name='PongNoFrameskip-v4')

# load models if already saved
if load_checkpoint:
    agent.load_models()

n_steps = 0
scores, eps_history, steps_array = [], [], []

# Play games
コード例 #25
0
# create environment
env = UnityEnvironment(file_name='./Banana.app')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=False)[brain_name]

# create DQN agent
osize = len(env_info.vector_observations[0])
asize = brain.vector_action_space_size
seed = 0
agent = DQNAgent(osize, asize, seed, BUFFERSIZE, GAMMA, EPSILON, DECAY, EPMIN,
                 MINIBATCHSIZE, LEARNRATE, TAU)

# log scores
reward_log = []
avg_log = []
avg_window = collections.deque(maxlen=AVG_WINDOW)

# verbosity
VERBOSE = True

# Train the agent
for ep_count in range(1, MAX_EPISODES):

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]
コード例 #26
0
import gym
from puckworld import PuckWorldEnv
from agents import DQNAgent
from utils import learning_curve

env = PuckWorldEnv()
agent = DQNAgent(env)
data = agent.learning(gamma=0.99,
                      epsilon=1,
                      decaying_epsilon=True,
                      alpha=1e-3,
                      max_episode_num=100,
                      display=False)
learning_curve(data,
               2,
               1,
               title="DQNAgent performance on PuckWorld",
               x_name="episodes",
               y_name="rewards of episode")
コード例 #27
0
env = gym.make(environment)
action_space = env.action_space.n
observation_space = env.observation_space.shape

# creating own tf session to use across all the Keras/Tensorflow models we are using
sess = tf.Session()

# Our models to solve the mountaincar problem.
agent = DQNAgent(sess,
                 action_space,
                 observation_space,
                 learning_rate=learning_rate,
                 batch_size=batch_size,
                 replay_memory_size=replay_memory_size,
                 minimum_replay_memory=min_replay_memory,
                 epsilon_start=epsilon_start,
                 epsilon_end=min_epsilon,
                 discount=discount,
                 activation=activation,
                 optimizer=optimizer,
                 loss_function=loss_function,
                 dense_1=dense_1,
                 dense_2=dense_2)

# replay experience
replay_memory = agent.memory

# dynamic epsilon
if (dynamic_epsilon):
    average_last_hundred_rewards = np.full(100, -200, dtype=float)
    reward_array_index = 0
コード例 #28
0
ファイル: nav_sim.py プロジェクト: aribiswas/drlnd-navigation
# create environment
env = UnityEnvironment(file_name='./Banana.app')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=False)[brain_name]

# create DQN agent
osize = len(env_info.vector_observations[0])
asize = brain.vector_action_space_size
seed = 0
agent = DQNAgent(osize, asize, seed)

# load the weights from file
agent.Q.load_state_dict(torch.load('checkpoint.pth'))

# simulate smart agent
for i in range(NUM_SIMS):

    # reset the environment
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations[0]

    for t in range(1, MAX_STEPS_PER_EPISODE):

        # get action from policy
        action = agent.get_action(state)
コード例 #29
0
    "host": host,
    "body_style": "donkey",
    "body_rgb": (128, 128, 128),
    "car_name": "42AI Potato Qarnot",
    "font_size": 100,
    "racer_name": "DDQN",
    "country": "FR",
    "bio": "Learning to drive w DDQN RL",
    "guid": str(uuid.uuid4()),
    "max_cte": 10,
}

if __name__ == "__main__":
    env = gym.make(env_name, conf=config_Simulator)

    S3 = S3(config.config_NeuralPlayer.config_Datasets.S3_bucket_name)
    agent = DQNAgent(config=config_Agent, S3=S3)
    agent.config.epsilon = 0.1
    preprocessor = PreprocessingVannilla(
        config.config_NeuralPlayer.config_Preprocessing)

    env.reset()
    i = 0
    state, reward, done, infos = env.step([0, 0.1])
    while (i < 1000):
        processed_state = preprocessor.process(state)
        action = agent.get_action(processed_state)
        state, reward, done, infos = env.step(action)
        print(action, done, infos)
        i += 1
コード例 #30
0
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=.05,
                              nb_steps=N_anneal)

# dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
#                processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
#                train_interval=4, delta_clip=1.)
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               enable_double_dqn=True,
               processor=None,
               nb_steps_warmup=5 * episode_len,
               gamma=.90,
               target_model_update=100,
               train_interval=1,
               delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

# dqn.fit(enviro, callbacks=None, nb_steps=1750000, log_interval=10000)
weights_filename = 'dqn_{}_weights.h5f'.format('PSF')
checkpoint_weights_filename = 'dqn_' + 'PSF' + '_weights_{step}.h5f'
log_filename = 'dqn_{}_log.json'.format('PSF')
callbacks = [
    ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)
]
callbacks += [FileLogger(log_filename, interval=100)]