コード例 #1
0
ファイル: test.py プロジェクト: zoskia/RL-MsPacman
def test_model(model_path, max_steps):
    dqn = DQN()
    env = gym.make("MsPacman-v0")

    X_state = tf.placeholder(
        tf.float32, shape=[None, input_height, input_width, input_channels])
    online_q_values, online_vars = dqn.create_model(X_state, "qnetwork_online")
    saver = tf.train.Saver()

    with tf.Session() as sess:
        saver.restore(sess, model_path)

        obs = env.reset()

        for step in range(max_steps):
            state = preprocess_observation(obs)

            # evaluates what to do
            q_values = online_q_values.eval(feed_dict={X_state: [state]})
            action = np.argmax(q_values)

            # plays the game
            obs, reward, done, info = env.step(action)
            env.render()
            time.sleep(0.05)
            if done:
                break
    env.close()
コード例 #2
0
def eval(cfg, saved_model_path=SAVED_MODEL_PATH):
    print('start to eval ! \n')
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")  # 检测gpu
    env = gym.make('CartPole-v0').unwrapped  # 可google为什么unwrapped gym,此处一般不需要
    env.seed(1)  # 设置env随机种子
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n
    agent = DQN(n_states=n_states,
                n_actions=n_actions,
                device="cpu",
                gamma=cfg.gamma,
                epsilon_start=cfg.epsilon_start,
                epsilon_end=cfg.epsilon_end,
                epsilon_decay=cfg.epsilon_decay,
                policy_lr=cfg.policy_lr,
                memory_capacity=cfg.memory_capacity,
                batch_size=cfg.batch_size)
    agent.load_model(saved_model_path + 'checkpoint.pth')
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/eval/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.eval_eps + 1):
        state = env.reset()  # reset环境状态
        ep_reward = 0
        for i_step in range(1, cfg.eval_steps + 1):
            action = agent.choose_action(state,
                                         train=False)  # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)  # 更新环境参数
            ep_reward += reward
            state = next_state  # 跳转到下一个状态
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step, 'done: ', done)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        # 计算滑动窗口的reward
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    '''存储reward等相关结果'''
    save_results(rewards,
                 moving_average_rewards,
                 ep_steps,
                 tag='eval',
                 result_path=RESULT_PATH)
    print('Complete evaling!')
コード例 #3
0
def run_dqn(experiment_name):
    current_dir = pathlib.Path().absolute()
    directories = Save_paths(data_dir=f'{current_dir}/data', experiment_name=experiment_name)

    game = Winter_is_coming(setup=PARAMS['setup'])
    environment = wrappers.SinglePrecisionWrapper(game)
    spec = specs.make_environment_spec(environment)

    # Build the network.
    def _make_network(spec) -> snt.Module:
        network = snt.Sequential([
            snt.Flatten(),
            snt.nets.MLP([50, 50, spec.actions.num_values]),
        ])
        tf2_utils.create_variables(network, [spec.observations])
        return network

    network = _make_network(spec)

    # Setup the logger
    if neptune_enabled:
        agent_logger = NeptuneLogger(label='DQN agent', time_delta=0.1)
        loop_logger = NeptuneLogger(label='Environment loop', time_delta=0.1)
        PARAMS['network'] = f'{network}'
        neptune.init('cvasquez/sandbox')
        neptune.create_experiment(name=experiment_name, params=PARAMS)
    else:
        agent_logger = loggers.TerminalLogger('DQN agent', time_delta=1.)
        loop_logger = loggers.TerminalLogger('Environment loop', time_delta=1.)

    # Build the agent
    agent = DQN(
        environment_spec=spec,
        network=network,
        params=PARAMS,
        checkpoint=True,
        paths=directories,
        logger=agent_logger
    )
    # Try running the environment loop. We have no assertions here because all
    # we care about is that the agent runs without raising any errors.
    loop = acme.EnvironmentLoop(environment, agent, logger=loop_logger)
    loop.run(num_episodes=PARAMS['num_episodes'])

    last_checkpoint_path = agent.save()

    # Upload last checkpoint
    if neptune_upload_checkpoint and last_checkpoint_path:
        files = os.listdir(last_checkpoint_path)
        for f in files:
            neptune.log_artifact(os.path.join(last_checkpoint_path, f))

    if neptune_enabled:
        neptune.stop()

    do_example_run(game,agent)
コード例 #4
0
ファイル: play.py プロジェクト: rlagent45/openaigym-practice
def display(env):
    if env == "MountainCar-v0":
        agent = DQN(2, 3, eps_max=0, load_path="models/" + env + "_model.h5")
        play(env, agent, train=False, render=True, episodes=1)
    else:
        agent = DQN(state_dim=(210, 160, 3),
                    n_actions=14,
                    eps_max=0,
                    load_path="models/" + env + "_model.h5")
        play(env, agent, train=False, render=True, episodes=1)
コード例 #5
0
def play(game):
    agent = DQN(game, use_saved=True)
    for i in tqdm(range(PLAY_GAMES)):
        game.new_episode()
        done = False
        while not done:
            state = game.get_state()
            img = state.screen_buffer
            action = agent.act(img)
            print(action)
            game.make_action(action)
            done = game.is_episode_finished()
コード例 #6
0
ファイル: main.py プロジェクト: zxcayumi/leedeeprl-notes
def train(cfg):
    print('Start to train ! \n')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
    env = gym.make('CartPole-v0')
    env.seed(1) # 设置env随机种子
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n
    agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
                epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.train_eps+1):
        state = env.reset() # reset环境状态
        ep_reward = 0
        for i_step in range(1, cfg.train_steps+1):
            action = agent.choose_action(state) # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action) # 更新环境参数
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
            state = next_state # 跳转到下一个状态
            agent.update() # 每步更新网络
            if done:
                break
        # 更新target network,复制DQN中的所有weights and biases
        if i_episode % cfg.target_update == 0:
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
        print('Episode:', i_episode, ' Reward: %i' %
              int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        # 计算滑动窗口的reward
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(
                0.9*moving_average_rewards[-1]+0.1*ep_reward)
        writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
        writer.add_scalar('steps_of_each_episode',
                          ep_steps[-1], i_episode)
    writer.close()
    print('Complete training!')
    ''' 保存模型 '''
    save_model(agent,model_path=SAVED_MODEL_PATH)
    '''存储reward等相关结果'''
    save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
コード例 #7
0
def train_dqn(episodes, env, render_frequency=0):
    now = datetime.datetime.now()
    id = f'{now.hour}{now.minute}'
    episode_rewards = []
    agent = DQN(env, params)
    best_score = 0
    for episode in range(episodes):
        rendering = render_frequency and episode % render_frequency == 0 and isinstance(
            env, HeadlessSnake)

        state = env.reset(
        )  # Reset enviroment before each episode to start fresh

        if rendering:
            renderer = Renderer(env, episode + 1)

        env.update_episode(episode + 1)
        # state = np.reshape(state, (1, env.state_space))
        total_reward = 0
        max_steps = 10000
        for step in range(max_steps):
            # 1. Find next action using the Epsilon-Greedy exploration Strategy
            action = agent.get_action(state)

            # 2. perform action in enviroment
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            # next_state = np.reshape(next_state, (1, env.state_space))

            if rendering:
                renderer.update()

            # 3. Update the Q-function (train model)
            agent.remember(state, action, reward, next_state, done)
            agent.train_with_experience_replay()

            # 4. Change exploration vs. explotation probability
            agent.update_exploration_strategy(episode)
            state = next_state

            if done:
                print(
                    f'episode: {episode+1}/{episodes}, score: {total_reward}, steps: {step}, '
                    f'epsilon: {agent.epsilon}, highscore: {env.maximum}')
                save_model(id, agent, best_score, total_reward)
                break

        if rendering:
            renderer.bye()

        save_model(id, agent, best_score, total_reward)
        episode_rewards.append(total_reward)
    return episode_rewards
コード例 #8
0
def run(ep,train=False):
    pygame.init()
    loss=[]
    agent = DQN(3, 5)
    env=pongGame()
    weights_filepath = 'PongGame.h5'
    if train==False:
        agent.model.load_weights(weights_filepath)
        print("weights loaded")
    for e in range(ep):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                quit()
        state = env.reset()
        state = np.reshape(state, (1, 5))
        score = 0
        max_steps = 1000
        for i in range(max_steps):
            action = agent.act(state)
            reward, next_state, done = env.step(action)
            score += reward
            next_state = np.reshape(next_state, (1, 5))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if train==True:
                agent.replay()
            if done:
                print("episode: {}/{}, score: {}".format(e, ep, score))
                break
        loss.append(score)
    if train:
        agent.model.save_weights("PongGame.h5")
    return loss
コード例 #9
0
def main(args):
    # load env
    env = gym.make('CartPole-v0')
    # load agent
    agent = DQN(env)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver()
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.init_var()

    # training loop
    for ep in range(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # sample actions
            action = agent.sample_action(state, policy='greedy')
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done:
                break

        print('Ep%s  Reward: %s ' % (ep + 1, total_rewards))
コード例 #10
0
def train(game):
    agent = DQN(game)

    for i in tqdm(range(TRAIN_GAMES)):
        game.new_episode()
        previous_variables = None
        previous_img = None
        done = False
        local_history = []
        total_reward = 0
        while not done:
            state = game.get_state()

            img = state.screen_buffer
            variables = state.game_variables
            if previous_variables is None:
                previous_variables = variables
            if previous_img is None:
                previous_img = img

            action = agent.act(img)
            reward = game.make_action(action)
            done = game.is_episode_finished()
            reward = (reward + calculate_additional_reward(previous_variables, variables)) / 100
            total_reward += reward
            local_history.append([previous_img, img, reward, action, done])
            previous_variables = variables
            previous_img = img

        if total_reward >= 0:
            for previous_state, state, reward, action, done in local_history:
                agent.remember(previous_state, state, reward, action, done)
            agent.train()
コード例 #11
0
def main():
    get_env_version()
    cfg = DQNConfig(env="CartPole-v0", train_eps=200)
    # cfg = DQNConfig(env="MountainCar-v0", train_eps=500)
    get_env_information(env_name=cfg.env)
    env = gym.make(cfg.env)
    env.seed(0)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = DQN(state_dim, action_dim, cfg)
    rewards, smooth_rewards = train(cfg, env, agent)
    os.makedirs(cfg.result_path)
    agent.save(path=cfg.result_path)
    save_results(rewards, smooth_rewards, tag='train', path=cfg.result_path)
    plot_rewards(rewards,
                 smooth_rewards,
                 tag='train',
                 env=cfg.env,
                 algo=cfg.algo,
                 path=cfg.result_path)
コード例 #12
0
def test_dqn(env):
    agent = DQN(env, params)

    agent.load_model(sys.argv[1], sys.argv[2])

    state = env.reset()  # Reset enviroment before each episode to start fresh
    state = np.reshape(state, (1, env.state_space))
    max_steps = 10000
    total_reward = 0

    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        state = np.reshape(next_state, (1, env.state_space))
        total_reward += reward
        time.sleep(0.1)
        if done:
            print(f'Score: {total_reward}, steps: {step}')
            break
    return
コード例 #13
0
ファイル: evaluation.py プロジェクト: Funitus/reinforce_py
def main(args):
    # load env
    env = gym.make('CartPole-v0')
    # load agent
    agent = DQN(env)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver()
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.init_var()

    # training loop
    for ep in range(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # sample actions
            action = agent.sample_action(state, policy='greedy')
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done:
                break

        print('Ep%s  Reward: %s ' % (ep+1, total_rewards))
コード例 #14
0
ファイル: play.py プロジェクト: rlagent45/openaigym-practice
def train(env):
    if env == "MountainCar-v0":
        agent = DQN(2, 3)
        play(env, agent, until=195, ckpt=True)
    else:
        agent = DQN(state_dim=(210, 160, 3), n_actions=14)
        play(env, agent)
    agent.save_model(env)
コード例 #15
0
def main():
    sess = tf.Session(config=cf.tf_config)

    dqn = DQN(cf, sess)
    sess.run(tf.global_variables_initializer())

    if bool(args.e):
        dqn.evaluate(load_model=True)
    else:
        dqn.learn()

    sess.close()
コード例 #16
0
    def _build_layer(self):
        # ALL NEURONS ARE BINARY EXCEPT OUTPUT
        num_outputs = self.args['num_outputs'] if self.ID == self.args[
            'num_layers'] else 2

        for ID in range(self.out_shape):
            if self.args['neuron_type'] == 'PG':
                neuron = PG(args=self.args,
                            in_shape=self.in_shape,
                            ID=ID,
                            num_outputs=num_outputs)
            elif self.args['neuron_type'] == 'DQN':
                neuron = DQN(args=self.args,
                             in_shape=self.in_shape,
                             ID=ID,
                             num_outputs=num_outputs)
            else:
                neuron = Random(args=self.args,
                                in_shape=self.in_shape,
                                ID=ID,
                                num_outputs=num_outputs)

            self.neurons.append(neuron)
コード例 #17
0
ファイル: training.py プロジェクト: zoskia/RL-MsPacman
def train_model():
    iteration = 0
    loss_val = np.infty
    game_length = 0
    total_max_q = 0
    mean_max_q = 0.0
    done = True
    state = []

    dqn = DQN()
    env = gym.make("MsPacman-v0")

    X_state = tf.placeholder(
        tf.float32, shape=[None, input_height, input_width, input_channels])

    online_q_values, online_vars = dqn.create_model(X_state, "qnetwork_online")
    target_q_values, target_vars = dqn.create_model(X_state, "qnetwork_target")

    copy_ops = [
        target_var.assign(online_vars[var_name])
        for var_name, target_var in target_vars.items()
    ]
    copy_online_to_target = tf.group(*copy_ops)

    X_action, global_step, loss, training_op, y = define_train_variables(
        online_q_values)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as sess:

        restore_session(copy_online_to_target, init, saver, sess)

        while True:
            step = global_step.eval()
            if step >= n_steps:
                break

            iteration += 1
            print(
                "\rIteration {}\tTraining step {}/{} ({:.1f})%\tLoss {:5f}\tMean Max-Q {:5f}   "
                .format(iteration, step, n_steps, step * 100 / n_steps,
                        loss_val, mean_max_q),
                end="")

            state = skip_some_steps(done, env, state)

            done, q_values, next_state = evaluate_and_play_online_dqn(
                X_state, env, online_q_values, state, step)
            state = next_state

            mean_max_q = compute_statistics(done, game_length, mean_max_q,
                                            q_values, total_max_q)

            if iteration < training_start or iteration % training_interval != 0:
                continue

            loss_val = train_online_dqn(X_action, X_state, loss, sess,
                                        target_q_values, training_op, y)

            # Copy the online DQN to the target DQN
            if step % copy_steps == 0:
                copy_online_to_target.run()

            # Save model
            if step % save_steps == 0:
                saver.save(sess, checkpoint_path)
コード例 #18
0
def run(
    agent_type="dqn",
    hidden_layer_size=32,
    gamma=1.0,
    min_epsilon=0.001,
    learning_rate=2.5e-4,
    env_name="CartPole-v0",
    num_episodes=3000,
    log_interval=100,
    replay_buffer_capacity=10**5,
    use_prioritized_experience_buffer=False,
    max_steps_per_episode = 10000,
    batch_size = 32,
    use_soft_update = False,
    online_update_period = 1,
    target_update_tau = 1,
    target_sync_period = 100,
):
    env = gym.make(env_name)

    cfg = {
        "type": agent_type,
        "network": {
            "type": "dense",
            "hidden_layers": (hidden_layer_size, hidden_layer_size),
        },
        "gamma": gamma,
        "min_epsilon": min_epsilon
    }
    agent = DQN(
        cfg, 
        env.observation_space.shape, 
        env.action_space.n,
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss_function=tf.keras.losses.MeanSquaredError(),
    )

    if use_prioritized_experience_buffer:
        buffer = PrioritizedReplayBuffer(
            size=replay_buffer_capacity, 
            alpha=0.6, 
            anneal_alpha_rate=1e-5, 
            anneal_beta_rate=1e-5
        )
    else:
        buffer = UniformReplayBuffer(size=replay_buffer_capacity)

    observer = [
        AverageObserver(log_interval), 
        MaximumObserver(log_interval)
    ]

    train(
        env, agent, buffer,
        num_episodes=num_episodes, 
        max_steps_per_episode=max_steps_per_episode,
        batch_size=batch_size,
        online_update_period=online_update_period,
        target_sync_period=target_sync_period,
        log_interval=log_interval,
        use_soft_update=use_soft_update,
        target_update_tau=target_update_tau,
        observer=observer
    )
コード例 #19
0
ファイル: main.py プロジェクト: PhantomDot1/pr_project
def main():
    env = retro.make(game='Frogger-Genesis', use_restricted_actions=retro.Actions.DISCRETE)
    gamma = 0.99
    copy_step = 25
    num_actions = env.action_space.n
    num_states = len(env.observation_space.sample())
    hidden_units = [200, 200]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)

    # For stable weights, use one net to train, and copy their weights over to the TargetNet every copy_steps
    TrainNet = DQN(num_actions=num_actions, gamma=gamma, max_experiences=max_experiences,
                   min_experiences=min_experiences, batch_size=batch_size, lr=lr,
                   hidden_units=hidden_units, num_states=num_states)
    TargetNet = DQN(num_actions=num_actions, gamma=gamma, max_experiences=max_experiences,
                    min_experiences=min_experiences, batch_size=batch_size, lr=lr,
                    hidden_units=hidden_units, num_states=num_states)

    # Loading check
    while True:
        if os.path.exists(save_dir):
            if input("\n\nWould you like to load the previous network weights? (y/n) ") == 'y':
                # load weights and copy to train net
                TargetNet.load_model(save_path)
                TrainNet.copy_weights(TargetNet)
                print("Loaded model weights...")
                break

            elif input("\nWould you like to delete the old checkpoints and start again? (y/n)") == 'y':
                shutil.rmtree(save_dir)
                print("Removed old checkpoint...")
                break
        else:
            break

    N = 50000
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1

    # play N games
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()

        with summary_writer.as_default():
            tf.summary.scalar("episode reward", total_reward, step=n)
            tf.summary.scalar("running avg reward(100)", avg_rewards, step=n)

        if n % 100 == 0:
            print("episode:", n, "episode reward:", total_reward,
                  "eps:", epsilon, "avg reward (last 100):", avg_rewards)

            # save the model weights
            TargetNet.save_model(save_path)

    print("avg reward for last 100 episodes:", avg_rewards)

    if create_video:
        make_video(env, TrainNet)

    env.close()
コード例 #20
0
import matplotlib.pyplot as plt
from collections import deque

import gym
from gym.wrappers import Monitor
from agent import DQN, preprocess
import numpy as np

import gym_ple

if __name__ == '__main__':

    N_EP = 10000
    N_SAVE = 500
    env = gym.make('FlappyBird-v0')
    agent = DQN(env)
    scores = deque(maxlen=100)
    for i in range(N_EP):
        score = 0
        ob = env.reset()

        # Stack observations
        pre_ob = preprocess(ob)
        pre_ob = pre_ob.reshape(1, 100, 100)
        ob_stack = np.stack((pre_ob, ) * 4, -1)
        pre_ob = ob_stack

        while True:
            action = agent.act(pre_ob, step=i)

            ob, reward, done, _ = env.step(action)
コード例 #21
0
ファイル: train_DQN.py プロジェクト: Funitus/reinforce_py
def main(args):
    set_random_seed(args.seed)
    env = gym.make('CartPole-v0')
    agent = DQN(env, args)
    agent.construct_model(args.gpu)

    # load pretrained models or init new a model.
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        best_mean_rewards = None

    rewards_history, steps_history = [], []
    train_steps = 0
    # Training
    for ep in range(args.max_ep):
        state = env.reset()
        ep_rewards = 0
        for step in range(env.spec.timestep_limit):
            # pick action
            action = agent.sample_action(state, policy='egreedy')
            # Execution action.
            next_state, reward, done, debug = env.step(action)
            train_steps += 1
            ep_rewards += reward
            # modified reward to speed up learning
            reward = 0.1 if not done else -1
            # Learn and Update net parameters
            agent.learn(state, action, reward, next_state, done)

            state = next_state
            if done:
                break
        steps_history.append(train_steps)
        if not rewards_history:
            rewards_history.append(ep_rewards)
        else:
            rewards_history.append(
                rewards_history[-1] * 0.9 + ep_rewards * 0.1)
        # Decay epsilon
        if agent.epsilon > args.final_epsilon:
            agent.epsilon -= (args.init_epsilon - args.final_epsilon) / args.max_ep

        # Evaluate during training
        if ep % args.log_every == args.log_every-1:
            total_reward = 0
            for i in range(args.test_ep):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    action = agent.sample_action(state, policy='greedy')
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            current_mean_rewards = total_reward / args.test_ep
            print('Episode: %d Average Reward: %.2f' %
                  (ep + 1, current_mean_rewards))
            # save model if current model outpeform the old one
            if best_mean_rewards is None or (current_mean_rewards >= best_mean_rewards):
                best_mean_rewards = current_mean_rewards
                if not os.path.isdir(args.save_path):
                    os.makedirs(args.save_path)
                save_name = args.save_path + str(round(best_mean_rewards, 2)) \
                    + '_' + str(ep_base + ep + 1)
                saver.save(agent.sess, save_name)
                print('Model saved %s' % save_name)

    # plot training rewards
    plt.plot(steps_history, rewards_history)
    plt.xlabel('steps')
    plt.ylabel('running avg rewards')
    plt.show()
コード例 #22
0
def main(args):
    # Hyper parameters
    MAX_EPISODE = 10000  # training episode
    INITIAL_EPSILON = 0.5  # starting value of epsilon
    FINAL_EPSILON = 0.01  # final value of epsilon
    TEST_EPISODE = 100

    env = gym.make('CartPole-v0')
    agent = DQN(env, double_q=args.double)
    agent.construct_model(args.gpu)

    saver = tf.train.Saver(max_to_keep=2)
    if args.model_path is not None:
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        mean_rewards = None

    # Training
    for ep in range(MAX_EPISODE):
        state = env.reset()

        for step in range(env.spec.timestep_limit):
            # pick action
            action = agent.sample_action(state, policy='egreedy')
            # Execution action.
            next_state, reward, done, debug = env.step(action)
            # modified reward to speed up learning
            reward = 0.1 if not done else -1
            # Learn and Update net parameters
            agent.learn(state, action, reward, next_state, done)

            state = next_state
            if done:
                break

        # Update epsilon
        if agent.epsilon > FINAL_EPSILON:
            agent.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / MAX_EPISODE

        # Evaluate during training
        if ep % args.log_every == args.log_every - 1:
            total_reward = 0
            for i in range(TEST_EPISODE):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    action = agent.sample_action(state, policy='greedy')
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            mean_rewards = total_reward / float(TEST_EPISODE)
            print('Episode:', ep + 1, ' Average Reward:', mean_rewards)
            print('Global steps:', agent.global_step)

            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)
            save_name = args.save_path + str(round(mean_rewards, 2)) + '_' \
                + str(ep_base+ep+1)
            saver.save(agent.sess, save_name)
コード例 #23
0
            step += 1

        #end of game
        #print("game over")
        #print(observation, '\n')
        result.append(np.max(observation))


if __name__ == "__main__":
    GAME = chessboard("GAME")
    result = []

    RL = DQN(
        GAME.n_actions,
        GAME.n_features,
        learning_rate=0.01,
        reward_decay=0.93,
        e_greedy=0.93,
        replace_target_iter=100,
        memory_size=20000,
        # output_graph=True
    )

    run_game()
    RL.show_cost()
    import matplotlib.pyplot as plt
    plt.plot(np.arange(len(result)), result)
    plt.ylabel('max')
    plt.xlabel('games')
    plt.show()
コード例 #24
0
logger.addHandler(fh)


# Check whether cuda is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialise the game
env = gym.make('ChromeDino-v0')
# env = gym.make('ChromeDinoNoBrowser-v0')
env = make_dino(env, timer=True, frame_stack=True)

# Get the number of actions and the dimension of input
n_actions = env.action_space.n

# ----------- Nature DQN ---------------
dqn = DQN(n_actions, device)
dqn.train(env, logger)
# dqn.load("./trained/dqn.pkl")
# dqn.test(env)

# ----------- Prioritized DQN ---------------
# dqn_p = DQNPrioritized(n_actions, device)
# dqn_p.train(env, logger)
# dqn_p.load("./trained/dqn_p.pkl")
# dqn_p.test(env)


# ----------- Double DQN ----------------
# double_dqn = DoubleDQN(n_actions, device)
# double_dqn.train(env, logger)
# double_dqn.load("./trained/double-dqn.pkl")
コード例 #25
0
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = get_screen(env).to(device)
_, screen_ch, screen_height, screen_width = init_screen.shape
n_action = env.action_space.n

policy_net = DQN(screen_ch * args.num_frames, screen_height, screen_width, n_action).to(device)
target_net = DQN(screen_ch * args.num_frames, screen_height, screen_width, n_action).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)


steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
コード例 #26
0
}

# MountainCar hyper parameters
MountainCar_HYPER_PARAMETERS = {
    'MEM_REPLAY_SIZE': 150000,
    'BATCH_SIZE': 512,
    'GAMMA': 0.999,
    'EPS_START': 1,
    'EPS_END': 0.1,
    'EPS_DECAY': 1000,
    'EVALUATE_FREQUENCY': 1,
    'ALTER_TARGET_UPDATE_RATE': 0.999,
    'MAX_EPISODES': 1000
}

# Acrobot hyper parameters
Acrobot_HYPER_PARAMETERS = {
    'MEM_REPLAY_SIZE': 150000,
    'BATCH_SIZE': 128,
    'GAMMA': 0.999,
    'EPS_START': 1,
    'EPS_END': 0.1,
    'EPS_DECAY': 1000,
    'EVALUATE_FREQUENCY': 20,
    'ALTER_TARGET_UPDATE_RATE': 0.995,
    'MAX_EPISODES': 1000
}

DQN.train_model(MountainCar_HYPER_PARAMETERS, envs[2])
DDQN.train_model(MountainCar_HYPER_PARAMETERS, envs[2])
コード例 #27
0
SAVED_MODEL_PATH = 'D:/unity2017/water/ai/saved_model/'

# 构建Socket实例、设置端口号和监听队列大小
listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
listener.bind(('127.0.0.1', 50213))
listener.listen(5)
print('Waiting for connect...')

while True:
    client_executor, addr = listener.accept()
    if addr != None:
        break

print('Accept new connection from %s:%s...' % addr)

agent = DQN(pretrained=True)
state = torch.zeros((150, 6), device=device, dtype=torch.float)
state[0][5] = 0.26
state[0][1] = 4.75
state = state.unsqueeze(0)
reward = 0
for i in range(6005):

    if i == 0:
        action = 50
        #action = torch.zeros((1),device=device,dtype=torch.float,requires_grad=False)
    else:
        action = agent.choose_action(state)
    msg = client_executor.recv(16384).decode('utf-8')
    client_executor.send(bytes(str(action / 10 - 5).encode('utf-8')))
コード例 #28
0
# main body

import gym
from agent import DQN, train
from wrapper import AtariWrapper
from replay import UniformReplayBuffer, PrioritizedReplayBuffer
from observer import AverageObserver, MaximumObserver

env = gym.make(config["env"]["name"])
if config["env"]["is_atari"]:
    env = AtariWrapper(env, **config["env"]["wrapper"])

agent = DQN(
    config["agent"],
    env.observation_space.shape,
    env.action_space.n,
)

if config["buffer"]["use_per"]:
    buffer = PrioritizedReplayBuffer(
        size = config["buffer"]["size"],
        alpha = config["buffer"]["alpha"],
        beta = config["buffer"]["beta"],
        anneal_alpha_rate = config["buffer"]["anneal_alpha_rate"],
        anneal_beta_rate = config["buffer"]["anneal_beta_rate"]
    )
else:
    buffer = UniformReplayBuffer(config["buffer"]["size"])

observer = []
コード例 #29
0
def main(args):
    env = gym.make("CartPole-v0")

    if args.seed >= 0:
        random_seed(args.seed)
        env.seed(args.seed)

    agent = DQN(env, args)
    model = get_model(out_dim=env.action_space.n, lr=args.lr)
    agent.set_model(model)

    rewards_history, steps_history = [], []
    train_steps = 0
    # Training
    for ep in range(args.max_ep):
        state = env.reset()
        ep_rewards = 0
        for step in range(env.spec.timestep_limit):
            # sample action
            action = agent.sample_action(state, policy="egreedy")
            # apply action
            next_state, reward, done, debug = env.step(action)
            train_steps += 1
            ep_rewards += reward
            # modified reward to speed up learning
            reward = 0.1 if not done else -1
            # train
            agent.train(state, action, reward, next_state, done)

            state = next_state
            if done:
                break

        steps_history.append(train_steps)
        if not rewards_history:
            rewards_history.append(ep_rewards)
        else:
            rewards_history.append(rewards_history[-1] * 0.9 +
                                   ep_rewards * 0.1)

        # Decay epsilon
        if agent.epsilon > args.final_epsilon:
            decay = (args.init_epsilon - args.final_epsilon) / args.max_ep
            agent.epsilon -= decay

        # Evaluate during training
        if ep % args.log_every == args.log_every - 1:
            total_reward = 0
            for i in range(args.test_ep):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    if args.render:
                        env.render()
                    action = agent.sample_action(state, policy="greedy")
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            current_mean_rewards = total_reward / args.test_ep
            print("Episode: %d Average Reward: %.2f" %
                  (ep + 1, current_mean_rewards))

    # plot training rewards
    plt.plot(steps_history, rewards_history)
    plt.xlabel("steps")
    plt.ylabel("running avg rewards")
    plt.show()
コード例 #30
0
        _process_data = my_process_data

    x = input('''To train model: train,
        To test a trained model: test,
        To train on different dataset: d: ''')
    if x == 'd':
        dataset = input('Enter name of dataset as "example_dataset.csv": ')
        try:
            raw = preprocess(dataset)
        except:
            print('Invalid dataset')
        raw = preprocess(dataset)
        actions = 2
        states = 7
        env = MyStocksEnv(raw, window_size=1, frame_bound=(1, 300))
        agent = DQN(actions, states, 100)
        all_rewards = agent.train(env, 1000)
    elif x == 'test':
        raw = preprocess()
        env = MyStocksEnv(raw, window_size=1, frame_bound=(1, 300))
        all_rewards = trained_test('dqn_model.h5', env)
    else:
        raw = preprocess()
        actions = 2
        states = 7
        env = MyStocksEnv(raw, window_size=1, frame_bound=(1, 300))
        agent = DQN(actions, states, 100)
        all_rewards = agent.train(env, 1000)

    if all_rewards != 0:
        print(all_rewards)
コード例 #31
0
def main(args):
    set_random_seed(args.seed)

    env = gym.make("CartPole-v0")
    agent = DQN(env, args)
    agent.construct_model(args.gpu)

    # load pre-trained models or init new a model.
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        best_mean_rewards = None

    rewards_history, steps_history = [], []
    train_steps = 0
    # Training
    for ep in range(args.max_ep):
        state = env.reset()
        ep_rewards = 0
        for step in range(env.spec.max_episode_steps):
            # pick action
            action = agent.sample_action(state, policy='egreedy')
            # execution action.
            next_state, reward, done, debug = env.step(action)
            train_steps += 1
            ep_rewards += reward
            # modified reward to speed up learning
            reward = 0.1 if not done else -1
            # learn and Update net parameters
            agent.learn(state, action, reward, next_state, done)

            state = next_state
            if done:
                break
        steps_history.append(train_steps)
        if not rewards_history:
            rewards_history.append(ep_rewards)
        else:
            rewards_history.append(rewards_history[-1] * 0.9 +
                                   ep_rewards * 0.1)

        # decay epsilon
        if agent.epsilon > args.final_epsilon:
            agent.epsilon -= (args.init_epsilon -
                              args.final_epsilon) / args.max_ep

        # evaluate during training
        if ep % args.log_every == args.log_every - 1:
            total_reward = 0
            for i in range(args.test_ep):
                state = env.reset()
                for j in range(env.spec.max_episode_steps):
                    action = agent.sample_action(state, policy='greedy')
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            current_mean_rewards = total_reward / args.test_ep
            print('Episode: %d Average Reward: %.2f' %
                  (ep + 1, current_mean_rewards))
            # save model if current model outperform the old one
            if best_mean_rewards is None or (current_mean_rewards >=
                                             best_mean_rewards):
                best_mean_rewards = current_mean_rewards
                if not os.path.isdir(args.save_path):
                    os.makedirs(args.save_path)
                save_name = args.save_path + str(round(best_mean_rewards, 2)) \
                    + '_' + str(ep_base + ep + 1)
                saver.save(agent.sess, save_name)
                print('Model saved %s' % save_name)

    plt.plot(steps_history, rewards_history)
    plt.xlabel('steps')
    plt.ylabel('running avg rewards')
    plt.show()
コード例 #32
0
print('Number of agents:', len(env_info.agents))
# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)
# examine the state space
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)


# Train Agent ##################################################################

from agent import DQN, Double_DQN
from training import train_agent
agent = DQN(state_size=state_size, action_size=action_size, seed=0)


def train(n_episodes=100, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
"""
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores