Ejemplo n.º 1
0
def main(args):
    # load env
    env = gym.make('CartPole-v0')
    # load agent
    agent = DQN(env)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver()
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.init_var()

    # training loop
    for ep in range(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # sample actions
            action = agent.sample_action(state, policy='greedy')
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done:
                break

        print('Ep%s  Reward: %s ' % (ep + 1, total_rewards))
Ejemplo n.º 2
0
def main(args):
    # load env
    env = gym.make('CartPole-v0')
    # load agent
    agent = DQN(env)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver()
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.init_var()

    # training loop
    for ep in range(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # sample actions
            action = agent.sample_action(state, policy='greedy')
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done:
                break

        print('Ep%s  Reward: %s ' % (ep+1, total_rewards))
Ejemplo n.º 3
0
def main(args):
    set_random_seed(args.seed)

    env = gym.make("CartPole-v0")
    agent = DQN(env, args)
    agent.construct_model(args.gpu)

    # load pre-trained models or init new a model.
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        best_mean_rewards = None

    rewards_history, steps_history = [], []
    train_steps = 0
    # Training
    for ep in range(args.max_ep):
        state = env.reset()
        ep_rewards = 0
        for step in range(env.spec.max_episode_steps):
            # pick action
            action = agent.sample_action(state, policy='egreedy')
            # execution action.
            next_state, reward, done, debug = env.step(action)
            train_steps += 1
            ep_rewards += reward
            # modified reward to speed up learning
            reward = 0.1 if not done else -1
            # learn and Update net parameters
            agent.learn(state, action, reward, next_state, done)

            state = next_state
            if done:
                break
        steps_history.append(train_steps)
        if not rewards_history:
            rewards_history.append(ep_rewards)
        else:
            rewards_history.append(rewards_history[-1] * 0.9 +
                                   ep_rewards * 0.1)

        # decay epsilon
        if agent.epsilon > args.final_epsilon:
            agent.epsilon -= (args.init_epsilon -
                              args.final_epsilon) / args.max_ep

        # evaluate during training
        if ep % args.log_every == args.log_every - 1:
            total_reward = 0
            for i in range(args.test_ep):
                state = env.reset()
                for j in range(env.spec.max_episode_steps):
                    action = agent.sample_action(state, policy='greedy')
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            current_mean_rewards = total_reward / args.test_ep
            print('Episode: %d Average Reward: %.2f' %
                  (ep + 1, current_mean_rewards))
            # save model if current model outperform the old one
            if best_mean_rewards is None or (current_mean_rewards >=
                                             best_mean_rewards):
                best_mean_rewards = current_mean_rewards
                if not os.path.isdir(args.save_path):
                    os.makedirs(args.save_path)
                save_name = args.save_path + str(round(best_mean_rewards, 2)) \
                    + '_' + str(ep_base + ep + 1)
                saver.save(agent.sess, save_name)
                print('Model saved %s' % save_name)

    plt.plot(steps_history, rewards_history)
    plt.xlabel('steps')
    plt.ylabel('running avg rewards')
    plt.show()
Ejemplo n.º 4
0
def main(args):
    set_random_seed(args.seed)
    env = gym.make('CartPole-v0')
    agent = DQN(env, args)
    agent.construct_model(args.gpu)

    # load pretrained models or init new a model.
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        best_mean_rewards = None

    rewards_history, steps_history = [], []
    train_steps = 0
    # Training
    for ep in range(args.max_ep):
        state = env.reset()
        ep_rewards = 0
        for step in range(env.spec.timestep_limit):
            # pick action
            action = agent.sample_action(state, policy='egreedy')
            # Execution action.
            next_state, reward, done, debug = env.step(action)
            train_steps += 1
            ep_rewards += reward
            # modified reward to speed up learning
            reward = 0.1 if not done else -1
            # Learn and Update net parameters
            agent.learn(state, action, reward, next_state, done)

            state = next_state
            if done:
                break
        steps_history.append(train_steps)
        if not rewards_history:
            rewards_history.append(ep_rewards)
        else:
            rewards_history.append(
                rewards_history[-1] * 0.9 + ep_rewards * 0.1)
        # Decay epsilon
        if agent.epsilon > args.final_epsilon:
            agent.epsilon -= (args.init_epsilon - args.final_epsilon) / args.max_ep

        # Evaluate during training
        if ep % args.log_every == args.log_every-1:
            total_reward = 0
            for i in range(args.test_ep):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    action = agent.sample_action(state, policy='greedy')
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            current_mean_rewards = total_reward / args.test_ep
            print('Episode: %d Average Reward: %.2f' %
                  (ep + 1, current_mean_rewards))
            # save model if current model outpeform the old one
            if best_mean_rewards is None or (current_mean_rewards >= best_mean_rewards):
                best_mean_rewards = current_mean_rewards
                if not os.path.isdir(args.save_path):
                    os.makedirs(args.save_path)
                save_name = args.save_path + str(round(best_mean_rewards, 2)) \
                    + '_' + str(ep_base + ep + 1)
                saver.save(agent.sess, save_name)
                print('Model saved %s' % save_name)

    # plot training rewards
    plt.plot(steps_history, rewards_history)
    plt.xlabel('steps')
    plt.ylabel('running avg rewards')
    plt.show()
Ejemplo n.º 5
0
def main(args):
    # Hyper parameters
    MAX_EPISODE = 10000  # training episode
    INITIAL_EPSILON = 0.5  # starting value of epsilon
    FINAL_EPSILON = 0.01  # final value of epsilon
    TEST_EPISODE = 100

    env = gym.make('CartPole-v0')
    agent = DQN(env, double_q=args.double)
    agent.construct_model(args.gpu)

    saver = tf.train.Saver(max_to_keep=2)
    if args.model_path is not None:
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        mean_rewards = None

    # Training
    for ep in range(MAX_EPISODE):
        state = env.reset()

        for step in range(env.spec.timestep_limit):
            # pick action
            action = agent.sample_action(state, policy='egreedy')
            # Execution action.
            next_state, reward, done, debug = env.step(action)
            # modified reward to speed up learning
            reward = 0.1 if not done else -1
            # Learn and Update net parameters
            agent.learn(state, action, reward, next_state, done)

            state = next_state
            if done:
                break

        # Update epsilon
        if agent.epsilon > FINAL_EPSILON:
            agent.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / MAX_EPISODE

        # Evaluate during training
        if ep % args.log_every == args.log_every - 1:
            total_reward = 0
            for i in range(TEST_EPISODE):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    action = agent.sample_action(state, policy='greedy')
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            mean_rewards = total_reward / float(TEST_EPISODE)
            print('Episode:', ep + 1, ' Average Reward:', mean_rewards)
            print('Global steps:', agent.global_step)

            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)
            save_name = args.save_path + str(round(mean_rewards, 2)) + '_' \
                + str(ep_base+ep+1)
            saver.save(agent.sess, save_name)