Example #1
0
def main(args):

    with tf.Session() as sess:
        saver = tf.train.Saver()

        env = ArmEnv()
        # np.random.seed(int(args['random_seed']))
        # tf.set_random_seed(int(args['random_seed']))

        state_dim = env.state_dim
        action_dim = env.action_dim
        action_bound = max(env.action_bound)

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             float(args['actor_lr']), float(args['tau']),
                             int(args['minibatch_size']))

        critic = CriticNetwork(sess, state_dim, action_dim,
                               float(args['critic_lr']), float(args['tau']),
                               float(args['gamma']),
                               actor.get_num_trainable_vars())

        actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim),
                                                   sigma=args['sigma'])

        train(sess, env, args, actor, critic, actor_noise)
        if args['save_model']:
            del_pre_model()
            saver.save(sess, './results/tf_model/model')

        # if args['use_gym_monitor']:
        #     env.close()
    sess.close()
Example #2
0
from env import ArmEnv
from rl import DDPG

MAX_EPISODES = 900
MAX_EP_STEPS = 200
ON_TRAIN = False

# set env
env = ArmEnv()
s_dim = env.state_dim
a_dim = env.action_dim
a_bound = env.action_bound

# set RL method (continuous)
rl = DDPG(a_dim, s_dim, a_bound)

steps = []
def train():
    # start training
    for i in range(MAX_EPISODES):
        s = env.reset()
        ep_r = 0.
        for j in range(MAX_EP_STEPS):
            env.render()

            a = rl.choose_action(s)

            s_, r, done = env.step(a)

            rl.store_transition(s, a, r, s_)
Example #3
0
            "n_arms": args.arms,
            "max_ep": 1001,
            "max_step": 150,
            "soft_replace": True,
            "random_target": True,
            "tau": 0.001,
            "gamma": 0.8,
            "lr": 0.0001,
            "memory_capacity": 9000
        }

    # set env
    print(PARAMS)
    env = ArmEnv(n_arms=PARAMS["n_arms"],
                 random_goal=PARAMS["random_target"],
                 on_mouse=False if PARAMS["training"] else True,
                 show_fps=args.show_fps,
                 fps=args.fps)
    s_dim = env.state_dim
    a_dim = env.action_dim
    a_bound = env.action_bound

    # set RL method (continuous)
    rl = DDPG(
        a_dim,
        s_dim,
        a_bound,
        soft_replace=PARAMS["soft_replace"],
        tau=PARAMS["tau"],
        gamma=PARAMS["gamma"],
        lr=PARAMS["lr"],
Example #4
0
def train(sess, env, args, actor, critic, actor_noise):
    def eval_reward(env, actor, max_episode_len, episode_i):
        #evaluate actor network without noise
        ep_num = 5
        ep_reward = 0
        for i in range(ep_num):
            # s=env.reset_to_value(rad_unit*i)
            s = env.reset()
            for k in range(max_episode_len):
                a = actor.predict_target(np.reshape(s, (1, actor.s_dim)))
                s2, r, terminal = env.step(a[0])
                ep_reward += r
                if terminal:
                    break
                s = s2
        ep_reward //= ep_num
        # print('Episodic Reward: %d, Elapsed time: %.4f' % (int(ep_reward),elapsed))
        print('episode: %d,Episodic Reward: %d' % (episode_i, ep_reward))
        return ep_reward

    def save_reward(lst, args):
        base_dir = './results/rewards/'
        time_stamp = time.strftime('%m%d__%H%M%S')
        base_dir += time_stamp
        os.makedirs(base_dir, exist_ok=1)
        save_file_name = os.path.join(base_dir, 'rwd.dat')
        file = open(save_file_name, 'wb')
        dump_dict = {}
        dump_dict['rewards'] = lst
        dump_dict['actor_lr'] = args['actor_lr']
        dump_dict['critic_lr'] = args['critic_lr']
        dump_dict['sigma'] = args['sigma']
        pickle.dump(dump_dict, file, 1)
        plt.plot(lst)
        plt.title(time_stamp)
        plt.xlabel('Episodes')
        plt.ylabel('Average Reward')
        plt.ylim([-200, 0])
        fig_name = base_dir + '/reward_fig.png'
        plt.savefig(fig_name)
        print('Rewards sucessfully writed!')

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    env_eval = ArmEnv()
    reward_list = []

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:

                # summary_str = sess.run(summary_ops, feed_dict={
                #     summary_vars[0]: ep_reward,
                #     summary_vars[1]: ep_ave_max_q / float(j)
                # })

                # writer.add_summary(summary_str, i)
                # writer.flush()

                # print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                #         i, (ep_ave_max_q / float(j))))
                break
        eval_r = eval_reward(env_eval, actor, int(args['max_episode_len']), i)
        reward_list.append(eval_r)
    save_reward(reward_list, args)
Example #5
0
File: main.py Project: unasm/utils
# Mail         :    [email protected]
# Create Time  :    2017-12-04 23:36:17
############################################### 

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from env import ArmEnv
from rl import DDPG

MAX_EPISODES = 500
MAX_EP_STEPS = 200

env = ArmEnv()
s_dim = env.state_dim
a_dim = env.action_dim
a_bound = env.action_bound


rl = DDPG(a_dim, s_dim, a_bound)
for i in range(MAX_EPISODES):
    s = evn.reset()
    for j in range(MAX_EP_STEPS):
        env.render()
        a = rl.choose_actions(s)
        s_, r, done = env.step(a)
        rl.store_transition(s, a, r, s_)
        if rl.memory_full():
            rl.learn()
Example #6
0
    display.clear_output(wait=True)
    display.display(plt.gcf())
    plt.figure(figsize=(20,10))
    plt.clf()

    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(score)
    plt.plot(mean)
    plt.text(len(score)-1, score[-1], str(score[-1]))
    plt.text(len(mean)-1, mean[-1], str(mean[-1]))

if __name__ == '__main__':

    env = ArmEnv()

    params = {
        'gamma': 0.9,
        'epsi_high': 0.9,
        'epsi_low': 0.05,
        'decay': 200, 
        'lr': 0.001,
        'capacity': 10000,
        'batch_size': 32,
        'state_space_dim': env.state_dim,
        'action_space_dim': env.action_dim,
        }
    agent = Agent(**params)

    score = []
Example #7
0
from env import ArmEnv
from brain import DDPG

MAX_EPISODES = 500
MAX_EP_STEPS = 200
ON_TRAIN = True
# set env
env = ArmEnv()
s_dim = env.state_dim
a_dim = env.action_dim
a_bound = env.action_bound

# set RL method
rl = DDPG(a_dim, s_dim, a_bound)
steps = []

# start training


def train():
    for i in range(MAX_EPISODES):
        s = env.reset()
        ep_r = 0
        for j in range(MAX_EP_STEPS):
            env.render()

            a = rl.choose_action(s)

            s_, r, done = env.step(a)

            rl.store_transition(s, a, r, s_)
Example #8
0
# Main
from env import ArmEnv
from rl import DDPG
import random

MAX_EPISODES = 300
MAX_EP_STEPS = 200
ON_TRAIN = True

# set env
env = ArmEnv()
env.get_train_state = ON_TRAIN
s_dim = env.state_dim
a_dim = env.action_dim
a_bound = env.action_bound

# set RL method (continuous)
rl = DDPG(a_dim, s_dim, a_bound)
rl.get_train_state = ON_TRAIN


def train():
    # start training
    sample_goal = [None] * 36
    for incx in range(6):
        for incy in range(6):
            sample_goal[incy * 6 + incx] = {
                'x': (100. + incx * 40),
                'y': (100. + incy * 40),
                'l': 40
            }
Example #9
0
#        env.reset()
#        env.render()
#        env.step()
#
#                                                           #
# ========================================================================= #

from env import ArmEnv
from rl import DDPG

# Gloabel Variable
MAX_EPISOSES = 500
MAX_EP_STEPS = 500

# Set the environement
env = ArmEnv()
s_dim = env.state_dim
a_dim = env.action_dim
a_bound = env.action_bound

# set the RL method
rl = DDPG(a_dim, s_dim, a_bound)

# start Training

for i in range(MAX_EPISOSES):
    s = env.reset()
    for j in range(MAX_EP_STEPS):
        env.render()

        a = rl.choose_action(s)