Exemple #1
0
def main():
    """
        DDPG run
    """
    en_nm = 'InvertedPendulum-v2'
    env = gym.make(en_nm)
    test_env = gym.make(en_nm)

    ac_kwargs = {'hidden_sizes': [256, 256], 'actor_critic': MLPActorCritic}
    agent_args = {'env_name': 'HCv2'}
    train_args = {
        'eval_episodes': 5,
        'seed': 0,
        'save_frequency': 120,
        'load_model': False,
        'device': 'cpu',
        'max_eps_len': 150,
        'test_env': test_env,
        'evaluate_agent': False,
        'q_lr': 1e-4,
        'pi_lr': 1e-4,
        'exploration_steps': 10000,
        'steps_per_epoch': 1000,
        'batch_size': 128
    }

    args = {'ac_kwargs': ac_kwargs, **agent_args, **train_args}

    ddpg(env, **args)
def launch(args):
    # create the ddpg_agent
    # env = gym.make(args.env_name)
    rend = False
    discreteAction = 0
    numControlledJoints = 9
    actionRepeat = 1
    fixed = False
    maxStep = 1000
    # env = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction,
    #                          numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True)
    env = kukaPickGymEnvHer(urdfRoot=robot_data.getDataPath(),
                            maxSteps=maxStep,
                            renders=rend,
                            useIK=0,
                            isDiscrete=discreteAction,
                            actionRepeat=actionRepeat,
                            numControlledJoints=numControlledJoints,
                            fixedPositionObj=fixed,
                            includeVelObs=True,
                            reward_type=1)

    # get the environment parameters
    env_params = get_env_params(env)
    # create the ddpg agent to interact with the environment
    args.replay_strategy = 'normal'
    ddpg_trainer = ddpg(args, env, env_params)
    ddpg_trainer.learn()
Exemple #3
0
def launch(args):
    # create the ddpg_agent
    # env = gym.make(args.env_name)
    rend = False
    discreteAction = 0
    numControlledJoints = 6
    fixed = False
    actionRepeat = 1
    reward_type = args.reward_type
    if args.env_name == 'reach' or args.env_name == 'Reach':
        env = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(),actionRepeat=actionRepeat,renders=rend, useIK=0, isDiscrete=discreteAction,
                                numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=reward_type)
    elif args.env_name == 'push' or args.env_name == 'Push':
        env = kukaPushGymEnvHer(urdfRoot=robot_data.getDataPath(),actionRepeat=actionRepeat,renders=rend, useIK=0, isDiscrete=discreteAction,
                                numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=reward_type)
    elif args.env_name == 'reachob' or args.env_name == 'Reachob':
        env = kukaReachGymEnvOb(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction,
                                numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=reward_type)
    else:
        env = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(),actionRepeat=actionRepeat,renders=rend, useIK=0, isDiscrete=discreteAction,
                                numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=reward_type)
                             
    # get the environment parameters
    env_params = get_env_params(env, actionRepeat)
    # create the ddpg agent to interact with the environment
    ddpg_trainer = ddpg(args, env, env_params)
    ddpg_trainer.learn()
    def __init__(self, action_size = 2, buffer_size = buffer_size , n_agents = 2 ,\
                 batch_size = batch_size , seed = 2, update_every = 1 , gamma = 1):

        self.madagents = [
            ddpg.ddpg(24, 2, 256, 128, 64),
            ddpg.ddpg(24, 2, 256, 128, 64)
        ]

        self.update_every = update_every
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.memory = buffer.ReplayBuffer(action_size,
                                          buffer_size,
                                          batch_size,
                                          seed=2)
        #self.t_step = 0
        self.n_agents = n_agents
        self.gamma = gamma
    def train():
        agent, scores = ddpg()

        plt.plot(np.arange(1, len(scores) + 1), scores)
        plt.ylabel('Score')
        plt.xlabel('Episode #')
        plt.show()

        return agent
Exemple #6
0
def main(trainable=True):

    env = gym_carla_car_following("127.0.0.1", 2000, 15)
    agent = ddpg(env.observation_space.shape[0], env.action_space.shape[0], trainable)

    try:
        agent.load()
    except:
        traceback.print_exc()

    while True:
        try:
            interactive_with_environment(agent, env)
        except:
            traceback.print_exc()
        finally:
            agent.save(-1)
Exemple #7
0
    parser.add_argument('env_name')
    parser.add_argument('--exp_name', default=None)
    parser.add_argument('--exp_variant', default=None)
    parser.add_argument('--logdir', default='out')
    parser.add_argument('--seeds', type=int, default=0, nargs='*')
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--steps_per_epoch', type=int, default=1000)
    parser.add_argument('--discount', type=float, default=.99)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--polyak', type=float, default=0.001)
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--exploration_steps', type=int, default=0)
    parser.add_argument('--rand_proc', default='ou')
    parser.add_argument('--rand_proc_kwargs', type=json.loads, default=dict())
    args = parser.parse_args()

    seeds = args.seeds if isinstance(args.seeds, list) else [args.seeds]
    rand_proc_dir = {'normal': core.NormalProcess, 
                     'ou': core.OrnsteinUhlenbeckProcess}
    rand_proc = rand_proc_dir[args.rand_proc]

    for seed in seeds:
        print("\nNEW EXPERIMENT: SEED {}\n".format(seed))
        ddpg(env_name=args.env_name, exp_name=args.exp_name, exp_variant=args.exp_variant,
                logdir='out', seed=seed, 
                epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, 
                batch_size=args.batch_size, discount=args.discount,
                polyak=args.polyak, weight_decay=args.weight_decay, 
                exploration_steps=args.exploration_steps,
                rand_proc=rand_proc, rand_proc_kwargs=args.rand_proc_kwargs)
Exemple #8
0
def main():

    # Creating necessary directories
    collect_track_no = 5
    experiment_name = "tensorboard-4"
    experiment_dir = "experiment-%s/" % experiment_name
    models_dir = experiment_dir + "model/"
    datas_dir = experiment_dir + "datas-track-no-%d/" % collect_track_no

    if os.path.exists(experiment_dir) == False:
        print("%s dosen't exists" % experiment_dir)
        return

    if os.path.exists(models_dir) == False:
        print("%s dosen't exists" % models_dir)
        return

    if os.path.exists(datas_dir) == False:
        os.mkdir(datas_dir)

    action_dim = 1
    state_dim = 30
    env_name = 'torcs'

    sess = tf.InteractiveSession()
    agent = ddpg(env_name, sess, state_dim, action_dim, models_dir)
    agent.load_network()

    vision = True
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   text_mode=False,
                   track_no=collect_track_no,
                   random_track=False,
                   track_range=(0, 3))

    print("Collecting Start.")
    max_data_entry_count = 2000
    data_entry_count = 0
    start_time = time.time()
    i = 0
    step = 0
    try:
        file = open(datas_dir + 'state-action-scalar', 'w')
        while data_entry_count < max_data_entry_count:
            if np.mod(i, 3) == 0:
                ob = env.reset(relaunch=True)
            else:
                ob = env.reset()
            s_t = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, 0.0))
            pre_a_t = 0.0
            while data_entry_count < max_data_entry_count:
                a_t = agent.action(s_t)

                ob, r_t, done, info = env.step([a_t[0], 0.16, 0])

                print("Step", step, "Action", a_t, "Reward", r_t)

                s_t1 = np.hstack(
                    (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                     ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, a_t[0]))

                image = ob.img
                if step > 20:
                    plt.imsave(
                        datas_dir + ("%d-%d.jpg" %
                                     (collect_track_no, data_entry_count)),
                        image)
                    ret = file.write(
                        "%f %f %f %f %f\n" %
                        (ob.speedX, ob.speedY, ob.speedZ, pre_a_t, a_t[0]))
                    if ret == 0:
                        print("File Write error")
                    data_entry_count += 1

                s_t = s_t1
                step += 1
                pre_a_t = a_t[0]

                if done:
                    break

            print(("TOTAL REWARD @ " + str(i) + "Collect", data_entry_count))
            print(("Total Step: " + str(step)))
            print("")

    except:
        traceback.print_exc()
        with open((datas_dir + "exception"), 'w') as file:
            file.write(str(traceback.format_exc()))

    finally:

        file.close()

        env.end()
        end_time = time.time()

        with open(datas_dir + "log", 'w') as file:
            file.write("total_step = %d\n" % step)
            file.write("total_time = %s (s)\n" % str(end_time - start_time))

        print("Finish.")
Exemple #9
0
from unityagents import UnityEnvironment
import numpy as np
from ddpg import ddpg
import matplotlib.pyplot as plt
from ddpg_agent import Agent

env = UnityEnvironment(file_name='env/Reacher_Linux/Reacher_Linux/Reacher.x86')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=True)[brain_name]

# number of actions and states
action_size = brain.vector_action_space_size
state = env_info.vector_observations[0]
state_size = len(state)

agent = Agent(state_size=state_size, action_size=action_size, random_seed=2)

scores = ddpg(env,
              brain_name,
              agent,
              n_episodes=300,
              max_t=1000,
              print_every=100)
print(scores)
            discountFactor = d.get('discountFactor')
            explorationRate = d.get('explorationRate')
            learnStart = d.get('learnStart')
            memorySize = d.get('memorySize')
            current_epoch = d.get('current_epoch')
            stepCounter = d.get('stepCounter')
            loadsim_seconds = d.get('loadsim_seconds')

        clear_monitor_files(outdir)
        copy_tree(monitor_path, outdir)
        env = gym.wrappers.Monitor(env, outdir, resume=True)
    ddpg = ddpg.ddpg(S_DIM=S_DIM,
                     A_DIM=A_DIM,
                     EP_MAX=epochs,
                     EP_LEN=episode_steps,
                     GAMMA=discountFactor,
                     A_LR=A_learningRate,
                     C_LR=C_learningRate,
                     BATCH=minibatch_size,
                     propeller_hovering_speed=0.0)
    last100Rewards = [0] * 100
    last100RewardsIndex = 0
    last100Filled = False
    all_ep_r = []

    start_time = time.time()

    # start iterating from 'current epoch'.
    for epoch in range(current_epoch + 1, epochs + 1, 1):
        observation = env.reset()
        cumulated_reward = 0
Exemple #11
0
import numpy as np
import matplotlib.pyplot as plt
from ddpg import ddpg


def smooth(x):
    # last 100
    n = len(x)
    y = np.zeros(n)
    for i in range(n):
        start = max(0, i - 99)
        y[i] = float(x[start:(i + 1)].sum()) / (i - start + 1)
    return y


returns, q_losses, mu_losses = ddpg(lambda: gym.make('Pendulum-v0'),
                                    num_train_episodes=50)

plt.plot(returns)
plt.plot(smooth(np.array(returns)))
plt.title("Train returns")
plt.show()

# plt.plot(test_returns)
# plt.plot(smooth(np.array(test_returns)))
# plt.title("Test returns")
# plt.show()

plt.plot(q_losses)
plt.title('q_losses')
plt.show()
Exemple #12
0
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

agents = Agents(num_agents=num_agents,
                state_size=state_size,
                action_size=action_size,
                random_seed=0)
scores = ddpg(env,
              brain_name,
              agents,
              n_episodes=n_episodes,
              eps_start=eps_start,
              eps_end=eps_end,
              eps_decay=eps_decay,
              resume=resume)

# plot the scores
plt.plot(np.arange(1, len(scores) + 1), np.mean(scores, axis=-1))
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# close the environment
env.close()
Exemple #13
0
    seed = 239
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    gamma = 0.99
    max_episodes = 50
    buffer_maxlen = 5000
    batch_size = 640
    critic_lr = 1e-3
    actor_lr = 1e-4
    tau = 1e-3

    agent = ddpg(state_dim, action_dim, gamma, tau, buffer_maxlen, batch_size,
                 critic_lr, actor_lr)
    noise = OUNoise(env.action_space)

    step = 0

    for episode in range(max_episodes):
        state = env.reset()
        total = 0
        done = False
        while True:
            action = agent.act(state)
            action = noise.get_action(action, step)

            next_state, reward, done, _ = env.step(action)
            total += reward
Exemple #14
0
def main():

    EXPLORE = total_explore
    MAX_STEPS = max_steps
    MAX_STEPS_EP = max_steps_ep
    epsilon = epsilon_start

    # Creating necessary directories
    experiment_name = "img-0"
    experiment_dir = "experiment-%s/" % experiment_name
    models_dir = experiment_dir + "model/"
    logs_train_dir = experiment_dir + "logs-train/"
    if os.path.exists(experiment_dir) == False:
        os.mkdir(experiment_dir)
    if os.path.exists(logs_train_dir) == False:
        os.mkdir(logs_train_dir)
    if os.path.exists(models_dir) == False:
        os.mkdir(models_dir)

    description = 'Using raw pixels as input, output (steer)' + '\n' + \
                    'Training from scratch' + '\n\n' \
                    'throttle = 0.16' + '\n\n' \
                    'brake = 0' + '\n\n' \
                    'sp*np.cos(obs["angle"]) - np.abs(sp*np.sin(obs["angle"])) - sp * np.abs(obs["trackPos"])  \
                    - sp * np.abs(action_torcs["steer"]) * 4'                                                              + '\n\n' + \
                    'env = TorcsEnv(vision=False, throttle=True, text_mode=False, track_no=5, random_track=False, track_range=(5, 8))' + '\n\n' \
                    'abs(trackPos) > 0.9 is out of track' + '\n\n' \

    with open(experiment_dir + "README.md", 'w') as file:
        file.write(description)
        file.write("\n\n")
        file.write(formatted_timestamp())

    action_dim = 1
    state_dim = 4
    img_dim = [304, 412, 3]
    env_name = 'torcs'

    sess = tf.InteractiveSession()
    agent = ddpg(env_name, sess, state_dim, action_dim, models_dir, img_dim)
    agent.load_network()

    vision = True
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   text_mode=False,
                   track_no=5,
                   random_track=False,
                   track_range=(5, 8))

    rewards_every_steps = np.zeros([MAX_STEPS])
    actions_every_steps = np.zeros([MAX_STEPS, action_dim])

    # sess.run(tf.initialize_all_variables())

    # Using tensorboard to visualize data
    with tf.name_scope('summary'):
        critic_cost = tf.placeholder(dtype=tf.float32)
        actor_action = tf.placeholder(dtype=tf.float32)
        reward = tf.placeholder(dtype=tf.float32)
        state = tf.placeholder(dtype=tf.float32, shape=(state_dim, ))
        img = tf.placeholder(dtype=tf.float32,
                             shape=(1, img_dim[0], img_dim[1], img_dim[2]))
        tf.summary.scalar("critic_cost", critic_cost)
        tf.summary.scalar('actor_action', actor_action)
        tf.summary.scalar('reward', reward)
        tf.summary.histogram('state', state)
        tf.summary.image("img", img)
        merged_summary = tf.summary.merge_all()

    writer = tf.summary.FileWriter(logs_train_dir, sess.graph)

    print("Training Start.")
    start_time = time.time()
    i = 0
    step = 0
    try:
        while step < MAX_STEPS:
            # if ((np.mod(i, 10) == 0 ) and (i>20)):
            #     train_indicator= 0
            # else:
            #     train_indicator=is_training

            # restart because of memory leak bug in torcs
            if np.mod(i, 3) == 0:
                ob = env.reset(relaunch=True)
            else:
                ob = env.reset()

            # Early episode annealing for out of track driving and small progress
            # During early training phases - out of track and slow driving is allowed as humans do ( Margin of error )
            # As one learns to drive the constraints become stricter

            # random_number = random.random()
            # eps_early = max(epsilon,0.10)
            # if (random_number < (1.0-eps_early)) and (train_indicator == 1):
            #     early_stop = 1
            # else:
            #     early_stop = 0
            print(("Episode : " + str(i) + " Replay Buffer " +
                   str(agent.replay_buffer.count())))

            # s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm,
            #                  0.0))
            s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, 0.0))
            i_t = ob.img
            # cv2.imshow("img", ob.img)
            # cv2.waitKey(0)
            # x_t = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0))
            # s_t = np.hstack((x_t, x_t, x_t, x_t))

            total_reward = 0
            step_ep = 0
            while (step < MAX_STEPS) and (step_ep < MAX_STEPS_EP):
                # Take noisy actions during training
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, 0.0)
                a_t = agent.noise_action(s_t, epsilon, i_t)

                #ob, r_t, done, info = env.step(a_t[0], early_stop)

                ob, r_t, done, info = env.step([a_t[0], 0.16, 0])

                # s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm,
                #                   a_t[0]))

                s_t1 = np.hstack((ob.speedX, ob.speedY, ob.speedZ, a_t[0]))
                i_t1 = ob.img

                # x_t1 = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0]))
                # s_t1 = np.hstack((np.roll(s_t, -6)[:18], x_t1))
                # s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0]))

                cost = agent.perceive(s_t, a_t, r_t, s_t1, done, i_t, i_t1)
                summary = sess.run(merged_summary,
                                   feed_dict={
                                       critic_cost:
                                       cost,
                                       actor_action:
                                       a_t[0],
                                       reward:
                                       r_t,
                                       state:
                                       s_t,
                                       img:
                                       i_t.reshape(1, img_dim[0], img_dim[1],
                                                   img_dim[2])
                                   })

                writer.add_summary(summary, step)

                total_reward += r_t
                s_t = s_t1
                i_t = i_t1

                print("Ep", i, "Total steps", step, "Reward", r_t, " Actions ",
                      a_t, " Epsilon ", epsilon, "Step ep", step_ep)

                rewards_every_steps[step] = r_t
                actions_every_steps[step] = a_t
                step += 1
                step_ep += 1

                if done:
                    break

                if np.mod(step + 1, 10000) == 0:
                    print("Now we save model with step = ", step)
                    agent.save_network(step + 1)

            print(("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
                   str(total_reward)))
            print(("Total Step: " + str(step)))
            print("")
            i += 1

    except:
        traceback.print_exc()
        with open((logs_train_dir + "exception"), 'w') as file:
            file.write(str(traceback.format_exc()))

    finally:
        env.end()
        end_time = time.time()

        np.save(logs_train_dir + "reward.npy", rewards_every_steps)
        np.save(logs_train_dir + "action.npy", actions_every_steps)

        with open(logs_train_dir + "log", 'w') as file:
            file.write("epsilon_start = %d\n" % epsilon_start)
            file.write("total_explore = %d\n" % total_explore)
            file.write("total_episode = %d\n" % i)
            file.write("total_step = %d\n" % step)
            file.write("total_time = %s (s)\n" % str(end_time - start_time))

        print("Finish.")
Exemple #15
0
def main():

    MAX_EP = 1
    MAX_STEPS_EP = 2000

    # Creating necessary directories
    test_track_no = 6
    experiment_name = "tensorboard-11"
    experiment_dir = "experiment-%s/" % experiment_name
    models_dir = experiment_dir + "model/"
    logs_test_dir = experiment_dir + "logs-test-track-no-%d/" % test_track_no

    if os.path.exists(experiment_dir) == False:
        print("%s dosen't exists" % experiment_dir)
        return

    if os.path.exists(models_dir) == False:
        print("%s dosen't exists" % models_dir)
        return

    if os.path.exists(logs_test_dir) == False:
        os.mkdir(logs_test_dir)

    action_dim = 1
    state_dim = 25
    env_name = 'torcs'

    sess = tf.InteractiveSession()
    agent = ddpg(env_name, sess, state_dim, action_dim, models_dir)
    agent.load_network()

    vision = False
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   text_mode=False,
                   track_no=test_track_no,
                   random_track=False,
                   track_range=(0, 3))

    # rewards_every_steps = np.zeros([MAX_EP, MAX_STEPS_EP])
    # actions_every_steps = np.zeros([MAX_EP, MAX_STEPS_EP, action_dim])

    # Using tensorboard to visualize data
    with tf.name_scope('summary'):
        actor_action = tf.placeholder(dtype=tf.float32)
        reward = tf.placeholder(dtype=tf.float32)
        state = tf.placeholder(dtype=tf.float32, shape=(state_dim, ))
        tf.summary.scalar('actor_action', actor_action)
        tf.summary.scalar('reward', reward)
        tf.summary.histogram('state', state)
        merged_summary = tf.summary.merge_all()

    writer = tf.summary.FileWriter(logs_test_dir, sess.graph)

    print("Testing Start.")
    start_time = time.time()
    step = 0
    try:
        for i in range(MAX_EP):
            if np.mod(i, 3) == 0:
                ob = env.reset(relaunch=True)
            else:
                ob = env.reset()

            print(("Episode : " + str(i) + " Replay Buffer " +
                   str(agent.replay_buffer.count())))

            # s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm,
            #                  0.0))
            s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                             ob.speedY, ob.speedZ, 0.0))

            # x_t = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0))
            # s_t = np.hstack((x_t, x_t, x_t, x_t))
            # s_t = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0))
            # s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0))

            total_reward = 0
            step_ep = 0
            while (step_ep < MAX_STEPS_EP):
                a_t = agent.action(s_t)
                ob, r_t, done, info = env.step([a_t[0], 0.16, 0])
                # s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm,
                #                   a_t[0]))
                s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                                  ob.speedY, ob.speedZ, a_t[0]))

                # x_t1 = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0]))
                # s_t1 = np.hstack((np.roll(s_t, -6)[:18], x_t1))

                # s_t1 = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0]))
                # s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0]))

                summary = sess.run([merged_summary],
                                   feed_dict={
                                       actor_action: a_t[0],
                                       reward: r_t,
                                       state: s_t
                                   })

                writer.add_summary(summary[0], step)

                total_reward += r_t
                s_t = s_t1

                print("Ep", i, "Total steps", step, "Reward", r_t, " Actions ",
                      a_t, "Step ep", step_ep)

                # rewards_every_steps[step] = r_t
                # actions_every_steps[step] = a_t
                step += 1
                step_ep += 1

                if done:
                    break

            print(("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
                   str(total_reward)))
            print(("Total Step: " + str(step)))
            print("")

    except:
        traceback.print_exc()
        with open((logs_test_dir + "exception"), 'w') as file:
            file.write(str(traceback.format_exc()))

    finally:
        env.end()
        end_time = time.time()

        # np.save(logs_test_dir + "reward.npy", rewards_every_steps)
        # np.save(logs_test_dir + "action.npy", actions_every_steps)

        with open(logs_test_dir + "log", 'w') as file:
            file.write("total_episode = %d\n" % MAX_EP)
            file.write("max_steps_ep = %d\n" % MAX_STEPS_EP)
            file.write("total_step = %d\n" % step)
            file.write("total_time = %s (s)\n" % str(end_time - start_time))

        print("Finish.")