Example #1
0
def evaluate(port=8080):
    env = SimstarEnv(track=simstar.TrackName.Austria,port=port,
    synronized_mode=True,speed_up=1,hz=10,
    add_agent=False)
    
    # total length of chosen observation states
    insize = 4 + env.track_sensor_size 

    hyperparams = {
                "lrvalue": 5e-4,
                "lrpolicy": 1e-4,
                "gamma": 0.97,
                "buffersize": 100000,
                "tau": 1e-2,
                "batchsize": 64,
                "start_sigma": 0.3,
                "end_sigma": 0,
                "sigma_decay_len": 15000,
                "theta": 0.15,
                "clipgrad": True
    }
    HyperParams = namedtuple("HyperParams", hyperparams.keys())
    hyprm = HyperParams(**hyperparams)

    # Load actor network from checkpoint
    agent = DDPGagent(env, hyprm, insize=insize,device=device)
    agent.to(device)
    load_checkpoint(agent)

    total_reward = 0

    for eps in range(NUM_EVAL_EPISODE):
        obs = env.reset()
        state = np.hstack((obs.angle, obs.track,
                    obs.trackPos, obs.speedX, obs.speedY))

        lap_start_time = time.time()
        epsisode_reward = 0

        for i in range(NUM_EVAL_STEPS):
            action = agent.get_action(state)
            a_1 = np.clip(action[0],-1,1)
            a_2 = np.clip(action[1],0,1)
            a_3 = np.clip(action[2],0,1)

            action = np.array([a_1, a_2, a_3])

            obs, reward, done, summary = env.step(action)

            next_state = np.hstack((obs.angle, obs.track,
                    obs.trackPos, obs.speedX, obs.speedY))

            epsisode_reward += reward

            if done:
                break

            state = next_state

        lap_progress = env.get_lap_progress()
        lap_time_passed = time.time() - lap_start_time
        total_reward += epsisode_reward
        print("Episode: %d, Reward: %.1f, lap progress%.2f time passed: %.0fs "%(i,epsisode_reward,lap_progress,lap_time_passed))
    
    print("Average reward over %d episodes: %.1f"%(NUM_EVAL_EPISODE,total_reward/NUM_EVAL_EPISODE))
Example #2
0
BATCH_SIZE = 32

env = QuadcopterEnv()
env = NormalizedEnv(env)

c1 = (((2 * K) / M) * omega_0)**(-1)
W0 = np.array([1, 1, 1, 1]).reshape((4, )) * omega_0
F1 = np.array([[0.25, 0.25, 0.25, 0.25], [1, 1, 1, 1]]).T

if len(sys.argv) == 1:
    hidden_sizes = [64, 64]
else:
    hidden_sizes = sys.argv[1:]
    hidden_sizes = [int(i) for i in hidden_sizes]

agent = DDPGagent(env, hidden_sizes)
noise = OUNoise(env.action_space)

writer_train = SummaryWriter()
writer_test = SummaryWriter()


def get_score(state):
    w, z = state
    if 14.9 < z < 15.1 and abs(w) < 0.0:
        return 1
    else:
        return 0


def training_loop(agent, noise, pbar, test=False):
Example #3
0
import sys
import gym
import numpy as np
import pandas as pd
import pdb
import matplotlib.pyplot as plt
from ddpg import DDPGagent
from utils import *
import random

env = gym.make("FetchReach-v1")
#env.env.reward_type = 'dense'  # WARNING! HER implemented currently only for sparse rewards. Dense will break it!
agent = DDPGagent(env)
noise = OUNoise(env.action_space)
batch_size = 128
rewards = []
avg_rewards = []

for episode in range(10000):
    state = env.reset()
    noise.reset()
    episode_reward = 0
    agent.memory.clear_trajectory()

    for step in range(500):
        # if episode%100 == 0:
        #     env.render()
        action = agent.get_action(state)
        action = noise.get_action(action, step)
        new_state, reward, done, _ = env.step(action)
        agent.memory.push(state, action, reward, new_state, done)
            break
    print('Score (max over agents) from episode {}: {}'.format(
        i, np.max(scores)))

# When finished, you can close the environment.

#***--------------BEGIN MY STUFF

from ddpg import DDPGagent
from utils import *

load_modelz = False

modelz_list = []

agent = DDPGagent(load_modelz, modelz_list, env_info)
noise = OUNoise(env_info.previous_vector_actions)
batch_size = 20
rewards = []
avg_rewards = []

#env_info = env.reset(train_mode=False)[brain_name]     # reset the environment
#states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)  # initialize the score (for each agent)
all_scores = []
last_20 = []
max_games = 0
noise_set = True  #**do we want temporary exploration?
total20 = 0
train_model = True  #**Do we wish to train model? or just play the game?
LR_update_max = 10
Example #5
0
import sys
import gym
import matplotlib.pyplot as plt
from ddpg import DDPGagent
from utils import *
from pid_env import PidEnv

env = PidEnv()
agent = DDPGagent(env, 4, 256, 4)
noise = OUNoise(4)
batch_size = 256
rewards = []
avgRewards = []
normalized = []
metalearn = False
random = False

for episode in range(300):

    sp = 50 if random == False else np.random.random() * 100
    env = PidEnv(sp)
    state = env.reset()
    noise.reset()
    episodeReward = 0
    stepCounter = 0

    for step in range(250):
        stepCounter += 1
        action = agent.get_action(state)
        action = noise.get_action(action, step)
        new_state, reward, done = env.step(action)
Example #6
0
def train(save_name="checkpoint", port=8080, hz=10):
    env = SimstarEnv(track=simstar.TrackName.HungaryGrandPrix,
                     port=port,
                     synronized_mode=True,
                     speed_up=6,
                     hz=hz,
                     lower_speed_limit=5,
                     add_agent=ADD_AGENT)

    # total length of chosen observation states
    insize = 4 + env.track_sensor_size

    outsize = env.action_space.shape[0]
    hyperparams = {
        "lrvalue": 5e-4,
        "lrpolicy": 1e-3,
        "gamma": 0.97,
        "episodes": 9000,
        "buffersize": 100000,
        "tau": 1e-2,
        "batchsize": 64,
        "start_sigma": 0.3,
        "end_sigma": 0,
        "sigma_decay_len": 15000,
        "theta": 0.15,
        "maxlength": 5000,
        "clipgrad": True
    }
    HyperParams = namedtuple("HyperParams", hyperparams.keys())
    hyprm = HyperParams(**hyperparams)

    datalog = defaultdict(list)

    agent = DDPGagent(env, hyprm, insize=insize, device=device)
    noise = OUNoise(env.action_space, hyprm)
    agent.to(device)
    step_counter = 0
    best_reward = 0
    #agent.to(device)

    if (START_FROM_CHECKPOINT):
        step_counter, best_reward = load_checkpoint(agent, load_name=save_name)

    for eps in range(hyprm.episodes):
        obs = env.reset()
        noise.reset()
        state = np.hstack(
            (obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY))

        epsisode_reward = 0
        episode_value = 0
        lap_start = env.get_lap_progress()

        for i in range(hyprm.maxlength):
            action = agent.get_action(state)
            if TRAIN:
                action = noise.get_action(action, step_counter)

            a_1 = np.clip(action[0], -1, 1)
            a_2 = np.clip(action[1], 0, 1)
            a_3 = np.clip(action[2], 0, 1)

            action = np.array([a_1, a_2, a_3])

            obs, reward, done, _ = env.step(action)

            next_state = np.hstack(
                (obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY))

            if not AUTOPILOT_OTHER_AGENTS:
                #agent actions
                agent_actions = []
                agents_obs = env.get_agent_obs()
                for j in range(len(agents_obs)):
                    a_obs = agents_obs[j]
                    agent_state = np.hstack(
                        (a_obs.angle, a_obs.track, a_obs.trackPos,
                         a_obs.speedX, a_obs.speedY, a))
                    agent_action = agent.get_action(agent_state)
                    agent_actions.append(agent_action)
                env.set_agent_action(agent_actions)

            agent.memory.push(state, action, reward, next_state, done)

            epsisode_reward += reward

            if TRAIN:
                if len(agent.memory) > hyprm.batchsize:
                    agent.update(hyprm.batchsize)

            if done:
                break

            state = next_state
            step_counter += 1

            if not np.mod(step_counter, SAVE_MODEL_EACH):
                save_checkpoint(agent,
                                step_counter,
                                epsisode_reward,
                                save_name=save_name + "_" + str(step_counter))

        if epsisode_reward > best_reward:
            best_reward = epsisode_reward
            print("best episode reward achived: ", best_reward)
            round_reward = int(epsisode_reward)
            save_checkpoint(agent,
                            step_counter,
                            epsisode_reward,
                            save_name="best___" + str(round_reward))

        datalog["epsiode length"].append(i)
        datalog["total reward"].append(epsisode_reward)

        avearage_reward = torch.mean(
            torch.tensor(datalog["total reward"][-20:])).item()
        lap_progress = env.get_lap_progress() - lap_start
        print(
            "\r Processs percentage: {:2.1f}%, Average reward: {:2.3f} lap progress:{:2.1f} "
            .format(eps / hyprm.episodes * 100, avearage_reward,
                    lap_progress * 100),
            flush=True)

    print("")
Example #7
0
#
# Of course, as part of the project, you'll have to change the code so that the agent is able to use its experience to gradually choose better actions when interacting with the environment!

# In[5]:

states = env_info.vector_observations  # get the current state (for each agent)

load_modelz = False
modelz_list = []
modelz_list.append(
    "MODEL_CHECKPOINT.5097780.actor.pt")  #**<model has exploding gradients
modelz_list.append("MODEL_CHECKPOINT.5097780.actor_target.pt")
modelz_list.append("MODEL_CHECKPOINT.5097780.critic.pt")
modelz_list.append("MODEL_CHECKPOINT.5097780.critic_target.pt")

agent = DDPGagent(load_modelz, modelz_list, env_info)
noise = OUNoise(env_info.previous_vector_actions)
batch_size = 10
rewards = []
avg_rewards = []

#env_info = env.reset(train_mode=False)[brain_name]     # reset the environment
#states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)  # initialize the score (for each agent)
all_scores = []
last_20 = []
max_games = 0
noise_set = True  #**do we want temporary exploration?
total20 = 0
train_model = False  #**Do we wish to train model? or just play the game?
LR_update_max = 10
Example #8
0
def train():
    env = SimstarEnv(synronized_mode=True, speed_up=5, hz=6)
    # total length of chosen observation states
    insize = 23
    outsize = env.action_space.shape[0]
    hyperparams = {
        "lrvalue": 5e-4,
        "lrpolicy": 1e-3,
        "gamma": 0.97,
        "episodes": 30000,
        "buffersize": 100000,
        "tau": 1e-2,
        "batchsize": 64,
        "start_sigma": 0.3,
        "end_sigma": 0,
        "sigma_decay_len": 15000,
        "theta": 0.15,
        "maxlength": 5000,
        "clipgrad": True
    }
    HyperParams = namedtuple("HyperParams", hyperparams.keys())
    hyprm = HyperParams(**hyperparams)

    datalog = defaultdict(list)

    agent = DDPGagent(env, hyprm, device=device)
    noise = OUNoise(env.action_space, hyprm)
    agent.to(device)
    step_counter = 0
    best_reward = 0
    #agent.to(device)

    if (START_FROM_CHECKPOINT):
        step_counter, best_reward = load_checkpoint(agent)

    for eps in range(hyprm.episodes):
        obs = env.reset()
        noise.reset()

        state = np.hstack(
            (obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY))

        epsisode_reward = 0
        episode_value = 0

        for i in range(hyprm.maxlength):
            action = agent.get_action(state)
            if TRAIN:
                action = noise.get_action(action, step_counter)

            a_1 = np.clip(action[0], -1, 1)
            a_2 = np.clip(action[1], 0, 1)
            a_3 = np.clip(action[2], 0, 1)

            action = np.array([a_1, a_2, a_3])

            obs, reward, done, _ = env.step(action)

            next_state = np.hstack(
                (obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY))

            agent.memory.push(state, action, reward, next_state, done)

            epsisode_reward += reward

            if TRAIN:
                if len(agent.memory) > hyprm.batchsize:
                    agent.update(hyprm.batchsize)

            if done:
                break

            state = next_state
            step_counter += 1

            if not np.mod(step_counter, SAVE_MODEL_EACH):
                save_checkpoint(agent, step_counter, epsisode_reward)
                if epsisode_reward > best_reward:
                    save_checkpoint(agent,
                                    step_counter,
                                    epsisode_reward,
                                    save_name="best")

        datalog["epsiode length"].append(i)
        datalog["total reward"].append(epsisode_reward)

        avearage_reward = torch.mean(
            torch.tensor(datalog["total reward"][-20:])).item()
        print(
            "\r Processs percentage: {:2.1f}%, Average reward: {:2.3f}".format(
                eps / hyprm.episodes * 100, avearage_reward),
            end="",
            flush=True)

    print("")