def evaluate(port=8080): env = SimstarEnv(track=simstar.TrackName.Austria,port=port, synronized_mode=True,speed_up=1,hz=10, add_agent=False) # total length of chosen observation states insize = 4 + env.track_sensor_size hyperparams = { "lrvalue": 5e-4, "lrpolicy": 1e-4, "gamma": 0.97, "buffersize": 100000, "tau": 1e-2, "batchsize": 64, "start_sigma": 0.3, "end_sigma": 0, "sigma_decay_len": 15000, "theta": 0.15, "clipgrad": True } HyperParams = namedtuple("HyperParams", hyperparams.keys()) hyprm = HyperParams(**hyperparams) # Load actor network from checkpoint agent = DDPGagent(env, hyprm, insize=insize,device=device) agent.to(device) load_checkpoint(agent) total_reward = 0 for eps in range(NUM_EVAL_EPISODE): obs = env.reset() state = np.hstack((obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY)) lap_start_time = time.time() epsisode_reward = 0 for i in range(NUM_EVAL_STEPS): action = agent.get_action(state) a_1 = np.clip(action[0],-1,1) a_2 = np.clip(action[1],0,1) a_3 = np.clip(action[2],0,1) action = np.array([a_1, a_2, a_3]) obs, reward, done, summary = env.step(action) next_state = np.hstack((obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY)) epsisode_reward += reward if done: break state = next_state lap_progress = env.get_lap_progress() lap_time_passed = time.time() - lap_start_time total_reward += epsisode_reward print("Episode: %d, Reward: %.1f, lap progress%.2f time passed: %.0fs "%(i,epsisode_reward,lap_progress,lap_time_passed)) print("Average reward over %d episodes: %.1f"%(NUM_EVAL_EPISODE,total_reward/NUM_EVAL_EPISODE))
BATCH_SIZE = 32 env = QuadcopterEnv() env = NormalizedEnv(env) c1 = (((2 * K) / M) * omega_0)**(-1) W0 = np.array([1, 1, 1, 1]).reshape((4, )) * omega_0 F1 = np.array([[0.25, 0.25, 0.25, 0.25], [1, 1, 1, 1]]).T if len(sys.argv) == 1: hidden_sizes = [64, 64] else: hidden_sizes = sys.argv[1:] hidden_sizes = [int(i) for i in hidden_sizes] agent = DDPGagent(env, hidden_sizes) noise = OUNoise(env.action_space) writer_train = SummaryWriter() writer_test = SummaryWriter() def get_score(state): w, z = state if 14.9 < z < 15.1 and abs(w) < 0.0: return 1 else: return 0 def training_loop(agent, noise, pbar, test=False):
import sys import gym import numpy as np import pandas as pd import pdb import matplotlib.pyplot as plt from ddpg import DDPGagent from utils import * import random env = gym.make("FetchReach-v1") #env.env.reward_type = 'dense' # WARNING! HER implemented currently only for sparse rewards. Dense will break it! agent = DDPGagent(env) noise = OUNoise(env.action_space) batch_size = 128 rewards = [] avg_rewards = [] for episode in range(10000): state = env.reset() noise.reset() episode_reward = 0 agent.memory.clear_trajectory() for step in range(500): # if episode%100 == 0: # env.render() action = agent.get_action(state) action = noise.get_action(action, step) new_state, reward, done, _ = env.step(action) agent.memory.push(state, action, reward, new_state, done)
break print('Score (max over agents) from episode {}: {}'.format( i, np.max(scores))) # When finished, you can close the environment. #***--------------BEGIN MY STUFF from ddpg import DDPGagent from utils import * load_modelz = False modelz_list = [] agent = DDPGagent(load_modelz, modelz_list, env_info) noise = OUNoise(env_info.previous_vector_actions) batch_size = 20 rewards = [] avg_rewards = [] #env_info = env.reset(train_mode=False)[brain_name] # reset the environment #states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) all_scores = [] last_20 = [] max_games = 0 noise_set = True #**do we want temporary exploration? total20 = 0 train_model = True #**Do we wish to train model? or just play the game? LR_update_max = 10
import sys import gym import matplotlib.pyplot as plt from ddpg import DDPGagent from utils import * from pid_env import PidEnv env = PidEnv() agent = DDPGagent(env, 4, 256, 4) noise = OUNoise(4) batch_size = 256 rewards = [] avgRewards = [] normalized = [] metalearn = False random = False for episode in range(300): sp = 50 if random == False else np.random.random() * 100 env = PidEnv(sp) state = env.reset() noise.reset() episodeReward = 0 stepCounter = 0 for step in range(250): stepCounter += 1 action = agent.get_action(state) action = noise.get_action(action, step) new_state, reward, done = env.step(action)
def train(save_name="checkpoint", port=8080, hz=10): env = SimstarEnv(track=simstar.TrackName.HungaryGrandPrix, port=port, synronized_mode=True, speed_up=6, hz=hz, lower_speed_limit=5, add_agent=ADD_AGENT) # total length of chosen observation states insize = 4 + env.track_sensor_size outsize = env.action_space.shape[0] hyperparams = { "lrvalue": 5e-4, "lrpolicy": 1e-3, "gamma": 0.97, "episodes": 9000, "buffersize": 100000, "tau": 1e-2, "batchsize": 64, "start_sigma": 0.3, "end_sigma": 0, "sigma_decay_len": 15000, "theta": 0.15, "maxlength": 5000, "clipgrad": True } HyperParams = namedtuple("HyperParams", hyperparams.keys()) hyprm = HyperParams(**hyperparams) datalog = defaultdict(list) agent = DDPGagent(env, hyprm, insize=insize, device=device) noise = OUNoise(env.action_space, hyprm) agent.to(device) step_counter = 0 best_reward = 0 #agent.to(device) if (START_FROM_CHECKPOINT): step_counter, best_reward = load_checkpoint(agent, load_name=save_name) for eps in range(hyprm.episodes): obs = env.reset() noise.reset() state = np.hstack( (obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY)) epsisode_reward = 0 episode_value = 0 lap_start = env.get_lap_progress() for i in range(hyprm.maxlength): action = agent.get_action(state) if TRAIN: action = noise.get_action(action, step_counter) a_1 = np.clip(action[0], -1, 1) a_2 = np.clip(action[1], 0, 1) a_3 = np.clip(action[2], 0, 1) action = np.array([a_1, a_2, a_3]) obs, reward, done, _ = env.step(action) next_state = np.hstack( (obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY)) if not AUTOPILOT_OTHER_AGENTS: #agent actions agent_actions = [] agents_obs = env.get_agent_obs() for j in range(len(agents_obs)): a_obs = agents_obs[j] agent_state = np.hstack( (a_obs.angle, a_obs.track, a_obs.trackPos, a_obs.speedX, a_obs.speedY, a)) agent_action = agent.get_action(agent_state) agent_actions.append(agent_action) env.set_agent_action(agent_actions) agent.memory.push(state, action, reward, next_state, done) epsisode_reward += reward if TRAIN: if len(agent.memory) > hyprm.batchsize: agent.update(hyprm.batchsize) if done: break state = next_state step_counter += 1 if not np.mod(step_counter, SAVE_MODEL_EACH): save_checkpoint(agent, step_counter, epsisode_reward, save_name=save_name + "_" + str(step_counter)) if epsisode_reward > best_reward: best_reward = epsisode_reward print("best episode reward achived: ", best_reward) round_reward = int(epsisode_reward) save_checkpoint(agent, step_counter, epsisode_reward, save_name="best___" + str(round_reward)) datalog["epsiode length"].append(i) datalog["total reward"].append(epsisode_reward) avearage_reward = torch.mean( torch.tensor(datalog["total reward"][-20:])).item() lap_progress = env.get_lap_progress() - lap_start print( "\r Processs percentage: {:2.1f}%, Average reward: {:2.3f} lap progress:{:2.1f} " .format(eps / hyprm.episodes * 100, avearage_reward, lap_progress * 100), flush=True) print("")
# # Of course, as part of the project, you'll have to change the code so that the agent is able to use its experience to gradually choose better actions when interacting with the environment! # In[5]: states = env_info.vector_observations # get the current state (for each agent) load_modelz = False modelz_list = [] modelz_list.append( "MODEL_CHECKPOINT.5097780.actor.pt") #**<model has exploding gradients modelz_list.append("MODEL_CHECKPOINT.5097780.actor_target.pt") modelz_list.append("MODEL_CHECKPOINT.5097780.critic.pt") modelz_list.append("MODEL_CHECKPOINT.5097780.critic_target.pt") agent = DDPGagent(load_modelz, modelz_list, env_info) noise = OUNoise(env_info.previous_vector_actions) batch_size = 10 rewards = [] avg_rewards = [] #env_info = env.reset(train_mode=False)[brain_name] # reset the environment #states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) all_scores = [] last_20 = [] max_games = 0 noise_set = True #**do we want temporary exploration? total20 = 0 train_model = False #**Do we wish to train model? or just play the game? LR_update_max = 10
def train(): env = SimstarEnv(synronized_mode=True, speed_up=5, hz=6) # total length of chosen observation states insize = 23 outsize = env.action_space.shape[0] hyperparams = { "lrvalue": 5e-4, "lrpolicy": 1e-3, "gamma": 0.97, "episodes": 30000, "buffersize": 100000, "tau": 1e-2, "batchsize": 64, "start_sigma": 0.3, "end_sigma": 0, "sigma_decay_len": 15000, "theta": 0.15, "maxlength": 5000, "clipgrad": True } HyperParams = namedtuple("HyperParams", hyperparams.keys()) hyprm = HyperParams(**hyperparams) datalog = defaultdict(list) agent = DDPGagent(env, hyprm, device=device) noise = OUNoise(env.action_space, hyprm) agent.to(device) step_counter = 0 best_reward = 0 #agent.to(device) if (START_FROM_CHECKPOINT): step_counter, best_reward = load_checkpoint(agent) for eps in range(hyprm.episodes): obs = env.reset() noise.reset() state = np.hstack( (obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY)) epsisode_reward = 0 episode_value = 0 for i in range(hyprm.maxlength): action = agent.get_action(state) if TRAIN: action = noise.get_action(action, step_counter) a_1 = np.clip(action[0], -1, 1) a_2 = np.clip(action[1], 0, 1) a_3 = np.clip(action[2], 0, 1) action = np.array([a_1, a_2, a_3]) obs, reward, done, _ = env.step(action) next_state = np.hstack( (obs.angle, obs.track, obs.trackPos, obs.speedX, obs.speedY)) agent.memory.push(state, action, reward, next_state, done) epsisode_reward += reward if TRAIN: if len(agent.memory) > hyprm.batchsize: agent.update(hyprm.batchsize) if done: break state = next_state step_counter += 1 if not np.mod(step_counter, SAVE_MODEL_EACH): save_checkpoint(agent, step_counter, epsisode_reward) if epsisode_reward > best_reward: save_checkpoint(agent, step_counter, epsisode_reward, save_name="best") datalog["epsiode length"].append(i) datalog["total reward"].append(epsisode_reward) avearage_reward = torch.mean( torch.tensor(datalog["total reward"][-20:])).item() print( "\r Processs percentage: {:2.1f}%, Average reward: {:2.3f}".format( eps / hyprm.episodes * 100, avearage_reward), end="", flush=True) print("")