def main(): action_high = 2 action_low = -2 action_high = np.array([action_high]) action_low = np.array([action_low]) buffer_size = 100000 minibatch_size = 256 num_episode = 500 env = gym.make("Pendulum-v0") state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] agent = Agent(state_size, action_size, buffer_size, minibatch_size, action_high, action_low) reward_list = [] for i_episode in range(num_episode): print("episode: %d" % i_episode) state = env.reset() total_reward = 0 for t_timesteps in range(env.spec.timestep_limit): env.render() action = agent.choose_action(state) next_state, reward, done, info = env.step(action) total_reward += reward transition = [state, action, next_state, reward, done] agent.train(transition) state = next_state if (done or t_timesteps == env.spec.timestep_limit - 1): print("Episode finish---time steps: %d" % t_timesteps) print("total reward: %d" % total_reward) reward_list.append(total_reward) break np.save('reward', reward_list)
def __init__(self, state_size, action_size): super(MADDPG, self).__init__() self.state_size = state_size self.action_size = action_size self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, 1) self.maddpg_agent = [ Agent(self.state_size, self.action_size, 0), Agent(self.state_size, self.action_size, 13) ]
def act(self, states): # corpus for Agent Actions actions = [] for Agent, state in zip(self.maddpg_agent, states): action = Agent.act(state) actions.append(action) return actions
def __init__(self, state_size, action_size, random_seed): super(maddpg, self).__init__() self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # As it was clear the the amount of Agent is limited to two, i # 'hardcoded' both into the class self.maddpg_agent = [ Agent(state_size, action_size, random_seed), Agent(state_size, action_size, random_seed) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, n_agents, state_size, action_size, seed): """ Initializes a MultiAgent object PARAMS ===== n_agents: Number of agents state_size: The dimension of the state space action_size: The dimensions of the action space seed: The seed to use """ self.n_agents = n_agents self.state_size = state_size self.action_size = action_size self.seed = seed self.agents = [ Agent(self.state_size, self.action_size, self.seed) for i in range(n_agents) ] # Single Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Gamma self.Gamma = GAMMA self.t_step = 0
def main(): env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64", no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations state_size = states.shape[1] agent = Agent(state_size, action_size) scores = train(env, agent, n_episodes=1000) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(scores) + 1), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.savefig('scores.png') env.close()
def main(): agent = Agent() agent.load() total_reward = 0 obs = env.reset() env.render() for _ in range(10000): act = agent.predict(obs) obs, reward, done, _ = env.step(act) total_reward += reward env.render() if done: print(f'total_reward: {total_reward}') env.close() break
def __init__(self, num_agents, state_size, action_size, random_seed): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ Agent(state_size, action_size, random_seed, i) for i in range(num_agents) ] self.memory = ReplayBuffer(state_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, num_agents, state_size, action_size, random_seed): """ Initialize multiple Agents each with a Actor-Critic network but they share the replay buffer to learn from experience """ self.num_agents = num_agents self.agents = [] for _ in range(num_agents): agent = Agent(state_size, action_size, random_seed) self.agents.append(agent) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, n_agents, seed): self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.seed = random.seed(seed) # Actor-Critic agents self.ActorCriticAgents = [ Agent(state_size, action_size, n_agents, seed) for _ in range(n_agents) ] # Replay memory self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, seed)
def main(): with tf.Session() as sess: while True: try: env = CarlaEnv() break except Exception as e: print(e) agent = Agent(sess=sess, state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0]) max_episodes = 1000 max_steps = 1800 for i in range(int(max_episodes)): state = env.reset() print(state.shape) ep_reward = 0 ep_ave_max_q = 0 # plt.clf() # if i: # with open("ddpg_memory.pkl","wb") as hand: # pickle.dump(replay_buffer,hand) # actor.save_model() # critic.save_model() # print("Agent saved") for j in range(int(max_steps)): print("epoch: {}, step: {}".format(i, j)) # env.render() # Added exploration noise # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) action = agent.get_action(state) # a = controller(s[0],s[1],s[3]) # a = [a] next_state, reward, done, info = env.step(action) print("reward: {}".format(reward)) agent.remember(state, action, reward, done, next_state) # Keep adding experience to the memory until # there are at least minibatch size samples agent.train()
def main(): env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64") # get action_size and state_size brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations state_size = states.shape[1] agent = Agent(state_size, action_size) agent.actor_local.load_state_dict(torch.load('files/checkpoint_actor.pth')) agent.critic_local.load_state_dict( torch.load('files/checkpoint_critic.pth')) play(env, agent) env.close()
def __init__(self, num_agents, state_size, action_size, random_seed): super(MADDPG, self).__init__() self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.random_seed = random_seed self.maddpg_agent = [ Agent(self.state_size, self.action_size, self.num_agents * self.state_size, self.num_agents * self.action_size, self.random_seed) for i in range(self.num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.noise_amplitud = 1 self.noise_reduction = 0.9995 self.t_step = 0
def main(): world = World() agent = Agent(state_size=world.state_size, action_size=world.action_size) while True: loop(agent, world)
from env.pid import PidEnv import numpy as np from ddpg import Agent from OUNoise import Noise import matplotlib.pyplot as plt env = PidEnv(setpoint=20) batch_size = 128 rewards = [] agent = Agent(num_states=5, num_actions=3) noise = Noise(num_actions=3) for episode in range(30): state = env.reset() noise.reset() eps_reward = 0 for step in range(500): action = agent.get_action(state) action = noise.get_action(action, step) new_state, reward = env.step(action) agent.mem.push(state, action, reward, new_state) agent.learn(batch_size) state = new_state eps_reward += reward rewards.append(eps_reward)
import gym import os import numpy as np from ddpg import Agent from utils import plot_learning_curve if __name__ == '__main__': env = gym.make('Pendulum-v0') agent = Agent(input_dims=env.observation_space.shape, env=env, n_actions=env.action_space.shape[0]) n_episodes: int = 250 base_dir: str = os.path.dirname(__file__) figure_file = os.path.abspath(os.path.join(base_dir, 'plots/pendulum.png')) best_score = env.reward_range[0] score_history = [] load_checkpoint = False if load_checkpoint: n_steps = 0 while n_steps <= agent.batch_size: observation = env.reset() action = env.action_space.sample() observation_, reward, done, info = env.step(action) agent.remember(observation, action, reward, observation_, done) n_steps += 1 agent.learn() agent.load_models()
env = gym.make('Pendulum-v0') env.reset() env.render() params = { 'env': env, 'gamma': 0.99, 'actor_lr': 0.001, 'critic_lr': 0.001, 'tau': 0.02, 'capacity': 10000, 'batch_size': 32, } agent = Agent(**params) for episode in range(100): s0 = env.reset() episode_reward = 0 for step in range(500): env.render() a0 = agent.act(s0) s1, r1, done, _ = env.step(a0) agent.put(s0, a0, r1, s1) episode_reward += r1 s0 = s1 agent.learn()
from ddpg import Agent env_name = 'Pendulum-v0' env = gym.make(env_name) env = env.unwrapped env.seed(1) state_shape = env.observation_space.shape num_actions = env.action_space.shape[0] MAX_EPISODES = 10000 MAX_STEPS = 500 n_iter = 0 action_scale = env.action_space.high[0] learner = Agent(state_shape, num_actions, action_scale) def exploration(mu, scale, size=None): return np.random.normal(mu, scale, size) episode_history = deque(maxlen=100) for i in xrange(MAX_EPISODES): # initialize state = env.reset() total_rewards = 0 noise = exploration(0.0, 0.2, MAX_STEPS)
from utils import fetch_protein from protein import ProteinState from ddpg import Agent, ReplayBuffer EPISODES = 10000 STEPS = 500 if __name__ == "__main__": goal_state = fetch_protein("2jof") state_dim = goal_state.n_residues() * 2 action_dim = goal_state.n_residues() * 2 buffer = ReplayBuffer(10000) agent = Agent(state_dim, action_dim, (0, 360)) for _ in range(EPISODES): data = {"state": ProteinState(n_residues=goal_state.n_residues())} for _ in range(STEPS): action = agent.get_action(data["state"]) next_state = data["state"].do_action(action) reward = data["state"].eval_state() - next_state.eval_state() buffer.append(data["state"], action, reward, next_state) agent.update(buffer) print(data["state"].l2_norm(goal_state)) data["state"] = ProteinState(angles=next_state.angles())
def execute_ddpg(ddpg_agent: Agent, num_episodes: int = 3000, max_episode_t: int = 2000, learn_each: int = 5, consec_learn_iter: int = 10) -> list: """ DDPG - Execution Algorithm Implementation :param ddpg_agent: agent in charge of controling both Actor and Critic neural networks behaviour :param num_episodes: number of episodes the algorithm will train :param max_episode_t: maximum number of time steps to play at each episode :param learn_each: teps in a game before triggering the learning procedure :param consec_learn_iter: number of consecutive learning iterations :return: results obtained during the training procedure """ # 1| Initialization global_score = [] global_score_deque = deque(maxlen=100) # 2| Episode run for i_episode in range(1, num_episodes + 1): # 2.0| Initialization of episode env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) ddpg_agent.reset() # 2.1| Episode Run for t_step in range(max_episode_t): # 2.1.1| Agent decision and interaction actions = ddpg_agent.act(states) env_info = env.step(actions)[brain_name] # 2.1.2| Feedback on action next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # 2.1.3| Experience saving for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): ddpg_agent.memorize(state, action, reward, next_state, done) # 2.1.4| Update values scores += rewards states = next_states # 2.1.5| Agent learning if t_step % learn_each == 0: for _ in range(consec_learn_iter): ddpg_agent.trigger_learning() # 2.1.6| Episode ending if np.any(dones): break # 2.2| Episode post-processing # 2.2.1| Scoring global_score_deque.append(np.max(scores)) global_score.append(np.max(scores)) if i_episode % 10 == 0: print('Episode {}\tTotal Average Score: {:.2f}\tMean: {:.2f}'.format( i_episode, np.mean(global_score_deque), np.mean(scores))) if i_episode % 50 == 0: torch.save(ddpg_agent.actor_local.state_dict(), model_dir + 'checkpoint__actor_local__episode_' + str(i_episode) + '.pth') torch.save(ddpg_agent.actor_target.state_dict(), model_dir + 'checkpoint__actor_target__episode_' + str(i_episode) + '.pth') torch.save(ddpg_agent.critic_local.state_dict(), model_dir + 'checkpoint__critic_local__episode_' + str(i_episode) + '.pth') torch.save(ddpg_agent.critic_target.state_dict(), model_dir + 'checkpoint__critic_target__episode_' + str(i_episode) + '.pth') if np.mean(global_score_deque) >= 0.5 and i_episode >= 100: print('\rEpisode employed for completing the challenge {}'.format(i_episode)) torch.save(ddpg_agent.actor_local.state_dict(), model_dir + 'checkpoint__actor_local__episode_' + str(i_episode) + '.pth') torch.save(ddpg_agent.actor_target.state_dict(), model_dir + 'checkpoint__actor_target__episode_' + str(i_episode) + '.pth') torch.save(ddpg_agent.critic_local.state_dict(), model_dir + 'checkpoint__critic_local__episode_' + str(i_episode) + '.pth') torch.save(ddpg_agent.critic_target.state_dict(), model_dir + 'checkpoint__critic_target__episode_' + str(i_episode) + '.pth') break return global_score
env = CityLearn(data_path, building_attributes, weather_file, solar_profile, building_ids, buildings_states_actions=building_state_actions, cost_function=objective_function) observations_spaces, actions_spaces = env.get_state_action_spaces() # Provides information on Building type, Climate Zone, Annual DHW demand, Annual Cooling Demand, Annual Electricity Demand, Solar Capacity, and correllations among buildings building_info = env.get_building_information() # RL CONTROLLER #Instantiating the control agent(s) agents = Agent(env, building_info, observations_spaces, actions_spaces) # Select many episodes for training. In the final run we will set this value to 1 (the buildings run for one year) episodes = 10 k, c = 0, 0 cost, cum_reward = {}, {} start = time.time() # The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward) for e in range(episodes): cum_reward[e] = 0 rewards = [] state = env.reset() done = False while not done: if k % (1000) == 0:
if __name__ == "__main__": scores_mat = {} average_scores_CV = [] parameters = {} env = gym.make('LunarLanderContinuous-v2') for j in range(10): score_j = [] param_dist = {'alpha': float(np.random.uniform(0.00009, 0.000009, 1)), 'beta': float(np.random.uniform(0.0009, 0.00009, 1)), 'tau': float(np.random.uniform(0.009, 0.0007, 1)), 'gamma': float(np.random.uniform(1, 0.95, 1))} agent = Agent(alpha=param_dist.get('alpha'), beta=param_dist.get('beta'), input_dims=[8], tau=param_dist.get('tau'), env=env, gamma=param_dist.get('gamma'), batch_size=64, layer1_size=400, layer2_size=300, n_actions=2) print('Iteration {} runs on following values: alpha={}, beta={}, tau={}, gamma={}'.format(j, param_dist.get( 'alpha'), param_dist.get( 'beta'), param_dist.get('tau'), param_dist.get( 'gamma'))) parameters[j]= [param_dist.get('alpha'), param_dist.get('beta'), param_dist.get('tau'), param_dist.get('gamma')] # agent.load_models() n_episodes = 1000
num_agents = len(env_info.agents) print('number of agents: ', num_agents) action_size = brain.vector_action_space_size print('action size: ', action_size) states = env_info.vector_observations state_size = states.shape[1] print('state size: ', state_size) #-------------------------------------------------------------------------------------- #buffer_type = 'standard' buffer_type = 'prioritized' agent = Agent(state_size, action_size, buffer_type) Nepisodes = 1500 Nsteps = 1000 #-------------------------------------------------------------------------------------- def train_agent(): mean_ep_rewards = [] for ep in range(Nepisodes): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations
def reset(self): for Agent in self.maddpg_agent: Agent.reset()
import gym import numpy as np from ddpg import Agent from utils import plotLearning env = gym.make('Pendulum-v0') agent = Agent(alpha=0.0001, beta=0.001, input_dims=[3], tau=0.001, env=env, n_actions=1) np.random.seed(0) score_history = [] for episode in range(1000): state = env.reset() done = False score = 0 while not done: action = agent.choose_action(state) next_state, reward, done, info = env.step(action) agent.remember(state, action, reward, next_state, int(done)) agent.learn()
env.seed(1) start = time.time() # 超参数 params = { 'env': env, 'gamma': 0.99, # 增益折损 'actor_lr': 0.001, # 学习率 'critic_lr': 0.001, 'tau': 0.02, # 软更新参数 'capacity': 10000, # 经验池容量 'batch_size': 32, # 随机梯度下降,经验池回放 'train_with_render': True, # 是否在训练时开启渲染 'save_reward': -800, # 在episode reward达到多少时,停止训练,储存模型 'actor_model_path': 'model/DDPG_actor.pt', # 模型储存位置 'critic_model_path': 'model/DDPG_critic.pt', 'Reset_parameters': False, # 是否从0开始训练 } agent = Agent(**params) agent.train_model(200) # 训练num幕 # agent.test_model(3) # 测试num幕 train_time = time.time() - start env.close() print("Time: %.4f" % train_time) # 查看LOSS变化,终端输入: # tensorboard --logdir=./log --port=6007
from env.pid_env import PidEnvSingle import torch import numpy as np from ddpg import Agent from OUNoise import Noise import matplotlib.pyplot as plt batch_size = 128 rewards = [] avg_rewards = [] env = PidEnvSingle() agent = Agent(num_states=2, num_actions=3, gamma=0.99) agent2 = Agent(num_states=2, num_actions=3, gamma=0.99) noise = Noise(num_actions=3) zeros = [0] normalized = [] all_steps = [-1]*10 inlook = [] metalearn = False random = False setpoints = [] total_steps = 600 if metalearn == True: for i in range(20): curr = 20 if random == False else np.random.random()*100 setpoints.append(curr) agent.metalearn(setpoints)
env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) print('number of agents: ', num_agents) action_size = brain.vector_action_space_size print('action size: ', action_size) states = env_info.vector_observations state_size = states.shape[1] print('state size: ', state_size) #-------------------------------------------------------------------------------------- agent = Agent(state_size, action_size) Nepisodes = 5000 Nsteps = 5000 #-------------------------------------------------------------------------------------- def train_agent(): mean_ep_rewards = [] ibackup = 0 thresh = 0.05 for ep in range(Nepisodes):
from ddpg import Agent import gym import numpy as np env = gym.make('LunarLanderContinuous-v2') agent = Agent(alpha = 0.000025, beta = 0.00025, input_dims = [8], tau = 0.001, env = env, batch_size = 64, layer1_size = 400, layer2_size = 300, n_actions = 2) np.random.seed(42) score_history = [] for i in range(1000): done = False score = 0 obs = env.reset() while not done: act = agent.choose_action(obs) new_state, reward, done, info = env.step(act) agent.remember(obs, act, reward, new_state, int(done)) agent.learn() score += reward obs = new_state score_history.append(score) print("Episode - {} Score - {} 100 game average {}".format(i, score, np.mean(score_history[-100:]))) if i % 25 == 0: agent.save_models() filename = l
#%% Load Tennis environment env = UnityEnvironment(file_name="Tennis_Linux_NoVis/Tennis.x86_64") # Get brain information brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size # Environment information env_info = env.reset(train_mode=False)[brain_name] state_size = env_info.vector_observations.shape[1] num_agents = len(env_info.agents) #%% DDPG - Agent Training agent = Agent(state_size=state_size, action_size=action_size, random_seed=random_seed) score = execute_ddpg(ddpg_agent=agent) # Plot results fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(score)+1), score) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() #%% Environment- Close env.close()