else: state = next_state # Update monitorization variables & params for next Episode scores.append(score) print('Episode/Test {} throws an avg of {}'.format(e, score)) return scores if __name__ == "__main__": # set environment and get state & action size env, brain_name, state_size, action_size, num_agents = defineEnvironment( path, verbose=True) # define agent agent = Agent(state_size,action_size,num_agents, \ SEED,GAMMA,TAU,LR_ACTOR,LR_CRITIC, \ BUFFER_SIZE, BUFFER_TYPE, POLICY_UPDATE) if mode == 'train': # train scores, checkpoint = train_agent(agent, env, brain_name, n_episodes=EPISODES, batch_size=BATCH_SIZE) # export data with open(results_filename, 'wb') as f: pickle.dump([scores, checkpoint], f) elif mode == 'evaluation': weights_filename = 'weights/twenty_agents/actor_batch64_model_weights.pth' agent.actor.load_state_dict(torch.load(weights_filename)) agent.actor.eval()
# number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] # print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) # print('The state for the first agent looks like:', states[0]) seed = 0 agent = Agent(state_size, action_size, seed) def ddpg(n_episodes=2000, max_t=1000): scores_deque = deque(maxlen=100) scores = [] max_score = -np.Inf for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations agent.reset() episode_scores = np.zeros(num_agents) for t in range(max_t): actions = agent.act(states)
""" ################################### STEP 5: Initialize DDPG Agents from the Agent Class in dqn_agent.py A DDPG agent initialized with the following parameters. ====== state_size (int): dimension of each state (required) action_size (int): dimension of each action (required) num_agents (int): number of agents in the unity environment seed (int): random seed for initializing training point (default = 0) Here we initialize two agents We set the states size to 48 (24*2), so we can feed each agent boths agent's state observations. """ #Initialize Agent agent_1 = Agent(state_size=48, action_size=action_size, num_agents=1, random_seed=0) agent_2 = Agent(state_size=48, action_size=action_size, num_agents=1, random_seed=0) # Load trained model weights for agent 1 agent_1.actor_local.load_state_dict(torch.load('ddpgActor1_Model.pth')) agent_1.critic_local.load_state_dict(torch.load('ddpgCritic1_Model.pth')) # Load trained model weights for agent 2 agent_2.actor_local.load_state_dict(torch.load('ddpgActor2_Model.pth')) agent_2.critic_local.load_state_dict(torch.load('ddpgCritic2_Model.pth')) """ ################################### STEP 6: Play Banana for specified number of Episodes """ # loop from num_episodes
def __init__(self): super(MultiAgentDDPG, self).__init__() self.config = Config() self.agents = [Agent() for _ in range(self.config.num_agents)] self.buffer = ReplayBuffer()
self.lr_critic = 1e-4 self.discount_rate = 0.99 # discount factor self.tau = 1e-3 self.weight_decay = 0 self.theta = 0.15 self.sigma = 0.2 config = Config() from collections import deque import matplotlib.pyplot as plt from ddpg_agent import Agent agent1 = Agent(state_size=config.state_size, action_size=config.action_size, random_seed=1, config=config) agent2 = Agent(state_size=config.state_size, action_size=config.action_size, random_seed=0, config=config) wandb.watch((agent1.actor_local, agent1.critic_local)) from tqdm import tqdm def combine_state_action(states, actions=np.zeros((1, config.action_size))): states = np.expand_dims(states, 0) augmented_state = np.hstack([states, actions]) return augmented_state
class MADDPG(): def __init__(self, state_size, action_size, random_seed): """Initialize 2 Agent objects. Params ====== state_size (int): dimension of one agent's observation action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size # Initialize the agents self.ddpg_agent0 = Agent(state_size, action_size, random_seed=0) self.ddpg_agent1 = Agent(state_size, action_size, random_seed=1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, states, rand=False): """Agents act with actor_local""" if rand == False: action0 = self.ddpg_agent0.act(states[0]) action1 = self.ddpg_agent1.act(states[1]) actions = [action0, action1] return actions if rand == True: actions = np.random.randn(2, 2) actions = np.clip(actions, -1, 1) return actions def step(self, states, actions, rewards, next_states, dones, learn=True): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward state0 = states[0] state1 = states[1] action0 = actions[0] action1 = actions[1] reward0 = rewards[0] reward1 = rewards[1] next_state0 = next_states[0] next_state1 = next_states[1] done0 = dones[0] done1 = dones[1] self.memory.add(state0, state1, action0, action1, reward0, reward1, next_state0, next_state1, done0, done1) if learn == True and len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, GAMMA): s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1 = experiences # next actions (for CRITIC network) a_next0 = self.ddpg_agent0.actor_target(next_s0) a_next1 = self.ddpg_agent1.actor_target(next_s1) # action predictions (for ACTOR network) a_pred0 = self.ddpg_agent0.actor_local(s0) a_pred1 = self.ddpg_agent1.actor_local(s1) # ddpg agents learn separately, each agent learns from its perspective, that is why states, actions, etc are swapped self.ddpg_agent0.learn(s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1, a_next0, a_next1, a_pred0, a_pred1) self.ddpg_agent1.learn(s1, s0, a1, a0, r1, r0, next_s1, next_s0, d1, d0, a_next1, a_next0, a_pred1, a_pred0)
def create_agent(memory): return Agent(state_size=states.shape[1], action_size=brain.vector_action_space_size, random_seed=random_seed, memory=memory, batch_size=128)
self.batch_size = 128 self.lr_actor = 1e-4 self.lr_critic = 1e-4 self.discount_rate = 0.99 self.tau = 1e-3 self.weight_decay = 0 config = Config() from collections import deque import matplotlib.pyplot as plt from ddpg_agent import Agent agent = Agent(state_size=state_size, action_size=action_size, random_seed=1, config=config) from tqdm import tqdm # Defining the main training loop. def ddpg(n_episodes=300, max_t=1000): scores_deque = deque(maxlen=100) scores = [] max_score = -np.Inf for i_episode in tqdm(range(1, n_episodes + 1)): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations score = np.zeros(config.no_agents)
env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) print('Number of agents:', num_agents) action_size = brain.vector_action_space_size print('Size of each action:', action_size) states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) agent = Agent(state_size, action_size, random_seed=0) agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth')) agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth')) scores_total = 0 NUM_GAMES = 50 max_time = 1000 for _ in range(NUM_GAMES): env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations score = np.zeros(num_agents) for t in range(max_time): actions = [] for j in range(num_agents): actions.append(agent.act(state[j], add_noise=False))
# size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) random_seed = 1 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") scores = np.zeros(num_agents) if torch.cuda.is_available(): trained_model = torch.load('checkpoint_actor.pth') else: trained_model = torch.load('checkpoint_actor.pth',map_location={'cuda:0': 'cpu'}) agent = Agent(state_size=state_size, action_size=action_size, random_seed=random_seed) agent.actor_local = Actor(state_size, action_size, random_seed).to(device) agent.actor_local.load_state_dict(trained_model) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) while True: action = agent.act(states, add_noise=False) env_info = env.step(action)[brain_name] states = env_info.vector_observations # get next state (for each agent)
# number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # number of actions action_size = brain.vector_action_space_size print('Size of each actions:', action_size) # examine the state space states = env_info.vector_observations # print('States look like:', state) state_size = states.shape[1] print('States have length:', state_size) agent_0 = Agent(state_size=state_size, action_size=action_size, num_agents=1, random_seed=0) agent_1 = Agent(state_size=state_size, action_size=action_size, num_agents=1, random_seed=0) agent_0.actor_local.load_state_dict(torch.load(result.actor_0_model, map_location='cpu')) agent_0.critic_local.load_state_dict(torch.load(result.critic_0_model, map_location='cpu')) agent_1.actor_local.load_state_dict(torch.load(result.actor_1_model, map_location='cpu')) agent_1.critic_local.load_state_dict(torch.load(result.critic_1_model, map_location='cpu')) # Set environment to evalulation mode env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations
from ddpg_agent import Agent from collections import deque if __name__ == "__main__": # Get the default financial and AC Model parameters financial_params, ac_params = utils.get_env_param() print(financial_params) print(ac_params) # Create simulation environment env = sca.MarketEnvironment() # Initialize Feed-forward DNNs for Actor and Critic models. agent1 = Agent(state_size=env.observation_space_dimension(), action_size=env.action_space_dimension(), random_seed=1225) agent2 = Agent(state_size=env.observation_space_dimension(), action_size=env.action_space_dimension(), random_seed=108) # Set the liquidation time lqt = 60 # Set the number of trades n_trades = 60 # Set trader's risk aversion tr1 = 1e-6 tr2 = 1e-6 # Set the number of episodes to run the simulation
# number of agents num_agents = len(env_info.agents) print("Number of agents:", num_agents) # size of each action action_size = brain.vector_action_space_size print("Size of each action:", action_size) # examine the state space states = env_info.vector_observations cstate_size = states.shape[1] print("There are {} agents. Each observes a state with length: {}".format( states.shape[0], cstate_size)) print("The state for the first agent looks like:", states[0]) agent = Agent(state_size=cstate_size, action_size=action_size, random_seed=42) def ddpg(n_episodes=5000, max_t=2000, print_every=10): scores_deque = deque(maxlen=print_every) t_scores = [] agent.reset() print(str(datetime.datetime.now()) + " Training started") for i_episode in range(0, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] # reset the envi states = env_info.vector_observations # get current state(each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) dones = env_info.local_done # see if episode finished for _ in range(max_t): actions = agent.act(states) env_info = env.step(actions)[brain_name] # send actions to the env
def train(env_pth, n_episodes=500, output='output'): BATCH_SIZE = 64 # minibatch size SEED = 2 os.makedirs(output, exist_ok=True) # load environment env = UnityEnvironment(file_name=env_pth) # get the default brain name brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the evironment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) logger.info(f'Number of agents: {num_agents}') # size of each action action_size = brain.vector_action_space_size logger.info(f'Size of each action: {action_size}') # examine the state space states = env_info.vector_observations state_size = states.shape[1] logger.info(f'There are {states.shape[0]} agent(s). Each observes a state with length: {state_size}') logger.info(f'The state for the first agent looks like: {states[0]}') # create agent agent = Agent(state_size, action_size, SEED, BATCH_SIZE) def ddpg(n_episodes, average_window=100, output='output'): scores_deque = deque(maxlen=average_window) scores_all = [] for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = agent.act(states) # select an action (for each agent) env_info = env.step(actions)[brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): agent.step(state, action, reward, next_state, done) scores += rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break average_score_episode = np.mean(scores) scores_deque.append(average_score_episode) scores_all.append(average_score_episode) average_score_queue = np.mean(scores_deque) logger.info(f'\rEpisode {i_episode}\tScores: {average_score_episode:.2f}\tAverage Score: {average_score_queue:.2f}') torch.save(agent.actor_local.state_dict(), f'{output}/checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), f'{output}/checkpoint_critic.pth') if i_episode > average_window and average_score_queue > 30: break return scores_all scores = ddpg(n_episodes=n_episodes, output=output) plot_rewards(scores, output) env.close()
env = UnityEnvironment(file_name='./Reacher_Linux/Reacher', no_graphics=not args.visualize and not args.watch_one_episode) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] agent = Agent(state_size=len(env_info.vector_observations[0]), action_size=brain.vector_action_space_size, actor_hidden_layers=args.actor_hidden_layers, critic_hidden_layers=args.critic_hidden_layers, ou_theta=args.ou_theta, ou_sigma=args.ou_sigma, ou_theta_decay=args.ou_theta_decay, ou_sigma_decay=args.ou_sigma_decay, energy_penalty=args.energy_penalty, random_seed=0) if args.load: load_agent() if args.watch_one_episode: watch_one_episode(args.slow_by) if args.train: scores = ddpg(n_episodes=args.n_episodes, slow_every=args.slow_every, slow_by=args.slow_by) print(scores) outfile = open('scores.txt', 'w')
import gym import torch import numpy as np from ddpg_agent import Agent import matplotlib.pyplot as plt from noise import OUNoise env = gym.make('BipedalWalker-v3') state_dim = int(env.observation_space.shape[0]) action_dim = int(env.action_space.shape[0]) agent = Agent(state_size=state_dim, action_size=action_dim) def ddpg(episodes, step, pretrained=False, noise=True): if pretrained: agent.actor_local.load_state_dict( torch.load('./models/weights/checkpoint_actor.pth', map_location="cpu")) agent.critic_local.load_state_dict( torch.load('.models/weights/checkpoint_critic.pth', map_location="cpu")) agent.actor_target.load_state_dict( torch.load('.models/weights/checkpoint_actor.pth', map_location="cpu")) agent.critic_target.load_state_dict( torch.load('.models/weights/checkpoint_critic.pth', map_location="cpu")) reward_list = []
# get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # number of actions and state size action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) # Instantiate the agent agent = Agent(state_size=state_size, action_size=action_size * 2, random_seed=64) # Load the weights from file agent.actor_local.load_state_dict(torch.load(actor_file)) agent.critic_local.load_state_dict(torch.load(critic_file)) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state, False) # Don't add noise during test! env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward
def ddpg(n_episodes=500, max_t=200, train_mode=True): env = UnityEnvironment(file_name='./1_agent/Reacher.app') brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=train_mode)[brain_name] states = env_info.vector_observations agent = Agent(state_size=states.shape[1], action_size=action_size, random_seed=2) brain_name = env.brain_names[0] scores = [] scores_deque = deque(maxlen=100) max_score = -np.Inf for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=train_mode)[brain_name] num_agents = len(env_info.agents) # agent.reset() score = 0 states = env_info.vector_observations # while True: for t in range(max_t): agent.reset() actions = agent.act(states) # actions = np.clip(actions, -1,1) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards # rewards = [1.0 if x > 0.0 else 0.0 for x in rewards] dones = env_info.local_done agent.step(states, actions, rewards, next_states, dones) states = next_states score += np.mean(env_info.rewards) if np.any(dones): break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format( i_episode, np.mean(scores_deque), score), end="") if i_episode % 100 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format( i_episode, np.mean(scores_deque), score), end="") print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) if np.mean(scores_deque) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') break env.close() return scores
num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # print('The state for the first agent looks like:', states[0]) agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=2) def save_model(filename): torch.save(agent.actor_local.state_dict(), '{}_actor.pth'.format(filename)) torch.save(agent.critic_local.state_dict(), '{}_critic.pth'.format(filename)) def ddpg(n_episodes=2000, print_every=100, save_every=100): avg_solved = 0 scores_deque = deque(maxlen=100) scores_global = [] avg_global = []
financial_params ac_params import numpy as np import syntheticChrissAlmgren as sca from ddpg_agent import Agent from collections import deque # Create simulation environment env = sca.MarketEnvironment() # Initialize Feed-forward DNNs for Actor and Critic models. agent = Agent(state_size=env.observation_space_dimension(), action_size=env.action_space_dimension(), random_seed=0) # Set the liquidation time lqt = 60 # Set the number of trades n_trades = 60 # Set trader's risk aversion tr = 1e-6 # Set the number of episodes to run the simulation episodes = 10000 shortfall_hist = np.array([])
env1_path = Path("./Reacher_Windows_x86_64_v1/Reacher.app") env2_path = Path("./Reacher_Windows_x86_64_v2/Reacher.app") env_path = str(env1_path.resolve()) env = UnityEnvironment(file_name=env_path) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # reset the environment state_size = env_info.vector_observations.shape[1] action_size = env_info.previous_vector_actions.shape[1] NUM_AGENTS = 20 SEED = 72 agent = Agent(state_size=state_size, action_size=action_size, random_seed=SEED) writer = tbx.SummaryWriter() def ddpg(n_episodes=2000, max_t=1000): scores_deque = deque(maxlen=100) scores = [] timesteps = 0 tbx_counter = 0 max_score = -np.Inf for i_episode in range(1, n_episodes + 1): state = env.reset(train_mode=True)[brain_name].vector_observations agent.reset() score = 0 for t in range(max_t):
# size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) parser = argparse.ArgumentParser() parser.add_argument('--train', dest='train', action='store_true', help='train agent locally') parser.add_argument('--test', dest='test', action='store_true', help='test agent locally') args = parser.parse_args() agent = Agent(state_size=state_size, action_size=action_size, random_seed=0) def ddpg(n_episodes=300, max_t=1000, solved_score=30.0, print_every=100): scores_window = deque(maxlen=print_every) scores = [] for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) score = np.zeros(num_agents) # initialize the score (for each agent) agent.reset() for t in range(max_t): actions = agent.act(states, add_noise=True) # select an action (for each agent) actions = np.clip(actions, -1, 1) # all actions between -1 and 1 env_info = env.step(actions)[brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent)
def main(): # load version 2 (with 20 agents) of the environment env_name = "Reacher_Windows_x86_64_version1\Reacher.exe" # add a Unity-Environment name. env = UnityEnvironment(file_name=env_name) # Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python. # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the enviroment env_info = env.reset(train_mode=False)[brain_name] # number of agents num_agents = len(env_info.agents) print("Number of agents : ", num_agents) # size of each action action_size = brain.vector_action_space_size print("Size of each action : ", action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print("There are {} agents. Each observes a state with length: {}".format( states.shape[0], state_size)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") random_seed = 12345 #10 agent = Agent(state_size, action_size, random_seed, device=device) actor_state_dict = torch.load("checkpoint_actor.pth") agent.actor_local.load_state_dict(actor_state_dict) critic_state_dict = torch.load("checkpoint_critic.pth") agent.critic_local.load_state_dict(critic_state_dict) # Take Random Actions in the Environment env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = agent.act( states, add_noise=False) # select an action (for each agent) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break print('Total score (averaged over agents) this episode: {}'.format( np.mean(scores))) # When finished, you can close the environment env.close()
num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) state_size = state_size * 2 agent_1 = Agent(state_size=state_size, action_size=action_size, random_seed=1) agent_2 = Agent(state_size=state_size, action_size=action_size, random_seed=1) actor_1_weights = "actor_1_model.pth" actor_2_weights = "actor_2_model.pth" critic_1_weights = "critic_1_model.pth" critic_2_weights = "critic_2_model.pth" def ddpg(n_episodes=100000, max_t=20000): scores_deque = deque(maxlen=100) total_scores = [] average_scores = []
# get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of actions and state size action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) # Instantate the agent, using DQN or Double DQN according to the command line argument agent = Agent(state_size=state_size, action_size=action_size * 2, random_seed=64) def ddpg(n_episodes=10000, max_t=10000): """DDPG Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name]
def train( n_episodes, max_t, env_fp, no_graphics, seed, save_every_nth, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, log, ): log.info("#### Initializing environment...") # init environment env = UnityEnvironment(file_name=env_fp, no_graphics=no_graphics) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) log.info(f"Number of agents: {num_agents}") # size of each action action_size = brain.vector_action_space_size log.info(f"Size of each action: {action_size}") # examine the state space states = env_info.vector_observations state_size = states.shape[1] log.info( f"There are {states.shape[0]} agents. Each observes a state with length: {state_size}" ) log.info(f"The state for the first agent looks like: {states[0]}") agent = Agent( num_agents=len(env_info.agents), state_size=state_size, action_size=action_size, buffer_size=buffer_size, batch_size=batch_size, gamma=gamma, tau=tau, lr_actor=lr_actor, lr_critic=lr_critic, weight_decay=weight_decay, random_seed=seed, ) log.info("#### Training...") scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, n_episodes + 1): brain_name = env.brain_names[0] env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations agent.reset() score = np.zeros((len(env_info.agents), 1)) for t in range(max_t): actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards rewards = np.array(rewards).reshape((next_states.shape[0], 1)) dones = env_info.local_done dones = np.array(dones).reshape((next_states.shape[0], 1)) agent.step(states, actions, rewards, next_states, dones) score += rewards states = next_states if np.any(dones): break scores_deque.append(np.max(score)) scores.append(np.max(score)) print( "Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}".format( i_episode, np.mean(scores_deque), scores[-1]), end="\r", ) if i_episode % 100 == 0: print("\rEpisode {}\tAverage Score: {:.2f}".format( i_episode, np.mean(scores_deque))) if i_episode % save_every_nth == 0: save_checkpoint( state={ "episode": i_episode, "actor_state_dict": agent.actor_local.state_dict(), "critic_state_dict": agent.critic_local.state_dict(), "scores_deque": scores_deque, "scores": scores, }, filename="checkpoint.pth", ) if np.mean(scores_deque) >= 0.5: save_checkpoint( state={ "episode": i_episode, "actor_state_dict": agent.actor_local.state_dict(), "critic_state_dict": agent.critic_local.state_dict(), "scores_deque": scores_deque, "scores": scores, }, filename="checkpoint_solved.pth", ) print( "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}" .format(i_episode - 100, np.mean(scores_deque))) break
GRAPHICS_OFF = False n_episodes = 3 env = UnityEnvironment(file_name=ENV_PATH, no_graphics=GRAPHICS_OFF) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agents = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0) agents.actor_local.load_state_dict( torch.load("ckpt/{}".format(ACTOR_CHECKPOINT_NAME))) agents.critic_local.load_state_dict( torch.load("ckpt/{}".format(CRITIC_CHECKPOINT_NAME))) for i_episode in range(1, n_episodes + 1): print('Starting episode {}'.format(i_episode)) env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name] state = env_info.vector_observations agents.reset() score = np.zeros(num_agents) while True: action = agents.act(state)
# number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) agent = Agent(state_size=state_size, action_size=action_size, random_seed=2) pretrained_dict_actor = torch.load( opt.model_pth_actor, map_location=lambda storage, location: storage) pretrained_dict_critic = torch.load( opt.model_pth_critic, map_location=lambda storage, location: storage) model_dict_actor = agent.actor_local.state_dict() model_dict_critic = agent.critic_local.state_dict() # 1. filter out unnecessary keys pretrained_dict_actor = { k: v for k, v in pretrained_dict_actor.items() if k in model_dict_actor } pretrained_dict_critic = { k: v for k, v in pretrained_dict_critic.items() if k in model_dict_critic
print(state_size) print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) #import gym import random import torch import numpy as np from collections import deque import matplotlib.pyplot as plt #%matplotlib inline from ddpg_agent import Agent print('state size, action size', state_size, action_size) agent = Agent(state_size, action_size, random_seed=5) def ddpg(n_episodes=200, max_t=1000, print_every=1, plot_every=25): scores_deque = deque(maxlen=print_every) scores = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations #print('state size', state.shape) agent.reset() score = 0 for t in range(max_t): action = agent.act(state) env_info = env.step(action)[brain_name]
brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] state_size = len(state) num_agents = len(env_info.vector_observations) print("num agents {} state size {} action size {}".format( num_agents, state_size, action_size)) agent = Agent(state_size, action_size, random_seed=0, buffer_size=int(1e5), batch_size=512, tau=1e-3, lr_actor=3e-4, lr_critic=3e-4, critic_weight_decay=0.0, update_every=4, update_num_repeats=2, noise_decay=0.999) if isForTrain is True: print("start training....") Train(env, agent, num_agents, num_episodes=3000) else: Test(env, agent, num_agents, num_episodes=200) except KeyboardInterrupt: print("Keyboard interrupted") env.close()