def __init__(self, state_size, action_size, seed): super(MADDPG, self).__init__() self.maddpg_agents = [ DDPGAgent(state_size, action_size, 1 * seed), DDPGAgent(state_size, action_size, 2 * seed) ] self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
def __init__(self, num_agents=2, random_seed=1): #np.random.randint(1000) super(MADDPG, self).__init__() self.maddpg_agent = [ DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed), DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed) ] self.num_agents = num_agents # Replay memory action_size = 2 self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, discount_factor=0.95, tau=0.02, device=device): super(MADDPG, self).__init__() # store configuration parameters self.device = device self.discount_factor = discount_factor self.tau = tau self.num_agents = num_agents # create maddgp agent self.maddpg_agent = [ DDPGAgent(num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, device=self.device) for _ in range(num_agents) ] # iteration counter self.iter = 0
def __init__(self, numAgents, state_size, action_size, random_seed, batch_size=BATCH_SIZE, buffer_size=BUFFER_SIZE, use_batch_norm=USE_BATCH_NORM): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.agents = [ DDPGAgent(i, numAgents, state_size, action_size, random_seed, use_batch_norm) for i in range(numAgents) ] self.numAgents = numAgents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) self.batch_size = batch_size
def __init__(self, state_size, action_size, numAgent, random_seed, epsilon=1, epsilonDecay=0.995, minEpsilon=0.00): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = epsilon self.epsilonDecay = epsilonDecay self.minEpsilon = minEpsilon self.numAgent = numAgent self.agents = [ DDPGAgent(state_size, action_size, random_seed) for i in range(numAgent) ] self.sharedMemory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, num_agents, state_size, action_size, hidden_layers, seed, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize MADDPG agent.""" super(MADDPG, self).__init__() self.seed = random.seed(seed) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.buffer_size = buffer_size self.batch_size = batch_size self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \ tau, lr_actor, lr_critic, weight_decay, seed) \ for _ in range(num_agents)] self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size)
def __init__(self, state_size, action_size, n_agents, random_seed=1): self.actor_local = Actor(state_size, action_size, random_seed) self.actor_target = Actor(state_size, action_size, random_seed) self.ddpg_agents = [ DDPGAgent(state_size, action_size, self.actor_local, self.actor_target, random_seed) for _ in range(n_agents) ]
def __init__(self, config): self.config = config if config.shared_replay_buffer: self.memory = config.memory() self.config.memory = self.memory self.ddpg_agents = [ DDPGAgent(self.config) for _ in range(config.num_agents) ] self.t_step = 0
def __init__(self, state_size, action_size, num, seed): # shared memory for all agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # define each agent self.ddpg_agents = [ DDPGAgent(state_size, action_size, seed, self.memory, device) for _ in range(num) ] self.t_step = 0 self.num_agents = num
def test(args): brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) print('Number of agents:', num_agents) # dim of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # dim of the state space states = env_info.vector_observations state_size = states.shape[1] agent_1 = DDPGAgent(state_size, action_size) #agent_2 = DDPGAgent(state_size, action_size) agent_2 = TD3Agent(state_size, action_size) agent_1_path = '../results/td3_opponent/00_best_td3_model.checkpoint' agent_2_path = '../results/ddgp_solo/01_best_model.checkpoint' agent = MAAC(state_size, action_size, agent_2, agent_1, False, False) agent.load(agent_1_path, 0) agent.load(agent_2_path, 1) test_scores = [] for i_episode in tqdm(range(1, 1 + args.test_n_run)): # initialize the scores scores = np.zeros(num_agents) env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current states dones = [False] * num_agents while not np.any(dones): actions = agent.act(states) # select actions # send the actions to the environment env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the next states rewards = env_info.rewards # get the rewards dones = env_info.local_done # see if episode has finished scores += rewards # update the scores # roll over the states to next time step states = next_states test_scores.append(np.max(scores)) avg_score = sum(test_scores) / len(test_scores) print("Test Score: {}".format(avg_score)) return avg_score
def train_ddpg_agent_job(config): if config.render_game: env = UnityEnvironment( file_name="./resources/Tennis_Linux/Tennis.x86_64") else: env = UnityEnvironment( file_name="./resources/Tennis_Linux_NoVis/Tennis.x86_64") brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations print(states) state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:\n', states[0]) # Train the agent agent = DDPGAgent(state_size, action_size, config.random_seed, config.buffer_size, config.batch_size, config.gamma, config.tau, config.lr_actor, config.lr_critic, config.weight_decay, config.sigma, config.actor_nn_size, config.critic_nn_size, config.batch_norm, config.clip_grad_norm) scores, avg_scores, std, save_path = train(agent, env, config.n_episodes, config.score_window_size, config.print_every, config.max_score, config.damp_exploration_noise) config.dump(save_path + 'config.yml') env.close() return scores, avg_scores, std
def __init__(self, n_agents, state_size, action_size, seed=0): super(MADDPGAgent, self).__init__() self.n_agents = n_agents self.maddpg_agent = [ DDPGAgent(state_size, action_size, n_agents, seed + i) for i in range(n_agents) ] # Replay memory self.memory = ReplayBuffer((n_agents, action_size), BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Noise parameters with exponential decay self.noise = INITIAL_NOISE self.noise_decay = NOISE_DECAY
def run(): env = gym.make('Pendulum-v0') seed = 30 env.seed(seed) agent = DDPGAgent(seed=seed, n_state=env.observation_space.shape[0], n_action=env.action_space.shape[0]) ''' agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], random_seed=seed) ''' episodes_n = 2000 steps_max = 300 scores = [] print_every = 100 scores_deque = deque(maxlen=print_every) for i_episode in range(1, episodes_n): state = env.reset() agent.reset() score = 0 done_step = 0 for step in range(steps_max): action = agent.act(state) state_next, reward, done, meta = env.step(action) agent.step(state, action, reward, state_next, done) state = state_next score += reward done_step += 1 if done: break scores.append(score) scores_deque.append(score) print_line(i_episode, scores_deque, end="") if i_episode % print_every == 0: print_line(i_episode, scores_deque, end="\n") return scores
def __init__(self, num_agents, state_size, action_size): """Initialize a MADDPGAgent wrapper. Params ====== num_agents (int): the number of agents in the environment state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [DDPGAgent(state_size, action_size, i+1) for i in range(num_agents)] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Will help to decide when to update the model weights self.t_step = 0 # Directory where to save the model self.model_dir = os.getcwd() + "/saved_models" os.makedirs(self.model_dir, exist_ok=True)
def __init__(self, num_agents, state_size, action_size, random_seed=None): super(MultiAgentDDPG, self).__init__() """Initialize an Agent object. Params ====== num_agents (int): number of agents state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size if random_seed == None: random_seed = random.randint(1, 1000) self.seed = random.seed(random_seed) self.maddpg_agent = [DDPGAgent(state_size=self.state_size, action_size=self.action_size, random_seed=self.seed)\ for i in range(self.num_agents)] self.iter = 0
brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size state = env_info.vector_observations state_size = state.shape[1] # create agent agent1 = DDPGAgent(nS=state_size, nA=action_size, lr_actor=0.0005, lr_critic=0.0005, gamma=0.99, batch_size=60, tau=0.001, memory_length=int(1e6), no_op=int(1e3), net_update_rate=1, std_initial=0.15, std_final=0.025, std_decay_frames=200000) # create agent agent2 = DDPGAgent(nS=state_size, nA=action_size, lr_actor=0.0005, lr_critic=0.0005, gamma=0.99, batch_size=60, tau=0.001,
def run(env, device, episodes, experiment_name, update_rate, action_size, state_size, brain_name, epsilon_start=1.0, epsilon_min=0.05, epsilon_decay=0.995, max_score=30., num_agents=1): epsilon = epsilon_start agent = DDPGAgent(state_space=state_size, action_space=action_size, buffer_size=int(1e5), batch_size=512, learning_rate_actor=0.001, learning_rate_critic=0.001, update_rate=update_rate, gamma=0.995, tau=0.001, device=device, seed=5, num_agents=num_agents) score_window = deque(maxlen=100) all_scores = [] tb_writer = SummaryWriter('{}/{}'.format('logs', experiment_name)) for episode in range(episodes): agent.reset() scores = np.zeros(num_agents) dones = np.zeros((num_agents), dtype=bool) env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations while not np.any(dones): actions = agent.act(states, epsilon) actions = np.clip(actions, -1, 1) # all actions between -1 and 1 env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards # update the score (for each agent) for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): agent.step(state, action, reward, next_state, done) states = next_states episode_score = np.mean(scores) score_window.append(episode_score) all_scores.append(episode_score) #decay after each episode if episode % 10 == 0: epsilon = max(epsilon_min, epsilon * epsilon_decay) print('\rEpisode: {}\tAverage Score: {}'.format( episode, np.mean(score_window)), end="") if episode % 100 == 0: tb_writer.add_scalar('Episode_Accum_score', np.mean(score_window), episode) print('\rEpisode: {}\tAverage Score: {}'.format( episode, np.mean(score_window))) if np.mean(score_window) >= max_score: torch.save(agent.actor_local_network.state_dict(), 'actor_checkpoint_{}.pth'.format(experiment_name)) torch.save(agent.critic_local_network.state_dict(), 'critic_checkpoint_{}.pth'.format(experiment_name)) break
env = UnityEnvironment( file_name="./resources/Tennis_Linux/Tennis.x86_64") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agent = DDPGAgent(state_size, action_size, conf.random_seed, conf.buffer_size, conf.batch_size, conf.gamma, conf.tau, conf.lr_actor, conf.lr_critic, conf.weight_decay, conf.sigma, conf.actor_nn_size, conf.critic_nn_size, conf.batch_norm, conf.clip_grad_norm) agent.load_weights( actor_weights_file=checkpoint_path + "checkpoint_actor.pth", critic_weights_file=checkpoint_path + "checkpoint_critic.pth") demo(agent, env, num_episodes=100) finally: env.close() else: raise (ValueError, f"Unknown command {command}")
# State space information states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) print('========================================') # ### Training the DDPG agent to solve the environment ### # # In the following the training takes place. # When training the environment, setting `train_mode=True` accelerates the # simulation environment so that training can be done much quicker. # Make a new agent from the DDPGAgent class in 'ddpg_agent.py' agent = DDPGAgent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=1) # , seed=random.randint(1,100000000) # Print information about the network architecture print('====================') print('Actor-Critic network architecture') print(agent.actor_local) print(agent.critic_local) print('====================') # Set up a function that takes in information for training the agent for n_episodes. def trainDDPG(n_episodes=1000, print_every=10): '''Deep Deterministic Policy Gradient (DDPG) agent that can be trained for a set amount of episodes. Keyword Arguments:
def __init__(self, random_seed, num_agents, state_size, action_size): self.agents = [DDPGAgent(state_size,action_size,random_seed) for x in range(num_agents)]
env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) for config in generate_grid_config(): agent = DDPGAgent(state_size, action_size, ddpg_config=config, num_agents=num_agents) agent.ddpg_actor_local.load_state_dict( torch.load('ddpg_weights/actor_{}.pth'.format(config))) agent.ddpg_critic_local.load_state_dict( torch.load('ddpg_weights/critic_{}.pth'.format(config))) evaluate_current_weights(env, agent, brain_name, num_agents, 5) # torch.save(agent.actor_critic.state_dict(), 'weights/{}.pth'.format(agent.ddpg_config)) # agent.actor_critic.load_state_dict(torch.load('weights/{}.pth'.format(agent.ddpg_config))) # test_agent(env, agent, brain_name, num_agents)
MAX_ITERATIONS = 200 def save_rewards(rewards): with open("reward_data.pickle", "wb") as handle: pickle.dump(rewards, handle) # with tf.Session() as sess: env = gym.make(ENV_NAME) # print(env.observation_space.shape) # print(env.action_space.shape) n_states = env.observation_space.shape[0] n_actions = env.action_space.shape[0] agent = DDPGAgent(env) rewards = [] for episode in range(MAX_EPISODES): state = env.reset() total_reward = 0 for itr in range(MAX_ITERATIONS): action = agent.getNoisyAction(state) # print("##### ", action, "####") state_, reward, done, _ = env.step(action[0]) agent.observe(state, action, reward, state_, done) state = state_ total_reward += reward if done: break
import gym from puckworld_env import PuckWorldEnv from ddpg_agent import DDPGAgent from utils import learning_curve import numpy as np env = PuckWorldEnv() agent =DDPGAgent(env) data = agent.learning(max_episode_num=200, display=True) learning_curve(data, 2, 1, #title="DDPGAgent performance on PuckWorld with continuous action space", x_name="episodes", y_name="rewards of episode")
def train_agent(episodes=100, model='DDPG', print_every=10): if model.lower() == 'd4pg': agent = D4PGAgent() print('Use D4PG agent......\n') else: agent = DDPGAgent() print('Use default DDPG agent......\n') print('Batch size: ', BATCH_SIZE) print('Actor learning rate: ', LR_ACTOR) print('Critic learning rate: ', LR_CRITIC) print('\n') env = EnvWrapper(file_name='Reacher_Windows_x86_64\Reacher.exe', train_mode=True) scores = [] scores_window = deque(maxlen=100) for ep in range(1, episodes + 1): agent.reset() agent.states = env.reset() for s in range(agent.max_steps): agent.actions = agent.act(add_noise=True) agent.rewards, agent.next_states, agent.dones = env.step( agent.actions) agent.step() agent.states = agent.next_states scores.append(agent.scores.mean()) scores_window.append(agent.scores.mean()) if ep % print_every == 0: print('Episode %d, avg score: %.2f' % (ep, agent.scores.mean())) if np.mean(scores_window) >= 30: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(ep - 100, np.mean(scores_window))) torch.save(agent.actor.state_dict(), 'checkpoints/reacher_%s_actor_checkpoint.pth' % model) torch.save(agent.critic.state_dict(), 'checkpoints/reacher_%s_critic_checkpoint.pth' % model) env.close() return scores, agent
print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) # Set device between cuda:0 and cpu torch_device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print('Device =', torch_device) memory = ReplayBuffer(action_size, memory_params['buffer_size'], memory_params['batch_size'], memory_params['seed'], torch_device) ddpg_agents = [DDPGAgent(state_size, action_size, memory, torch_device, params) for _ in range(num_agents)] ddpg_scores = train(300, 5000, ddpg_agents, ["model_ddpg_actor.pth", "model_ddpg_critic.pth"], benchmark_score, rolling_n_episodes) plot_scores(ddpg_scores, benchmark_score, rolling_n_episodes) # Test ddpg_agents = [DDPGAgent(state_size, action_size, memory, torch_device, params) for _ in range(num_agents)] for agent in ddpg_agents: agent.load_weights(["model_ddpg_actor.pth", "model_ddpg_critic.pth"]) test(ddpg_agents)
# Set the minimum score the agent has to reach in order to solve this task threshold = 0.5 max_episodes = 5000 max_t = 100000 threshold = 2.0 conseq_episodes = 5 print_every = 1 mode = False if mode == True: #train(args) #exit() agent_1 = DDPGAgent(state_size, action_size) #agent_2 = DDPGAgent(state_size, action_size) agent_2 = TD3Agent(state_size, action_size) agent_1_path = 'results/ddgp_solo/00_best_model.checkpoint' #agent_2_path = 'results/temp/new_ddpg_model.checkpoint' agent_2_path = 'results/temp/new_td3_model.checkpoint' agent = MAAC(state_size, action_size, agent_1, agent_2, False, True) agent.load(agent_1_path, 0) df = train_CollabAndCompete(env, brain_name, max_episodes, max_t, threshold, \ conseq_episodes, print_every, agent, agent_1_path, agent_2_path) plot_minmax(df)
def main(): test_mode = len(sys.argv) >= 2 and sys.argv[1] == "test" env = get_swimmer6_env() max_action_val = env.action_space.high[0] min_action_val = env.action_space.low[0] agent = DDPGAgent(env.state_space_dim, env.action_space_dim, min_action_val, max_action_val, hidden_layer_size=512, path_to_load=MODEL_PATH) episode_rewards = [] episodes_count = 1000000 with open("avgs.log", "w") as avgs_file: for episode_index in range(episodes_count): try: episode_reward = 0 state = env.reset() current_reward = None while True: if test_mode: action = agent.get_best_action(state) else: action = agent.get_action(state) next_state, reward, done, _ = env.step(action) if current_reward is None: current_reward = reward continue reward_diff = reward - current_reward current_reward = reward if not test_mode: agent.remember_step( (state, action, next_state, reward_diff)) agent.learn() agent.update_targets() episode_reward += reward_diff if done: break state = next_state episode_rewards.append(episode_reward) avg = np.mean(episode_rewards[-100:]) print( f"Episode #{episode_index}, reward: {episode_reward}, avg: {avg}" ) avgs_file.write(str(episode_reward) + "\n") avgs_file.flush() if episode_index % 10 == 0 and not test_mode: agent.save(MODEL_PATH) except KeyboardInterrupt: if not test_mode: agent.save(MODEL_PATH) cmd = input("Input: ") if cmd == "1": view_render(env, agent) except Exception as e: print(e)
n_hid1 = 400 n_hid2 = 300 lr_alpha = 1e-4 lr_beta = 1e-3 gamma = 0.99 tau = 0.99 fname = 'lunarlandercontinuous_ngames' + str(n_games) + '_memsize' + str(mem_size) + '_batchsize' + str(batch_size) + '_nhid1' + str(n_hid1)\ + '_nhid2' + str(n_hid2) + '_lralpha' + str(lr_alpha) + '_lrbeta' + str(lr_beta) + '_gamma' + str(gamma) +\ '_tau' + str(tau) figure_file = 'plots/' + fname + '.png' checkpoint_file = 'models/' + fname agent = DDPGAgent(load_checkpoint, n_states, n_actions, checkpoint_file, mem_size, batch_size, n_hid1, n_hid2, lr_alpha, lr_beta, gamma, tau) env = gym.make('LunarLanderContinuous-v2') if not load_checkpoint: scores = [] n_to_consider = 100 # number of previous score to consider in the avg best_score = env.reward_range[0] for i in range(n_games): done = False score = 0 agent.noise.reset() obs = env.reset() while not done: action = agent.choose_action(obs) obs_, reward, done, info = env.step(action)
def main(path=''): """ show the environment controlled by the 20 smart agents Args: param1: (string) pathname for saved network weights """ env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64', no_graphics=False) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) config = Config() config.n_agents = num_agents config.gamma = 0.99 config.state_dim = states.shape[1] config.action_dim = brain.vector_action_space_size config.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") config.seed = 42 config.leak = 0.001 config.tau = 1e-3 config.hdl1 = 256 config.hdl2 = 128 config.hdl3 = 128 config.lr_actor = 0.001 config.lr_critic = 0.001 config.batch_size = 1024 config.weight_decay = 0.99 config.memory_capacity = int(1e6) agent = DDPGAgent(config) agent.actor_local.load_state_dict(torch.load(path + 'checkpoint_actor.pth')) agent.critic_local.load_state_dict(torch.load(path + 'checkpoint_critic.pth')) for _ in range(3): episode_reward = [] scores = np.zeros(num_agents) env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations agent.reset_noise() total_steps = 0 while True: total_steps += 1 actions = agent.act(states, 0, False) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations reward = env_info.rewards done = np.array(env_info.local_done) episode_reward.append(np.mean(reward)) scores += reward states = next_states if np.any(done): print("total steps", total_steps) print(sum(episode_reward)) print('average: ', np.mean(scores)) print('min: ', np.min(np.array(episode_reward))) print('max: ', np.max(np.array(episode_reward))) break
def main(): # Instanciate specified environment. env = fe.FlockingEnv(size1, dynamic="first") # Get environment specs num_states = (size1 + 1) * dim * 2 num_actions = size1 * dim # Print specs print("Number of states: %d" % num_states) print("Number of actions: %d" % num_actions) print("-----------------------------------------") # Instanciate reinforcement learning agent which contains Actor/Critic DNN. #agents =[] #for i in range(0,size): agent = DDPGAgent(ob_shape=num_states, ac_shape=dim) # agents.append(agent) # Exploration noise generator which uses Ornstein-Uhlenbeck process. noise = OUNoise(1) for i in range(episodes_num): print("--------Episode %d--------" % i) reward_per_episode = 0 observation = env.reset_mul() #observation = env.reset_full() for j in range(steps_limit): if is_movie_on: env.render() # Select action off-policy state = observation action = np.zeros((size1, dim), dtype=np.float32) # get individual ob states here for k in range(0, size1): ac = agent.feed_forward_actor( np.reshape(state[k], [1, num_states])) # print(noise.generate()) if i % 2 == 0: action[k][0] = ac[0][0] + noise.generate() action[k][1] = ac[0][1] + noise.generate() else: action[k][0] = ac[0][0] + noise.generate() action[k][1] = ac[0][1] + noise.generate() ''' action = agent.feed_forward_actor(np.reshape(state, [1, num_states])) action = np.reshape(action, [size1,dim]) for k in range(0, size1): if i % 2 == 0: action[k][0] += noise.generate() action[k][1] += noise.generate() ''' # Throw action to environment observation, reward, done, info = env.step_mul(action) #observation, reward, done, info = env.step_full(action) for k in range(0, size1): agent.add_experience(np.reshape(state[k], [num_states]), action[k], np.reshape(observation[k], [num_states]), reward[k], done) #action=np.reshape(action,[num_actions]) #agent.add_experience(np.reshape(state, [num_states]), action, # np.reshape(observation, [ num_states]), reward, done) # Train actor/critic network if len(agent.replay_buffer) > MINI_BATCH_SIZE: agent.train() reward_per_episode = reward.sum() if j % 100 == 0: print(j, "step finished. reward=", reward_per_episode, "info=", info) # print("action=",action,"observation=",observation) if (done or j == steps_limit - 1): print("Steps count: %d" % j) print("Total reward: %d" % reward_per_episode) env.render() #noise.reset() with open("reward_log.csv", "a") as f: f.write("%d,%f\n" % (i, reward_per_episode)) break