def evolve(self, population, save=True): """ Evolve agents :param population: :type population: :param save: save agents weights and scores :type save: bool :return: """ timestamp = datetime.now().strftime('%Y%m%d%H%M%S') print("Optimization - started", timestamp) agents = population.create_population() for i in range(population.max_generations): if not self.terminate(population, i): try: population.agents_weights[i] = np.array( [a.model.get_weights() for a in agents], dtype=np.ndarray) for j, agent in enumerate(agents): # TODO parallelize score = agent.run_agent() population.scores[i][j] = score print_scores(i + 1, population.scores[i]) if save and (i + 1) % 50 == 0: save_results(population.agents_weights[:i], population.scores[:i], timestamp) if i < population.max_generations - 1: self.generate_next_generation(population=population, generation=i) for k, a in enumerate(agents): agents[k].model.set_weights( population.agents_weights[i + 1][k]) except KeyboardInterrupt: LOGGER.log(environment=ENVIRONMENT.name, timestamp=timestamp, algorithm=self.__class__.__name__, parameters=vars(self), generations=i, score=np.max(population.scores[i - 1])) save_results(population.agents_weights[:i - 1], population.scores[:i - 1], timestamp) sys.exit() else: population.agents_weights = population.agents_weights[:i] population.scores = population.scores[:i] break if save: LOGGER.log(environment=ENVIRONMENT.name, timestamp=timestamp, algorithm=self.__class__.__name__, parameters=vars(self), generations=i, score=np.max(population.scores[i])) save_results(population.agents_weights, population.scores, timestamp) return population.agents_weights, population.scores
def run(self, train=True): """ Run DDPG agent :param train: train indicator :type train: bool :return: """ timestamp = datetime.now().strftime('%Y%m%d%H%M%S') step = 0 self.load_weights() try: for i in range(self.n_episodes): # get initial state state = self.environment.env.reset() total_reward = 0 for j in range(self.environment.max_time): # loss = 0 self.environment.env.render() action = self.actor.model.predict( state.reshape(1, state.shape[0])) action = self.noise.get_noisy_action(action, j) new_state, reward, done, info = self.environment.env.step( action[0]) # td_error = reward + self.gamma * 1 - 1 self.buffer.add(state, action[0], reward, new_state, done) s, a, r, new_s, d = self.buffer.get_batch(self.batch_size) target_q = self.critic.target_model.predict( [new_s, self.actor.target_model.predict(new_s)]) y = r for k in range(len(d)): # if d[k]: # y[k] = r[k] # else: # y[k] = r[k] + self.gamma * target_q[k] y[k] = r[k] + self.gamma * target_q[k] if train: # loss += self.critic.model.train_on_batch([s, a], y) self.critic.model.train_on_batch([s, a], y) a_grads = self.actor.model.predict(s) grads = self.critic.gradients(s, a_grads) self.actor.train(s, grads) self.actor.update_target() self.critic.update_target() total_reward += reward if i + 1 % 200 == 0: LOGGER.log(environment=self.environment.name, timestamp=timestamp, algorithm=self.__class__.__name__, parameters=self.get_params(), total_steps=i, score=total_reward) self.save_weights() if done: # or np.array_equal(np.around(new_state, 3), np.around(state, 3)): previous_reward = total_reward break state = new_state # print("Episode", i, "Step", step, "Action", action, "Reward", reward, "Loss", loss) step += 1 print( "Episode: {:<5d} Total Reward: {:<+10.3f} Total Steps: {:<10d} " " Replay Buffer size: {}".format( i, total_reward, step, self.buffer.n_experiences)) except KeyboardInterrupt: print("Training interrupted.") print("Saving weights...") self.save_weights()
def train(self): """ Train BipedaWaker-v2 agent :return: """ # get start timestamp timestamp = datetime.now().strftime('%Y%m%d%H%M%S') # saving paths model_path = os.path.join(td3_cfg.models_path, '{}-{}'.format(env_cfg.name, timestamp), 'model.ckpt') results_path = os.path.join(td3_cfg.models_path, '{}-{}'.format(env_cfg.name, timestamp), 'results.npy') distances_path = os.path.join(td3_cfg.models_path, '{}-{}'.format(env_cfg.name, timestamp), 'distances.npy') weights_path = os.path.join(td3_cfg.models_path, '{}-{}'.format(env_cfg.name, timestamp), 'weights_init_fin.npy') video_dir = os.path.join(td3_cfg.models_path, '{}-{}'.format(env_cfg.name, timestamp), 'video') # read environment information from config env = env_cfg.env # record video every 100 episodes if td3_cfg.record_videos: env = wrappers.Monitor( env, video_dir, video_callable=lambda ep: ep % td3_cfg.record_videos == 0) # arrays with rewards, distances for later optimality analysis rewards = [] distances_consecutive = np.zeros(2, dtype=np.ndarray) distances_init = np.zeros(2, dtype=np.ndarray) with tf.Session() as sess: # initialization self.agent = TD3(sess) saver = tf.train.Saver() init = tf.global_variables_initializer() sess.run(init) self.agent.initialize() global_step = 0 weights_init = get_actor_weights(sess) weights_old = weights_init try: for i in range(td3_cfg.n_episodes): s = env.reset() # get initial state ep_reward = 0 ep_steps = 0 noises = [] actions = [] done = False while not done: env.render() if ep_steps < 10: action = self.agent.get_random_action() else: action, action_org, noise = self.agent.get_noisy_action( s) noises.append(noise) actions.append(action_org) action = action.squeeze() s2, r, done, info = env.step(action.tolist()) ep_reward += r ep_steps += 1 global_step += 1 # store transition in replay buffer self.agent.store_experience(s, action, r, done, s2) # use symmetry of leg 1 and leg 2 mirrored_s = mirror_state(s) mirrored_s2 = mirror_state(s2) mirrored_a = mirror_action(action) self.agent.store_experience(mirrored_s, mirrored_a, r, done, mirrored_s2) # train agent temp = self.agent.train(global_step) if temp: weights = temp s = s2 if done: # end of the episode count = i + 1 # get trained weights weights = get_actor_weights(sess) for iw, w in enumerate(weights): # compute the distances with the previous weights and the initial weights con, init = compute_distance_episodes( weights_init[iw], weights_old[iw], weights[iw]) distances_consecutive[iw] = np.append( distances_consecutive[iw], con) distances_init[iw] = np.append( distances_init[iw], init) weights_old = weights # evaluation if count % td3_cfg.test_every == 0: eval_ep_reward, eval_ep_steps = self.evaluate( env) print( "Episode: {:<10d} Evaluation Reward: {:<+10.3f} " "Total Training Steps: {:10d}".format( count, eval_ep_reward, global_step)) rewards.append(eval_ep_reward) # saving if count % td3_cfg.save_every == 0: saver.save(sess, model_path, global_step=count) np.save(results_path, rewards) np.save( distances_path, np.vstack((distances_consecutive, distances_init))) np.save(weights_path, np.append(weights_init, weights)) except KeyboardInterrupt: print("Training interrupted.") # Finalize training and save results print('Total steps:', global_step) print("Saving results...") LOGGER.log(environment=env_cfg.name, timestamp=timestamp, algorithm=self.agent.__class__.__name__, parameters=vars(td3_cfg), total_steps=global_step, score=eval_ep_reward) saver.save(sess, model_path, global_step=count) np.save(results_path, rewards) np.save(distances_path, np.vstack((distances_consecutive, distances_init))) np.save(weights_path, np.append(weights_init, weights)) env.close()