def train(self, episodes=1000, max_steps=800, plot_rewards=True): # Initialize target network weights self.actor.update_target_model(copy=True) self.critic.update_target_model(copy=True) scores, steps = np.empty(episodes), np.empty(episodes) start = time.time() for e in range(episodes): score, step = self.run_episode(max_steps) scores[e], steps[e] = score, step print("Episode:", e, " steps:", step, " score:", score, " time:", time.time() - start) ensure_saved_models_dir() if plot_rewards: t_time = time.time() - start print("Mean score:", np.mean(scores), " Total steps:", np.sum(steps), " total time:", t_time) plot(scores) plot_running_avg(scores) np.save( "./train_data/ddpg_enc_actions" + str(self.state_size) + str(self.n_neighbors) + "_scores", scores) np.save( "./train_data/ddpg_enc_actions" + str(self.state_size) + str(self.n_neighbors) + "_time", t_time) np.save( "./train_data/ddpg_enc_actions" + str(self.state_size) + str(self.n_neighbors) + "_steps", steps)
def train(self, episodes=1000, max_steps=1000, plot_rewards=True): scores, steps = np.empty(episodes), np.empty(episodes) start = time.time() for e in range(episodes): score, step = self.run_episode(max_steps) scores[e], steps[e] = score, step print("Episode:", e, " steps:", step, " score:", score, " epsilon:", self.epsilon, " time:", time.time() - start) '''if e%100 == 0: ensure_saved_models_dir() self.model.save_weights(FINAL_WEIGHTS_PATH) print("Weights Saved")''' ensure_saved_models_dir() self.model.save_weights(FINAL_WEIGHTS_PATH) if plot_rewards: t_time = time.time() - start print("Mean score:", np.mean(scores), " Total steps:", np.sum(steps), " total time:", t_time) plot(scores) plot_running_avg(scores) np.save("./train_data/ddqn_" + str(self.state_size) + "_scores", scores) np.save("./train_data/ddqn_" + str(self.state_size) + "_time", t_time) np.save("./train_data/ddqn_" + str(self.state_size) + "_steps", steps)
def train(self, episodes=1000, max_steps=1000, plot_rewards=True): scores = np.empty(episodes) for e in range(episodes): score = self.run_episode(max_steps) scores[e] = score print("Episode:", e, " score:", score, " epsilon:", self.epsilon) ensure_saved_models_dir() self.model.save_weights(FINAL_WEIGHTS_PATH) if plot_rewards: plot(scores) plot_running_avg(scores)
def serial_pretrain(self, rows=10000, epochs=10): targets = np.empty((rows, self.action_size)) states = np.empty((rows, self.state_size)) for i in range(rows): p = self.env.observation_space.sample() states[i] = self.state_transformer.transform(p) self.fill_target(p, targets[i]) if i % 100 == 0: print("%.1f %%" % (i / rows * 100)) self.model.fit(states, targets, batch_size=self.batch_size, epochs=epochs, verbose=1, validation_split=0.1) self.update_target_model() ensure_saved_models_dir() self.model.save_weights(PRETRAIN_WEIGHTS_PATH)
def serial_pretrain(self, rows=16000000, batch_size=32, act_size=10, epochs=3): start = time.time() r = int(rows / batch_size) for j in range(r): states = np.empty((batch_size, self.state_size)) actions = np.random.randint(self.action_size, size=(batch_size, act_size)) targets = np.empty((batch_size, act_size)) for i in range(batch_size): p = self.env.observation_space.sample() states[i] = self.state_transformer.transform(p) for k in range(act_size): _, ia, ja, ka = self.env.actions[actions[i][k]] targets[i][k] = v_upperbound_breakpoints(p, ia, ja, ka) if i % 100 == 0: print(j, "-- %.6f %%" % ((j * batch_size + i) / rows * 100), time.time() - start) print(j, "-- %.6f %%" % ((j * batch_size + i) / rows * 100), time.time() - start, "-- UPDATE") for i in range(epochs): self.critic_train_model( np.repeat(states, act_size, axis=0), self.enc_actions[actions.reshape(-1, )].todense(), targets.reshape(-1, 1)) self.actor_train_model(states) targets, states, actions = None, None, None gc.collect() if j % 100000 == 0: ensure_saved_models_dir() saver = tf.train.Saver() saver.save(self.session, self.pretrain_path) print("Pretrain weights saved") ensure_saved_models_dir() saver = tf.train.Saver() saver.save(self.session, self.pretrain_path) print("Pretrain weights saved") self.session.run(self.target_init)
def serial_pretrain(self, rows=100000, batch_size=64, epochs=10): start = time.time() r = int(rows / batch_size) for j in range(r): targets = np.empty((batch_size, self.action_size)) states = np.empty((batch_size, self.state_size)) for i in range(batch_size): p = self.env.observation_space.sample() states[i] = self.state_transformer.transform(p) self.fill_target(p, targets[i]) if i % 100 == 0: print(j, "-- %.6f %%" % ((j*batch_size + i)/rows * 100), time.time() - start) actions = np.argmax(targets, axis=1) print(j, "-- %.6f %%" % ((j * batch_size + i) / rows * 100), time.time() - start, "-- UPDATE") for i in range(epochs): self.update(states, actions, targets) targets, states = None, None gc.collect() ensure_saved_models_dir() saver = tf.train.Saver() saver.save(self.session, self.pretrain_path) self.update_target_model()
def train(self, episodes=1000, max_steps=800, plot_rewards=True): scores, steps, losses = np.zeros(episodes), np.zeros(episodes), np.zeros(episodes) start = time.time() saver = tf.train.Saver() if self.fill_mem: self.fill_memory() for e in range(episodes): score, step, loss = self.run_episode(max_steps) scores[e], steps[e], losses[e] = score, step, loss print("Episode:", e, " steps:", step, " score: %.1f" % score," loss:", loss, " epsilon:", self.epsilon, " time:", time.time() - start) if math.isnan(loss): break ensure_saved_models_dir() saver.save(self.session, self.train_path) if plot_rewards: t_time = time.time() - start print("Mean score:", np.mean(scores), " Total steps:", np.sum(steps), " total time:", t_time) np.save("./train_data/ddqn_tf_" + str(self.state_size) + "_scores", scores) np.save("./train_data/ddqn_tf_" + str(self.state_size) + "_time", t_time) np.save("./train_data/ddqn_tf_" + str(self.state_size) + "_steps", steps) plot(steps) plot_running_avg(steps) plot_running_avg(losses, title="Losses")
def parallel_pretrain(self, rows=10000, epochs=10, n_threads=8): def f(i): p = self.env.observation_space.sample() states[i] = self.state_transformer.transform(p) self.fill_target(p, targets[i]) cur = progress.inc() if cur % 100 == 0: print("%.1f %%" % (cur / rows * 100)) progress = AtomicInteger() targets = np.empty((rows, self.action_size)) states = np.empty((rows, self.state_size)) pool = ThreadPool(n_threads) pool.map(f, range(rows)) self.model.fit(states, targets, batch_size=self.batch_size, epochs=epochs, verbose=1, validation_split=0.1) self.update_target_model() ensure_saved_models_dir() self.model.save_weights(PRETRAIN_WEIGHTS_PATH)
def train(self, episodes=1000, max_steps=800, plot_rewards=True): # Initialize target network weights self.actor.update_target_model(copy=True) self.critic.update_target_model(copy=True) scores, steps = np.empty(episodes), np.empty(episodes) start = time.time() break_flag = 0 for e in range(episodes): score, step = self.run_episode(max_steps) scores[e], steps[e] = score, step print("Episode:", e, " steps:", step, " score:", score, " time:", time.time() - start) if e % 50 == 0 and step == max_steps and self.fill_mem: self.fill_memory() break_flag = break_flag + 1 if step == max_steps else 0 if break_flag > 50 and e >= episodes / 2: break ensure_saved_models_dir() saver = tf.train.Saver() saver.save(self.session, self.train_path) if plot_rewards: t_time = time.time() - start print("Mean score:", np.mean(scores), " Total steps:", np.sum(steps), " total time:", t_time) np.save( "./train_data/ddpg_enc_actions" + str(self.state_size) + str(self.n_neighbors) + "_scores", scores) np.save( "./train_data/ddpg_enc_actions" + str(self.state_size) + str(self.n_neighbors) + "_time", t_time) np.save( "./train_data/ddpg_enc_actions" + str(self.state_size) + str(self.n_neighbors) + "_steps", steps) plot(steps) plot_running_avg(steps)