teacher_step_nums = [] step_nums = [] ep_avgs = [0] ep_avg = [] for ee in range(episodes): ob = env.reset() score_ = 0 steps = 0 t_steps = 0 while True: steps += 1 action = student.act(ob) next_ob, reward, done, _ = env.step(action) score_ += reward student.build_memory(ob, action, reward, next_ob, done) ob = next_ob if done: if (ee+1) % 20 == 0 or ee == 0: student.model.save_weights("teacher_student_weights.h5") if ee == 0: print "Completed {}/{} with a score of {}.".format(ee+1, episodes, score_) else: ep_avgs.append(np.mean(ep_avg)) ep_avg = [] print "Completed {}/{} with a score of {}. Average over the last 20 epochs was {} ({}).".format(ee+1, episodes, score_, ep_avgs[-1], ep_avgs[-1]-ep_avgs[-2])
for i in range(5000): print "===== {} =====".format(str(i)) ob = env.reset() sum_r = 0 while True: act_t = pred(ob[None, :, :, :])[0][0].argmax( ) # Record Teacher Action for Training act_s = student.predict(np.array(ob[None, :, :, :]) / 255.)[0].argmax() # Students Action if act_t != act_s: observations_.append(ob / 255.) # Append Observation teacher_actions.append(act_t) ob, r, isOver, info = env.step(act_s) # Take Step sum_r += r if isOver: print "Total Score \t {}".format(str(sum_r)) break scrambled_idx = np.random.choice(len(np.array(observations_)), size=(len(np.array(observations_)), ), replace=False) print "Training Count \t {}".format(str(len(scrambled_idx))) student.fit(np.array(observations_)[scrambled_idx], np_utils.to_categorical(teacher_actions, num_classes=9)[scrambled_idx], epochs=5, batch_size=8,