def curiosity(world): world = ActionNoise(world, stddev=0.2) memory = Cache(max_size=100) log_dir = "__oracle" if not os.path.exists(log_dir): os.mkdir(log_dir) agent = build_agent() agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5) oracle = build_oracle() oracle_opt = Adam(np.random.randn(oracle.n_params) * 0.1, lr=0.05, memory=0.95) for episode in range(1000): agent.load_params(agent_opt.get_value()) oracle.load_params(oracle_opt.get_value()) agent_trajs = world.trajectories(agent, 4) for_oracle = [[(np.asarray([o1, o2, o3]).flatten(), a1, r1) for (o1, a1, r1), (o2, a2, r2), (o3, a3, r3) in zip(t, t[1:], t[2:])] for t in agent_trajs] memory.add_trajectories(for_oracle) predictions = retrace(for_oracle, model=oracle) save_plot(log_dir + "/%04d.png" % (episode + 1), agent_trajs, predictions) np.save(log_dir + "/%04d.npy" % (episode + 1), agent_opt.get_value()) curiosity_trajs = [[ (o1, a1, np.log(np.mean(np.square((o2 - o1) - delta_p)))) for (o1, a1, r1), (o2, a2, r2), delta_p in zip(t, t[10:], p) ] for t, p in zip(agent_trajs, predictions)] #curiosity_trajs = replace_rewards(curiosity_trajs, # episode=lambda rs: np.max(rs)) print_reward(curiosity_trajs, max_value=5000.0) print_reward(agent_trajs, max_value=90.0, episode=np.sum) curiosity_trajs = discount(curiosity_trajs, horizon=500) curiosity_trajs = normalize(curiosity_trajs) agent_trajs = discount(agent_trajs, horizon=500) agent_trajs = normalize(agent_trajs) agent_trajs = [traj[:-10] for traj in agent_trajs] agent_weight = 0.5 # + 0.4*(0.5 * (1 - np.cos(np.pi * episode / 20))) curiosity_weight = 1. - agent_weight comb_trajs = combine_rewards([curiosity_trajs, agent_trajs], [curiosity_weight, agent_weight]) grad = policy_gradient(comb_trajs, policy=agent) agent_opt.apply_gradient(grad) oracle_trajs = [[(o1, (o2 - o1)[:2], 1.0) for (o1, a1, r1), (o2, a2, r2) in zip(t, t[10:])] for t in memory.trajectories(None, 4)] grad = policy_gradient(oracle_trajs, policy=oracle) oracle_opt.apply_gradient(grad)
def run(): model = Input(4) model = Affine(model, 128) model = LReLU(model) model = Affine(model, 2) model = Softmax(model) world = StochasticPolicy(Gym(make_env, max_steps=500)) opt = Adam(np.random.randn(model.n_params) * 0.1, lr=0.01) for _ in range(50): model.load_params(opt.get_value()) trajs = world.trajectories(model, 16) print_reward(trajs, max_value=5000) trajs = discount(trajs, horizon=500) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad) while True: world.render(model)
def train_one(carrOpt): nonlocal oldTrajs classOpt = Adam( np.random.randn(classifier.n_params) * 1., lr=0.5, memory=0.9, ) if carrOpt == None: carrOpt = Adam( np.random.randn(curCarr.n_params), lr=0.10, memory=0.5, ) curScore = 0. curAccuracy = 0. for i in range(250): classifier.load_params(classOpt.get_value()) curCarr.load_params(carrOpt.get_value()) oldTrajIdx = np.random.choice(len(oldTrajs), size=50) trajs = [oldTrajs[i] for i in oldTrajIdx] trajs += world.trajectories(curCarr, 50) trajsForClass = [tag_traj(traj, [1, 0]) for traj in trajs[:50]] trajsForClass += [tag_traj(traj, [0, 1]) for traj in trajs[50:]] plot_tagged_trajs(trajsForClass) accTrajs = accuracy(trajsForClass, model=classifier) print_reward(accTrajs, max_value=1.0, episode=np.mean, label="Cla reward: ") curAccuracy = np.mean(get_rewards(accTrajs, episode=np.mean)) if curAccuracy > 1. - i / 500: break grad = policy_gradient(trajsForClass, policy=classifier) classOpt.apply_gradient(grad) trajs2 = learn_from_classifier(classifier, trajs[50:], 1) print_reward(trajs2, max_value=1.0, episode=np.max, label="Car reward: ") curScore = np.mean(get_rewards(trajs2, episode=np.max)) trajs2 = replace_rewards(trajs2, episode=np.max) trajs2 = normalize(trajs2) grad2 = policy_gradient(trajs2, policy=curCarr) carrOpt.apply_gradient(grad2) if i % 10 == 0: print("%d episodes in." % i) oldTrajs += world.trajectories(curCarr, 800) world.render(curCarr) if curScore > 0.11: return carrOpt else: return None
def train_one(carrOpt): if carrOpt == None: carrOpt = Adam( np.random.randn(curCarr.n_params), lr=0.10, memory=0.5, ) nextBreak = 5 for i in range(250): curCarr.load_params(carrOpt.get_value()) realTrajs, curiosityTrajs = world.trajectories(curCarr, 50) curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 90. print_reward(realTrajs, max_value=90.0, episode=np.sum, label="Real reward: ") print_reward(curiosityTrajs, max_value=1.0, episode=np.max, label="Curiosity reward: ") curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max)) if curCuriosity > 0.98: if nextBreak == 0: break else: nextBreak -= 1 else: nextBreak = np.min([nextBreak + 1, 5]) realTrajs = replace_rewards(realTrajs, episode=np.sum) realTrajs = normalize(realTrajs) curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max) #this is stupid, we should care more(?) if the costs are to high realWeight = 0.001 + np.max([np.min([curScore, 0.2]), 0. ]) * 0.998 / 0.2 curiosityWeight = 1. - realWeight print('RWeight: %f, CWeight: %f' % (realWeight, curiosityWeight)) trajs = combine_rewards([realTrajs, curiosityTrajs], [realWeight, curiosityWeight]) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=curCarr) carrOpt.apply_gradient(grad) if i % 10 == 0: print("%d episodes in." % i) world.remember_agent(curCarr) world.render(curCarr) if curScore > 0.01: return carrOpt else: return None
def train(world, model): opt = Adam(np.random.randn(model.n_params), lr=0.3, memory=0.9) for _ in range(20): model.load_params(opt.get_value()) trajs = world.trajectories(None, 100) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad) trajs = cross_entropy(trajs, model=model) print_reward(trajs, episode=np.mean, label="Surprise/byte:", max_value=8.0)
def curiosity(world): world = ActionNoise(world, stddev=0.1) memory = Cache(max_size=100) log_dir = "__oracle" if not os.path.exists(log_dir): os.mkdir(log_dir) agent = build_agent() agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5) oracle = build_oracle() oracle_opt = Adam(np.random.randn(oracle.n_params) * 0.1, lr=0.05, memory=0.95) for episode in range(1000): agent.load_params(agent_opt.get_value()) oracle.load_params(oracle_opt.get_value()) agent_trajs = world.trajectories(agent, 4) memory.add_trajectories(agent_trajs) predictions = retrace(agent_trajs, model=oracle) save_plot(log_dir + "/%04d.png" % (episode + 1), agent_trajs, predictions) np.save(log_dir + "/%04d.npy" % (episode + 1), agent_opt.get_value()) agent_trajs = [[ (o1, a1, np.log(np.mean(np.square((o2 - o1) - delta_p)))) for (o1, a1, r1), (o2, a2, r2), delta_p in zip(t, t[10:], p) ] for t, p in zip(agent_trajs, predictions)] agent_trajs = replace_rewards(agent_trajs, episode=lambda rs: np.max(rs) / len(rs)) print_reward(agent_trajs, max_value=10.0) agent_trajs = normalize(agent_trajs) grad = policy_gradient(agent_trajs, policy=agent) agent_opt.apply_gradient(grad) oracle_trajs = [[(o1, o2 - o1, 1.0) for (o1, a1, r1), (o2, a2, r2) in zip(t, t[10:])] for t in memory.trajectories(None, 4)] grad = policy_gradient(oracle_trajs, policy=oracle) oracle_opt.apply_gradient(grad)
def train(model): world = Mnist() opt = Adam(np.random.randn(model.n_params), lr=0.1) for i in range(600): model.load_params(opt.get_value() + np.random.randn(model.n_params) * 0.01) trajs = world.trajectories(None, 256) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad) if i % 20 == 19: print("%4d) " % (i + 1), flush=True, end="") trajs = world.trajectories(None, 2000) trajs = accuracy(trajs, model=model, percent=True) print_reward(trajs, max_value=100, label="Train accuracy:") return opt.get_value()
def train_one(): gaussOpt = Adam( [0., 0.], lr=0.010, memory=0.5, ) classOpt = Adam(np.random.randn(classifier.n_params) * 0.1, lr=0.5, memory=0.99) gaussCenterer = Constant(2) gausses.append(gaussCenterer) curAccuracy = 0. while curAccuracy < 0.98: classifier.load_params(classOpt.get_value()) gaussCenterer.load_params(gaussOpt.get_value()) trajs = [[(gauss_observation(gausses[:-1]), [1, 0], 1.)] for _ in range(500)] trajs += [[(gauss_observation(gausses[-1:]), [0, 1], 1.)] for _ in range(500)] accTrajs = accuracy(trajs, model=classifier) print_reward(accTrajs, max_value=1.0) accs = [traj[0][2] for traj in accTrajs] curAccuracy = np.mean(accs) grad = policy_gradient(trajs, policy=classifier) classOpt.apply_gradient(grad) trajs2 = learn_from_classifier(classifier, trajs[500:], 1) trajs2 = normalize(trajs2) grad2 = policy_gradient(trajs2, policy=gaussCenterer) gaussOpt.apply_gradient(grad2) plt.clf() plt.grid() plt.gcf().axes[0].set_ylim([-1, 1]) plt.gcf().axes[0].set_xlim([-1, 1]) x, y = zip(*[o for ((o, _, _), ) in trajs[:500]]) plt.scatter(x, y, color="blue") x, y = zip(*[o for ((o, _, _), ) in trajs[500:]]) plt.scatter(x, y, color="red") plt.pause(0.01)