def run(): model = Input(4) model = Affine(model, 128) model = LReLU(model) model = Affine(model, 2) model = Softmax(model) world = StochasticPolicy(Gym(make_env, max_steps=500)) opt = Adam(np.random.randn(model.n_params) * 0.1, lr=0.01) for _ in range(50): model.load_params(opt.get_value()) trajs = world.trajectories(model, 16) print_reward(trajs, max_value=5000) trajs = discount(trajs, horizon=500) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad) while True: world.render(model)
def curiosity(world): world = ActionNoise(world, stddev=0.2) memory = Cache(max_size=100) log_dir = "__oracle" if not os.path.exists(log_dir): os.mkdir(log_dir) agent = build_agent() agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5) oracle = build_oracle() oracle_opt = Adam(np.random.randn(oracle.n_params) * 0.1, lr=0.05, memory=0.95) for episode in range(1000): agent.load_params(agent_opt.get_value()) oracle.load_params(oracle_opt.get_value()) agent_trajs = world.trajectories(agent, 4) for_oracle = [[(np.asarray([o1, o2, o3]).flatten(), a1, r1) for (o1, a1, r1), (o2, a2, r2), (o3, a3, r3) in zip(t, t[1:], t[2:])] for t in agent_trajs] memory.add_trajectories(for_oracle) predictions = retrace(for_oracle, model=oracle) save_plot(log_dir + "/%04d.png" % (episode + 1), agent_trajs, predictions) np.save(log_dir + "/%04d.npy" % (episode + 1), agent_opt.get_value()) curiosity_trajs = [[ (o1, a1, np.log(np.mean(np.square((o2 - o1) - delta_p)))) for (o1, a1, r1), (o2, a2, r2), delta_p in zip(t, t[10:], p) ] for t, p in zip(agent_trajs, predictions)] #curiosity_trajs = replace_rewards(curiosity_trajs, # episode=lambda rs: np.max(rs)) print_reward(curiosity_trajs, max_value=5000.0) print_reward(agent_trajs, max_value=90.0, episode=np.sum) curiosity_trajs = discount(curiosity_trajs, horizon=500) curiosity_trajs = normalize(curiosity_trajs) agent_trajs = discount(agent_trajs, horizon=500) agent_trajs = normalize(agent_trajs) agent_trajs = [traj[:-10] for traj in agent_trajs] agent_weight = 0.5 # + 0.4*(0.5 * (1 - np.cos(np.pi * episode / 20))) curiosity_weight = 1. - agent_weight comb_trajs = combine_rewards([curiosity_trajs, agent_trajs], [curiosity_weight, agent_weight]) grad = policy_gradient(comb_trajs, policy=agent) agent_opt.apply_gradient(grad) oracle_trajs = [[(o1, (o2 - o1)[:2], 1.0) for (o1, a1, r1), (o2, a2, r2) in zip(t, t[10:])] for t in memory.trajectories(None, 4)] grad = policy_gradient(oracle_trajs, policy=oracle) oracle_opt.apply_gradient(grad)
def train_one(carrOpt): nonlocal oldTrajs classOpt = Adam( np.random.randn(classifier.n_params) * 1., lr=0.5, memory=0.9, ) if carrOpt == None: carrOpt = Adam( np.random.randn(curCarr.n_params), lr=0.10, memory=0.5, ) curScore = 0. curAccuracy = 0. for i in range(250): classifier.load_params(classOpt.get_value()) curCarr.load_params(carrOpt.get_value()) oldTrajIdx = np.random.choice(len(oldTrajs), size=50) trajs = [oldTrajs[i] for i in oldTrajIdx] trajs += world.trajectories(curCarr, 50) trajsForClass = [tag_traj(traj, [1, 0]) for traj in trajs[:50]] trajsForClass += [tag_traj(traj, [0, 1]) for traj in trajs[50:]] plot_tagged_trajs(trajsForClass) accTrajs = accuracy(trajsForClass, model=classifier) print_reward(accTrajs, max_value=1.0, episode=np.mean, label="Cla reward: ") curAccuracy = np.mean(get_rewards(accTrajs, episode=np.mean)) if curAccuracy > 1. - i / 500: break grad = policy_gradient(trajsForClass, policy=classifier) classOpt.apply_gradient(grad) trajs2 = learn_from_classifier(classifier, trajs[50:], 1) print_reward(trajs2, max_value=1.0, episode=np.max, label="Car reward: ") curScore = np.mean(get_rewards(trajs2, episode=np.max)) trajs2 = replace_rewards(trajs2, episode=np.max) trajs2 = normalize(trajs2) grad2 = policy_gradient(trajs2, policy=curCarr) carrOpt.apply_gradient(grad2) if i % 10 == 0: print("%d episodes in." % i) oldTrajs += world.trajectories(curCarr, 800) world.render(curCarr) if curScore > 0.11: return carrOpt else: return None
def train_one(carrOpt): if carrOpt == None: carrOpt = Adam( np.random.randn(curCarr.n_params), lr=0.10, memory=0.5, ) nextBreak = 5 for i in range(250): curCarr.load_params(carrOpt.get_value()) realTrajs, curiosityTrajs = world.trajectories(curCarr, 50) curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 90. print_reward(realTrajs, max_value=90.0, episode=np.sum, label="Real reward: ") print_reward(curiosityTrajs, max_value=1.0, episode=np.max, label="Curiosity reward: ") curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max)) if curCuriosity > 0.98: if nextBreak == 0: break else: nextBreak -= 1 else: nextBreak = np.min([nextBreak + 1, 5]) realTrajs = replace_rewards(realTrajs, episode=np.sum) realTrajs = normalize(realTrajs) curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max) #this is stupid, we should care more(?) if the costs are to high realWeight = 0.001 + np.max([np.min([curScore, 0.2]), 0. ]) * 0.998 / 0.2 curiosityWeight = 1. - realWeight print('RWeight: %f, CWeight: %f' % (realWeight, curiosityWeight)) trajs = combine_rewards([realTrajs, curiosityTrajs], [realWeight, curiosityWeight]) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=curCarr) carrOpt.apply_gradient(grad) if i % 10 == 0: print("%d episodes in." % i) world.remember_agent(curCarr) world.render(curCarr) if curScore > 0.01: return carrOpt else: return None
def train(world, model): opt = Adam(np.random.randn(model.n_params), lr=0.3, memory=0.9) for _ in range(20): model.load_params(opt.get_value()) trajs = world.trajectories(None, 100) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad) trajs = cross_entropy(trajs, model=model) print_reward(trajs, episode=np.mean, label="Surprise/byte:", max_value=8.0)
def curiosity(world): world = ActionNoise(world, stddev=0.1) memory = Cache(max_size=100) log_dir = "__oracle" if not os.path.exists(log_dir): os.mkdir(log_dir) agent = build_agent() agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5) oracle = build_oracle() oracle_opt = Adam(np.random.randn(oracle.n_params) * 0.1, lr=0.05, memory=0.95) for episode in range(1000): agent.load_params(agent_opt.get_value()) oracle.load_params(oracle_opt.get_value()) agent_trajs = world.trajectories(agent, 4) memory.add_trajectories(agent_trajs) predictions = retrace(agent_trajs, model=oracle) save_plot(log_dir + "/%04d.png" % (episode + 1), agent_trajs, predictions) np.save(log_dir + "/%04d.npy" % (episode + 1), agent_opt.get_value()) agent_trajs = [[ (o1, a1, np.log(np.mean(np.square((o2 - o1) - delta_p)))) for (o1, a1, r1), (o2, a2, r2), delta_p in zip(t, t[10:], p) ] for t, p in zip(agent_trajs, predictions)] agent_trajs = replace_rewards(agent_trajs, episode=lambda rs: np.max(rs) / len(rs)) print_reward(agent_trajs, max_value=10.0) agent_trajs = normalize(agent_trajs) grad = policy_gradient(agent_trajs, policy=agent) agent_opt.apply_gradient(grad) oracle_trajs = [[(o1, o2 - o1, 1.0) for (o1, a1, r1), (o2, a2, r2) in zip(t, t[10:])] for t in memory.trajectories(None, 4)] grad = policy_gradient(oracle_trajs, policy=oracle) oracle_opt.apply_gradient(grad)
def run(): model = Input(28, 28) model = Affine(model, 128) model = LReLU(model) model = Affine(model, 10) model = Softmax(model) train_world = StochasticPolicy(Accuracy(Mnist())) opt = Adams(np.random.randn(model.n_params), lr=0.00002, memory=0.99) for i in range(600): model.load_params(opt.get_value()) trajs = train_world.trajectories(model, 128) print_reward(trajs, max_value=1) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad)
def curiosity(world): world = ActionNoise(world, stddev=0.1) memory = Cache(delay=32 * 25) log_dir = "__car" if not os.path.exists(log_dir): os.mkdir(log_dir) agent = build_agent() agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5) classifier = build_classifier() classifier_opt = Adams(np.random.randn(classifier.n_params), lr=0.00005, memory=0.9) for episode in range(1000): agent.load_params(agent_opt.get_value()) classifier.load_params(classifier_opt.get_value()) agent_trajs = world.trajectories(agent, 32) memory.add_trajectory(*agent_trajs) classifier_trajs = (supervised(memory.trajectories(None, 32), label=lambda *_: [1, 0]) + supervised(agent_trajs, label=lambda *_: [0, 1])) grad = policy_gradient(classifier_trajs, policy=classifier) classifier_opt.apply_gradient(grad) agent_trajs = replace_rewards( agent_trajs, model=classifier, reward=lambda o: o[1], episode=lambda rs: np.square(rs) / len(rs)) save_plot(log_dir + "/%04d.png" % (episode + 1), classifier, classifier_trajs) print_reward(agent_trajs, max_value=1.0) agent_trajs = discount(agent_trajs, horizon=100) agent_trajs = normalize(agent_trajs) grad = policy_gradient(agent_trajs, policy=agent) agent_opt.apply_gradient(grad)
def train(model): world = Mnist() opt = Adam(np.random.randn(model.n_params), lr=0.1) for i in range(600): model.load_params(opt.get_value() + np.random.randn(model.n_params) * 0.01) trajs = world.trajectories(None, 256) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad) if i % 20 == 19: print("%4d) " % (i + 1), flush=True, end="") trajs = world.trajectories(None, 2000) trajs = accuracy(trajs, model=model, percent=True) print_reward(trajs, max_value=100, label="Train accuracy:") return opt.get_value()
def train(world, model): opt = Adams( np.random.randn(model.n_params) * 0.1, lr=0.0001, memory=0.8 ) while True: model.load_params(opt.get_value()) trajs = world.trajectories(model, 16) print_reward(trajs, max_value=500) if np.mean(get_rewards(trajs, episode=np.sum)) >= 498: return opt.get_value() trajs = discount(trajs, horizon=500) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad)
def train_one(): gaussOpt = Adam( [0., 0.], lr=0.010, memory=0.5, ) classOpt = Adam(np.random.randn(classifier.n_params) * 0.1, lr=0.5, memory=0.99) gaussCenterer = Constant(2) gausses.append(gaussCenterer) curAccuracy = 0. while curAccuracy < 0.98: classifier.load_params(classOpt.get_value()) gaussCenterer.load_params(gaussOpt.get_value()) trajs = [[(gauss_observation(gausses[:-1]), [1, 0], 1.)] for _ in range(500)] trajs += [[(gauss_observation(gausses[-1:]), [0, 1], 1.)] for _ in range(500)] accTrajs = accuracy(trajs, model=classifier) print_reward(accTrajs, max_value=1.0) accs = [traj[0][2] for traj in accTrajs] curAccuracy = np.mean(accs) grad = policy_gradient(trajs, policy=classifier) classOpt.apply_gradient(grad) trajs2 = learn_from_classifier(classifier, trajs[500:], 1) trajs2 = normalize(trajs2) grad2 = policy_gradient(trajs2, policy=gaussCenterer) gaussOpt.apply_gradient(grad2) plt.clf() plt.grid() plt.gcf().axes[0].set_ylim([-1, 1]) plt.gcf().axes[0].set_xlim([-1, 1]) x, y = zip(*[o for ((o, _, _), ) in trajs[:500]]) plt.scatter(x, y, color="blue") x, y = zip(*[o for ((o, _, _), ) in trajs[500:]]) plt.scatter(x, y, color="red") plt.pause(0.01)
def run(): model = Input(28, 28) model = Conv2d(model, size=3, channels=8) model = LReLU(model) model = Maxpool(model, size=2) model = Conv2d(model, size=5, channels=16) model = LReLU(model) model = Maxpool(model, size=2) model = Affine(model, 128) model = LReLU(model) model = Affine(model, 10) model = Softmax(model) if len(sys.argv) >= 2: params = np.load(sys.argv[1]) else: params = train(model) np.save("__mnist.npy", params) model.load_params(params) test_world = Mnist(test=True) trajs = test_world.trajectories(None, 5000) trajs = accuracy(trajs, model=model, percent=True) print_reward(trajs, max_value=100, label="Test accuracy:")
def run(): classifier = Input(7) classifier = Affine(classifier, 32) classifier = LReLU(classifier) classifier = Affine(classifier, 2) classifier = Softmax(classifier) agent = walker() agent.load_params(np.random.randn(agent.n_params) * 1.5) MAX_TRAIN_TIME = 200 trainTimeLeft = MAX_TRAIN_TIME curAgentId = -1 curMemoryId = 0 def plot_tagged_trajs(trajs): nonlocal trainTimeLeft, curAgentId, curMemoryId COLORS = ["blue", "red"] plt.clf() plt.grid() plt.gcf().axes[0].set_xlim([-1.25, 1.25]) plt.gcf().axes[0].set_ylim([-1.25, 1.25]) plt.suptitle("Episode %d of agent %d, memories: %d" % (MAX_TRAIN_TIME - trainTimeLeft, curAgentId, curMemoryId)) for traj in trajs: tag = traj[0][1] xs, ys = [], [] for state, _, _ in traj: x = state[2] y = state[3] xs.append(x) ys.append(y) plt.plot(xs, ys, color=COLORS[np.argmax(tag)], alpha=0.1) plt.gcf().set_size_inches(10, 8) plt.gcf().savefig("__step_a%03d_t%03d.png" % (curAgentId, MAX_TRAIN_TIME - trainTimeLeft), dpi=100) world = Gym("BipedalWalker-v2", max_steps=MAX_STEPS) world = ActionNoise(world, stddev=0.2) world = Curiosity(world, classifier=classifier, history_length=50, for_classifier=lambda ts: change_obs_space( ts, changer=interesting_part), plot=plot_tagged_trajs) MAX_BOREDOM = 3 boredom = MAX_BOREDOM MAX_MOTIVATION = 3 motivation = MAX_MOTIVATION agentOpt = None lastScores = None def memorize(): nonlocal boredom, curMemoryId print("Memorizing %d..." % curMemoryId) world.remember(agent) boredom = MAX_BOREDOM curMemoryId += 1 def save_agent(): np.save( "__ranger_a%03d_t%03d.npy" % (curAgentId, MAX_TRAIN_TIME - trainTimeLeft), agentOpt.get_value()) def reset_agent(): nonlocal agentOpt, trainTimeLeft, lastScores, curAgentId, motivation if agentOpt is not None: save_agent() print("Resetting agent %d." % curAgentId) agentOpt = Adam( np.random.randn(agent.n_params) * 1.5, lr=0.05, memory=0.9, ) trainTimeLeft = MAX_TRAIN_TIME lastScores = [-0.4] curAgentId += 1 motivation = MAX_MOTIVATION reset_agent() while True: agent.load_params(agentOpt.get_value()) realTrajs, curiosityTrajs = world.trajectories(agent, 30) curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 300. lastScores.append(curScore) lastScores = lastScores[-10:] scoreDev = np.std(lastScores) scoreMean = np.max([np.abs(np.mean(lastScores)), 1.]) curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max)) print_reward(realTrajs, max_value=300.0, episode=np.sum, label="Real reward: ") print_reward(curiosityTrajs, max_value=1.0, episode=np.max, label="Curiosity reward: ") if curCuriosity > 0.85: if boredom == 0: save_agent() memorize() else: boredom -= 1 else: boredom = np.min([boredom + 1, MAX_BOREDOM]) if scoreDev / scoreMean < 0.010 or trainTimeLeft < 0: if motivation == 0: print("Not really learning.") save_agent() motivation = MAX_MOTIVATION trainTimeLeft = MAX_TRAIN_TIME if curScore < 0.01: memorize() reset_agent() continue else: motivation -= 1 else: motivation = np.min([motivation + 1, MAX_MOTIVATION]) realTrajs = discount(realTrajs, horizon=200) realTrajs = normalize(realTrajs) curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max) realWeight = np.min([scoreDev / scoreMean * 10., 0.9]) curiosityWeight = 1. - realWeight trajs = combine_rewards([realTrajs, curiosityTrajs], [realWeight, curiosityWeight]) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=agent) agentOpt.apply_gradient(grad) trainTimeLeft -= 1