def build_classifier(): model = Input(2) model = LReLU(Affine(model, 32)) model = LReLU(Affine(model, 32)) model = Affine(model, 2) model = Softmax(model) return model
def walker(): walker = Input(STATE_SIZE) walker = Affine(walker, 64) walker = LReLU(walker) walker = Affine(walker, ACTION_SIZE) walker = Tanh(walker) return walker
def run(): model = Input(4) model = Affine(model, 128) model = LReLU(model) model = Affine(model, 2) model = Softmax(model) world = StochasticPolicy(Gym(make_env, max_steps=500)) opt = Adam(np.random.randn(model.n_params) * 0.1, lr=0.01) for _ in range(50): model.load_params(opt.get_value()) trajs = world.trajectories(model, 16) print_reward(trajs, max_value=5000) trajs = discount(trajs, horizon=500) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad) while True: world.render(model)
def run(): classifier = Input(2) classifier = Affine(classifier, 16) classifier = LReLU(classifier) classifier = Affine(classifier, 2) classifier = Softmax(classifier) gausses = [Constant(2)] gausses[0].load_params([0., 0.]) plt.ion() def train_one(): gaussOpt = Adam( [0., 0.], lr=0.010, memory=0.5, ) classOpt = Adam(np.random.randn(classifier.n_params) * 0.1, lr=0.5, memory=0.99) gaussCenterer = Constant(2) gausses.append(gaussCenterer) curAccuracy = 0. while curAccuracy < 0.98: classifier.load_params(classOpt.get_value()) gaussCenterer.load_params(gaussOpt.get_value()) trajs = [[(gauss_observation(gausses[:-1]), [1, 0], 1.)] for _ in range(500)] trajs += [[(gauss_observation(gausses[-1:]), [0, 1], 1.)] for _ in range(500)] accTrajs = accuracy(trajs, model=classifier) print_reward(accTrajs, max_value=1.0) accs = [traj[0][2] for traj in accTrajs] curAccuracy = np.mean(accs) grad = policy_gradient(trajs, policy=classifier) classOpt.apply_gradient(grad) trajs2 = learn_from_classifier(classifier, trajs[500:], 1) trajs2 = normalize(trajs2) grad2 = policy_gradient(trajs2, policy=gaussCenterer) gaussOpt.apply_gradient(grad2) plt.clf() plt.grid() plt.gcf().axes[0].set_ylim([-1, 1]) plt.gcf().axes[0].set_xlim([-1, 1]) x, y = zip(*[o for ((o, _, _), ) in trajs[:500]]) plt.scatter(x, y, color="blue") x, y = zip(*[o for ((o, _, _), ) in trajs[500:]]) plt.scatter(x, y, color="red") plt.pause(0.01) for i in range(10): print("Teaching agent %d." % i) train_one() plt.pause(10000000000000.)
def run(): model = Input(28, 28) model = Affine(model, 128) model = LReLU(model) model = Affine(model, 10) model = Softmax(model) train_world = StochasticPolicy(Accuracy(Mnist())) opt = Adams(np.random.randn(model.n_params), lr=0.00002, memory=0.99) for i in range(600): model.load_params(opt.get_value()) trajs = train_world.trajectories(model, 128) print_reward(trajs, max_value=1) grad = policy_gradient(trajs, policy=model) opt.apply_gradient(grad)
def run(): world = StochasticPolicy(Gym("CartPole-v1")) model = Input(4) model = Affine(model, 64) model = LReLU(model) model = Affine(model, 2) model = Softmax(model) if len(sys.argv) >= 2: params = np.load(sys.argv[1]) else: params = train(world, model) np.save("__cartpole.npy", params) model.load_params(params) world.render(model)
def run(): model = Input(28, 28) model = Conv2d(model, size=3, channels=8) model = LReLU(model) model = Maxpool(model, size=2) model = Conv2d(model, size=5, channels=16) model = LReLU(model) model = Maxpool(model, size=2) model = Affine(model, 128) model = LReLU(model) model = Affine(model, 10) model = Softmax(model) if len(sys.argv) >= 2: params = np.load(sys.argv[1]) else: params = train(model) np.save("__mnist.npy", params) model.load_params(params) test_world = Mnist(test=True) trajs = test_world.trajectories(None, 5000) trajs = accuracy(trajs, model=model, percent=True) print_reward(trajs, max_value=100, label="Test accuracy:")
def run(): world = Bytes(b"aabbaab", max_steps=4, charset=b'abcd') print("\nConstant model:\n") model = Constant(4) model = Softmax(model) train(world, model) print("\nLast character:\n") model = Input(4) model = Affine(model, 4) model = LReLU(model) model = Affine(model, 4) model = Softmax(model) train(world, model) print("\nLast two characters:\n") model = Input(2, 4) model = Affine(model, 4) model = LReLU(model) model = Affine(model, 4) model = History(model, length=2) model = Softmax(model) train(world, model)
def run(): if len(sys.argv) < 2: print("Usage: imitate.py <file>") return with open(sys.argv[1], "r") as f: data = f.buffer.read() charset = set(data) world = Bytes(data, max_steps=100, charset=charset) print("Charset size: %d" % len(charset)) model = Input(len(charset)) model = LSTM(model) model = Affine(model, len(charset)) model = Softmax(model) train(world, model) for _ in range(10): world.render(model)
def build_oracle(): model = Input(2) model = LReLU(Affine(model, 32)) model = LReLU(Affine(model, 32)) model = Affine(model, 2) return model
def build_agent(): model = Input(2) model = LReLU(Affine(model, 32)) model = LReLU(Affine(model, 32)) model = Affine(model, 1) return model
def run(): classifier = Input(7) classifier = Affine(classifier, 32) classifier = LReLU(classifier) classifier = Affine(classifier, 2) classifier = Softmax(classifier) agent = walker() agent.load_params(np.random.randn(agent.n_params) * 1.5) MAX_TRAIN_TIME = 200 trainTimeLeft = MAX_TRAIN_TIME curAgentId = -1 curMemoryId = 0 def plot_tagged_trajs(trajs): nonlocal trainTimeLeft, curAgentId, curMemoryId COLORS = ["blue", "red"] plt.clf() plt.grid() plt.gcf().axes[0].set_xlim([-1.25, 1.25]) plt.gcf().axes[0].set_ylim([-1.25, 1.25]) plt.suptitle("Episode %d of agent %d, memories: %d" % (MAX_TRAIN_TIME - trainTimeLeft, curAgentId, curMemoryId)) for traj in trajs: tag = traj[0][1] xs, ys = [], [] for state, _, _ in traj: x = state[2] y = state[3] xs.append(x) ys.append(y) plt.plot(xs, ys, color=COLORS[np.argmax(tag)], alpha=0.1) plt.gcf().set_size_inches(10, 8) plt.gcf().savefig("__step_a%03d_t%03d.png" % (curAgentId, MAX_TRAIN_TIME - trainTimeLeft), dpi=100) world = Gym("BipedalWalker-v2", max_steps=MAX_STEPS) world = ActionNoise(world, stddev=0.2) world = Curiosity(world, classifier=classifier, history_length=50, for_classifier=lambda ts: change_obs_space( ts, changer=interesting_part), plot=plot_tagged_trajs) MAX_BOREDOM = 3 boredom = MAX_BOREDOM MAX_MOTIVATION = 3 motivation = MAX_MOTIVATION agentOpt = None lastScores = None def memorize(): nonlocal boredom, curMemoryId print("Memorizing %d..." % curMemoryId) world.remember(agent) boredom = MAX_BOREDOM curMemoryId += 1 def save_agent(): np.save( "__ranger_a%03d_t%03d.npy" % (curAgentId, MAX_TRAIN_TIME - trainTimeLeft), agentOpt.get_value()) def reset_agent(): nonlocal agentOpt, trainTimeLeft, lastScores, curAgentId, motivation if agentOpt is not None: save_agent() print("Resetting agent %d." % curAgentId) agentOpt = Adam( np.random.randn(agent.n_params) * 1.5, lr=0.05, memory=0.9, ) trainTimeLeft = MAX_TRAIN_TIME lastScores = [-0.4] curAgentId += 1 motivation = MAX_MOTIVATION reset_agent() while True: agent.load_params(agentOpt.get_value()) realTrajs, curiosityTrajs = world.trajectories(agent, 30) curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 300. lastScores.append(curScore) lastScores = lastScores[-10:] scoreDev = np.std(lastScores) scoreMean = np.max([np.abs(np.mean(lastScores)), 1.]) curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max)) print_reward(realTrajs, max_value=300.0, episode=np.sum, label="Real reward: ") print_reward(curiosityTrajs, max_value=1.0, episode=np.max, label="Curiosity reward: ") if curCuriosity > 0.85: if boredom == 0: save_agent() memorize() else: boredom -= 1 else: boredom = np.min([boredom + 1, MAX_BOREDOM]) if scoreDev / scoreMean < 0.010 or trainTimeLeft < 0: if motivation == 0: print("Not really learning.") save_agent() motivation = MAX_MOTIVATION trainTimeLeft = MAX_TRAIN_TIME if curScore < 0.01: memorize() reset_agent() continue else: motivation -= 1 else: motivation = np.min([motivation + 1, MAX_MOTIVATION]) realTrajs = discount(realTrajs, horizon=200) realTrajs = normalize(realTrajs) curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max) realWeight = np.min([scoreDev / scoreMean * 10., 0.9]) curiosityWeight = 1. - realWeight trajs = combine_rewards([realTrajs, curiosityTrajs], [realWeight, curiosityWeight]) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=agent) agentOpt.apply_gradient(grad) trainTimeLeft -= 1
def carr(): carr = Input(2) carr = Affine(carr, 32) carr = LReLU(carr) carr = Affine(carr, 1) return carr
def run(): classifier = Input(2) classifier = Affine(classifier, 16) classifier = LReLU(classifier) classifier = Affine(classifier, 2) classifier = Softmax(classifier) curCarr = carr() curCarr.load_params(np.random.randn(curCarr.n_params)) world = Gym("MountainCarContinuous-v0", max_steps=500) world = ActionNoise(world, stddev=0.1) world = Curiosity(world, classifier=classifier, history_length=800, plot=True) def train_one(carrOpt): if carrOpt == None: carrOpt = Adam( np.random.randn(curCarr.n_params), lr=0.10, memory=0.5, ) nextBreak = 5 for i in range(250): curCarr.load_params(carrOpt.get_value()) realTrajs, curiosityTrajs = world.trajectories(curCarr, 50) curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 90. print_reward(realTrajs, max_value=90.0, episode=np.sum, label="Real reward: ") print_reward(curiosityTrajs, max_value=1.0, episode=np.max, label="Curiosity reward: ") curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max)) if curCuriosity > 0.98: if nextBreak == 0: break else: nextBreak -= 1 else: nextBreak = np.min([nextBreak + 1, 5]) realTrajs = replace_rewards(realTrajs, episode=np.sum) realTrajs = normalize(realTrajs) curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max) #this is stupid, we should care more(?) if the costs are to high realWeight = 0.001 + np.max([np.min([curScore, 0.2]), 0. ]) * 0.998 / 0.2 curiosityWeight = 1. - realWeight print('RWeight: %f, CWeight: %f' % (realWeight, curiosityWeight)) trajs = combine_rewards([realTrajs, curiosityTrajs], [realWeight, curiosityWeight]) trajs = normalize(trajs) grad = policy_gradient(trajs, policy=curCarr) carrOpt.apply_gradient(grad) if i % 10 == 0: print("%d episodes in." % i) world.remember_agent(curCarr) world.render(curCarr) if curScore > 0.01: return carrOpt else: return None theCarOpt = None for i in range(50): print("Teaching agent %d." % i) theCarOpt = train_one(theCarOpt)
def run(): classifier = Input(2) classifier = Affine(classifier, 16) classifier = LReLU(classifier) classifier = Affine(classifier, 2) classifier = Softmax(classifier) world = Gym("MountainCar-v0") world = StochasticPolicy(world) curCarr = carr() curCarr.load_params(np.random.randn(curCarr.n_params)) oldTrajs = world.trajectories(curCarr, 800) def train_one(carrOpt): nonlocal oldTrajs classOpt = Adam( np.random.randn(classifier.n_params) * 1., lr=0.5, memory=0.9, ) if carrOpt == None: carrOpt = Adam( np.random.randn(curCarr.n_params), lr=0.10, memory=0.5, ) curScore = 0. curAccuracy = 0. for i in range(250): classifier.load_params(classOpt.get_value()) curCarr.load_params(carrOpt.get_value()) oldTrajIdx = np.random.choice(len(oldTrajs), size=50) trajs = [oldTrajs[i] for i in oldTrajIdx] trajs += world.trajectories(curCarr, 50) trajsForClass = [tag_traj(traj, [1, 0]) for traj in trajs[:50]] trajsForClass += [tag_traj(traj, [0, 1]) for traj in trajs[50:]] plot_tagged_trajs(trajsForClass) accTrajs = accuracy(trajsForClass, model=classifier) print_reward(accTrajs, max_value=1.0, episode=np.mean, label="Cla reward: ") curAccuracy = np.mean(get_rewards(accTrajs, episode=np.mean)) if curAccuracy > 1. - i / 500: break grad = policy_gradient(trajsForClass, policy=classifier) classOpt.apply_gradient(grad) trajs2 = learn_from_classifier(classifier, trajs[50:], 1) print_reward(trajs2, max_value=1.0, episode=np.max, label="Car reward: ") curScore = np.mean(get_rewards(trajs2, episode=np.max)) trajs2 = replace_rewards(trajs2, episode=np.max) trajs2 = normalize(trajs2) grad2 = policy_gradient(trajs2, policy=curCarr) carrOpt.apply_gradient(grad2) if i % 10 == 0: print("%d episodes in." % i) oldTrajs += world.trajectories(curCarr, 800) world.render(curCarr) if curScore > 0.11: return carrOpt else: return None theCarOpt = None for i in range(10): print("Teaching agent %d." % i) theCarOpt = train_one(theCarOpt)
def carr(): carr = Input(2) carr = Affine(carr, 32) carr = LReLU(carr) carr = Affine(carr, 3) return Softmax(carr)