Example #1
0
def build_classifier():
    model = Input(2)
    model = LReLU(Affine(model, 32))
    model = LReLU(Affine(model, 32))
    model = Affine(model, 2)
    model = Softmax(model)
    return model
Example #2
0
def walker():
    walker = Input(STATE_SIZE)
    walker = Affine(walker, 64)
    walker = LReLU(walker)
    walker = Affine(walker, ACTION_SIZE)
    walker = Tanh(walker)
    return walker
def run():
    model = Input(4)
    model = Affine(model, 128)
    model = LReLU(model)
    model = Affine(model, 2)
    model = Softmax(model)

    world = StochasticPolicy(Gym(make_env, max_steps=500))

    opt = Adam(np.random.randn(model.n_params) * 0.1, lr=0.01)

    for _ in range(50):
        model.load_params(opt.get_value())

        trajs = world.trajectories(model, 16)
        print_reward(trajs, max_value=5000)

        trajs = discount(trajs, horizon=500)
        trajs = normalize(trajs)

        grad = policy_gradient(trajs, policy=model)
        opt.apply_gradient(grad)

    while True:
        world.render(model)
Example #4
0
def run():
    classifier = Input(2)
    classifier = Affine(classifier, 16)
    classifier = LReLU(classifier)
    classifier = Affine(classifier, 2)
    classifier = Softmax(classifier)

    gausses = [Constant(2)]
    gausses[0].load_params([0., 0.])

    plt.ion()

    def train_one():
        gaussOpt = Adam(
            [0., 0.],
            lr=0.010,
            memory=0.5,
        )
        classOpt = Adam(np.random.randn(classifier.n_params) * 0.1,
                        lr=0.5,
                        memory=0.99)
        gaussCenterer = Constant(2)
        gausses.append(gaussCenterer)
        curAccuracy = 0.
        while curAccuracy < 0.98:
            classifier.load_params(classOpt.get_value())
            gaussCenterer.load_params(gaussOpt.get_value())

            trajs = [[(gauss_observation(gausses[:-1]), [1, 0], 1.)]
                     for _ in range(500)]
            trajs += [[(gauss_observation(gausses[-1:]), [0, 1], 1.)]
                      for _ in range(500)]
            accTrajs = accuracy(trajs, model=classifier)
            print_reward(accTrajs, max_value=1.0)
            accs = [traj[0][2] for traj in accTrajs]
            curAccuracy = np.mean(accs)

            grad = policy_gradient(trajs, policy=classifier)
            classOpt.apply_gradient(grad)
            trajs2 = learn_from_classifier(classifier, trajs[500:], 1)
            trajs2 = normalize(trajs2)
            grad2 = policy_gradient(trajs2, policy=gaussCenterer)
            gaussOpt.apply_gradient(grad2)
            plt.clf()
            plt.grid()
            plt.gcf().axes[0].set_ylim([-1, 1])
            plt.gcf().axes[0].set_xlim([-1, 1])
            x, y = zip(*[o for ((o, _, _), ) in trajs[:500]])
            plt.scatter(x, y, color="blue")
            x, y = zip(*[o for ((o, _, _), ) in trajs[500:]])
            plt.scatter(x, y, color="red")
            plt.pause(0.01)

    for i in range(10):
        print("Teaching agent %d." % i)
        train_one()
    plt.pause(10000000000000.)
Example #5
0
def run():
    model = Input(28, 28)
    model = Affine(model, 128)
    model = LReLU(model)
    model = Affine(model, 10)
    model = Softmax(model)

    train_world = StochasticPolicy(Accuracy(Mnist()))

    opt = Adams(np.random.randn(model.n_params), lr=0.00002, memory=0.99)

    for i in range(600):
        model.load_params(opt.get_value())
        trajs = train_world.trajectories(model, 128)
        print_reward(trajs, max_value=1)
        grad = policy_gradient(trajs, policy=model)
        opt.apply_gradient(grad)
Example #6
0
def run():
    world = StochasticPolicy(Gym("CartPole-v1"))

    model = Input(4)
    model = Affine(model, 64)
    model = LReLU(model)
    model = Affine(model, 2)
    model = Softmax(model)

    if len(sys.argv) >= 2:
        params = np.load(sys.argv[1])
    else:
        params = train(world, model)
        np.save("__cartpole.npy", params)

    model.load_params(params)
    world.render(model)
Example #7
0
def run():
    model = Input(28, 28)
    model = Conv2d(model, size=3, channels=8)
    model = LReLU(model)
    model = Maxpool(model, size=2)
    model = Conv2d(model, size=5, channels=16)
    model = LReLU(model)
    model = Maxpool(model, size=2)
    model = Affine(model, 128)
    model = LReLU(model)
    model = Affine(model, 10)
    model = Softmax(model)

    if len(sys.argv) >= 2:
        params = np.load(sys.argv[1])
    else:
        params = train(model)
        np.save("__mnist.npy", params)

    model.load_params(params)
    test_world = Mnist(test=True)
    trajs = test_world.trajectories(None, 5000)
    trajs = accuracy(trajs, model=model, percent=True)
    print_reward(trajs, max_value=100, label="Test accuracy:")
Example #8
0
def run():
    world = Bytes(b"aabbaab", max_steps=4, charset=b'abcd')

    print("\nConstant model:\n")
    model = Constant(4)
    model = Softmax(model)
    train(world, model)

    print("\nLast character:\n")
    model = Input(4)
    model = Affine(model, 4)
    model = LReLU(model)
    model = Affine(model, 4)
    model = Softmax(model)
    train(world, model)

    print("\nLast two characters:\n")
    model = Input(2, 4)
    model = Affine(model, 4)
    model = LReLU(model)
    model = Affine(model, 4)
    model = History(model, length=2)
    model = Softmax(model)
    train(world, model)
Example #9
0
def run():
    if len(sys.argv) < 2:
        print("Usage: imitate.py <file>")
        return

    with open(sys.argv[1], "r") as f:
        data = f.buffer.read()
    charset = set(data)
    world = Bytes(data, max_steps=100, charset=charset)
    print("Charset size: %d" % len(charset))

    model = Input(len(charset))
    model = LSTM(model)
    model = Affine(model, len(charset))
    model = Softmax(model)

    train(world, model)

    for _ in range(10):
        world.render(model)
Example #10
0
def build_oracle():
    model = Input(2)
    model = LReLU(Affine(model, 32))
    model = LReLU(Affine(model, 32))
    model = Affine(model, 2)
    return model
Example #11
0
def build_agent():
    model = Input(2)
    model = LReLU(Affine(model, 32))
    model = LReLU(Affine(model, 32))
    model = Affine(model, 1)
    return model
Example #12
0
def run():
    classifier = Input(7)
    classifier = Affine(classifier, 32)
    classifier = LReLU(classifier)
    classifier = Affine(classifier, 2)
    classifier = Softmax(classifier)

    agent = walker()
    agent.load_params(np.random.randn(agent.n_params) * 1.5)

    MAX_TRAIN_TIME = 200
    trainTimeLeft = MAX_TRAIN_TIME
    curAgentId = -1
    curMemoryId = 0

    def plot_tagged_trajs(trajs):
        nonlocal trainTimeLeft, curAgentId, curMemoryId
        COLORS = ["blue", "red"]
        plt.clf()
        plt.grid()
        plt.gcf().axes[0].set_xlim([-1.25, 1.25])
        plt.gcf().axes[0].set_ylim([-1.25, 1.25])
        plt.suptitle("Episode %d of agent %d, memories: %d" %
                     (MAX_TRAIN_TIME - trainTimeLeft, curAgentId, curMemoryId))
        for traj in trajs:
            tag = traj[0][1]
            xs, ys = [], []
            for state, _, _ in traj:
                x = state[2]
                y = state[3]
                xs.append(x)
                ys.append(y)
            plt.plot(xs, ys, color=COLORS[np.argmax(tag)], alpha=0.1)
        plt.gcf().set_size_inches(10, 8)
        plt.gcf().savefig("__step_a%03d_t%03d.png" %
                          (curAgentId, MAX_TRAIN_TIME - trainTimeLeft),
                          dpi=100)

    world = Gym("BipedalWalker-v2", max_steps=MAX_STEPS)
    world = ActionNoise(world, stddev=0.2)
    world = Curiosity(world,
                      classifier=classifier,
                      history_length=50,
                      for_classifier=lambda ts: change_obs_space(
                          ts, changer=interesting_part),
                      plot=plot_tagged_trajs)
    MAX_BOREDOM = 3
    boredom = MAX_BOREDOM

    MAX_MOTIVATION = 3
    motivation = MAX_MOTIVATION

    agentOpt = None
    lastScores = None

    def memorize():
        nonlocal boredom, curMemoryId
        print("Memorizing %d..." % curMemoryId)
        world.remember(agent)
        boredom = MAX_BOREDOM
        curMemoryId += 1

    def save_agent():
        np.save(
            "__ranger_a%03d_t%03d.npy" %
            (curAgentId, MAX_TRAIN_TIME - trainTimeLeft), agentOpt.get_value())

    def reset_agent():
        nonlocal agentOpt, trainTimeLeft, lastScores, curAgentId, motivation
        if agentOpt is not None:
            save_agent()
        print("Resetting agent %d." % curAgentId)
        agentOpt = Adam(
            np.random.randn(agent.n_params) * 1.5,
            lr=0.05,
            memory=0.9,
        )
        trainTimeLeft = MAX_TRAIN_TIME
        lastScores = [-0.4]
        curAgentId += 1
        motivation = MAX_MOTIVATION

    reset_agent()
    while True:
        agent.load_params(agentOpt.get_value())

        realTrajs, curiosityTrajs = world.trajectories(agent, 30)
        curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 300.
        lastScores.append(curScore)
        lastScores = lastScores[-10:]
        scoreDev = np.std(lastScores)
        scoreMean = np.max([np.abs(np.mean(lastScores)), 1.])

        curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max))

        print_reward(realTrajs,
                     max_value=300.0,
                     episode=np.sum,
                     label="Real reward:      ")
        print_reward(curiosityTrajs,
                     max_value=1.0,
                     episode=np.max,
                     label="Curiosity reward: ")
        if curCuriosity > 0.85:
            if boredom == 0:
                save_agent()
                memorize()
            else:
                boredom -= 1
        else:
            boredom = np.min([boredom + 1, MAX_BOREDOM])

        if scoreDev / scoreMean < 0.010 or trainTimeLeft < 0:
            if motivation == 0:
                print("Not really learning.")
                save_agent()
                motivation = MAX_MOTIVATION
                trainTimeLeft = MAX_TRAIN_TIME
                if curScore < 0.01:
                    memorize()
                    reset_agent()
                    continue
            else:
                motivation -= 1
        else:
            motivation = np.min([motivation + 1, MAX_MOTIVATION])

        realTrajs = discount(realTrajs, horizon=200)
        realTrajs = normalize(realTrajs)
        curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max)
        realWeight = np.min([scoreDev / scoreMean * 10., 0.9])
        curiosityWeight = 1. - realWeight
        trajs = combine_rewards([realTrajs, curiosityTrajs],
                                [realWeight, curiosityWeight])
        trajs = normalize(trajs)
        grad = policy_gradient(trajs, policy=agent)
        agentOpt.apply_gradient(grad)

        trainTimeLeft -= 1
Example #13
0
def carr():
    carr = Input(2)
    carr = Affine(carr, 32)
    carr = LReLU(carr)
    carr = Affine(carr, 1)
    return carr
Example #14
0
def run():
    classifier = Input(2)
    classifier = Affine(classifier, 16)
    classifier = LReLU(classifier)
    classifier = Affine(classifier, 2)
    classifier = Softmax(classifier)

    curCarr = carr()
    curCarr.load_params(np.random.randn(curCarr.n_params))

    world = Gym("MountainCarContinuous-v0", max_steps=500)
    world = ActionNoise(world, stddev=0.1)
    world = Curiosity(world,
                      classifier=classifier,
                      history_length=800,
                      plot=True)

    def train_one(carrOpt):
        if carrOpt == None:
            carrOpt = Adam(
                np.random.randn(curCarr.n_params),
                lr=0.10,
                memory=0.5,
            )
        nextBreak = 5
        for i in range(250):
            curCarr.load_params(carrOpt.get_value())

            realTrajs, curiosityTrajs = world.trajectories(curCarr, 50)
            curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 90.
            print_reward(realTrajs,
                         max_value=90.0,
                         episode=np.sum,
                         label="Real reward:      ")
            print_reward(curiosityTrajs,
                         max_value=1.0,
                         episode=np.max,
                         label="Curiosity reward: ")
            curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max))
            if curCuriosity > 0.98:
                if nextBreak == 0:
                    break
                else:
                    nextBreak -= 1
            else:
                nextBreak = np.min([nextBreak + 1, 5])

            realTrajs = replace_rewards(realTrajs, episode=np.sum)
            realTrajs = normalize(realTrajs)
            curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max)
            #this is stupid, we should care more(?) if the costs are to high
            realWeight = 0.001 + np.max([np.min([curScore, 0.2]), 0.
                                         ]) * 0.998 / 0.2
            curiosityWeight = 1. - realWeight
            print('RWeight: %f, CWeight: %f' % (realWeight, curiosityWeight))
            trajs = combine_rewards([realTrajs, curiosityTrajs],
                                    [realWeight, curiosityWeight])
            trajs = normalize(trajs)
            grad = policy_gradient(trajs, policy=curCarr)
            carrOpt.apply_gradient(grad)
            if i % 10 == 0:
                print("%d episodes in." % i)
        world.remember_agent(curCarr)
        world.render(curCarr)
        if curScore > 0.01:
            return carrOpt
        else:
            return None

    theCarOpt = None
    for i in range(50):
        print("Teaching agent %d." % i)
        theCarOpt = train_one(theCarOpt)
Example #15
0
def run():
    classifier = Input(2)
    classifier = Affine(classifier, 16)
    classifier = LReLU(classifier)
    classifier = Affine(classifier, 2)
    classifier = Softmax(classifier)

    world = Gym("MountainCar-v0")
    world = StochasticPolicy(world)

    curCarr = carr()
    curCarr.load_params(np.random.randn(curCarr.n_params))
    oldTrajs = world.trajectories(curCarr, 800)

    def train_one(carrOpt):
        nonlocal oldTrajs
        classOpt = Adam(
            np.random.randn(classifier.n_params) * 1.,
            lr=0.5,
            memory=0.9,
        )
        if carrOpt == None:
            carrOpt = Adam(
                np.random.randn(curCarr.n_params),
                lr=0.10,
                memory=0.5,
            )
        curScore = 0.
        curAccuracy = 0.
        for i in range(250):
            classifier.load_params(classOpt.get_value())
            curCarr.load_params(carrOpt.get_value())

            oldTrajIdx = np.random.choice(len(oldTrajs), size=50)
            trajs = [oldTrajs[i] for i in oldTrajIdx]
            trajs += world.trajectories(curCarr, 50)
            trajsForClass = [tag_traj(traj, [1, 0]) for traj in trajs[:50]]
            trajsForClass += [tag_traj(traj, [0, 1]) for traj in trajs[50:]]
            plot_tagged_trajs(trajsForClass)
            accTrajs = accuracy(trajsForClass, model=classifier)
            print_reward(accTrajs,
                         max_value=1.0,
                         episode=np.mean,
                         label="Cla reward: ")
            curAccuracy = np.mean(get_rewards(accTrajs, episode=np.mean))
            if curAccuracy > 1. - i / 500:
                break

            grad = policy_gradient(trajsForClass, policy=classifier)
            classOpt.apply_gradient(grad)
            trajs2 = learn_from_classifier(classifier, trajs[50:], 1)
            print_reward(trajs2,
                         max_value=1.0,
                         episode=np.max,
                         label="Car reward: ")
            curScore = np.mean(get_rewards(trajs2, episode=np.max))
            trajs2 = replace_rewards(trajs2, episode=np.max)
            trajs2 = normalize(trajs2)
            grad2 = policy_gradient(trajs2, policy=curCarr)
            carrOpt.apply_gradient(grad2)
            if i % 10 == 0:
                print("%d episodes in." % i)
        oldTrajs += world.trajectories(curCarr, 800)
        world.render(curCarr)
        if curScore > 0.11:
            return carrOpt
        else:
            return None

    theCarOpt = None
    for i in range(10):
        print("Teaching agent %d." % i)
        theCarOpt = train_one(theCarOpt)
Example #16
0
def carr():
    carr = Input(2)
    carr = Affine(carr, 32)
    carr = LReLU(carr)
    carr = Affine(carr, 3)
    return Softmax(carr)