Ejemplo n.º 1
0
def run():
    world = Gym("MountainCarContinuous-v0", max_steps=500)

    if len(sys.argv) >= 2:
        agent = build_agent()
        for fn in sys.argv[1:]:
            agent.load_params(np.load(fn))
            world.render(agent)
    else:
        curiosity(world)
Ejemplo n.º 2
0
def run():
    model = Input(4)
    model = Affine(model, 128)
    model = LReLU(model)
    model = Affine(model, 2)
    model = Softmax(model)

    world = StochasticPolicy(Gym(make_env, max_steps=500))

    opt = Adam(np.random.randn(model.n_params) * 0.1, lr=0.01)

    for _ in range(50):
        model.load_params(opt.get_value())

        trajs = world.trajectories(model, 16)
        print_reward(trajs, max_value=5000)

        trajs = discount(trajs, horizon=500)
        trajs = normalize(trajs)

        grad = policy_gradient(trajs, policy=model)
        opt.apply_gradient(grad)

    while True:
        world.render(model)
Ejemplo n.º 3
0
def run():
    world = StochasticPolicy(Gym("CartPole-v1"))

    model = Input(4)
    model = Affine(model, 64)
    model = LReLU(model)
    model = Affine(model, 2)
    model = Softmax(model)

    if len(sys.argv) >= 2:
        params = np.load(sys.argv[1])
    else:
        params = train(world, model)
        np.save("__cartpole.npy", params)

    model.load_params(params)
    world.render(model)
Ejemplo n.º 4
0
def run():
    classifier = Input(7)
    classifier = Affine(classifier, 32)
    classifier = LReLU(classifier)
    classifier = Affine(classifier, 2)
    classifier = Softmax(classifier)

    agent = walker()
    agent.load_params(np.random.randn(agent.n_params) * 1.5)

    MAX_TRAIN_TIME = 200
    trainTimeLeft = MAX_TRAIN_TIME
    curAgentId = -1
    curMemoryId = 0

    def plot_tagged_trajs(trajs):
        nonlocal trainTimeLeft, curAgentId, curMemoryId
        COLORS = ["blue", "red"]
        plt.clf()
        plt.grid()
        plt.gcf().axes[0].set_xlim([-1.25, 1.25])
        plt.gcf().axes[0].set_ylim([-1.25, 1.25])
        plt.suptitle("Episode %d of agent %d, memories: %d" %
                     (MAX_TRAIN_TIME - trainTimeLeft, curAgentId, curMemoryId))
        for traj in trajs:
            tag = traj[0][1]
            xs, ys = [], []
            for state, _, _ in traj:
                x = state[2]
                y = state[3]
                xs.append(x)
                ys.append(y)
            plt.plot(xs, ys, color=COLORS[np.argmax(tag)], alpha=0.1)
        plt.gcf().set_size_inches(10, 8)
        plt.gcf().savefig("__step_a%03d_t%03d.png" %
                          (curAgentId, MAX_TRAIN_TIME - trainTimeLeft),
                          dpi=100)

    world = Gym("BipedalWalker-v2", max_steps=MAX_STEPS)
    world = ActionNoise(world, stddev=0.2)
    world = Curiosity(world,
                      classifier=classifier,
                      history_length=50,
                      for_classifier=lambda ts: change_obs_space(
                          ts, changer=interesting_part),
                      plot=plot_tagged_trajs)
    MAX_BOREDOM = 3
    boredom = MAX_BOREDOM

    MAX_MOTIVATION = 3
    motivation = MAX_MOTIVATION

    agentOpt = None
    lastScores = None

    def memorize():
        nonlocal boredom, curMemoryId
        print("Memorizing %d..." % curMemoryId)
        world.remember(agent)
        boredom = MAX_BOREDOM
        curMemoryId += 1

    def save_agent():
        np.save(
            "__ranger_a%03d_t%03d.npy" %
            (curAgentId, MAX_TRAIN_TIME - trainTimeLeft), agentOpt.get_value())

    def reset_agent():
        nonlocal agentOpt, trainTimeLeft, lastScores, curAgentId, motivation
        if agentOpt is not None:
            save_agent()
        print("Resetting agent %d." % curAgentId)
        agentOpt = Adam(
            np.random.randn(agent.n_params) * 1.5,
            lr=0.05,
            memory=0.9,
        )
        trainTimeLeft = MAX_TRAIN_TIME
        lastScores = [-0.4]
        curAgentId += 1
        motivation = MAX_MOTIVATION

    reset_agent()
    while True:
        agent.load_params(agentOpt.get_value())

        realTrajs, curiosityTrajs = world.trajectories(agent, 30)
        curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 300.
        lastScores.append(curScore)
        lastScores = lastScores[-10:]
        scoreDev = np.std(lastScores)
        scoreMean = np.max([np.abs(np.mean(lastScores)), 1.])

        curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max))

        print_reward(realTrajs,
                     max_value=300.0,
                     episode=np.sum,
                     label="Real reward:      ")
        print_reward(curiosityTrajs,
                     max_value=1.0,
                     episode=np.max,
                     label="Curiosity reward: ")
        if curCuriosity > 0.85:
            if boredom == 0:
                save_agent()
                memorize()
            else:
                boredom -= 1
        else:
            boredom = np.min([boredom + 1, MAX_BOREDOM])

        if scoreDev / scoreMean < 0.010 or trainTimeLeft < 0:
            if motivation == 0:
                print("Not really learning.")
                save_agent()
                motivation = MAX_MOTIVATION
                trainTimeLeft = MAX_TRAIN_TIME
                if curScore < 0.01:
                    memorize()
                    reset_agent()
                    continue
            else:
                motivation -= 1
        else:
            motivation = np.min([motivation + 1, MAX_MOTIVATION])

        realTrajs = discount(realTrajs, horizon=200)
        realTrajs = normalize(realTrajs)
        curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max)
        realWeight = np.min([scoreDev / scoreMean * 10., 0.9])
        curiosityWeight = 1. - realWeight
        trajs = combine_rewards([realTrajs, curiosityTrajs],
                                [realWeight, curiosityWeight])
        trajs = normalize(trajs)
        grad = policy_gradient(trajs, policy=agent)
        agentOpt.apply_gradient(grad)

        trainTimeLeft -= 1
Ejemplo n.º 5
0
                    reset_agent()
                    continue
            else:
                motivation -= 1
        else:
            motivation = np.min([motivation + 1, MAX_MOTIVATION])

        realTrajs = discount(realTrajs, horizon=200)
        realTrajs = normalize(realTrajs)
        curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max)
        realWeight = np.min([scoreDev / scoreMean * 10., 0.9])
        curiosityWeight = 1. - realWeight
        trajs = combine_rewards([realTrajs, curiosityTrajs],
                                [realWeight, curiosityWeight])
        trajs = normalize(trajs)
        grad = policy_gradient(trajs, policy=agent)
        agentOpt.apply_gradient(grad)

        trainTimeLeft -= 1


if __name__ == "__main__":
    if len(sys.argv) >= 2:
        world = Gym("BipedalWalker-v2", max_steps=MAX_STEPS)
        agent = walker()
        for fn in sys.argv[1:]:
            agent.load_params(np.load(fn))
            world.render(agent)
    else:
        run()
Ejemplo n.º 6
0
def run():
    classifier = Input(2)
    classifier = Affine(classifier, 16)
    classifier = LReLU(classifier)
    classifier = Affine(classifier, 2)
    classifier = Softmax(classifier)

    curCarr = carr()
    curCarr.load_params(np.random.randn(curCarr.n_params))

    world = Gym("MountainCarContinuous-v0", max_steps=500)
    world = ActionNoise(world, stddev=0.1)
    world = Curiosity(world,
                      classifier=classifier,
                      history_length=800,
                      plot=True)

    def train_one(carrOpt):
        if carrOpt == None:
            carrOpt = Adam(
                np.random.randn(curCarr.n_params),
                lr=0.10,
                memory=0.5,
            )
        nextBreak = 5
        for i in range(250):
            curCarr.load_params(carrOpt.get_value())

            realTrajs, curiosityTrajs = world.trajectories(curCarr, 50)
            curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 90.
            print_reward(realTrajs,
                         max_value=90.0,
                         episode=np.sum,
                         label="Real reward:      ")
            print_reward(curiosityTrajs,
                         max_value=1.0,
                         episode=np.max,
                         label="Curiosity reward: ")
            curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max))
            if curCuriosity > 0.98:
                if nextBreak == 0:
                    break
                else:
                    nextBreak -= 1
            else:
                nextBreak = np.min([nextBreak + 1, 5])

            realTrajs = replace_rewards(realTrajs, episode=np.sum)
            realTrajs = normalize(realTrajs)
            curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max)
            #this is stupid, we should care more(?) if the costs are to high
            realWeight = 0.001 + np.max([np.min([curScore, 0.2]), 0.
                                         ]) * 0.998 / 0.2
            curiosityWeight = 1. - realWeight
            print('RWeight: %f, CWeight: %f' % (realWeight, curiosityWeight))
            trajs = combine_rewards([realTrajs, curiosityTrajs],
                                    [realWeight, curiosityWeight])
            trajs = normalize(trajs)
            grad = policy_gradient(trajs, policy=curCarr)
            carrOpt.apply_gradient(grad)
            if i % 10 == 0:
                print("%d episodes in." % i)
        world.remember_agent(curCarr)
        world.render(curCarr)
        if curScore > 0.01:
            return carrOpt
        else:
            return None

    theCarOpt = None
    for i in range(50):
        print("Teaching agent %d." % i)
        theCarOpt = train_one(theCarOpt)
Ejemplo n.º 7
0
def run():
    classifier = Input(2)
    classifier = Affine(classifier, 16)
    classifier = LReLU(classifier)
    classifier = Affine(classifier, 2)
    classifier = Softmax(classifier)

    world = Gym("MountainCar-v0")
    world = StochasticPolicy(world)

    curCarr = carr()
    curCarr.load_params(np.random.randn(curCarr.n_params))
    oldTrajs = world.trajectories(curCarr, 800)

    def train_one(carrOpt):
        nonlocal oldTrajs
        classOpt = Adam(
            np.random.randn(classifier.n_params) * 1.,
            lr=0.5,
            memory=0.9,
        )
        if carrOpt == None:
            carrOpt = Adam(
                np.random.randn(curCarr.n_params),
                lr=0.10,
                memory=0.5,
            )
        curScore = 0.
        curAccuracy = 0.
        for i in range(250):
            classifier.load_params(classOpt.get_value())
            curCarr.load_params(carrOpt.get_value())

            oldTrajIdx = np.random.choice(len(oldTrajs), size=50)
            trajs = [oldTrajs[i] for i in oldTrajIdx]
            trajs += world.trajectories(curCarr, 50)
            trajsForClass = [tag_traj(traj, [1, 0]) for traj in trajs[:50]]
            trajsForClass += [tag_traj(traj, [0, 1]) for traj in trajs[50:]]
            plot_tagged_trajs(trajsForClass)
            accTrajs = accuracy(trajsForClass, model=classifier)
            print_reward(accTrajs,
                         max_value=1.0,
                         episode=np.mean,
                         label="Cla reward: ")
            curAccuracy = np.mean(get_rewards(accTrajs, episode=np.mean))
            if curAccuracy > 1. - i / 500:
                break

            grad = policy_gradient(trajsForClass, policy=classifier)
            classOpt.apply_gradient(grad)
            trajs2 = learn_from_classifier(classifier, trajs[50:], 1)
            print_reward(trajs2,
                         max_value=1.0,
                         episode=np.max,
                         label="Car reward: ")
            curScore = np.mean(get_rewards(trajs2, episode=np.max))
            trajs2 = replace_rewards(trajs2, episode=np.max)
            trajs2 = normalize(trajs2)
            grad2 = policy_gradient(trajs2, policy=curCarr)
            carrOpt.apply_gradient(grad2)
            if i % 10 == 0:
                print("%d episodes in." % i)
        oldTrajs += world.trajectories(curCarr, 800)
        world.render(curCarr)
        if curScore > 0.11:
            return carrOpt
        else:
            return None

    theCarOpt = None
    for i in range(10):
        print("Teaching agent %d." % i)
        theCarOpt = train_one(theCarOpt)