Exemple #1
0
def curiosity(world):
    world = ActionNoise(world, stddev=0.2)
    memory = Cache(max_size=100)

    log_dir = "__oracle"
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    agent = build_agent()
    agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5)

    oracle = build_oracle()
    oracle_opt = Adam(np.random.randn(oracle.n_params) * 0.1,
                      lr=0.05,
                      memory=0.95)

    for episode in range(1000):
        agent.load_params(agent_opt.get_value())
        oracle.load_params(oracle_opt.get_value())

        agent_trajs = world.trajectories(agent, 4)
        for_oracle = [[(np.asarray([o1, o2, o3]).flatten(), a1, r1)
                       for (o1, a1, r1), (o2, a2,
                                          r2), (o3, a3,
                                                r3) in zip(t, t[1:], t[2:])]
                      for t in agent_trajs]
        memory.add_trajectories(for_oracle)

        predictions = retrace(for_oracle, model=oracle)
        save_plot(log_dir + "/%04d.png" % (episode + 1), agent_trajs,
                  predictions)
        np.save(log_dir + "/%04d.npy" % (episode + 1), agent_opt.get_value())

        curiosity_trajs = [[
            (o1, a1, np.log(np.mean(np.square((o2 - o1) - delta_p))))
            for (o1, a1, r1), (o2, a2, r2), delta_p in zip(t, t[10:], p)
        ] for t, p in zip(agent_trajs, predictions)]
        #curiosity_trajs = replace_rewards(curiosity_trajs,
        #    episode=lambda rs: np.max(rs))
        print_reward(curiosity_trajs, max_value=5000.0)
        print_reward(agent_trajs, max_value=90.0, episode=np.sum)

        curiosity_trajs = discount(curiosity_trajs, horizon=500)
        curiosity_trajs = normalize(curiosity_trajs)
        agent_trajs = discount(agent_trajs, horizon=500)
        agent_trajs = normalize(agent_trajs)
        agent_trajs = [traj[:-10] for traj in agent_trajs]
        agent_weight = 0.5  # + 0.4*(0.5 * (1 - np.cos(np.pi * episode / 20)))
        curiosity_weight = 1. - agent_weight
        comb_trajs = combine_rewards([curiosity_trajs, agent_trajs],
                                     [curiosity_weight, agent_weight])
        grad = policy_gradient(comb_trajs, policy=agent)
        agent_opt.apply_gradient(grad)

        oracle_trajs = [[(o1, (o2 - o1)[:2], 1.0)
                         for (o1, a1, r1), (o2, a2, r2) in zip(t, t[10:])]
                        for t in memory.trajectories(None, 4)]

        grad = policy_gradient(oracle_trajs, policy=oracle)
        oracle_opt.apply_gradient(grad)
def run():
    model = Input(4)
    model = Affine(model, 128)
    model = LReLU(model)
    model = Affine(model, 2)
    model = Softmax(model)

    world = StochasticPolicy(Gym(make_env, max_steps=500))

    opt = Adam(np.random.randn(model.n_params) * 0.1, lr=0.01)

    for _ in range(50):
        model.load_params(opt.get_value())

        trajs = world.trajectories(model, 16)
        print_reward(trajs, max_value=5000)

        trajs = discount(trajs, horizon=500)
        trajs = normalize(trajs)

        grad = policy_gradient(trajs, policy=model)
        opt.apply_gradient(grad)

    while True:
        world.render(model)
Exemple #3
0
    def train_one(carrOpt):
        nonlocal oldTrajs
        classOpt = Adam(
            np.random.randn(classifier.n_params) * 1.,
            lr=0.5,
            memory=0.9,
        )
        if carrOpt == None:
            carrOpt = Adam(
                np.random.randn(curCarr.n_params),
                lr=0.10,
                memory=0.5,
            )
        curScore = 0.
        curAccuracy = 0.
        for i in range(250):
            classifier.load_params(classOpt.get_value())
            curCarr.load_params(carrOpt.get_value())

            oldTrajIdx = np.random.choice(len(oldTrajs), size=50)
            trajs = [oldTrajs[i] for i in oldTrajIdx]
            trajs += world.trajectories(curCarr, 50)
            trajsForClass = [tag_traj(traj, [1, 0]) for traj in trajs[:50]]
            trajsForClass += [tag_traj(traj, [0, 1]) for traj in trajs[50:]]
            plot_tagged_trajs(trajsForClass)
            accTrajs = accuracy(trajsForClass, model=classifier)
            print_reward(accTrajs,
                         max_value=1.0,
                         episode=np.mean,
                         label="Cla reward: ")
            curAccuracy = np.mean(get_rewards(accTrajs, episode=np.mean))
            if curAccuracy > 1. - i / 500:
                break

            grad = policy_gradient(trajsForClass, policy=classifier)
            classOpt.apply_gradient(grad)
            trajs2 = learn_from_classifier(classifier, trajs[50:], 1)
            print_reward(trajs2,
                         max_value=1.0,
                         episode=np.max,
                         label="Car reward: ")
            curScore = np.mean(get_rewards(trajs2, episode=np.max))
            trajs2 = replace_rewards(trajs2, episode=np.max)
            trajs2 = normalize(trajs2)
            grad2 = policy_gradient(trajs2, policy=curCarr)
            carrOpt.apply_gradient(grad2)
            if i % 10 == 0:
                print("%d episodes in." % i)
        oldTrajs += world.trajectories(curCarr, 800)
        world.render(curCarr)
        if curScore > 0.11:
            return carrOpt
        else:
            return None
Exemple #4
0
    def train_one(carrOpt):
        if carrOpt == None:
            carrOpt = Adam(
                np.random.randn(curCarr.n_params),
                lr=0.10,
                memory=0.5,
            )
        nextBreak = 5
        for i in range(250):
            curCarr.load_params(carrOpt.get_value())

            realTrajs, curiosityTrajs = world.trajectories(curCarr, 50)
            curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 90.
            print_reward(realTrajs,
                         max_value=90.0,
                         episode=np.sum,
                         label="Real reward:      ")
            print_reward(curiosityTrajs,
                         max_value=1.0,
                         episode=np.max,
                         label="Curiosity reward: ")
            curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max))
            if curCuriosity > 0.98:
                if nextBreak == 0:
                    break
                else:
                    nextBreak -= 1
            else:
                nextBreak = np.min([nextBreak + 1, 5])

            realTrajs = replace_rewards(realTrajs, episode=np.sum)
            realTrajs = normalize(realTrajs)
            curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max)
            #this is stupid, we should care more(?) if the costs are to high
            realWeight = 0.001 + np.max([np.min([curScore, 0.2]), 0.
                                         ]) * 0.998 / 0.2
            curiosityWeight = 1. - realWeight
            print('RWeight: %f, CWeight: %f' % (realWeight, curiosityWeight))
            trajs = combine_rewards([realTrajs, curiosityTrajs],
                                    [realWeight, curiosityWeight])
            trajs = normalize(trajs)
            grad = policy_gradient(trajs, policy=curCarr)
            carrOpt.apply_gradient(grad)
            if i % 10 == 0:
                print("%d episodes in." % i)
        world.remember_agent(curCarr)
        world.render(curCarr)
        if curScore > 0.01:
            return carrOpt
        else:
            return None
Exemple #5
0
def train(world, model):
    opt = Adam(np.random.randn(model.n_params), lr=0.3, memory=0.9)

    for _ in range(20):
        model.load_params(opt.get_value())
        trajs = world.trajectories(None, 100)
        grad = policy_gradient(trajs, policy=model)
        opt.apply_gradient(grad)

        trajs = cross_entropy(trajs, model=model)
        print_reward(trajs,
                     episode=np.mean,
                     label="Surprise/byte:",
                     max_value=8.0)
def curiosity(world):
    world = ActionNoise(world, stddev=0.1)
    memory = Cache(max_size=100)

    log_dir = "__oracle"
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    agent = build_agent()
    agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5)

    oracle = build_oracle()
    oracle_opt = Adam(np.random.randn(oracle.n_params) * 0.1,
                      lr=0.05,
                      memory=0.95)

    for episode in range(1000):
        agent.load_params(agent_opt.get_value())
        oracle.load_params(oracle_opt.get_value())

        agent_trajs = world.trajectories(agent, 4)
        memory.add_trajectories(agent_trajs)

        predictions = retrace(agent_trajs, model=oracle)
        save_plot(log_dir + "/%04d.png" % (episode + 1), agent_trajs,
                  predictions)
        np.save(log_dir + "/%04d.npy" % (episode + 1), agent_opt.get_value())

        agent_trajs = [[
            (o1, a1, np.log(np.mean(np.square((o2 - o1) - delta_p))))
            for (o1, a1, r1), (o2, a2, r2), delta_p in zip(t, t[10:], p)
        ] for t, p in zip(agent_trajs, predictions)]
        agent_trajs = replace_rewards(agent_trajs,
                                      episode=lambda rs: np.max(rs) / len(rs))
        print_reward(agent_trajs, max_value=10.0)

        agent_trajs = normalize(agent_trajs)
        grad = policy_gradient(agent_trajs, policy=agent)
        agent_opt.apply_gradient(grad)

        oracle_trajs = [[(o1, o2 - o1, 1.0)
                         for (o1, a1, r1), (o2, a2, r2) in zip(t, t[10:])]
                        for t in memory.trajectories(None, 4)]

        grad = policy_gradient(oracle_trajs, policy=oracle)
        oracle_opt.apply_gradient(grad)
Exemple #7
0
def train(model):
    world = Mnist()

    opt = Adam(np.random.randn(model.n_params), lr=0.1)

    for i in range(600):
        model.load_params(opt.get_value() +
                          np.random.randn(model.n_params) * 0.01)

        trajs = world.trajectories(None, 256)
        grad = policy_gradient(trajs, policy=model)
        opt.apply_gradient(grad)

        if i % 20 == 19:
            print("%4d) " % (i + 1), flush=True, end="")
            trajs = world.trajectories(None, 2000)
            trajs = accuracy(trajs, model=model, percent=True)
            print_reward(trajs, max_value=100, label="Train accuracy:")

    return opt.get_value()
    def train_one():
        gaussOpt = Adam(
            [0., 0.],
            lr=0.010,
            memory=0.5,
        )
        classOpt = Adam(np.random.randn(classifier.n_params) * 0.1,
                        lr=0.5,
                        memory=0.99)
        gaussCenterer = Constant(2)
        gausses.append(gaussCenterer)
        curAccuracy = 0.
        while curAccuracy < 0.98:
            classifier.load_params(classOpt.get_value())
            gaussCenterer.load_params(gaussOpt.get_value())

            trajs = [[(gauss_observation(gausses[:-1]), [1, 0], 1.)]
                     for _ in range(500)]
            trajs += [[(gauss_observation(gausses[-1:]), [0, 1], 1.)]
                      for _ in range(500)]
            accTrajs = accuracy(trajs, model=classifier)
            print_reward(accTrajs, max_value=1.0)
            accs = [traj[0][2] for traj in accTrajs]
            curAccuracy = np.mean(accs)

            grad = policy_gradient(trajs, policy=classifier)
            classOpt.apply_gradient(grad)
            trajs2 = learn_from_classifier(classifier, trajs[500:], 1)
            trajs2 = normalize(trajs2)
            grad2 = policy_gradient(trajs2, policy=gaussCenterer)
            gaussOpt.apply_gradient(grad2)
            plt.clf()
            plt.grid()
            plt.gcf().axes[0].set_ylim([-1, 1])
            plt.gcf().axes[0].set_xlim([-1, 1])
            x, y = zip(*[o for ((o, _, _), ) in trajs[:500]])
            plt.scatter(x, y, color="blue")
            x, y = zip(*[o for ((o, _, _), ) in trajs[500:]])
            plt.scatter(x, y, color="red")
            plt.pause(0.01)