Example #1
0
def generate_data(num_blocks, base_name):

    thing_below = random_thing_below(num_blocks, max_levels=3)
    goal_thing_below = random_thing_below(num_blocks, max_levels=3)

    dump = DataDump(goal_thing_below, hook_period=1)
    env = BlocksWorldEnv(pb.POSITION_CONTROL,
                         show=False,
                         control_period=12,
                         step_hook=dump.step_hook)
    env.load_blocks(thing_below)

    restacker = Restacker(env, goal_thing_below, dump)
    restacker.run()

    reward = compute_symbolic_reward(env, goal_thing_below)
    final_thing_below = env.thing_below
    commands = [frame["command"] for frame in dump.data]
    data_file = "%s/meta.pkl" % base_name
    data = (thing_below, goal_thing_below, final_thing_below, reward, commands)
    with open(data_file, "wb") as f:
        pk.dump(data, f)

    env.close()

    for d, frame in enumerate(dump.data):
        _, (thing, block) = frame["command"]
        position, action, rgba, coords_of, _ = zip(*frame["records"])

        position = tr.tensor(np.stack(position)).float()
        action = tr.tensor(np.stack(action)).float()
        rgba = tr.tensor(np.stack(rgba))
        block_coords = tr.tensor(np.stack([co[block]
                                           for co in coords_of])).float()
        thing_coords = tr.tensor(np.stack([co[thing]
                                           for co in coords_of])).float()

        # preprocessing
        rgb, block_coords, thing_coords = preprocess(rgba, block_coords,
                                                     thing_coords)

        data_file = "%s/%03d.pt" % (base_name, d)
        tr.save((position, action, rgb, block_coords, thing_coords), data_file)

    print(" success=%s (start, end, goal)" % (reward == 0))
    print("  ", thing_below)
    print("  ", env.thing_below)
    print("  ", goal_thing_below)
    return reward
def run_machine(machine, goal_thing_below, reset_dict):

    goal_thing_above = machine.env.invert(goal_thing_below)
    for key, val in goal_thing_above.items():
        if val == "none": goal_thing_above[key] = "nil"

    start = time.perf_counter()
    memorize_env(machine, goal_thing_above)
    machine.reset(reset_dict)
    ticks = machine.run()
    running_time = time.perf_counter() - start

    sym_reward = compute_symbolic_reward(machine.env, goal_thing_below)
    spa_reward = compute_spatial_reward(machine.env, goal_thing_below)

    return ticks, running_time, sym_reward, spa_reward
Example #3
0
def run_episode(env,
                thing_below,
                goal_thing_below,
                nvm,
                init_regs,
                init_conns,
                sigma=0):

    # reload blocks
    env.reset()
    env.load_blocks(thing_below)

    # reset nvm, input new env, mount main program
    nvm.reset_state(init_regs, init_conns)
    memorize_env(nvm, goal_thing_above)
    nvm.mount("main")

    log_prob = 0.0  # accumulate over episode

    dbg = False
    if dbg: nvm.dbg()
    target_changed = True
    while True:
        done = nvm.tick()
        if dbg: nvm.dbg()
        # if nvm.tick_counter % 100 == 0: print("     tick %d" % nvm.tick_counter)
        if target_changed:
            mu = nvm.registers["jnt"].content
            if sigma > 0:
                dist = tr.distributions.normal.Normal(mu, sigma)
                position = dist.sample()
                log_probs = dist.log_prob(position)
                log_prob += log_probs.sum()  # multivariate white noise
            else:
                position = mu
            nvm.env.goto_position(position.detach().numpy())
        tar = nvm.registers["tar"]
        target_changed = (tar.decode(tar.content) != tar.decode(
            tar.old_content))
        if done: break

    sym_reward = compute_symbolic_reward(nvm.env, goal_thing_below)
    spa_reward = compute_spatial_reward(nvm.env, goal_thing_below)
    reward = calc_reward(sym_reward, spa_reward)

    return reward, log_prob
Example #4
0
def rvm_baseline(env, thing_below, goal_thing_above, rvm):

    start = time.perf_counter()

    # reload blocks
    env.reset()
    env.load_blocks(thing_below)

    # reset rvm, input new env, mount main program
    rvm.env = env
    memorize_env(rvm, goal_thing_above)
    rvm.reset({"jnt": "rest"})
    rvm.mount("main")

    # run
    ticks = rvm.run()
    running_time = time.perf_counter() - start

    sym_reward = compute_symbolic_reward(env, goal_thing_below)
    spa_reward = compute_spatial_reward(env, goal_thing_below)
    reward = calc_reward(sym_reward, spa_reward)

    return running_time, reward
 def step_hook(self, env, action):
     self.mp.append(env.movement_penalty())
     self.sym.append(
         compute_symbolic_reward(env, self.goal_thing_below))
def run_episodes(problem, nvm, W_init, v_init, num_time_steps, num_episodes,
                 penalty_tracker, sigma):

    memorize_problem(nvm, problem)
    for name in ["obj", "loc", "goal"]:
        W_init[name][0] = nvm.connections[name].W.unsqueeze(dim=0)

    perf_counter = time.perf_counter()
    # W, v = nvm.net.run(W_init, v_init, num_time_steps)
    nvm.net.clear_ticks()
    for t in range(num_time_steps):
        nvm.net.tick(W_init, v_init)
        # nvm.pullback(t)
        # nvm.dbg()
        # input('.')
    W, v = nvm.net.weights, nvm.net.activities
    print("    NVM run took %fs (%d timesteps)" %
          (time.perf_counter() - perf_counter, num_time_steps))

    perf_counter = time.perf_counter()
    positions, log_probs = tuple({b: list()
                                  for b in range(num_episodes)}
                                 for _ in [0, 1])
    tar = nvm.registers["tar"]
    for t in range(2, num_time_steps):
        if nvm.decode("tar", t - 2) != nvm.decode("tar", t - 1):
            mu = v["jnt"][t][0, :, 0]
            dist = tr.distributions.normal.Normal(mu, sigma)
            for b in range(num_episodes):
                position = dist.sample(
                ) if b > 0 else mu  # first episode noiseless
                positions[b].append(position)
                log_probs[b].append(
                    dist.log_prob(position).sum())  # multivariate white noise
    for b in range(num_episodes):
        if any([tr.isnan(lp) for lp in log_probs[b]]):
            print(" " * 6, log_probs[b])
            for t in range(2, num_time_steps):
                if nvm.decode("tar", t - 2, b) != nvm.decode("tar", t - 1, b):
                    nvm.pullback(t, b)
                    nvm.dbg()
            #         input('.')
            # input('.')
    print("    log probs took %fs (%d motions)" %
          (time.perf_counter() - perf_counter, len(positions[0])))

    perf_counter = time.perf_counter()
    # env = BlocksWorldEnv(show=False, step_hook=penalty_tracker.step_hook)
    env = nvm.env
    rewards, sym = [], []
    for b in range(num_episodes):
        rewards.append([])
        env.reset()
        env.load_blocks(problem.thing_below)
        for position in positions[b]:
            penalty_tracker.reset()
            env.goto_position(position.detach().numpy(), speed=1.5)
            rewards[b].append(-penalty_tracker.penalty)
        sym_reward = compute_symbolic_reward(env, problem.goal_thing_below)
        # spa_reward = compute_spatial_reward(env, problem.goal_thing_below)
        # end_reward = calc_reward(sym_reward, spa_reward)
        # rewards[b][-1] += end_reward
        rewards[b][-1] += sym_reward
        sym.append(sym_reward)
        # env.reset()
    # env.close()
    print("    simulation rewards took %fs" %
          (time.perf_counter() - perf_counter))

    perf_counter = time.perf_counter()
    rewards_to_go = []
    for b in range(num_episodes):
        rewards[b] = tr.tensor(rewards[b]).float()
        rtg = tr.cumsum(rewards[b], dim=0)
        rtg = rtg[-1] - rtg + rewards[b]
        rewards_to_go.append(rtg)
    baselines = tr.stack(rewards_to_go[1:]).mean(dim=0)  # exclude noiseless
    baselines *= (num_episodes - 1) / (num_episodes - 2)  # de-bias
    baseline = baselines[0]
    loss = tr.sum(
        tr.stack([
            -((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() /
            (num_episodes - 1) / len(positions[0])
            for b in range(1, num_episodes)
        ]))
    # loss = tr.tensor(0.)
    # for b in range(1,num_episodes): # exclude noiseless
    #     loss -= ((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() / (num_episodes - 1) / len(positions[0])
    #     # loss = - ((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() / (num_episodes - 1) / len(positions[0])
    #     # loss.backward(retain_graph=(b+1 < len(rewards)))
    loss.backward()
    print("    backprop took %fs" % (time.perf_counter() - perf_counter))

    return sym, rewards_to_go, baseline
                             σ=nv.default_activator,
                             detach_gates=detach_gates)
            nvm.mount("main")
            memorize_problem(nvm, problem)
            for name in ["obj", "loc", "goal"]:
                W_init[name][0] = nvm.connections[name].W.unsqueeze(dim=0)

            while True:
                done = rvm.tick()
                if rvm.registers["jnt"].content != rvm.registers[
                        "jnt"].old_content:
                    position = rvm.ik[rvm.registers["jnt"].content]
                    env.goto_position(position, speed=1.5)
                if done: break
            num_time_steps = rvm.tick_counter
            rvm_sym = compute_symbolic_reward(env, problem.goal_thing_below)
            rvm_mps, rvm_joints, rvm_grips = tracker.penalties[
                10:], tracker.joints, tracker.grips
            tracker.penalties, tracker.joints, tracker.grips = [], [], []
            print(rvm_sym, sum(rvm_mps))
            if showpb: input('...')

            env.reset()
            env.load_blocks(problem.thing_below)
            nvm.net.clear_ticks()
            for t in range(num_time_steps):
                nvm.net.tick(W_init, v_init)
                if t > 1 and nvm.decode("tar", t - 2) != nvm.decode(
                        "tar", t - 1):
                    position = nvm.net.activities["jnt"][t][0, :, 0]
                    env.goto_position(position.detach().numpy(), speed=1.5)
                 positions[b][e].append(position)
     num_motions = [len(positions[b][0]) for b in range(batch_size)]
     print("    actions took %fs (%d-%d motions)" % (time.perf_counter() - perf_counter, min(num_motions), max(num_motions)))
 
     # simulate to get rewards
     perf_counter = time.perf_counter()
     rewards, sym = tuple(np.zeros((batch_size, num_episodes)) for _ in [0,1])
     for b, problem in enumerate(problems):
         for e in range(num_episodes):
             env.reset()
             env.load_blocks(problem.thing_below)
             for position in positions[b][e]:
                 mp_tracker.reset()
                 env.goto_position(position.detach().numpy(), speed=1.5)
                 rewards[b,e] -= mp_tracker.penalty
             sym[b,e] = compute_symbolic_reward(env, problem.goal_thing_below)
             rewards[b,e] += sym[b,e]
             # print("      %d,%d: %f" % (b,e,rewards[b,e]))
     print("    simulation rewards took %fs" % (time.perf_counter() - perf_counter))
     
     avg_reward = rewards[:,0].mean() # noiseless episodes
     if batch_iter+1 == num_batch_iters:
         print("   batch iter %d took %fs, avg reward = %f" % (batch_iter, time.perf_counter() - batch_iter_counter, avg_reward))
         results.append((avg_reward, rewards, {}, []))
         with open(results_file, "wb") as f: pk.dump(results, f)
         break
     
     # set up dual descent
     perf_counter = time.perf_counter()
     opt_index = rewards.argmax(axis=1)
     print("    %d problems with better noisy episodes" % (opt_index > 0).sum())
Example #9
0
def run_episode(env,
                thing_below,
                goal_thing_below,
                nvm,
                init_regs,
                init_conns,
                penalty_tracker,
                sigma=0):

    # reload blocks
    env.reset()
    env.load_blocks(thing_below)

    # invert goals for nvm
    goal_thing_above = invert(goal_thing_below,
                              num_blocks=len(thing_below),
                              num_bases=len(env.bases))
    for key, val in goal_thing_above.items():
        if val == "none": goal_thing_above[key] = "nil"

    # reset nvm, input new env, mount main program
    nvm.reset_state(init_regs, init_conns)
    memorize_env(nvm, goal_thing_above)
    nvm.mount("main")

    log_prob = 0.0  # accumulate over episode
    log_probs, rewards = [], []

    dbg = False
    if dbg: nvm.dbg()
    target_changed = False
    while True:
        done = nvm.tick()  # reliable if core is not trained
        if dbg: nvm.dbg()
        # if nvm.tick_counter % 100 == 0: print("     tick %d" % nvm.tick_counter)
        if target_changed:
            mu = nvm.registers["jnt"].content
            if sigma > 0:
                dist = tr.distributions.normal.Normal(mu, sigma)
                position = dist.sample()
                log_probs.append(
                    dist.log_prob(position).sum())  # multivariate white noise
                log_prob += log_probs[-1]
            else:
                position = mu

            penalty_tracker.reset()
            # nvm.dbg()
            # print("       pos:", position.detach().numpy())
            nvm.env.goto_position(position.detach().numpy())
            rewards.append(-penalty_tracker.penalty)
            # print("net penalty: %.5f" % penalty_tracker.penalty)
            # input('...')

        tar = nvm.registers["tar"]
        # decode has some robustness to noise even if tar connections are trained
        target_changed = (tar.decode(tar.content) != tar.decode(
            tar.old_content))
        if done: break

    if len(rewards) == 0:  # target never changed
        mu = nvm.registers["jnt"].content
        dist = tr.distributions.normal.Normal(mu, 0.001)
        log_probs.append(dist.log_prob(mu).sum())  # multivariate white noise
        rewards = [-10]

    sym_reward = compute_symbolic_reward(nvm.env, goal_thing_below)
    spa_reward = compute_spatial_reward(nvm.env, goal_thing_below)
    end_reward = calc_reward(sym_reward, spa_reward)
    rewards[-1] += end_reward

    return end_reward, log_prob, rewards, log_probs
def run_trial(domain):

    env = BlocksWorldEnv(show=False)

    # rejection sample non-trivial instance
    problem = domain.random_problem_instance()
    env.reset()
    env.load_blocks(problem.thing_below, num_bases=domain.num_bases)

    # set up rvm and virtualize
    rvm = make_abstract_machine(env, domain)
    memorize_problem(rvm, problem)
    rvm.reset({"jnt": "rest"})
    rvm.mount("main")
    nvm = virtualize(rvm, σ=nv.default_activator, detach_gates=True)
    nvm.mount("main")
    W_init = {
        name: {
            0: nvm.net.batchify_weights(conn.W)
        }
        for name, conn in nvm.connections.items()
    }
    v_init = {
        name: {
            0: nvm.net.batchify_activities(reg.content)
        }
        for name, reg in nvm.registers.items()
    }
    v_init["jnt"][0] = nvm.net.batchify_activities(
        tr.tensor(rvm.ik["rest"]).float())

    # rvm_results = run_machine(rvm, problem.goal_thing_below, {"jnt": "rest"})
    start = time.perf_counter()
    tar_changed = False
    while True:
        done = rvm.tick()
        if tar_changed:
            position = rvm.ik[rvm.registers["jnt"].content]
            env.goto_position(position, speed=1.5)
        if done: break
        tar_changed = (rvm.registers["tar"].content !=
                       rvm.registers["tar"].old_content)
    rvm_ticks = rvm.tick_counter
    rvm_runtime = time.perf_counter() - start
    rvm_sym = compute_symbolic_reward(env, problem.goal_thing_below)
    rvm_spa = compute_spatial_reward(env, problem.goal_thing_below)
    rvm_results = rvm_ticks, rvm_runtime, rvm_sym, rvm_spa

    # nvm_results = run_machine(nvm, problem.goal_thing_below, {"jnt": tr.tensor(rvm.ik["rest"]).float()})
    env.reset()
    env.load_blocks(problem.thing_below, num_bases=domain.num_bases)
    start = time.perf_counter()
    while True:
        t = nvm.net.tick_counter
        if t > 0 and nvm.decode("ipt", t, 0) == nvm.decode("ipt", t - 1, 0):
            break
        nvm.net.tick(W_init, v_init)
        nvm.pullback(t)
        if t > 1 and nvm.decode("tar", t - 2, 0) != nvm.decode(
                "tar", t - 1, 0):
            position = nvm.net.activities["jnt"][t][0, :, 0].detach().numpy()
            env.goto_position(position, speed=1.5)
    nvm_ticks = nvm.net.tick_counter
    nvm_runtime = time.perf_counter() - start
    nvm_sym = compute_symbolic_reward(env, problem.goal_thing_below)
    nvm_spa = compute_spatial_reward(env, problem.goal_thing_below)
    nvm_results = nvm_ticks, nvm_runtime, nvm_sym, nvm_spa

    env.close()
    return rvm_results, nvm_results, nvm.size(), problem
Example #11
0
 def step_hook(self, env, action):
     pen = env.movement_penalty()
     if pen > 0.01: input("pen...")
     self.mp.append(pen)
     self.sym.append(
         compute_symbolic_reward(env, self.goal_thing_below))