def generate_data(num_blocks, base_name): thing_below = random_thing_below(num_blocks, max_levels=3) goal_thing_below = random_thing_below(num_blocks, max_levels=3) dump = DataDump(goal_thing_below, hook_period=1) env = BlocksWorldEnv(pb.POSITION_CONTROL, show=False, control_period=12, step_hook=dump.step_hook) env.load_blocks(thing_below) restacker = Restacker(env, goal_thing_below, dump) restacker.run() reward = compute_symbolic_reward(env, goal_thing_below) final_thing_below = env.thing_below commands = [frame["command"] for frame in dump.data] data_file = "%s/meta.pkl" % base_name data = (thing_below, goal_thing_below, final_thing_below, reward, commands) with open(data_file, "wb") as f: pk.dump(data, f) env.close() for d, frame in enumerate(dump.data): _, (thing, block) = frame["command"] position, action, rgba, coords_of, _ = zip(*frame["records"]) position = tr.tensor(np.stack(position)).float() action = tr.tensor(np.stack(action)).float() rgba = tr.tensor(np.stack(rgba)) block_coords = tr.tensor(np.stack([co[block] for co in coords_of])).float() thing_coords = tr.tensor(np.stack([co[thing] for co in coords_of])).float() # preprocessing rgb, block_coords, thing_coords = preprocess(rgba, block_coords, thing_coords) data_file = "%s/%03d.pt" % (base_name, d) tr.save((position, action, rgb, block_coords, thing_coords), data_file) print(" success=%s (start, end, goal)" % (reward == 0)) print(" ", thing_below) print(" ", env.thing_below) print(" ", goal_thing_below) return reward
def run_machine(machine, goal_thing_below, reset_dict): goal_thing_above = machine.env.invert(goal_thing_below) for key, val in goal_thing_above.items(): if val == "none": goal_thing_above[key] = "nil" start = time.perf_counter() memorize_env(machine, goal_thing_above) machine.reset(reset_dict) ticks = machine.run() running_time = time.perf_counter() - start sym_reward = compute_symbolic_reward(machine.env, goal_thing_below) spa_reward = compute_spatial_reward(machine.env, goal_thing_below) return ticks, running_time, sym_reward, spa_reward
def run_episode(env, thing_below, goal_thing_below, nvm, init_regs, init_conns, sigma=0): # reload blocks env.reset() env.load_blocks(thing_below) # reset nvm, input new env, mount main program nvm.reset_state(init_regs, init_conns) memorize_env(nvm, goal_thing_above) nvm.mount("main") log_prob = 0.0 # accumulate over episode dbg = False if dbg: nvm.dbg() target_changed = True while True: done = nvm.tick() if dbg: nvm.dbg() # if nvm.tick_counter % 100 == 0: print(" tick %d" % nvm.tick_counter) if target_changed: mu = nvm.registers["jnt"].content if sigma > 0: dist = tr.distributions.normal.Normal(mu, sigma) position = dist.sample() log_probs = dist.log_prob(position) log_prob += log_probs.sum() # multivariate white noise else: position = mu nvm.env.goto_position(position.detach().numpy()) tar = nvm.registers["tar"] target_changed = (tar.decode(tar.content) != tar.decode( tar.old_content)) if done: break sym_reward = compute_symbolic_reward(nvm.env, goal_thing_below) spa_reward = compute_spatial_reward(nvm.env, goal_thing_below) reward = calc_reward(sym_reward, spa_reward) return reward, log_prob
def rvm_baseline(env, thing_below, goal_thing_above, rvm): start = time.perf_counter() # reload blocks env.reset() env.load_blocks(thing_below) # reset rvm, input new env, mount main program rvm.env = env memorize_env(rvm, goal_thing_above) rvm.reset({"jnt": "rest"}) rvm.mount("main") # run ticks = rvm.run() running_time = time.perf_counter() - start sym_reward = compute_symbolic_reward(env, goal_thing_below) spa_reward = compute_spatial_reward(env, goal_thing_below) reward = calc_reward(sym_reward, spa_reward) return running_time, reward
def step_hook(self, env, action): self.mp.append(env.movement_penalty()) self.sym.append( compute_symbolic_reward(env, self.goal_thing_below))
def run_episodes(problem, nvm, W_init, v_init, num_time_steps, num_episodes, penalty_tracker, sigma): memorize_problem(nvm, problem) for name in ["obj", "loc", "goal"]: W_init[name][0] = nvm.connections[name].W.unsqueeze(dim=0) perf_counter = time.perf_counter() # W, v = nvm.net.run(W_init, v_init, num_time_steps) nvm.net.clear_ticks() for t in range(num_time_steps): nvm.net.tick(W_init, v_init) # nvm.pullback(t) # nvm.dbg() # input('.') W, v = nvm.net.weights, nvm.net.activities print(" NVM run took %fs (%d timesteps)" % (time.perf_counter() - perf_counter, num_time_steps)) perf_counter = time.perf_counter() positions, log_probs = tuple({b: list() for b in range(num_episodes)} for _ in [0, 1]) tar = nvm.registers["tar"] for t in range(2, num_time_steps): if nvm.decode("tar", t - 2) != nvm.decode("tar", t - 1): mu = v["jnt"][t][0, :, 0] dist = tr.distributions.normal.Normal(mu, sigma) for b in range(num_episodes): position = dist.sample( ) if b > 0 else mu # first episode noiseless positions[b].append(position) log_probs[b].append( dist.log_prob(position).sum()) # multivariate white noise for b in range(num_episodes): if any([tr.isnan(lp) for lp in log_probs[b]]): print(" " * 6, log_probs[b]) for t in range(2, num_time_steps): if nvm.decode("tar", t - 2, b) != nvm.decode("tar", t - 1, b): nvm.pullback(t, b) nvm.dbg() # input('.') # input('.') print(" log probs took %fs (%d motions)" % (time.perf_counter() - perf_counter, len(positions[0]))) perf_counter = time.perf_counter() # env = BlocksWorldEnv(show=False, step_hook=penalty_tracker.step_hook) env = nvm.env rewards, sym = [], [] for b in range(num_episodes): rewards.append([]) env.reset() env.load_blocks(problem.thing_below) for position in positions[b]: penalty_tracker.reset() env.goto_position(position.detach().numpy(), speed=1.5) rewards[b].append(-penalty_tracker.penalty) sym_reward = compute_symbolic_reward(env, problem.goal_thing_below) # spa_reward = compute_spatial_reward(env, problem.goal_thing_below) # end_reward = calc_reward(sym_reward, spa_reward) # rewards[b][-1] += end_reward rewards[b][-1] += sym_reward sym.append(sym_reward) # env.reset() # env.close() print(" simulation rewards took %fs" % (time.perf_counter() - perf_counter)) perf_counter = time.perf_counter() rewards_to_go = [] for b in range(num_episodes): rewards[b] = tr.tensor(rewards[b]).float() rtg = tr.cumsum(rewards[b], dim=0) rtg = rtg[-1] - rtg + rewards[b] rewards_to_go.append(rtg) baselines = tr.stack(rewards_to_go[1:]).mean(dim=0) # exclude noiseless baselines *= (num_episodes - 1) / (num_episodes - 2) # de-bias baseline = baselines[0] loss = tr.sum( tr.stack([ -((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() / (num_episodes - 1) / len(positions[0]) for b in range(1, num_episodes) ])) # loss = tr.tensor(0.) # for b in range(1,num_episodes): # exclude noiseless # loss -= ((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() / (num_episodes - 1) / len(positions[0]) # # loss = - ((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() / (num_episodes - 1) / len(positions[0]) # # loss.backward(retain_graph=(b+1 < len(rewards))) loss.backward() print(" backprop took %fs" % (time.perf_counter() - perf_counter)) return sym, rewards_to_go, baseline
σ=nv.default_activator, detach_gates=detach_gates) nvm.mount("main") memorize_problem(nvm, problem) for name in ["obj", "loc", "goal"]: W_init[name][0] = nvm.connections[name].W.unsqueeze(dim=0) while True: done = rvm.tick() if rvm.registers["jnt"].content != rvm.registers[ "jnt"].old_content: position = rvm.ik[rvm.registers["jnt"].content] env.goto_position(position, speed=1.5) if done: break num_time_steps = rvm.tick_counter rvm_sym = compute_symbolic_reward(env, problem.goal_thing_below) rvm_mps, rvm_joints, rvm_grips = tracker.penalties[ 10:], tracker.joints, tracker.grips tracker.penalties, tracker.joints, tracker.grips = [], [], [] print(rvm_sym, sum(rvm_mps)) if showpb: input('...') env.reset() env.load_blocks(problem.thing_below) nvm.net.clear_ticks() for t in range(num_time_steps): nvm.net.tick(W_init, v_init) if t > 1 and nvm.decode("tar", t - 2) != nvm.decode( "tar", t - 1): position = nvm.net.activities["jnt"][t][0, :, 0] env.goto_position(position.detach().numpy(), speed=1.5)
positions[b][e].append(position) num_motions = [len(positions[b][0]) for b in range(batch_size)] print(" actions took %fs (%d-%d motions)" % (time.perf_counter() - perf_counter, min(num_motions), max(num_motions))) # simulate to get rewards perf_counter = time.perf_counter() rewards, sym = tuple(np.zeros((batch_size, num_episodes)) for _ in [0,1]) for b, problem in enumerate(problems): for e in range(num_episodes): env.reset() env.load_blocks(problem.thing_below) for position in positions[b][e]: mp_tracker.reset() env.goto_position(position.detach().numpy(), speed=1.5) rewards[b,e] -= mp_tracker.penalty sym[b,e] = compute_symbolic_reward(env, problem.goal_thing_below) rewards[b,e] += sym[b,e] # print(" %d,%d: %f" % (b,e,rewards[b,e])) print(" simulation rewards took %fs" % (time.perf_counter() - perf_counter)) avg_reward = rewards[:,0].mean() # noiseless episodes if batch_iter+1 == num_batch_iters: print(" batch iter %d took %fs, avg reward = %f" % (batch_iter, time.perf_counter() - batch_iter_counter, avg_reward)) results.append((avg_reward, rewards, {}, [])) with open(results_file, "wb") as f: pk.dump(results, f) break # set up dual descent perf_counter = time.perf_counter() opt_index = rewards.argmax(axis=1) print(" %d problems with better noisy episodes" % (opt_index > 0).sum())
def run_episode(env, thing_below, goal_thing_below, nvm, init_regs, init_conns, penalty_tracker, sigma=0): # reload blocks env.reset() env.load_blocks(thing_below) # invert goals for nvm goal_thing_above = invert(goal_thing_below, num_blocks=len(thing_below), num_bases=len(env.bases)) for key, val in goal_thing_above.items(): if val == "none": goal_thing_above[key] = "nil" # reset nvm, input new env, mount main program nvm.reset_state(init_regs, init_conns) memorize_env(nvm, goal_thing_above) nvm.mount("main") log_prob = 0.0 # accumulate over episode log_probs, rewards = [], [] dbg = False if dbg: nvm.dbg() target_changed = False while True: done = nvm.tick() # reliable if core is not trained if dbg: nvm.dbg() # if nvm.tick_counter % 100 == 0: print(" tick %d" % nvm.tick_counter) if target_changed: mu = nvm.registers["jnt"].content if sigma > 0: dist = tr.distributions.normal.Normal(mu, sigma) position = dist.sample() log_probs.append( dist.log_prob(position).sum()) # multivariate white noise log_prob += log_probs[-1] else: position = mu penalty_tracker.reset() # nvm.dbg() # print(" pos:", position.detach().numpy()) nvm.env.goto_position(position.detach().numpy()) rewards.append(-penalty_tracker.penalty) # print("net penalty: %.5f" % penalty_tracker.penalty) # input('...') tar = nvm.registers["tar"] # decode has some robustness to noise even if tar connections are trained target_changed = (tar.decode(tar.content) != tar.decode( tar.old_content)) if done: break if len(rewards) == 0: # target never changed mu = nvm.registers["jnt"].content dist = tr.distributions.normal.Normal(mu, 0.001) log_probs.append(dist.log_prob(mu).sum()) # multivariate white noise rewards = [-10] sym_reward = compute_symbolic_reward(nvm.env, goal_thing_below) spa_reward = compute_spatial_reward(nvm.env, goal_thing_below) end_reward = calc_reward(sym_reward, spa_reward) rewards[-1] += end_reward return end_reward, log_prob, rewards, log_probs
def run_trial(domain): env = BlocksWorldEnv(show=False) # rejection sample non-trivial instance problem = domain.random_problem_instance() env.reset() env.load_blocks(problem.thing_below, num_bases=domain.num_bases) # set up rvm and virtualize rvm = make_abstract_machine(env, domain) memorize_problem(rvm, problem) rvm.reset({"jnt": "rest"}) rvm.mount("main") nvm = virtualize(rvm, σ=nv.default_activator, detach_gates=True) nvm.mount("main") W_init = { name: { 0: nvm.net.batchify_weights(conn.W) } for name, conn in nvm.connections.items() } v_init = { name: { 0: nvm.net.batchify_activities(reg.content) } for name, reg in nvm.registers.items() } v_init["jnt"][0] = nvm.net.batchify_activities( tr.tensor(rvm.ik["rest"]).float()) # rvm_results = run_machine(rvm, problem.goal_thing_below, {"jnt": "rest"}) start = time.perf_counter() tar_changed = False while True: done = rvm.tick() if tar_changed: position = rvm.ik[rvm.registers["jnt"].content] env.goto_position(position, speed=1.5) if done: break tar_changed = (rvm.registers["tar"].content != rvm.registers["tar"].old_content) rvm_ticks = rvm.tick_counter rvm_runtime = time.perf_counter() - start rvm_sym = compute_symbolic_reward(env, problem.goal_thing_below) rvm_spa = compute_spatial_reward(env, problem.goal_thing_below) rvm_results = rvm_ticks, rvm_runtime, rvm_sym, rvm_spa # nvm_results = run_machine(nvm, problem.goal_thing_below, {"jnt": tr.tensor(rvm.ik["rest"]).float()}) env.reset() env.load_blocks(problem.thing_below, num_bases=domain.num_bases) start = time.perf_counter() while True: t = nvm.net.tick_counter if t > 0 and nvm.decode("ipt", t, 0) == nvm.decode("ipt", t - 1, 0): break nvm.net.tick(W_init, v_init) nvm.pullback(t) if t > 1 and nvm.decode("tar", t - 2, 0) != nvm.decode( "tar", t - 1, 0): position = nvm.net.activities["jnt"][t][0, :, 0].detach().numpy() env.goto_position(position, speed=1.5) nvm_ticks = nvm.net.tick_counter nvm_runtime = time.perf_counter() - start nvm_sym = compute_symbolic_reward(env, problem.goal_thing_below) nvm_spa = compute_spatial_reward(env, problem.goal_thing_below) nvm_results = nvm_ticks, nvm_runtime, nvm_sym, nvm_spa env.close() return rvm_results, nvm_results, nvm.size(), problem
def step_hook(self, env, action): pen = env.movement_penalty() if pen > 0.01: input("pen...") self.mp.append(pen) self.sym.append( compute_symbolic_reward(env, self.goal_thing_below))