def get_rvm_timesteps(rvm, problem, simulate=False, dbg=False): # run nvm for time-steps rvm.reset({"jnt": "rest"}) rvm.mount("main") # sets tick counter to 0 memorize_problem(rvm, problem) if dbg: rvm.dbg() while True: done = rvm.tick() if dbg: rvm.dbg() if simulate and rvm.registers["jnt"].content != rvm.registers["jnt"].old_content: position = rvm.ik[rvm.registers["jnt"].content] rvm.env.goto_position(position) if done: break return rvm.tick_counter
def run_episodes(problem, nvm, W_init, v_init, num_time_steps, num_episodes, penalty_tracker, sigma): memorize_problem(nvm, problem) for name in ["obj", "loc", "goal"]: W_init[name][0] = nvm.connections[name].W.unsqueeze(dim=0) perf_counter = time.perf_counter() # W, v = nvm.net.run(W_init, v_init, num_time_steps) nvm.net.clear_ticks() for t in range(num_time_steps): nvm.net.tick(W_init, v_init) # nvm.pullback(t) # nvm.dbg() # input('.') W, v = nvm.net.weights, nvm.net.activities print(" NVM run took %fs (%d timesteps)" % (time.perf_counter() - perf_counter, num_time_steps)) perf_counter = time.perf_counter() positions, log_probs = tuple({b: list() for b in range(num_episodes)} for _ in [0, 1]) tar = nvm.registers["tar"] for t in range(2, num_time_steps): if nvm.decode("tar", t - 2) != nvm.decode("tar", t - 1): mu = v["jnt"][t][0, :, 0] dist = tr.distributions.normal.Normal(mu, sigma) for b in range(num_episodes): position = dist.sample( ) if b > 0 else mu # first episode noiseless positions[b].append(position) log_probs[b].append( dist.log_prob(position).sum()) # multivariate white noise for b in range(num_episodes): if any([tr.isnan(lp) for lp in log_probs[b]]): print(" " * 6, log_probs[b]) for t in range(2, num_time_steps): if nvm.decode("tar", t - 2, b) != nvm.decode("tar", t - 1, b): nvm.pullback(t, b) nvm.dbg() # input('.') # input('.') print(" log probs took %fs (%d motions)" % (time.perf_counter() - perf_counter, len(positions[0]))) perf_counter = time.perf_counter() # env = BlocksWorldEnv(show=False, step_hook=penalty_tracker.step_hook) env = nvm.env rewards, sym = [], [] for b in range(num_episodes): rewards.append([]) env.reset() env.load_blocks(problem.thing_below) for position in positions[b]: penalty_tracker.reset() env.goto_position(position.detach().numpy(), speed=1.5) rewards[b].append(-penalty_tracker.penalty) sym_reward = compute_symbolic_reward(env, problem.goal_thing_below) # spa_reward = compute_spatial_reward(env, problem.goal_thing_below) # end_reward = calc_reward(sym_reward, spa_reward) # rewards[b][-1] += end_reward rewards[b][-1] += sym_reward sym.append(sym_reward) # env.reset() # env.close() print(" simulation rewards took %fs" % (time.perf_counter() - perf_counter)) perf_counter = time.perf_counter() rewards_to_go = [] for b in range(num_episodes): rewards[b] = tr.tensor(rewards[b]).float() rtg = tr.cumsum(rewards[b], dim=0) rtg = rtg[-1] - rtg + rewards[b] rewards_to_go.append(rtg) baselines = tr.stack(rewards_to_go[1:]).mean(dim=0) # exclude noiseless baselines *= (num_episodes - 1) / (num_episodes - 2) # de-bias baseline = baselines[0] loss = tr.sum( tr.stack([ -((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() / (num_episodes - 1) / len(positions[0]) for b in range(1, num_episodes) ])) # loss = tr.tensor(0.) # for b in range(1,num_episodes): # exclude noiseless # loss -= ((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() / (num_episodes - 1) / len(positions[0]) # # loss = - ((rewards_to_go[b] - baselines) * tr.stack(log_probs[b])).sum() / (num_episodes - 1) / len(positions[0]) # # loss.backward(retain_graph=(b+1 < len(rewards))) loss.backward() print(" backprop took %fs" % (time.perf_counter() - perf_counter)) return sym, rewards_to_go, baseline
goal_thing_above) env = BlocksWorldEnv(show=showpb, step_hook=tracker.step_hook) yaw, pitch, dist, targ = 0, -7, 1.1, ( 0, 0.75, 0) # got from running blocks_world.py pb.resetDebugVisualizerCamera(dist, yaw, pitch, targ) env.reset() env.load_blocks(problem.thing_below) if showpb: input('...') # set up rvm and virtualize rvm = make_abstract_machine(env, domain) rvm.reset({"jnt": "rest"}) rvm.mount("main") memorize_problem(rvm, problem) nvm = virtualize(rvm, σ=nv.default_activator, detach_gates=detach_gates) nvm.mount("main") memorize_problem(nvm, problem) for name in ["obj", "loc", "goal"]: W_init[name][0] = nvm.connections[name].W.unsqueeze(dim=0) while True: done = rvm.tick() if rvm.registers["jnt"].content != rvm.registers[ "jnt"].old_content: position = rvm.ik[rvm.registers["jnt"].content] env.goto_position(position, speed=1.5)
def run_trial(domain): env = BlocksWorldEnv(show=False) # rejection sample non-trivial instance problem = domain.random_problem_instance() env.reset() env.load_blocks(problem.thing_below, num_bases=domain.num_bases) # set up rvm and virtualize rvm = make_abstract_machine(env, domain) memorize_problem(rvm, problem) rvm.reset({"jnt": "rest"}) rvm.mount("main") nvm = virtualize(rvm, σ=nv.default_activator, detach_gates=True) nvm.mount("main") W_init = { name: { 0: nvm.net.batchify_weights(conn.W) } for name, conn in nvm.connections.items() } v_init = { name: { 0: nvm.net.batchify_activities(reg.content) } for name, reg in nvm.registers.items() } v_init["jnt"][0] = nvm.net.batchify_activities( tr.tensor(rvm.ik["rest"]).float()) # rvm_results = run_machine(rvm, problem.goal_thing_below, {"jnt": "rest"}) start = time.perf_counter() tar_changed = False while True: done = rvm.tick() if tar_changed: position = rvm.ik[rvm.registers["jnt"].content] env.goto_position(position, speed=1.5) if done: break tar_changed = (rvm.registers["tar"].content != rvm.registers["tar"].old_content) rvm_ticks = rvm.tick_counter rvm_runtime = time.perf_counter() - start rvm_sym = compute_symbolic_reward(env, problem.goal_thing_below) rvm_spa = compute_spatial_reward(env, problem.goal_thing_below) rvm_results = rvm_ticks, rvm_runtime, rvm_sym, rvm_spa # nvm_results = run_machine(nvm, problem.goal_thing_below, {"jnt": tr.tensor(rvm.ik["rest"]).float()}) env.reset() env.load_blocks(problem.thing_below, num_bases=domain.num_bases) start = time.perf_counter() while True: t = nvm.net.tick_counter if t > 0 and nvm.decode("ipt", t, 0) == nvm.decode("ipt", t - 1, 0): break nvm.net.tick(W_init, v_init) nvm.pullback(t) if t > 1 and nvm.decode("tar", t - 2, 0) != nvm.decode( "tar", t - 1, 0): position = nvm.net.activities["jnt"][t][0, :, 0].detach().numpy() env.goto_position(position, speed=1.5) nvm_ticks = nvm.net.tick_counter nvm_runtime = time.perf_counter() - start nvm_sym = compute_symbolic_reward(env, problem.goal_thing_below) nvm_spa = compute_spatial_reward(env, problem.goal_thing_below) nvm_results = nvm_ticks, nvm_runtime, nvm_sym, nvm_spa env.close() return rvm_results, nvm_results, nvm.size(), problem