def test_c51(args=get_args()): args.cfg_path = f"maps/{args.task}.cfg" args.wad_path = f"maps/{args.task}.wad" args.res = (args.skip_num, 84, 84) env = Env(args.cfg_path, args.frames_stack, args.res) args.state_shape = args.res args.action_shape = env.action_space.shape or env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # make environments train_envs = ShmemVectorEnv([ lambda: Env(args.cfg_path, args.frames_stack, args.res) for _ in range(args.training_num) ]) test_envs = ShmemVectorEnv([ lambda: Env(args.cfg_path, args.frames_stack, args.res, args.save_lmp) for _ in range(min(os.cpu_count() - 1, args.test_num)) ]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # define model net = C51(*args.state_shape, args.action_shape, args.num_atoms, args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) # define policy policy = C51Policy(net, optim, args.gamma, args.num_atoms, args.v_min, args.v_max, args.n_step, target_update_freq=args.target_update_freq).to( args.device) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # replay buffer: `save_last_obs` and `stack_num` can be removed together # when you have enough RAM buffer = VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) # collector train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # log log_path = os.path.join(args.logdir, args.task, 'c51') writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): if env.spec.reward_threshold: return mean_rewards >= env.spec.reward_threshold elif 'Pong' in args.task: return mean_rewards >= 20 else: return False def train_fn(epoch, env_step): # nature DQN setting, linear decay in the first 1M steps if env_step <= 1e6: eps = args.eps_train - env_step / 1e6 * \ (args.eps_train - args.eps_train_final) else: eps = args.eps_train_final policy.set_eps(eps) if env_step % 1000 == 0: logger.write("train/env_step", env_step, {"train/eps": eps}) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() policy.set_eps(args.eps_test) test_envs.seed(args.seed) if args.save_buffer_name: print(f"Generate buffer with size {args.buffer_size}") buffer = VectorReplayBuffer(args.buffer_size, buffer_num=len(test_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) collector = Collector(policy, test_envs, buffer, exploration_noise=True) result = collector.collect(n_step=args.buffer_size) print(f"Save buffer into {args.save_buffer_name}") # Unfortunately, pickle will cause oom with 1M buffer size buffer.save_hdf5(args.save_buffer_name) else: print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) rew = result["rews"].mean() lens = result["lens"].mean() * args.skip_num print(f'Mean reward (over {result["n/ep"]} episodes): {rew}') print(f'Mean length (over {result["n/ep"]} episodes): {lens}') if args.watch: watch() exit(0) # test train_collector and start filling replay buffer train_collector.collect(n_step=args.batch_size * args.training_num) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, test_in_train=False) pprint.pprint(result) watch()
config.n_mix = 10 config.z_dim = 4 config.x_transformed = 100 config.decode.net_size = 100 config.encode.net_size = 100 config.weight_factor = 0.5 layers_num = 2 dim_size = 1 rnn_layers = 1 n_mix = 10 env = Env("vrae", clear_pics=True) encode_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers*[tf.nn.rnn_cell.LSTMCell(config.encode.net_size)]) decode_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers*[ ProjectionCell(tf.nn.rnn_cell.LSTMCell(config.decode.net_size), n_mix, activation=tf.nn.tanh) ]) input = tf.placeholder(tf.float32, shape=(seq_size, batch_size, dim_size), name="Input") # state = tf.placeholder(tf.float32, [batch_size, net_size], name="state") state = tuple( tf.nn.rnn_cell.LSTMStateTuple(*[tf.zeros((batch_size, sz)) for sz in sz_outer]) for sz_outer in encode_cell.state_size )
u, a, a_m, x_hat_flat = u_ta.stack(), a_ta.stack(), a_m_ta.stack( ), x_hat_flat_ta.stack() x_hat = tf.reshape(x_hat_flat, (seq_size, batch_size, filter_len, input_size)) grads_and_vars = [] for li, s in enumerate(finstate): dF = s[-1] grads_and_vars += [ (-tf.reduce_mean(dF, 0), net._cells[li].F_flat), ] sess = tf.Session() env = Env("lca_simple") x_v = np.zeros((seq_size, batch_size, input_size)) for bi in xrange(batch_size): for ni in xrange(input_size): x_v[:, bi, ni] = generate_ts(seq_size) x_v = x_v.reshape((seq_size, batch_size, input_size)) state_v = get_zero_state() sample_size = 20 E = np.zeros((sample_size, sample_size))
MalNil, MalBoolean, MalFunctionCompiled, MalFunctionRaw, MalAtom, MalVector, MalHash_map, ) from mal_types import ( MalUnknownSymbolException, MalSyntaxException, MalInvalidArgumentException, MalString, ) repl_env = Env(None) for key in core.ns: repl_env.set(key, core.ns[key]) def eval_func(args: List[MalExpression]) -> MalExpression: a0 = args[0] assert isinstance(a0, MalExpression) return EVAL(a0, repl_env) repl_env.set("eval", MalFunctionCompiled(lambda args: eval_func(args))) def swap(args: List[MalExpression]) -> MalExpression: atom = args[0]
def EVAL(ast: MalExpression, env: Env) -> MalExpression: while True: dbgeval = env.get("DEBUG-EVAL") if (dbgeval is not None and not isinstance(dbgeval, MalNil) and (not isinstance(dbgeval, MalBoolean) or dbgeval.native())): print("EVAL: " + str(ast)) ast_native = ast.native() if isinstance(ast, MalSymbol): key = str(ast) val = env.get(key) if val is None: raise MalUnknownSymbolException(key) return val if isinstance(ast, MalVector): return MalVector([EVAL(x, env) for x in ast_native]) if isinstance(ast, MalHash_map): new_dict = {} # type: Dict[str, MalExpression] for key in ast_native: new_dict[key] = EVAL(ast_native[key], env) return MalHash_map(new_dict) if not isinstance(ast, MalList): return ast elif len(ast_native) == 0: return ast first_str = str(ast_native[0]) if first_str == "def!": name: str = str(ast_native[1]) value: MalExpression = EVAL(ast_native[2], env) return env.set(name, value) elif first_str == "let*": assert len(ast_native) == 3 let_env = Env(env) bindings: MalExpression = ast_native[1] assert isinstance(bindings, MalList) or isinstance( bindings, MalVector) bindings_list: List[MalExpression] = bindings.native() assert len(bindings_list) % 2 == 0 for i in range(0, len(bindings_list), 2): assert isinstance(bindings_list[i], MalSymbol) assert isinstance(bindings_list[i + 1], MalExpression) let_env.set(str(bindings_list[i]), EVAL(bindings_list[i + 1], let_env)) env = let_env ast = ast_native[2] continue elif first_str == "do": for x in range(1, len(ast_native) - 1): EVAL(ast_native[x], env) ast = ast_native[len(ast_native) - 1] continue elif first_str == "if": condition = EVAL(ast_native[1], env) if isinstance(condition, MalNil) or (isinstance(condition, MalBoolean) and condition.native() is False): if len(ast_native) >= 4: ast = ast_native[3] continue else: return MalNil() else: ast = ast_native[2] continue elif first_str == "fn*": raw_ast = ast_native[2] raw_params = ast_native[1] def fn(args: List[MalExpression]) -> MalExpression: f_ast = raw_ast f_env = Env(outer=env, binds=raw_params.native(), exprs=args) return EVAL(f_ast, f_env) return MalFunctionRaw(fn=fn, ast=raw_ast, params=raw_params, env=env) elif first_str == "quote": return (MalList(ast_native[1].native()) if isinstance( ast_native[1], MalVector) else ast_native[1]) elif first_str == "quasiquote": ast = quasiquote(ast_native[1]) continue else: f, *args = (EVAL(form, env) for form in ast_native) if isinstance(f, MalFunctionRaw): ast = f.ast() env = Env( outer=f.env(), binds=f.params().native(), exprs=args, ) continue elif isinstance(f, MalFunctionCompiled): return f.call(args) else: raise MalInvalidArgumentException(f, "not a function")
def EVAL(ast: MalExpression, env: Env) -> MalExpression: while True: ast = macroexpand(ast, env) ast_native = ast.native() if not isinstance(ast, MalList): return eval_ast(ast, env) elif len(ast_native) == 0: return ast first_str = str(ast_native[0]) if first_str == "macroexpand": return macroexpand(ast.native()[1], env) elif first_str == "def!": name: str = str(ast_native[1]) value: MalExpression = EVAL(ast_native[2], env) return env.set(name, value) if first_str == "defmacro!": name = str(ast_native[1]) value = EVAL(ast_native[2], env) assert isinstance(value, MalFunctionCompiled) or isinstance( value, MalFunctionRaw) value.make_macro() return env.set(name, value) elif first_str == "let*": assert len(ast_native) == 3 let_env = Env(env) bindings: MalExpression = ast_native[1] assert isinstance(bindings, MalList) or isinstance( bindings, MalVector) bindings_list: List[MalExpression] = bindings.native() assert len(bindings_list) % 2 == 0 for i in range(0, len(bindings_list), 2): assert isinstance(bindings_list[i], MalSymbol) assert isinstance(bindings_list[i + 1], MalExpression) let_env.set(str(bindings_list[i]), EVAL(bindings_list[i + 1], let_env)) env = let_env ast = ast_native[2] continue elif first_str == "do": for x in range(1, len(ast_native) - 1): EVAL(ast_native[x], env) ast = ast_native[len(ast_native) - 1] continue elif first_str == "if": condition = EVAL(ast_native[1], env) if isinstance(condition, MalNil) or (isinstance(condition, MalBoolean) and condition.native() is False): if len(ast_native) >= 4: ast = ast_native[3] continue else: return MalNil() else: ast = ast_native[2] continue elif first_str == "fn*": raw_ast = ast_native[2] raw_params = ast_native[1] def fn(args: List[MalExpression]) -> MalExpression: f_ast = raw_ast f_env = Env(outer=env, binds=raw_params.native(), exprs=args) return EVAL(f_ast, f_env) return MalFunctionRaw(fn=fn, ast=raw_ast, params=raw_params, env=env) elif first_str == "quote": return (MalList(ast_native[1].native()) if isinstance( ast_native[1], MalVector) else ast_native[1]) elif first_str == "quasiquote": ast = quasiquote(ast_native[1]) continue elif first_str == "try*": try: return EVAL(ast_native[1], env) except MalException as e: if len(ast_native) < 3: raise e catch_block = ast_native[2] assert (isinstance(catch_block, MalList) and isinstance(catch_block.native()[0], MalSymbol) and str(catch_block.native()[0]) == "catch*" and len(catch_block.native()) == 3) exception_symbol = catch_block.native()[1] assert isinstance(exception_symbol, MalSymbol) env = Env(env) env.set(str(exception_symbol), e.native()) ast = catch_block.native()[2] continue else: evaled_ast = eval_ast(ast, env) f = evaled_ast.native()[0] args = evaled_ast.native()[1:] if isinstance(f, MalFunctionRaw): ast = f.ast() env = Env( outer=f.env(), binds=f.params().native(), exprs=evaled_ast.native()[1:], ) continue elif isinstance(f, MalFunctionCompiled): return f.call(args) else: raise MalInvalidArgumentException(f, "not a function")
def test(args, T, dqn, val_mem, evaluate=False): global Ts, rewards, Qs, best_avg_reward env = Env(args) env.eval() Ts.append(T) T_rewards, T_Qs = [], [] # Test performance over several episodes done = True ramitha_frame_number = 0 for number in range(args.evaluation_episodes): while True: if done: state, reward_sum, done = env.reset(), 0, False action = dqn.act_e_greedy(state) # Choose an action ε-greedily state, reward, done= env.step(action) # Step if(number==0): ################################################ if(action == dqn.act(state)): ramitha_frame_number += 1 #Pixel Saliency : print(ramitha_frame_number) #dqn.pixel_saliency(state) #Guided backprop and Guided GradCAM #dqn.guided_backprop_and_CAM(state) RGBc_state = deepcopy(env.retRGB_state()) RGBc_state = cv2.cvtColor(RGBc_state,cv2.COLOR_RGB2BGR) #plt.imsave('./saliency_outputs_objectsaliency/current_state'+str(ramitha_frame_number)+'.jpg',RGBc_state,cmap='gray') masked_state = {} positions_listx = {} positions_listy = {} compset = [] comp_set = [] adv_obs_big = deepcopy(RGBc_state) sigma = 0.3 k = 300 min_val = 20 ''' sigma = 0.0 k = 300 min_val = 5 ''' adv_obs = adv_obs_big[45:185] #adv_obs = adv_obs_big[23:] plt.imsave('./saliency_outputs_objectsaliency/current_state'+str(ramitha_frame_number)+'.png',adv_obs,cmap='gray') #I change the jpg file into png——kunlun due to an error occur u,width,height = segment(adv_obs, sigma, k, min_val,999) compset = [] object_saliency = np.zeros((210,160)) for y in range(height): for x in range(width): comp = u.find(y * width + x) compset.append(comp) final = np.array(list(set(compset))) for i in range(len(final)): current_comp = final[i] copy_state = deepcopy(adv_obs) masked_state = np.zeros((210,160,1)) masked_state = masked_state[45:185] #I uncomment this for this game full_state = deepcopy(adv_obs_big) for y in range(height): for x in range(width): comp = u.find(y * width + x) if(comp == current_comp): copy_state[y, x,:] = 0 #Spoiled image masked_state[y,x,0] = 255 # Maksed Image masked_state = masked_state.astype('uint8') if( np.count_nonzero(masked_state) <500): dst = cv2.inpaint(copy_state,masked_state,3,cv2.INPAINT_TELEA) full_state[45:185] = dst #full_state[23:] = dst full_state = np.array(full_state) for y in range(height): for x in range(width): comp = u.find(y * width + x) if(comp == current_comp): full_state_m = cv2.resize(cv2.cvtColor( full_state, cv2.COLOR_RGB2GRAY ), (84, 84), interpolation=cv2.INTER_LINEAR) full_state_m = torch.tensor(full_state_m, dtype=torch.float32).div_(255) object_saliency[y,x] = (dqn.evaluate_q(full_state_m.unsqueeze(0).expand(4,-1,-1)) - dqn.evaluate_q(state)) #plt.imsave('./result/inpaint'+str(i)+'.png',full_state) plt.imsave('./saliency_outputs_objectsaliency/object_saliency'+str(ramitha_frame_number)+'.png',object_saliency,cmap='gray') ''' tv_beta = 3 learning_rate = 0.1 l1_coeff = 0.01 tv_coeff = 0.2 max_iterations = 500 original_img = deepcopy(RGBc_state) reduced_img = cv2.resize(original_img, (84, 84), interpolation=cv2.INTER_LINEAR) img = np.float32(original_img) / 255 blurred_img1 = cv2.GaussianBlur(img, (11, 11), 5) blurred_img2 = np.float32(cv2.medianBlur(original_img, 11)) / 255 blurred_img_numpy = (blurred_img1 + blurred_img2) / 2 mask_init = np.ones((28, 28), dtype=np.float32) # Convert to torch variables img_m = cv2.resize(cv2.cvtColor( img, cv2.COLOR_RGB2GRAY ), (84, 84), interpolation=cv2.INTER_LINEAR) img_m = torch.tensor(img_m, dtype=torch.float32).div_(255) blurred_m = cv2.resize(cv2.cvtColor( blurred_img2, cv2.COLOR_RGB2GRAY ), (84, 84), interpolation=cv2.INTER_LINEAR) blurred_m = torch.tensor(blurred_m, dtype=torch.float32).div_(255) mask = numpy_to_torch(mask_init) upsample = torch.nn.UpsamplingBilinear2d(size=(84, 84)) optimizer = torch.optim.Adam([mask], lr=learning_rate) #img_m = torch.mean(img_m, 1).unsqueeze(0) logp = nn.Softmax()( ( dqn.q_full(img_m.unsqueeze(0).expand(4,-1,-1)) ) ) category = np.argmax(logp.cpu().data.numpy()) for i in range(max_iterations): upsampled_mask = upsample(mask) upsampled_mask = \ upsampled_mask.expand(1, 3, upsampled_mask.size(2), \ upsampled_mask.size(3)) # Use the mask to perturbated the input image. perturbated_input = img_m.mul(upsampled_mask) + \ blurred_m.mul(1 - upsampled_mask) noise = np.zeros((84, 84, 3), dtype=np.float32) noise = noise + cv2.randn(noise, 0, 0.2) noise = numpy_to_torch(noise) perturbated_input = perturbated_input + noise pi = np.squeeze(perturbated_input.detach().numpy()) pi = np.transpose(pi, (1, 2, 0)) pi = cv2.resize(cv2.cvtColor( pi, cv2.COLOR_RGB2GRAY ), (84, 84), interpolation=cv2.INTER_LINEAR) pi = torch.tensor(pi, dtype=torch.float32).div_(255) #perturbated_m = cv2.resize(cv2.cvtColor( perturbated_input, cv2.COLOR_RGB2GRAY ), (84, 84), interpolation=cv2.INTER_LINEAR) #perturbated_m = torch.tensor(perturbated_m, dtype=torch.float32).div_(255) #perturbated_input = torch.mean(perturbated_input, 1).unsqueeze(0) optimizer.zero_grad() logp = nn.Softmax()((dqn.q_full(pi.unsqueeze(0).expand(4,-1,-1)))) loss = l1_coeff * torch.mean(torch.abs(1 - mask)) + \ tv_coeff * tv_norm(mask, tv_beta) + logp[0, category] loss.backward() optimizer.step() # Optional: clamping seems to give better results mask.data.clamp_(0, 1) upsampled_mask = upsample(mask) mm = save(upsampled_mask, reduced_img, blurred_img_numpy,ramitha_frame_number) ''' reward_sum += reward if args.render: env.render() if done: T_rewards.append(reward_sum) break env.close() # Test Q-values over validation memory for state in val_mem: # Iterate over valid states T_Qs.append(dqn.evaluate_q(state)) avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs) if not evaluate: # Append to results rewards.append(T_rewards) Qs.append(T_Qs) # Plot _plot_line(Ts, rewards, 'Reward', path='results') _plot_line(Ts, Qs, 'Q', path='results') # Save model parameters if improved if avg_reward > best_avg_reward: best_avg_reward = avg_reward dqn.save('results') # Return average reward and Q-value return avg_reward, avg_Q
ALPHA = 0.1 GAMMA = 0.9 MAX_STEP = 30 np.random.seed(0) def epsilon_greedy(Q, state): if (np.random.uniform() > 1 - EPSILON) or ((Q[state, :] == 0).all()): action = np.random.randint(0, 4) # 0~3 else: action = Q[state, :].argmax() return action e = Env() Q = np.zeros((e.state_num, 4)) for i in range(200): e = Env() while (e.is_end is False) and (e.step < MAX_STEP): action = epsilon_greedy(Q, e.present_state) state = e.present_state reward = e.interact(action) new_state = e.present_state Q[state, action] = (1 - ALPHA) * Q[state, action] + \ ALPHA * (reward + GAMMA * Q[new_state, :].max()) e.print_map() time.sleep(0.1) print('Episode:', i, 'Total Step:', e.step, 'Total Reward:', e.total_reward)
def train(args, model, optimizer=None, video_train=None): reward_avg = AverageMeter() loss_avg = AverageMeter() value_loss_avg = AverageMeter() policy_loss_avg = AverageMeter() root_dir = '/home/piaozx/VOT16' data_type = 'VOT' model.train() # save_path='dataset/Result/VOT' env = Env(seqs_path=root_dir, data_set_type=data_type, save_path='save') for video_name in video_train: actions = [] rewards = [] values = [] entropies = [] logprobs = [] # reset for new video observation1, observation2 = env.reset(video_name) img1 = ReadSingleImage(observation2) img1 = Variable(img1).cuda() hidden_prev = model.init_hidden_state( batch_size=1) # variable cuda tensor _, _, _, _, hidden_pres = model(imgs=img1, hidden_prev=hidden_prev) # for loop init parameter hidden_prev = hidden_pres observation = observation2 FLAG = 1 loss_dd = 0 i = 2 while FLAG: img = ReadSingleImage(observation) img = Variable(img).cuda() action_prob, action_logprob, action_sample, value, hidden_pres = model( imgs=img, hidden_prev=hidden_prev) entropy = -(action_logprob * action_prob).sum(1, keepdim=True) entropies.append(entropy) actions.append(action_sample.long()) # list, Variable cuda inner action_np = action_sample.data.cpu().numpy() # print('train:', action_np) # import pdb; pdb.set_trace() # print(action_prob[0, 1]) loss_dd += torch.abs(0.5 - action_prob[0, 1]).pow(2) hidden_prev = hidden_pres sample = Variable(torch.LongTensor(action_np).cuda()).unsqueeze(0) logprob = action_logprob.gather(1, sample) logprobs.append(logprob) reward, new_observation, done = env.step(action=action_np) env.show_all() print( 'train:', 'frame{%d}' % (i), 'Action:{%1d}' % action_np[0], 'rewards:{%.6f}' % reward, 'probability:{%.6f}, {%.6f}' % (action_prob.data.cpu().numpy()[0, 0], action_prob.data.cpu().numpy()[0, 1])) i += 1 rewards.append(reward) # just list values.append(value) # list, Variable cuda inner observation = new_observation if done: FLAG = 0 num_seqs = len(rewards) running_add = Variable(torch.FloatTensor([0])).cuda() value_loss = 0 policy_loss = 0 gae = torch.FloatTensor([0]).cuda() values.append(running_add) for i in reversed(range(len(rewards))): running_add = args.gamma * running_add + rewards[i] advantage = running_add - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t # gae = delta_t policy_loss = policy_loss - logprobs[i] * Variable( gae) - args.entropy_coef * entropies[i] value_loss = value_loss / num_seqs policy_loss = policy_loss / num_seqs # values.append(running_add) # for i in reversed(range(len(rewards))): # running_add = args.gamma * running_add + rewards[i] # advantage = running_add - values[i] # value_loss = value_loss + 0.5 * advantage.pow(2) # policy_loss = policy_loss - logprobs[i] * advantage - args.entropy_coef * entropies[i] # # value_loss = value_loss / num_seqs # policy_loss = policy_loss/num_seqs optimizer.zero_grad() loss = args.value_loss_coef * value_loss + policy_loss loss += 0.005 * loss_dd[0] # print model.actor.fc1.weight loss.backward() # viz_ = viz.get_viz('main') # # viz_.update_plot() torch.nn.utils.clip_grad_norm(model.critic.parameters(), args.max_grad_norm) torch.nn.utils.clip_grad_norm(model.actor.parameters(), args.max_grad_norm) optimizer.step() print(video_name, 'rewards:{%.6f}' % np.mean(rewards), 'loss:{%.6f}' % loss.data[0], 'value_loss:{%6f}' % value_loss.data[0], 'policy_loss:{%.6f}' % policy_loss.data[0]) # update the loss loss_avg.update(loss.data.cpu().numpy()) value_loss_avg.update(value_loss.data.cpu().numpy()) policy_loss_avg.update(policy_loss.data.cpu().numpy()) reward_avg.update(np.mean(rewards)) return reward_avg.avg, loss_avg.avg, value_loss_avg.avg, policy_loss_avg.avg
print(max(env.vm_time)) print(np.sum(env.vm_cost)) # TODO(hang): env.vm_cost if episode % 10 == 0: print( 'episode:' + str(episode) + ' steps:' + str(step) + ' reward:' + str( rwd) + ' eps_greedy:' + str( dqn.epsilon)) rewards.append(rwd) break if __name__ == '__main__': rewards = [] env = Env(N_VM) memories = Memory(MEMORY_SIZE) dqn = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.001, replace_target_iter=200, e_greedy_increment=3e-5 ) run_env(EPISODES, MINI_BATCH) dqn.plot_cost() plt.plot(np.arange(len(rewards)), rewards) plt.plot(np.arange(len(rewards)), [138 for i in range(len(rewards))])
def run(self, config_fp, model_name, profile_export=None, keep_data=False, given_data=None): print("Running evaluation of {} with given data ? {}".format( model_name, given_data is not None)) # Loading Config config_abs_fp = os.path.join(os.path.dirname(__file__), config_fp) config = Box(yaml.load(open(config_abs_fp, 'r').read())) # Load environment env = Env(config=config.env, model_name=model_name) app_config = config.app[model_name] # Loading Inference Server inference_server = PBServer(config=app_config) if profile_export is not None: profile_export = profile_export + '/' + model_name if not os.path.isdir(profile_export): print( "{} does not exits, creating one.".format(profile_export)) pathlib.Path(profile_export).mkdir(parents=True, exist_ok=True) all_inference_data = [] all_training_obj = [] if env.fs_mode(): all_raw_data = env.prepare_all_testing_data() if given_data is None: for _id, _example_raw_data in enumerate(all_raw_data): _inference_data, _training_obj = env.prepare_one_inference_data( _example_raw_data) if keep_data: all_inference_data.append(_inference_data) all_training_obj.append(_training_obj) assert len(_inference_data) == 2 assert len(_inference_data[0]) == len(_inference_data[1]) if profile_export is not None: _profile_export = profile_export + '/{}'.format(_id) else: _profile_export = None _y_hat_se3param = inference_server.inference( data=_inference_data, profile_export=_profile_export) env.write_report(example=_training_obj, pred=_y_hat_se3param) else: _given_inference_data, _given_training_obj = given_data for _id, _inference_data in enumerate(_given_inference_data): assert len(_inference_data) == 2 assert len(_inference_data[0]) == len(_inference_data[1]) if profile_export is not None: _profile_export = profile_export + '/{}'.format(_id) else: _profile_export = None _y_hat_se3param = inference_server.inference( data=_inference_data, profile_export=_profile_export) env.write_report(example=_given_training_obj[_id], pred=_y_hat_se3param) if keep_data: return all_inference_data, all_training_obj
out_dir = os.path.join( 'results', '%s_%s_%d_%s_%d' % (shape_id, args.category, args.cnt_id, primact_type, trial_id)) if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) flog = open(os.path.join(out_dir, 'log.txt'), 'w') out_info = dict() # set random seed if args.random_seed is not None: np.random.seed(args.random_seed) out_info['random_seed'] = args.random_seed # setup env env = Env(flog=flog, show_gui=(not args.no_gui)) # setup camera cam = Camera(env, random_position=True) out_info['camera_metadata'] = cam.get_metadata_json() if not args.no_gui: env.set_controller_camera_pose(cam.pos[0], cam.pos[1], cam.pos[2], np.pi + cam.theta, -cam.phi) # load shape object_urdf_fn = '../data/where2act_original_sapien_dataset/%s/mobility_vhacd.urdf' % shape_id flog.write('object_urdf_fn: %s\n' % object_urdf_fn) object_material = env.get_material(4, 4, 0.01) state = 'random-closed-middle' if np.random.random() < 0.5: state = 'closed'
def eval(ast, env): while True: if ast.type != "list": return eval_ast(ast, env) if len(ast.value) == 0: return ast symbol = ast.value[0].value if symbol == "def!": evaluated = eval(ast.value[2], env) env.set(ast.value[1].value, evaluated) return evaluated if symbol == "let*": new_env = Env(env) params = ast.value[1].value i = 0 while i < len(params): new_env.set(params[i].value, eval(params[i + 1], new_env)) i += 2 ast = ast.value[2] env = new_env continue if symbol == "do": evaluated = None for i in ast.value[1:-1]: eval(i, env) ast = ast.value[len(ast.value) -1] continue if symbol == "if": evaluated = None cond = eval(ast.value[1], env) if cond.type != "nil" and (cond.type != "bool" or cond.value == True): ast = ast.value[2] elif len(ast.value) > 3: ast = ast.value[3] else: ast = Val("nil", []) continue if symbol == "fn*": def func(*exprs_t): new_env = Env(env) params = ast.value[1].value exprs = list(exprs_t) i = 0 while i < len(params): if params[i].value == "&": new_env.set(params[i + 1].value, Val("list", exprs[i:])) break new_env.set(params[i].value, exprs[i]) i += 1 return eval(ast.value[2], new_env) return Val("custom_fn", { "fn": func, "ast": ast.value[2], "params": ast.value[1].value, "env": env }) new_ast = eval_ast(ast, env) fn = new_ast.value[0] if fn.type == 'custom_fn': exprs = new_ast.value[1:] ast = fn.value["ast"] params = fn.value["params"] i = 0 new_env = Env(fn.value["env"]) while i < len(params): if params[i].value == "&": new_env.set(params[i + 1].value, Val("list", exprs[i:])) break new_env.set(params[i].value, exprs[i]) i += 1 env = new_env continue return fn.value(*new_ast.value[1:])
parser.add_argument("--device", type=str, default='cpu') parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--n_imgs", type=int, default=16) parser.add_argument("--seed", default=1, type=int, help="Random seed") args = parser.parse_args() if __name__ == "__main__": torch.manual_seed(args.seed) np.random.seed(args.seed) agent = Agent(args.n_imgs) agent.init_from_save(filename=f'{args.output_dir}/{args.model_name}.pkl') agent.prep_eval() env = Env(args.n_imgs) env.batch_size = args.batch_size env.load_labels(data_path='data/train.txt') env.load_video(video_path='data/train.mp4') env.shuffle_data() env.prep_eval() criterion = nn.MSELoss() test_records = EvalRecords() for i_ep in range(1000): state, labels = env.get_data() torch_state = Variable(torch.from_numpy(state)) torch_labels = Variable(torch.from_numpy(labels)) # forward
def get_command(): if line.linear.x == 2: return 0 elif line.linear.x == -2: return 1 elif line.angular.z == 2: return 2 else: return 3 global line line = Twist() if __name__ == '__main__': rospy.init_node('listener', anonymous=True) rate = rospy.Rate(200) pub = rospy.Publisher('/gazebo/set_model_state', ModelState, queue_size=1) multi_env = Env() while (True): listener() tele_action = get_command() multi_env.step(tele_action) for car in multi_env.Cars: talker(car.modelstate) for car in multi_env.Waiting_cars: talker(car.modelstate) rate.sleep()
safe_R = int(value) elif op in ("-e", "--seed"): seed = int(value) elif op in ("-a", "--astar"): use_astar = True elif op in ("-d", "--dstar"): use_dstar = True if use_astar is None and use_dstar is None: print("Error: You need to select one algorithm to plan path: Astar or Dstar!") sys.exit() else: print('Space: ', space_boundary) print('Agents number: ', agents_num) print('Safe Radius: ', safe_R) print('Seed: ', seed) env = Env(space_boundary = space_boundary, agents_num=agents_num, seed = seed, safe_R = safe_R) if use_astar: print('Algorithm: Astar') astar = Astar(env.agents_pos[:, 0], env.agents_targ[:, 0], env.agents_pos, env.space_boundary, env.walk_dirs) pathData = astar.search() pathsData = {"Plan Path": pathData} draw_path(env.space_boundary ,env.agents_pos, pathsData, env.agents_targ, title = "Path Plan with A*") elif use_dstar: print('Algorithm: Dstar') space_map = Map(env.space_boundary, env.walk_dirs) dstar = Dstar(space_map, env.space_boundary) paths = dstar.search( env.agents_pos[:, 0], env.agents_targ[:, 0], env.agents_pos) pathsData = {"Path Without Obstacle": paths[0], "Path With Obstacle": paths[1]} draw_path(env.space_boundary ,env.agents_pos, pathsData,
def test(args, T, dqn, val_mem, metrics, results_dir, evaluate=False): env = Env(args) env.eval() metrics['steps'].append(T) #for val in metrics: # print(val, metrics[val]) T_rewards, T_Qs = [], [] T_hidden_reward = [] # Test performance over several episodes done = True for _ in range(args.evaluation_episodes): #print("outer loop", flush = True); while True: if done: state, reward_sum, done = env.reset(), 0, False #print("Infinite Loop?", flush = True) action = dqn.act_e_greedy(state) # Choose an action ε-greedily state, reward, done = env.step(action) # Step reward_sum += reward if args.render: env.render() if done: T_rewards.append(reward_sum) T_hidden_reward.append(env.grid.get_last_performance()) break #print("Point 8", flush = True) env.close() #print("Point 9", flush = True) # Test Q-values over validation memory for state in val_mem: # Iterate over valid states T_Qs.append(dqn.evaluate_q(state)) avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs) avg_hidden_reward = sum(T_hidden_reward) / len(T_hidden_reward) if not evaluate: # Save model parameters if improved if avg_reward > metrics['best_avg_reward']: metrics['best_avg_reward'] = avg_reward dqn.save(results_dir) # Append to results and save metrics metrics['rewards'].append(T_rewards) metrics['Qs'].append(T_Qs) metrics['hidden'].append(T_hidden_reward) torch.save(metrics, os.path.join(results_dir, 'metrics.pth')) # Plot _plot_line(metrics['steps'], metrics['rewards'], 'Reward', path=results_dir) _plot_line(metrics['steps'], metrics['Qs'], 'Q', path=results_dir) _plot_line(metrics['steps'], metrics['hidden'], 'Hidden Reward', path=results_dir) #The timesteps that it hit the water #print(getRewardHistory()) #Plot times in water vs steps xaxis = getRewardHistory() yaxis = [i for i in range(1, len(xaxis) + 1)] if (len(xaxis) > 0): _plot_water(xaxis, yaxis, "Steps vs Water", path=results_dir) #Plot ... printTerminations() # Return average reward and Q-value return avg_reward, avg_Q
def __init__(self, size=4): self.grid_size = size self.env = Env(self.grid_size) self.a_id = dict([(a, i) for i, a in enumerate(self.env.actions())]) self.policy = EspionGreedyPolicy(self.env.actions(), range(self.grid_size**2))
from agent import Car from env import Racetrack, Env from racetrack import Generator SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) MODEL_NAME = 'example' RACETRACK_FILE = os.path.join(SCRIPT_DIR, 'racetrack.pkl') NUMBER_OF_EPISODES_TO_LEARN = 50000 TEST = True # ### To generate a new racetrack and save it: # gen = Generator() # racetrack = gen.generate() # plt.imshow(racetrack) # plt.show() # with open(RACETRACK_FILE, 'wb') as ofile: pkl.dump(racetrack, ofile) if __name__ == "__main__": env = Env(Racetrack(RACETRACK_FILE)) agent = Car(env) if TEST: if agent.load_model(MODEL_NAME): tr = agent.play() print(f'test episode reward: {tr}') else: agent.learn(NUMBER_OF_EPISODES_TO_LEARN) agent.save(MODEL_NAME)
else: args.device = torch.device('cpu') metrics = { 'steps': [], 'episodes': [], 'train_rewards': [], 'test_episodes': [], 'test_rewards': [], 'observation_loss': [], 'reward_loss': [], 'kl_loss': [] } print("Initializing environment!") # Initialise training environment and experience replay memory env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth) if args.load_experience: D = torch.load(os.path.join(results_dir, 'experience.pth')) metrics['steps'], metrics['episodes'] = [D.steps] * D.episodes, list( range(1, D.episodes + 1)) else: D = ExperienceReplay(args.experience_size, args.symbolic_env, env.observation_size, env.action_size, args.bit_depth, args.device) # Initialise dataset D with S random seed episodes for s in range(1, args.seed_episodes + 1): observation, done, t = env.reset(), False, 0 while not done: action = env.sample_random_action() next_observation, reward, done = env.step(action) D.append(observation, action, reward, done)
def EVAL(ast, env): while True: #print("EVAL %s" % printer._pr_str(ast)) if not types._list_Q(ast): return eval_ast(ast, env) # apply list ast = macroexpand(ast, env) if not types._list_Q(ast): return eval_ast(ast, env) if len(ast) == 0: return ast a0 = ast[0] if isinstance(a0, MalSym): a0sym = a0.value else: a0sym = u"__<*fn*>__" if u"def!" == a0sym: a1, a2 = ast[1], ast[2] res = EVAL(a2, env) return env.set(a1, res) elif u"let*" == a0sym: a1, a2 = ast[1], ast[2] let_env = Env(env) for i in range(0, len(a1), 2): let_env.set(a1[i], EVAL(a1[i + 1], let_env)) ast = a2 env = let_env # Continue loop (TCO) elif u"quote" == a0sym: return ast[1] elif u"quasiquote" == a0sym: ast = quasiquote(ast[1]) # Continue loop (TCO) elif u"defmacro!" == a0sym: func = EVAL(ast[2], env) func.ismacro = True return env.set(ast[1], func) elif u"macroexpand" == a0sym: return macroexpand(ast[1], env) elif u"do" == a0sym: if len(ast) == 0: return nil elif len(ast) > 1: eval_ast(ast.slice2(1, len(ast) - 1), env) ast = ast[-1] # Continue loop (TCO) elif u"if" == a0sym: a1, a2 = ast[1], ast[2] cond = EVAL(a1, env) if cond is nil or cond is false: if len(ast) > 3: ast = ast[3] # Continue loop (TCO) else: return nil else: ast = a2 # Continue loop (TCO) elif u"fn*" == a0sym: a1, a2 = ast[1], ast[2] return MalFunc(None, a2, env, a1, EVAL) else: el = eval_ast(ast, env) f = el.values[0] if isinstance(f, MalFunc): if f.ast: ast = f.ast env = f.gen_env(el.rest()) # Continue loop (TCO) else: return f.apply(el.rest()) else: raise Exception("%s is not callable" % f)
import reader import printer from env import Env class EvalException(Exception): """Won't cause the repl to crash, but aborts reading current form""" pass def s(name): "Return a MalSymbol named s." return reader.MalSymbol(name) REPL_ENV = Env(None) REPL_ENV.set(s('+'), lambda a, b: a + b) REPL_ENV.set(s('-'), lambda a, b: a - b) REPL_ENV.set(s('*'), lambda a, b: a * b) REPL_ENV.set(s('/'), lambda a, b: int(a / b)) def eval_ast(form, env): if isinstance(form, reader.MalSymbol): result = env.get(form) if result is None: raise EvalException('could not find symbol: ' + form.name) return result elif isinstance(form, reader.MalList): return [EVAL(x, env) for x in form.value] elif isinstance(form, list):
def fn(args: List[MalExpression]) -> MalExpression: f_ast = raw_ast f_env = Env(outer=env, binds=raw_params.native(), exprs=args) return EVAL(f_ast, f_env)
N = 1 lrate = 1e-02 batch_size = 1 input_size = 100 net_size = 10 layer_dims = [net_size, net_size] num_of_layers = len(layer_dims) tau_syn = dt_ms / 10.0 tau_mem = dt_ms / 10.0 tau_refr = dt_ms / 2.0 tau_learn = dt_ms / 100.0 amp_refr = 50.0 env = Env("simple_test") _GLMStateTuple = collections.namedtuple( "GLMStateTuple", ("u", "s", "r", "spikes", "dW", "log_ll")) _GLMOutputTuple = collections.namedtuple("GLMOutputTuple", ("input", "target", "a")) class GLMStateTuple(_GLMStateTuple): __slots__ = () class GLMOutputTuple(_GLMOutputTuple): __slots__ = () def safe_log(v):
#import torchvision.transforms as T import sys import argparse from argument import get_args args = get_args('DQN') #args.game = 'MountainCar-v0' #args.max_step = 200 #args.action_space =3 #args.state_space = 2 #args.memory_capacity = 1000 args.learn_start = 1000 #args.render= True from env import Env env = Env(args) from memory import ReplayMemory memory = ReplayMemory(args) #args.memory_capacity = 1000 #args.learn_start = 1000 #args.render= True from agent import Agent agent = Agent(args) print(args.cuda) """ define test function """ from plot import _plot_line
from env import Env from algorithm import DP import numpy as np if __name__ == '__main__': env = Env(np.array([3, 1])) learner = DP(env, 1, 1e-4) learner.run() print(learner.values[1:-1, 1:-1]) print(learner.env.policy[1:-1, 1:-1])
return -2.0 + PENALTY, dist_ if dist_ < dist: if dist_ < DIST: return 100.0, dist_ return 1.0 + PENALTY, dist_ if dist_ > dist: return -1.0 + PENALTY, dist_ return 0.0 + PENALTY, dist_ if __name__ == "__main__": # maze game env = Env() RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.0001, reward_decay=0.9, e_greedy=0.75, replace_target_iter=2000, memory_size=MEMORYCAPACITY, batch_size=64 # output_graph=True ) RL.restore_model() for episode in range(EPS): env.build_map() value = 0
def base_env(): return Env(defaults=lang.builtin_fn_vals)
else: el = eval_ast(ast, env) f = el.values[0] if isinstance(f, MalFunc): return f.apply(el.values[1:]) else: raise Exception("%s is not callable" % f) # print def PRINT(exp): return printer._pr_str(exp) # repl repl_env = Env() def REP(str, env): return PRINT(EVAL(READ(str), env)) def plus(args): a, b = args[0], args[1] assert isinstance(a, MalInt) assert isinstance(b, MalInt) return MalInt(a.value + b.value) def minus(args): a, b = args[0], args[1]
torch.manual_seed(config.SEED) torch.cuda.manual_seed(config.SEED) # --- load data data, meta, meta_full = datalib.load_data(config.DATA_FILE, config.META_FILE) config.init_dataset(meta_full) print("config =", config) print( f"Using dataset {meta_full['name']} with {meta_full['samples']} samples and {meta_full['classes']} classes." ) data_trn, data_val, data_tst = datalib.split(data, config.DATASEED) net = Net(meta).to(config.DEVICE) env = Env(data_trn, meta) agent = Agent(env, net, meta) log_trn = Log(data_trn, net, meta) log_val = Log(data_val, net, meta) log_tst = Log(data_tst, net, meta) print(net) fps = utils.Fps() fps.start() def set_lr(ep_steps): ep = ep_steps // (config.EPOCH_STEPS * 10) lr = config.OPT_LR * (config.OPT_LR_FACTOR**ep)