def test(self, test_case_count=200, load_dir=None): self.target_net = self.target_net.eval() if load_dir is not None: self.target_net.load_state_dict(torch.load(load_dir)) count = 0 total_length = 0 for _ in tqdm(range(test_case_count)): env = Env() s = env.get_current_state() ep_r = 0 for i in range(4): x = torch.unsqueeze(torch.FloatTensor(s), 0) # input only one sample root_result, leaf_result = self.target_net(x) root_action = torch.argmax(root_result).item() if root_action != 3: leaf_action = torch.argmax(leaf_result[root_action]).item() # step s_, r, done = env.step(root_action, leaf_action) else: find_path_result = leaf_result[3] find_path_source = torch.argmax( find_path_result[:, :int(find_path_result.shape[1] / 2)]).item() find_path_target = torch.argmax( find_path_result[:, int(find_path_result.shape[1] / 2):]).item() # step s_, r, done = env.step( root_action, (find_path_source, find_path_target)) ep_r += r s = s_ if done: if ep_r > 0: total_length += i break if ep_r > 0: count += 1 acc = float(count) / test_case_count if acc > self.max_acc and load_dir is None: torch.save(self.target_net.state_dict(), 'models/dqn.pkl') self.max_acc = acc print("acc is: ", acc) if count > 0: # 因为统计的时候少1,这里补上1 print("length is: ", float(total_length) / count + 1)
def test_attack(): agent = Agent(args.img_stack, device) agent.load_param() env = Env(args.seed, args.img_stack, args.action_repeat) # load adv input, by default general attack perturbation delta_s = np.load('param/adv_general.npy') if args.attack_type != 'general': file_path = 'param/adv_' + args.attack_type if args.attack_type == 'patch': file_path += '_' + args.patch_type file_path += '.npy' delta_s = np.load(file_path) # show adv fig = plt.figure(figsize=(8, 8)) plt.title('Stack of ' + str(args.img_stack) + ' adversarial signals seen by Agent') plt.axis('off') columns, rows = args.img_stack // 2, args.img_stack // 2 for i in range(1, columns * rows + 1): # denormalize while showing the image img = (delta_s[i - 1] + 1) * 128 fig.add_subplot(rows, columns, i) plt.imshow(img, cmap='gray') plt.show() for i_ep in range(10): score = 0 state = env.reset() for t in range(1000): # steps range to render attack in 1000 attack_render = [30, 40] if t in np.arange(attack_render[0], attack_render[1] + 1): if t in attack_render: s_with_ds = (state + delta_s) # clip the image limits and denormalize for displaying s_with_ds = np.clip(s_with_ds, -1, 0.9921875) s_with_ds = (s_with_ds + 1) * 128 title = 'Attack Started' if t == attack_render[ 0] else 'Attack ended' title += ' (showing first frame of 4 frames visible to policy)' plt.imshow(s_with_ds[0], cmap='gray') plt.axis('off') plt.title(title) plt.show() state += delta_s action = agent.select_action(state) state_, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.])) if args.render: env.render() score += reward state = state_ if done: break print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
def main(test=False): if test: dqn = DQN() dqn.test(test_case_count=10000, load_dir='models/dqn.pkl') else: dqn = DQN() env = Env() # dqn.load("models/pretrained.pkl") print('\nCollecting experience...') for i_episode in range(60000): s = env.reset() ep_r = 0 for _count in range(4): root_action, leaf_action = dqn.choose_action(s) # take action s_, r, done = env.step(root_action, leaf_action) dqn.store_transition(s, (root_action, leaf_action), r, s_) ep_r += r if dqn.memory_counter > MEMORY_CAPACITY: dqn.learn() if done: break s = s_ # print('ep_r:', ep_r) if i_episode % 1000 == 1: dqn.test() dqn.save('models/dqn_final_no_pretrain.pkl')
def generate_dataset_from_raw(self, raw_data_file_path): raw_data = np.load(raw_data_file_path) dataset = [] for item in raw_data: state = item[0] init_target = state[int(len(state) / 2):] actions = item[1] env = Env(target_state=init_target) for action in actions: dataset.append({"state": state, "action": action}) state, reward, done = env.step(action[0], action[1]) if done: break return dataset
def generate_dataset(raw_data_file_path): raw_data = np.load(raw_data_file_path) dataset = [] for item in raw_data: state = item[0] init_target = state[int(len(state) / 2):] actions = item[1] env = Env(target_state=init_target) for action in actions: dataset.append((state, action)) state, reward, done = env.step(action[0], action[1]) if done: break np.save('dataset/dataset_2.npy', dataset) print(dataset)
def play2(self, arg): step_for = 16 step_bck = 15 match = False for_inter_state = [] bck_inter_state = [] _, _, heuristic = arg while match == False: env_for = Env() env_bck = DCAEnv() non_ter = False for _ in range(step_for): action = self.naive_policy(env_for, heuristic, env_for.feasible_actions) _, _, end = env_for.step(action) if end: non_ter = True break if non_ter: for_inter_state.append(np.ones((7, 7)).tolist()) else: for_inter_state.append(env_for.state[:, :, 0].tolist()) non_ter = False for _ in range(step_bck): action = self.naive_policy(env_bck, heuristic, env_bck.feasible_actions, False) _, _, end = env_bck.step(action) if end: non_ter = True break if non_ter: bck_inter_state.append(np.ones((7, 7)).tolist()) else: bck_inter_state.append(env_bck.state.tolist()) if env_for.state[:, :, 0].tolist() in bck_inter_state: match = True if env_bck.state.tolist() in for_inter_state: match = True if len(for_inter_state) == 1000: return 1000, False return len(for_inter_state), True
def run_agent(): agent = Agent(args.img_stack, device) agent.load_param() env = Env(args.seed, args.img_stack, args.action_repeat) state = env.reset() # Prepare attack attack = AdvAttack(args.attack_type) attack.initialize_perturbation(state.shape) attack.load_networks() for i_ep in range(50): score = 0 state = env.reset() for t in range(1000): action = agent.select_action(state) # update buffer for training the attack attack.update_buffer(state) # write to tensorboard input_imgs_to_net = torch.tensor( (attack.buffer['s'] + attack.buffer['d_s'])) input_imgs_grid = make_grid(input_imgs_to_net[0].reshape( 4, 1, 96, 96)) writer.add_image('Four stack of input state with adversarial', input_imgs_grid) writer.add_graph(attack.net, input_imgs_to_net) writer.close() # train attack attack.train() state_, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.])) if args.render: env.render() score += reward state = state_ if done or die: break print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
def main(): env = Env(enable_draw=True, base_fix=False) agent = Agent(env) time_horizon = 10 com_pos = np.array([0.0, 0, 0.1]) rpy = np.zeros(3) com_vel = np.zeros(3) base_ang_vel = np.zeros(3) target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) target_x = target_x.reshape((-1, 1)) target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape( (12, 1)) init_u_list = np.array([target_u for i in range(time_horizon)]) state = env.reset() t = 0 while t < 10: com_pos = env.model.com_pos rpy = env.model.base_rpy com_vel = env.model.base_vel base_ang_vel = np.matmul(env.model.base_rot.T, env.model.base_ang_vel) init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) init_x = init_x.reshape((-1, 1)) delta_time_list = np.array([0.01] * time_horizon) foot_pos_list = np.array( [env.model.foot_pos_list for i in range(time_horizon + 1)]) contact_phi_list = np.array([[1, 1, 1, 1] for i in range(time_horizon + 1)]) target_x_list = np.array([target_x for i in range(time_horizon + 1)]) target_u_list = np.array([target_u for i in range(time_horizon)]) action, u_list = agent.get_action(init_x, init_u_list, delta_time_list, foot_pos_list, contact_phi_list, target_x_list, target_u_list) init_u_list = deepcopy(u_list) state = env.step(action) #time.sleep(env.time_step) t += env.time_step
def beam_search(net, beam_size=3): states = [] probs = [] trajectories = [] env = Env() for i in range(beam_size): states.append(deepcopy(env)) probs.append(1.0) trajectories.append([]) for _ in range(4): candidate_states = [] for k in range(beam_size): s = states[k].get_current_state() x = torch.unsqueeze(torch.FloatTensor(s), 0) # input only one sample actions_value = net(x) candidates = topk_actions(actions_value, beam_size) for i in range(beam_size): # step env = deepcopy(states[k]) action = candidates[i][1] temp_traj = copy.copy(trajectories[k]) temp_traj.append((states[k].get_current_state(), action)) s_, r, done = env.step(action[0], action[1]) new_state = env if r > 0: return True, temp_traj # if (new_state, candidates[i][0] * probs[k], temp_traj) not in candidate_states: # candidate_states.append((new_state, candidates[i][0] * probs[k], temp_traj)) candidate_states.append( (new_state, candidates[i][0] * probs[k], temp_traj)) candidate_states = sorted(candidate_states, key=lambda x: x[1], reverse=True) for i in range(beam_size): states[i] = candidate_states[i][0] probs[i] = candidate_states[i][1] trajectories[i] = candidate_states[i][2] return False, None
def play_greedy_game(verbose=True): """ This function plays a Tichu game with four "greedy" players. Uses greedyAgent. This is an Agent with very simple heuristic play moves. Always tries to win a stack except opponent is leading. Raises an Exception if 10 consecutive false moves are made. (This should not happen when environment and greedyAgent is bugfree.) """ agent = greedyAgent() env = Env(train_mode=not (verbose)) state, rewards, done, active_player = env.reset() conseq_active_counter = 0 cummulative_reward = [0, 0, 0, 0] while True: my_state = state[active_player] action = agent.act(my_state) last_active = active_player if not env.game.players[active_player].finished: cummulative_reward[active_player] += rewards[active_player] state, rewards, done, active_player = env.step(active_player, action) new_active = active_player if last_active == new_active: conseq_active_counter += 1 else: conseq_active_counter = 0 if done: if verbose: print('-----') for i in range(4): cummulative_reward[i] += rewards[i] if verbose: print('Cummulative reward of player {}: {}'.format( i, cummulative_reward[i])) return if conseq_active_counter > 10: raise Exception( "Active counter exceeded. Possible infinity loop detected.")
def collect_random_data(agent): env = Env() random_agent = RandomAgent() end = False states = [] actions = [] rewards = [] data = [] discount_G = 1.0 G = 0. t = 0 while not end: states.append(env.state) action = random_agent.select_action(env.feasible_actions) action_index = 4 * action[0] + action[1] actions.append(action_index) reward, _, end = env.step(action) rewards.append(reward) # discount = gamma # for s in range(t): # values[t-s-1] += discount * reward # discount = discount * gamma t += 1 G += discount_G * reward discount_G = discount_G * agent.gamma R = 0. # evaluate state values of all states encountered in a batch to save time state_values = agent.net.get_value( np.array(states).reshape(-1, 7, 7, agent.state_channels)).reshape(-1) for s in range(t): R = rewards[t - s - 1] + agent.gamma * R advantage = R - state_values[t - s - 1] data = [ dict({ "state": states[t - s - 1], "advantage": advantage, "action": actions[t - s - 1], "critic_target": R }) ] + data assert (G == R) assert (len(state_values) == len(states) == len(actions) == len(rewards) == t) # data = [] # for s in range(len(states)-1): # advantage = rewards[s] + values[s+1] - values[s] # data.append(dict({"state" : states[s], # "advantage" : advantage, # "critic_target" : values[s], # "action" : actions[s]})) # T = len(states)-1 # advantage = rewards[T] - values[T] # next state value is 0 because it is terminal # data.append(dict({"state" : states[T], # "advantage" : advantage, # "critic_target" : values[T], # "action" : actions[T]})) return data
def main(): env = Env(enable_draw=True, base_fix=False) agent = Agent(env) delta_time = 0.025 time_horizon = 10 com_pos = np.array([0.0, 0, 0.25]) rpy = np.zeros(3) com_vel = np.zeros(3) base_ang_vel = np.zeros(3) target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) target_x = target_x.reshape((-1, 1)) target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape( (12, 1)) init_u_list = np.array([target_u for i in range(time_horizon)]) temp_length = int(0.3 / delta_time) temp_contact_phi_list = [[0, 1, 1, 0]] * temp_length + [[ 1, 1, 1, 1 ]] * temp_length + [[1, 0, 0, 1]] * temp_length + [[1, 1, 1, 1] ] * temp_length total_contact_phi_list = np.array([[1, 1, 1, 1]] * temp_length + temp_contact_phi_list * 1000) state = env.reset() t = 0 last_t = 0 while t < 100: if last_t == 0 or t - last_t >= delta_time: last_t = t com_pos = env.model.com_pos print(com_pos) rpy = env.model.base_rpy com_vel = env.model.base_vel base_ang_vel = np.matmul(env.model.base_rot.T, env.model.base_ang_vel) init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) init_x = init_x.reshape((-1, 1)) delta_time_list = np.array([delta_time] * time_horizon) foot_pos_list = np.array( [env.model.foot_pos_list for i in range(time_horizon + 1)]) contact_phi_list = total_contact_phi_list[:time_horizon + 1] total_contact_phi_list = total_contact_phi_list[1:] target_x_list = np.array( [target_x for i in range(time_horizon + 1)]) target_u_list = np.array([target_u for i in range(time_horizon)]) action, u_list = agent.get_action(init_x, init_u_list, delta_time_list, foot_pos_list, contact_phi_list, target_x_list, target_u_list) init_u_list = deepcopy(u_list) for leg_idx in range(4): if contact_phi_list[0, leg_idx] == 0.0: action[leg_idx * 3:(leg_idx + 1) * 3] = [0, 0, -3.0] state = env.step(action, contact_phi_list[0, :]) t += env.time_step
old_action = None for iter_num in range(settings.TRAIN_ITERATIONS): action = sess.run(net.chosen_action, feed_dict={net.layer_input: [state]})[0] # if action == old_action and action != 0: # action += 1 old_action = action logger.debug( f'Iter {iter_num}: action={action} (avg reward={np.mean(history.get_rewards()[-100:])})' ) new_state, step_reward = env.step(action) history.add(state, action, step_reward, new_state) state = new_state grads = sess.run(net.gradients, feed_dict={ net.reward_holder: history.discounted_rewards( gamma=settings.REWARD_HISTORY_DISCOUNT), net.action_holder: history.get_actions(), net.layer_input: np.vstack(history.get_states()) })