def trained_disturber(variant): env_name = variant["env_name"] env = get_env_from_name(env_name) env_params = variant["env_params"] eval_params = variant["eval_params"] policy_params = variant["alg_params"] disturber_params = variant["disturber_params"] build_func = get_policy(variant["algorithm_name"]) if "Fetch" in env_name or "Hand" in env_name: s_dim = (env.observation_space.spaces["observation"].shape[0] + env.observation_space.spaces["achieved_goal"].shape[0] + env.observation_space.spaces["desired_goal"].shape[0]) else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] d_dim = env_params["disturbance dim"] policy = build_func(a_dim, s_dim, d_dim, policy_params) disturbance_chanel_list = np.nonzero( disturber_params["disturbance_magnitude"])[0] disturber_params["disturbance_chanel_list"] = disturbance_chanel_list disturber = Disturber(d_dim, s_dim, disturber_params) disturber.restore(eval_params["path"]) log_path = variant["log_path"] + "/eval/trained_disturber" variant["eval_params"].update({"magnitude": 0}) logger.configure(dir=log_path, format_strs=["csv"]) diagnostic_dict, _ = evaluation(variant, env, policy, disturber) string_to_print = [] [ string_to_print.extend( [key, ":", str(round(diagnostic_dict[key], 2)), "|"]) for key in diagnostic_dict.keys() ] print("".join(string_to_print)) [logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys()] logger.dumpkvs()
def trained_disturber(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] eval_params = variant['eval_params'] policy_params = variant['alg_params'] disturber_params = variant['disturber_params'] build_func = get_policy(variant['algorithm_name']) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0] \ + env.observation_space.spaces['achieved_goal'].shape[0] + \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] d_dim = env_params['disturbance dim'] policy = build_func(a_dim, s_dim, d_dim, policy_params) disturbance_chanel_list = np.nonzero( disturber_params['disturbance_magnitude'])[0] disturber_params['disturbance_chanel_list'] = disturbance_chanel_list disturber = Disturber(d_dim, s_dim, disturber_params) disturber.restore(eval_params['path']) log_path = variant['log_path'] + '/eval/trained_disturber' variant['eval_params'].update({'magnitude': 0}) logger.configure(dir=log_path, format_strs=['csv']) diagnostic_dict, _ = evaluation(variant, env, policy, disturber) string_to_print = [] [ string_to_print.extend( [key, ':', str(round(diagnostic_dict[key], 2)), '|']) for key in diagnostic_dict.keys() ] print(''.join(string_to_print)) [logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys()] logger.dumpkvs()
def train_v2(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_build_fun = get_policy(variant['algorithm_name']) policy_params = variant['alg_params'] disturber_params = variant['disturber_params'] iter_of_actor_train = policy_params['iter_of_actor_train_per_epoch'] iter_of_disturber_train = policy_params[ 'iter_of_disturber_train_per_epoch'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # if disturber_params['process_noise']: # d_dim = disturber_params['noise_dim'] # else: # d_dim = env_params['disturbance dim'] d_dim = np.nonzero(disturber_params['disturbance_magnitude'])[0].shape[0] disturbance_chanel_list = np.nonzero( disturber_params['disturbance_magnitude'])[0] disturber_params['disturbance_chanel_list'] = disturbance_chanel_list a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fun(a_dim, s_dim, d_dim, policy_params) disturber = Disturber(d_dim, s_dim, disturber_params) pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': d_dim, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'finite_horizon': policy_params['finite_horizon'], } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_actor_training_paths = deque(maxlen=store_last_n_paths) last_disturber_training_paths = deque(maxlen=store_last_n_paths) actor_training_started = False disturber_training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('ita', policy_params['ita']) logger.logkv('energy_decay_rate', disturber_params['energy_decay_rate']) logger.logkv('magnitude', disturber_params['disturbance_magnitude']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for iter in range(max_episodes): for i in range(iter_of_actor_train): current_path = { 'rewards': [], 'disturbance_mag': [], 'a_loss': [], 'alpha': [], 'lyapunov_error': [], 'labda': [], 'critic_error': [], 'entropy': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s, True) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 disturbance, raw_disturbance = disturber.choose_action(s, j) # Run in simulator # disturbance = np.array([0]) disturbance_input = np.zeros([a_dim + s_dim]) disturbance_input[disturbance_chanel_list] = disturbance s_, r, done, info = env.step(action, process_noise=disturbance_input) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if actor_training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. pool.store(s, a, disturbance, raw_disturbance, r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) # Learn if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: actor_training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now, batch) if actor_training_started: current_path['rewards'].append(r) current_path['labda'].append(labda) current_path['critic_error'].append(min(c1_loss, c2_loss)) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['disturbance_mag'].append( np.linalg.norm(disturbance)) if actor_training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_actor_training_paths) if training_diagnotic is not None: [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = [ 'Actor training!time_step:', str(global_step), '|' ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if actor_training_started: last_actor_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break if global_step > max_global_steps: break for i in range(iter_of_disturber_train): current_path = { 'rewards': [], 'disturbance_mag': [], 'd_loss': [], 'alpha': [], 'disturber_critic_error': [], 'entropy': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s, True) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 disturbance, raw_disturbance = disturber.choose_action(s, j) # Run in simulator # disturbance = np.array([0]) s_, r, done, info = env.step(action, disturbance) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if disturber_training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. pool.store(s, a, disturbance, raw_disturbance, r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) # Learn if pool.memory_pointer > min_memory_size and global_step % disturber_params[ 'steps_per_cycle'] == 0: disturber_training_started = True for _ in range(disturber_params['train_per_cycle']): batch = pool.sample(disturber_params['batch_size']) d_alpha, d_c1_loss, d_c2_loss, d_entropy, d_loss = disturber.learn( lr_a_now, lr_c_now, batch) # d_c1_loss = 0 # d_c2_loss = 0 # d_loss=0 if disturber_training_started: current_path['rewards'].append(r) current_path['disturber_critic_error'].append( min(d_c1_loss, d_c2_loss)) current_path['d_loss'].append(d_loss) current_path['alpha'].append(d_alpha) current_path['entropy'].append(d_entropy) current_path['disturbance_mag'].append( np.linalg.norm(disturbance)) if disturber_training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_disturber_training_paths) if training_diagnotic is not None: [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = [ 'Disturber training!time_step:', str(global_step), '|' ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if disturber_training_started: last_disturber_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break if global_step > max_global_steps: break policy.save_result(log_path) disturber.save_result(log_path) print('Running time: ', time.time() - t1) return