def train(variant): s_save = [] env_name = variant['env_name'] env = get_env_from_name(env_name) if variant['evaluate'] is True: evaluation_env = get_env_from_name(env_name) else: evaluation_env = None env_params = variant['env_params'] judge_safety_func = get_safety_constraint_func(variant) max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] num_of_paths = variant['num_of_paths'] alg_name = variant['algorithm_name'] policy_build_fn = get_policy(alg_name) policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fn(a_dim, s_dim, policy_params) logger.logkv('target_entropy', policy.target_entropy) # For analyse Render = env_params['eval_render'] ewma_p = 0.95 ewma_step = np.zeros((1, max_episodes + 1)) ewma_reward = np.zeros((1, max_episodes + 1)) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False for i in range(max_episodes): ep_reward = 0 l_r = 0 current_path = { 'rewards': [], 'l_rewards': [], 'violation': [], } [current_path.update({key: []}) for key in policy.diag_names] if global_step > max_global_steps: break s = env.reset() for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator s_, r, done, info = env.step(action) if training_started: global_step += 1 l_r = info['l_rewards'] if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. violation_of_constraint = info['violation_of_constraint'] # 储存s,a和s_next,reward用于DDPG的学习 policy.store_transition(s, a, r, l_r, terminal, s_) s_save.append(s_) sio.savemat('data_all.mat', { 's': s_save, }) # 如果状态接近边缘 就存储到边缘memory里 # if policy.use_lyapunov is True and np.abs(s[0]) > env.cons_pos: # or np.abs(s[2]) > env.theta_threshold_radians*0.8 if policy.use_lyapunov is True and judge_safety_func( s_, r, done, info): # or np.abs(s[2]) > env.theta_threshold_radians*0.8 policy.store_edge_transition(s, a, r, l_r, terminal, s_) # Learn if policy.use_lyapunov is True: if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0: # Decay the action randomness training_started = True for _ in range(train_per_cycle): train_diagnotic = policy.learn(lr_a_now, lr_c_now, lr_l_now) else: if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0: # Decay the action randomness training_started = True for _ in range(train_per_cycle): train_diagnotic = policy.learn(lr_a_now, lr_c_now, lr_l_now) if training_started: current_path['rewards'].append(r) current_path['l_rewards'].append(l_r) current_path['violation'].append(violation_of_constraint) [ current_path[key].append(value) for key, value in zip(policy.diag_names, train_diagnotic) ] if training_started and global_step % evaluation_frequency == 0 and global_step > 0: if evaluation_env is not None: rollouts = get_evaluation_rollouts(policy, evaluation_env, num_of_paths, max_ep_steps, render=Render) diagnotic = evaluate_rollouts(rollouts) # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()] print( 'training_step:', global_step, 'average eval reward:', diagnotic['return-average'], 'average eval lreward:', diagnotic['lreturn-average'], 'average eval violations:', diagnotic['violation-avg'], 'average length:', diagnotic['episode-length-avg'], ) logger.logkv('eval_eprewmean', diagnotic['return-average']) logger.logkv('eval_eplrewmean', diagnotic['lreturn-average']) logger.logkv('eval_eplenmean', diagnotic['episode-length-avg']) logger.logkv('eval_violation_times', diagnotic['violation-avg']) logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) if training_diagnotic is not None: # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\ logger.logkv('eprewmean', training_diagnotic['rewards']) logger.logkv('eplrewmean', training_diagnotic['l_rewards']) logger.logkv('eplenmean', training_diagnotic['len']) logger.logkv('end_cost', training_diagnotic['end_cost']) [ logger.logkv(key, training_diagnotic[key]) for key in policy.diag_names ] logger.logkv('violation_times', training_diagnotic['violation']) logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) print( 'training_step:', global_step, 'average reward:', round(training_diagnotic['rewards'], 2), 'average lreward:', round(training_diagnotic['l_rewards'], 2), 'average violations:', training_diagnotic['violation'], 'end cost:', round(training_diagnotic['end_cost'], 2), 'average length:', round(training_diagnotic['len'], 1), 'lyapunov error:', round(training_diagnotic['lyapunov_error'], 6), 'critic1 error:', round(training_diagnotic['critic1_error'], 6), 'critic2 error:', round(training_diagnotic['critic2_error'], 6), 'policy_loss:', round(training_diagnotic['policy_loss'], 6), 'alpha:', round(training_diagnotic['alpha'], 6), 'lambda:', round(training_diagnotic['labda'], 6), 'entropy:', round(training_diagnotic['entropy'], 6), ) # 'max_grad:', round(training_diagnotic['max_grad'], 6) logger.dumpkvs() # 状态更新 s = s_ ep_reward += r # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) ewma_step[0, i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j ewma_reward[ 0, i + 1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) if variant['evaluate'] is True: evaluation_env = get_env_from_name(env_name) else: evaluation_env = None env_params = variant['env_params'] judge_safety_func = get_safety_constraint_func(variant) max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] num_of_paths = variant['num_of_paths'] alg_name = variant['algorithm_name'] policy_build_fn = get_policy(alg_name) policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fn(a_dim, s_dim, policy_params) logger.logkv('target_entropy', policy.target_entropy) # For analyse Render = env_params['eval_render'] ewma_p = 0.95 ewma_step = np.zeros((1, max_episodes + 1)) ewma_reward = np.zeros((1, max_episodes + 1)) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False for i in range(max_episodes): ep_reward = 0 l_r = 0 current_path = { 'rewards': [], 'l_rewards': [], 'l_error': [], 'critic1_error': [], 'critic2_error': [], 'alpha': [], 'lambda': [], 'entropy': [], 'a_loss': [], 'violation': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s, True) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator s_, r, done, info = env.step(action) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if training_started: global_step += 1 l_r = info['l_rewards'] if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. violation_of_constraint = info['violation_of_constraint'] # 储存s,a和s_next,reward用于DDPG的学习 policy.store_transition(s, a, r, l_r, terminal, s_) # Learn if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0: # Decay the action randomness training_started = True for _ in range(train_per_cycle): labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now) if training_started: current_path['rewards'].append(r) current_path['l_rewards'].append(l_r) current_path['l_error'].append(l_loss) current_path['critic1_error'].append(c1_loss) current_path['critic2_error'].append(c2_loss) current_path['alpha'].append(alpha) current_path['lambda'].append(labda) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['violation'].append(violation_of_constraint) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: if evaluation_env is not None: rollouts = get_evaluation_rollouts(policy, evaluation_env, num_of_paths, max_ep_steps, render=Render) diagnotic = evaluate_rollouts(rollouts) # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()] print( 'training_step:', global_step, 'average eval reward:', diagnotic['return-average'], 'average eval lreward:', diagnotic['lreturn-average'], 'average eval violations:', diagnotic['violation-avg'], 'average length:', diagnotic['episode-length-avg'], ) logger.logkv('eval_eprewmean', diagnotic['return-average']) logger.logkv('eval_eplrewmean', diagnotic['lreturn-average']) logger.logkv('eval_eplenmean', diagnotic['episode-length-avg']) logger.logkv('eval_violation_times', diagnotic['violation-avg']) logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) if training_diagnotic is not None: # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\ logger.logkv('eprewmean', training_diagnotic['train-return-average']) logger.logkv('eplrewmean', training_diagnotic['train-lreturn-average']) logger.logkv( 'eplenmean', training_diagnotic['train-episode-length-avg']) logger.logkv('lyapunov_lambda', training_diagnotic['train-lambda-avg']) logger.logkv('alpha', training_diagnotic['train-alpha-avg']) logger.logkv('entropy', training_diagnotic['train-entropy-avg']) logger.logkv('critic1 error', training_diagnotic['train-critic1-error-avg']) logger.logkv('critic2 error', training_diagnotic['train-critic2-error-avg']) logger.logkv( 'lyapunov error', training_diagnotic['train-lyapunov-error-avg']) logger.logkv('policy_loss', training_diagnotic['train-a-loss-avg']) logger.logkv( 'average_cost', training_diagnotic['train-return-average'] / training_diagnotic['train-episode-length-avg']) logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) print( 'training_step:', global_step, 'average reward:', round(training_diagnotic['train-return-average'], 2), 'average lreward:', round(training_diagnotic['train-lreturn-average'], 2), 'average violations:', training_diagnotic['train-violation-avg'], 'average length:', round(training_diagnotic['train-episode-length-avg'], 1), 'lyapunov error:', round(training_diagnotic['train-lyapunov-error-avg'], 6), 'critic1 error:', round(training_diagnotic['train-critic1-error-avg'], 6), 'critic2 error:', round(training_diagnotic['train-critic2-error-avg'], 6), 'policy_loss:', round(training_diagnotic['train-a-loss-avg'], 6), 'alpha:', round(training_diagnotic['train-alpha-avg'], 6), 'lambda:', round(training_diagnotic['train-lambda-avg'], 6), 'entropy:', round(training_diagnotic['train-entropy-avg'], 6), ) logger.dumpkvs() # 状态更新 s = s_ ep_reward += r # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) ewma_step[0, i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j ewma_reward[ 0, i + 1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def eval(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] alg_name = variant['algorithm_name'] policy_build_fn = get_policy(alg_name) policy_params = variant['alg_params'] root_path = variant['log_path'] if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fn(a_dim, s_dim, policy_params) if 'CartPole' in env_name: mag = env_params['impulse_mag'] # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() die_count = 0 for i in range(variant['num_of_trials']): log_path = variant['log_path'] + '/eval/' + str(0) policy.restore(variant['log_path'] + '/' + str(0)) logger.configure(dir=log_path, format_strs=['csv']) s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s, True) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 if j == 100 and 'CartPole' in env_name: impulse = mag * np.sign(s[0]) # print('impulse comming:',impulse) # Run in simulator s_, r, done, info = env.step(action, impulse=impulse) else: s_, r, done, info = env.step(action) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True logger.logkv('rewards', r) logger.logkv('timestep', j) logger.dumpkvs() l_r = info['l_rewards'] if j == max_ep_steps - 1: done = True s = s_ if done: if j < 200: die_count += 1 print('episode:', i, 'death:', die_count, 'mag:', mag) break print('Running time: ', time.time() - t1) return
def train(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) evaluation_env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] alg_name = variant['algorithm_name'] policy_build_fn = get_policy(alg_name) policy_params = variant['alg_params'] batch_size = policy_params['batch_size'] lr_c = policy_params['lr_c'] cliprange = policy_params['cliprange'] cliprangenow = cliprange lr_c_now = lr_c # learning rate for critic gamma = policy_params['gamma'] gae_lamda = policy_params['gae_lamda'] log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=policy_params['output_format']) logger.logkv('safety_threshold', policy_params['safety_threshold']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', batch_size) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fn(a_dim, s_dim, policy_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=policy.N) for j in range(max_global_steps): if global_step > max_global_steps: break mb_obs, mb_obs_, mb_rewards, mb_actions, mb_values, mb_terminals, mb_t = [], [], [], [], [], [], [] for n in range(policy.N): current_path = { 'rewards': [], 'obs': [], 'obs_': [], 'done': [], 'value': [], 't': [], 'action': [], } s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) # For n in range number of steps for t in range(max_ep_steps): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init [a], [value] = policy.choose_action(s) action = np.tanh(a) action = a_lowerbound + (action + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator s_, r, done, info = env.step(action) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if t == max_ep_steps - 1: done = True terminal = 1. if done else 0. if Render: env.render() current_path['rewards'].append(r) current_path['action'].append(a) current_path['obs'].append(s) current_path['obs_'].append(s_) current_path['done'].append(terminal) current_path['value'].append(value) current_path['t'].append(t) if done: global_step += t + 1 last_training_paths.appendleft(current_path) break else: s = s_ # mb_obs = np.asarray(mb_obs, dtype=s.dtype) # mb_values = np.asarray(mb_values, dtype=s.dtype) # mb_l_values = np.asarray(mb_l_values, dtype=s.dtype) # mb_actions = np.asarray(mb_actions, dtype=action.dtype) # mb_obs_ = np.asarray(mb_obs_, dtype=s_.dtype) # mb_rewards = np.asarray(mb_rewards, dtype=np.float32) # mb_l_rewards = np.asarray(mb_l_rewards, dtype=np.float32) # mb_terminals = np.asarray(mb_terminals, dtype=np.float32) # last_value, last_l_value = policy.predict_values([s_]) rescale = np.mean([len(path) for path in last_training_paths]) initial_return = [] mb_advs = [] for path in last_training_paths: lastgaelam = 0 path_advs = np.zeros_like(path['rewards']) path_values = path['value'] path_next_values = path['value'][1:] path_next_values.append(policy.predict_values(path['obs_'][-1])) for t in reversed(range(len(path_values))): delta = path['rewards'][t] + gamma * path_next_values[t] * ( 1 - path['done'][t]) - path_values[t] path_advs[t] = lastgaelam = delta + gamma * gae_lamda * ( 1 - path['done'][t]) * lastgaelam path_returns = path_advs + path_values initial_return.append(path_returns[0]) mb_advs.extend(path_advs) mb_obs.extend(path['obs']) mb_obs_.extend(path['obs_']) mb_values.extend(path['value']) mb_terminals.extend(path['done']) mb_t.extend(path['t']) mb_actions.extend(path['action']) initial_return = np.asarray(initial_return, dtype=np.float32) mb_obs = np.asarray(mb_obs, dtype=s.dtype) mb_values = np.asarray(mb_values, dtype=s.dtype) mb_actions = np.asarray(mb_actions, dtype=action.dtype) mb_obs_ = np.asarray(mb_obs_, dtype=s_.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_terminals = np.asarray(mb_terminals, dtype=np.float32) mb_advs = np.asarray(mb_advs, dtype=np.float32) mb_t = np.asarray(mb_t, dtype=np.float32) mb_returns = mb_advs + mb_values mblossvals = [] inds = np.arange(len(mb_advs), dtype=int) initial_return = np.mean(initial_return) # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step # if sum(current_path['l_rewards'])>0: # policy.ALPHA3 = min(policy.ALPHA3 * 1.5, policy_params['alpha3']) # else: # policy.ALPHA3 = min(policy.ALPHA3 * 1.01, policy_params['alpha3']) slices = (arr[inds] for arr in (mb_obs, mb_obs_, mb_returns, mb_advs, mb_actions, mb_values, mb_t)) # print(**slices) mblossvals.append( policy.update(*slices, initial_return, cliprangenow, lr_c_now, rescale)) mblossvals = np.mean(mblossvals, axis=0) frac = 1.0 - (global_step - 1.0) / max_global_steps cliprangenow = cliprange * frac lr_c_now = lr_c * frac # learning rate for critic # lr_l_now = lr_l * frac # learning rate for critic logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts(last_training_paths) if training_diagnotic is not None: # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\ eval_diagnotic = training_evaluation(variant, evaluation_env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_c', lr_c_now) [ logger.logkv(name, value) for name, value in zip(policy.diagnosis_names, mblossvals) ] string_to_print = ['time_step:', str(global_step), '|'] [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend( [key, ':', str(round(training_diagnotic[key], 2)), '|']) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY print('Running time: ', time.time() - t1) return