def update(self, arm, reward): """更新收益 :param reward: 收益 :type arm: 选中的臂的下标 """ Policy.update(self, arm, reward) self.b[arm] = self.b[arm] + reward * self.context[arm] self.context[arm].shape = (self.d, 1) self.A[arm] = self.A[arm] + self.context[arm].dot(np.transpose(self.context[arm]))
def main(): torch.set_num_threads(1) torch.manual_seed(0) env = gym.make(env_name) env.seed(seed) print('New model') policy = Policy('actor_critic', env.observation_space.shape[0], env.action_space.n) policy.to(device) optimizer = PPO(policy, clip_param, ppo_epoch, mini_batch_size, value_loss_coef, entropy_coef, learning_rate, max_grad_norm) episode_rewards = deque(maxlen=50) for eps in range(0, n_eps + 1): state = env.reset() storage = Storage(device=device) policy.eval() episode_rewards.append(test_env(policy, gym.make(env_name))) if eps % 5 == 0: print('Avg reward', np.mean(episode_rewards)) for step in range(n_steps): state = torch.FloatTensor(state).to(device) with torch.no_grad(): value, action, log_prob = policy.act(state) next_state, reward, done, _ = env.step(action.item()) storage.push(state, action, log_prob, value, reward, done) state = next_state if done: state = env.reset() next_state = torch.FloatTensor(next_state).to(device) with torch.no_grad(): next_value = policy.get_value(next_state).detach() storage.compute(next_value) policy.train() value_loss, action_loss, dist_entropy = optimizer.update(storage) with open('metrics.csv', 'a') as metrics: metrics.write('{},{},{}\n'.format(value_loss, action_loss, dist_entropy))
def __init__(self, args): """ 我们人为生成一些上下文来模拟 :param args: 臂个数参数,以及各个臂的穿越参数等 """ Policy.__init__(self, args) self.alpha = args[1] self.travel_args = args[2:] # 穿越过来的臂均值参数 self.d = 3 # 上下文维度 self.A = np.array([np.identity(self.d) for _ in range(self.n_bandits)]) self.b = np.array([np.zeros(self.d) for _ in range(self.n_bandits)]) self.context = None
def play_full_episode(agents: ParallelAgentsWrapper, policy: Policy, step: int, params: argparse, is_train: bool) \ -> Tuple[ParallelAgentsWrapper, int, bool, bool, float, int, Dict[str, float]]: eval_required = False checkpoint_reached = False epoch_reward = 0 rewards, terminals, states, terminals_due_to_timeout, success = agents.perform_actions( ['new game' for _ in range(params.number_of_agents)], is_train) # Restart all the agents. log_dict = {} start_step = step successful_agents = [0 for _ in range(params.number_of_agents)] while not all([t or t is None for t in terminals]): # Loop ends only when all agents have terminated. action = policy.get_action(states, is_train) rewards, terminals, states, terminals_due_to_timeout, success = agents.perform_actions(action, is_train) # reward is a list. Passing it to update_observation changes its values hence all references should be # performed prior to calling update_observation. for idx, reward in enumerate(rewards): if reward is not None: epoch_reward += reward if success[idx]: successful_agents[idx] = 1 logging.debug('step: %s, reward: %s, terminal: %s, terminal_due_to_timeout: %s, sucess: %s', step, rewards, terminals, terminals_due_to_timeout, success) policy.update_observation(rewards, terminals, terminals_due_to_timeout, success, is_train) if is_train: single_log_dict = policy.train(states) else: single_log_dict = {} step += 1 if step % params.eval_frequency == 0: eval_required = True if step % params.checkpoint_interval == 0: checkpoint_reached = True for item in single_log_dict: if item in log_dict: log_dict[item] = log_dict[item] + single_log_dict[item] else: log_dict[item] = single_log_dict[item] for item in log_dict: log_dict[item] = log_dict[item] * 1.0 / (step - start_step) return agents, step, eval_required, checkpoint_reached, epoch_reward, sum(successful_agents), log_dict
def _collect_trajectory( cls, replay_description_: replay_description.ReplayDescription, frozen_policy: policy.Policy): new_transitions = {} while not _process_env.needs_reset: state = _process_env.state action, action_logprob = frozen_policy.sample(state, return_logprob=True) next_state, reward, is_terminal, info = _process_env.step(action) is_timeout = _process_env.needs_reset terminal_weight = 0. if is_terminal else 1. timeout_weight = 0. if is_timeout else 1. new_transition = { 'states': state, 'actions': action, 'rewards': reward, 'next_states': next_state, 'timeout_weight': timeout_weight, 'terminal_weight': terminal_weight, 'action_log_prob': action_logprob, **info } for key in new_transition: if key not in new_transitions: new_transitions[key] = [new_transition[key]] else: new_transitions[key].append(new_transition[key]) new_transitions = replay_description_.prepare_samples( (len(new_transitions['states']), ), new_transitions) cumulative_return = _process_env.cumulative_return() _process_env.reset() del frozen_policy return new_transitions, cumulative_return
# E2C Parameters num_episodes=50 # total overall cycles B=100 # num minibatches per cycle batch_size=128 data_size = 500 k=.1 A=int(k*data_size) # number of samples we gather on each cycle class RandomPolicy(Policy): def __init__(self, batch_size, x_dim, u_dim): super(RandomPolicy, self).__init__(batch_size, x_dim, u_dim) def eval(self, sess, x): return np.random.uniform(low=-5.,high=5.,size=self.u_dim) #np.random.randn(self.u_dim) Policy.register(RandomPolicy) DATA_PATH='/ltmp/e2c-boxbot-rand' robot_type = "polyp" #"octoarm" # walker, polyp def run_experiment(): #tmp - verify E2C model builds properly x0v = np.zeros((120,320,6)) u_dim=20 u=tf.placeholder(tf.float32, [batch_size, u_dim]) e2c = E2CBoxbotModel(x0v, u, batch_size) for v in tf.all_variables():
def __init__(self, args): Policy.__init__(self, args) self.try_perSlot = int(args[1])
def __init__(self, args): Policy.__init__(self, args) self.gamma = args[1] self._weights = np.array([1] * self.n_bandits) self._probs = None
def __init__(self, args): Policy.__init__(self, args)
def __init__(self, args): Policy.__init__(self, args) self.anneal = args[2] > 0.0 self.decay = args[1]
def main(): torch.set_num_threads(1) torch.manual_seed(0) env = gym.make(env_name) env.seed(42) print('New model') policy = Policy('dqn', env.observation_space.shape[0], env.action_space.n) target_policy = Policy('dqn', env.observation_space.shape[0], env.action_space.n) policy.to(device) target_policy.to(device) target_policy.load_state_dict(policy.state_dict()) optimizer = DQNOptimizer(policy, target_policy, mini_batch_size, discount, learning_rate, update_epochs) episode_rewards = deque(maxlen=50) get_epsilon = lambda episode: np.exp(-episode * e_decay) for eps in range(0, n_eps + 1): state = env.reset() storage = Storage(device=device) episode_rewards.append(test_env(target_policy, gym.make(env_name))) if eps % 5 == 0: print('Avg reward', np.mean(episode_rewards)) for step in range(n_steps): state = torch.FloatTensor(state).to(device) with torch.no_grad(): action = policy.act(state, get_epsilon(eps)) next_state, reward, done, _ = env.step(action.item()) storage.push(state, action, reward, next_state, done) state = next_state if done: state = env.reset() storage.compute() loss = optimizer.update(storage) if eps % target_policy_update: target_policy.load_state_dict(policy.state_dict()) with open('metrics.csv', 'a') as metrics: metrics.write('{}\n'.format(loss))
def __init__(self, args): Policy.__init__(self, args) # 记录每个臂的beta分布参数 self.betaArgs = [[args[1], args[2]] for _ in range(self.n_bandits)]
def main(): torch.set_num_threads(1) torch.manual_seed(0) env = MountainCarEnvInherit() env.seed(42) meta_policy = Policy('dqn', env.observation_space.shape[0], goal_object.get_size()) target_meta_policy = Policy('dqn', env.observation_space.shape[0], goal_object.get_size()) policy = Policy('dqn', env.observation_space.shape[0] + goal_object.get_size(), env.action_space.n) target_policy = Policy('dqn', env.observation_space.shape[0] + goal_object.get_size(), env.action_space.n) meta_policy.to(device) target_meta_policy.to(device) target_meta_policy.load_state_dict(meta_policy.state_dict()) policy.to(device) target_policy.to(device) target_policy.load_state_dict(policy.state_dict()) optimizer_meta_policy = DQNOptimizer(meta_policy, target_meta_policy, mini_batch_size, discount, learning_rate, update_epochs) optimizer_policy = DQNOptimizer(policy, target_policy, mini_batch_size, discount, learning_rate, update_epochs) episode_rewards = deque(maxlen=50) get_meta_epsilon = lambda episode: np.exp(-episode * e_meta_decay) get_epsilon = lambda episode: np.exp(-episode * e_decay) frame = 0 meta_frame = 0 for eps in range(0, n_eps + 1): if eps % 1 == 0: episode_rewards.append(test_env(meta_policy, policy, MountainCarEnvInherit())) print('Avg reward', np.mean(episode_rewards)) storage = Storage(device=device) storage_meta = Storage(device=device) print('Game', eps) state0 = env.reset() state = state0.copy() state = torch.FloatTensor(state).to(device) done = False for step in range(100): extrinsic_reward = 0 goal = meta_policy.act(state, get_meta_epsilon(step)) onehot_goal = to_onehot(goal, goal_object.get_size()) print('Goal', goal) goal_reached = False for i in range(100): joint_state = torch.FloatTensor(np.concatenate([state.cpu().numpy(), onehot_goal], axis=0)).to(device) with torch.no_grad(): action = policy.act(joint_state, get_epsilon(frame)) next_state, reward, done, _ = env.step(action.item()) intrinsic_reward = get_intrinsic_reward(goal, next_state) goal_reached = True if intrinsic_reward else False joint_next_state = np.concatenate([next_state, onehot_goal], axis=0) storage.push(joint_state, action, intrinsic_reward, joint_next_state, done) extrinsic_reward += reward state = next_state state = torch.FloatTensor(state).to(device) frame += 1 if done or goal_reached: break goal = torch.LongTensor([goal]).to(device) storage_meta.push(torch.FloatTensor(state0).to(device), goal, extrinsic_reward, next_state, done) meta_frame += 1 if done: break storage.compute() storage_meta.compute() loss_meta = optimizer_meta_policy.update(storage_meta) loss = optimizer_policy.update(storage) if eps % target_policy_update: target_meta_policy.load_state_dict(meta_policy.state_dict()) target_policy.load_state_dict(policy.state_dict()) with open('metrics.csv', 'a') as metrics: metrics.write('{},{}\n'.format(loss_meta, loss))
def update(self, arm, reward): Policy.update(self, arm, reward) ratio = math.exp(self.gamma * reward / (self.n_bandits * self._probs[arm])) # todo 权重会越来越大? self._weights[arm] *= ratio
def get_policies(env, goal_object): meta_policy = Policy(env.observation_space.shape[0], goal_object.get_size()) target_meta_policy = Policy(env.observation_space.shape[0], goal_object.get_size()) policy = Policy(env.observation_space.shape[0] + 1, env.action_space.n) target_policy = Policy(env.observation_space.shape[0] + 1, env.action_space.n) meta_policy.to(device) target_meta_policy.to(device) policy.to(device) target_policy.to(device) target_meta_policy.load_state_dict(meta_policy.state_dict()) target_policy.load_state_dict(policy.state_dict()) return meta_policy, target_meta_policy, policy, target_policy
def __init__(self, args): Policy.__init__(self, args) self.temperature = args[1] # 降火参数,温度越高,分子越随机,成为气体;低温的时候有序排列,成为固体 self.anneal = args[2] > 0.0
batch_size = 128 data_size = 500 k = .1 A = int(k * data_size) # number of samples we gather on each cycle class RandomPolicy(Policy): def __init__(self, batch_size, x_dim, u_dim): super(RandomPolicy, self).__init__(batch_size, x_dim, u_dim) def eval(self, sess, x): return np.random.uniform(low=-5., high=5., size=self.u_dim) #np.random.randn(self.u_dim) Policy.register(RandomPolicy) DATA_PATH = '/ltmp/e2c-boxbot-rand' robot_type = "polyp" #"octoarm" # walker, polyp def run_experiment(): #tmp - verify E2C model builds properly x0v = np.zeros((120, 320, 6)) u_dim = 20 u = tf.placeholder(tf.float32, [batch_size, u_dim]) e2c = E2CBoxbotModel(x0v, u, batch_size) for v in tf.all_variables(): print("%s : %s" % (v.name, v.get_shape())) sess = tf.InteractiveSession()
def __init__(self, args): Policy.__init__(self, args) self.squared_reward = [0.0] * self.n_bandits