コード例 #1
0
 def update(self, arm, reward):
     """更新收益
     :param reward: 收益
     :type arm: 选中的臂的下标
     """
     Policy.update(self, arm, reward)
     self.b[arm] = self.b[arm] + reward * self.context[arm]
     self.context[arm].shape = (self.d, 1)
     self.A[arm] = self.A[arm] + self.context[arm].dot(np.transpose(self.context[arm]))
コード例 #2
0
def main():
    torch.set_num_threads(1)
    torch.manual_seed(0)

    env = gym.make(env_name)
    env.seed(seed)

    print('New model')
    policy = Policy('actor_critic', env.observation_space.shape[0], env.action_space.n)
    policy.to(device)

    optimizer = PPO(policy, clip_param, ppo_epoch, mini_batch_size,
                value_loss_coef, entropy_coef, learning_rate,
                max_grad_norm)

    episode_rewards = deque(maxlen=50)

    for eps in range(0, n_eps + 1):
        state = env.reset()
        storage = Storage(device=device)

        policy.eval()

        episode_rewards.append(test_env(policy, gym.make(env_name)))
        if eps % 5 == 0:
            print('Avg reward', np.mean(episode_rewards))

        for step in range(n_steps):

            state = torch.FloatTensor(state).to(device)

            with torch.no_grad():
                value, action, log_prob = policy.act(state)

            next_state, reward, done, _ = env.step(action.item())

            storage.push(state, action, log_prob, value, reward, done)

            state = next_state

            if done:
                state = env.reset()

        next_state = torch.FloatTensor(next_state).to(device)
        with torch.no_grad():
            next_value = policy.get_value(next_state).detach()

        storage.compute(next_value)

        policy.train()

        value_loss, action_loss, dist_entropy = optimizer.update(storage)

        with open('metrics.csv', 'a') as metrics:
            metrics.write('{},{},{}\n'.format(value_loss, action_loss, dist_entropy))
コード例 #3
0
 def __init__(self, args):
     """
     我们人为生成一些上下文来模拟
     :param args: 臂个数参数,以及各个臂的穿越参数等
     """
     Policy.__init__(self, args)
     self.alpha = args[1]
     self.travel_args = args[2:]  # 穿越过来的臂均值参数
     self.d = 3  # 上下文维度
     self.A = np.array([np.identity(self.d) for _ in range(self.n_bandits)])
     self.b = np.array([np.zeros(self.d) for _ in range(self.n_bandits)])
     self.context = None
コード例 #4
0
ファイル: helpers.py プロジェクト: tesslerc/malmo_rl
def play_full_episode(agents: ParallelAgentsWrapper, policy: Policy, step: int, params: argparse, is_train: bool) \
        -> Tuple[ParallelAgentsWrapper, int, bool, bool, float, int, Dict[str, float]]:
    eval_required = False
    checkpoint_reached = False
    epoch_reward = 0
    rewards, terminals, states, terminals_due_to_timeout, success = agents.perform_actions(
        ['new game' for _ in range(params.number_of_agents)], is_train)  # Restart all the agents.

    log_dict = {}
    start_step = step
    successful_agents = [0 for _ in range(params.number_of_agents)]
    while not all([t or t is None for t in terminals]):  # Loop ends only when all agents have terminated.
        action = policy.get_action(states, is_train)
        rewards, terminals, states, terminals_due_to_timeout, success = agents.perform_actions(action, is_train)

        # reward is a list. Passing it to update_observation changes its values hence all references should be
        # performed prior to calling update_observation.
        for idx, reward in enumerate(rewards):
            if reward is not None:
                epoch_reward += reward
                if success[idx]:
                    successful_agents[idx] = 1
        logging.debug('step: %s, reward: %s, terminal: %s, terminal_due_to_timeout: %s, sucess: %s', step, rewards,
                      terminals, terminals_due_to_timeout, success)

        policy.update_observation(rewards, terminals, terminals_due_to_timeout, success, is_train)

        if is_train:
            single_log_dict = policy.train(states)
        else:
            single_log_dict = {}

        step += 1

        if step % params.eval_frequency == 0:
            eval_required = True
        if step % params.checkpoint_interval == 0:
            checkpoint_reached = True

        for item in single_log_dict:
            if item in log_dict:
                log_dict[item] = log_dict[item] + single_log_dict[item]
            else:
                log_dict[item] = single_log_dict[item]

    for item in log_dict:
        log_dict[item] = log_dict[item] * 1.0 / (step - start_step)
    return agents, step, eval_required, checkpoint_reached, epoch_reward, sum(successful_agents), log_dict
    def _collect_trajectory(
            cls, replay_description_: replay_description.ReplayDescription,
            frozen_policy: policy.Policy):
        new_transitions = {}
        while not _process_env.needs_reset:
            state = _process_env.state
            action, action_logprob = frozen_policy.sample(state,
                                                          return_logprob=True)

            next_state, reward, is_terminal, info = _process_env.step(action)
            is_timeout = _process_env.needs_reset
            terminal_weight = 0. if is_terminal else 1.
            timeout_weight = 0. if is_timeout else 1.
            new_transition = {
                'states': state,
                'actions': action,
                'rewards': reward,
                'next_states': next_state,
                'timeout_weight': timeout_weight,
                'terminal_weight': terminal_weight,
                'action_log_prob': action_logprob,
                **info
            }
            for key in new_transition:
                if key not in new_transitions:
                    new_transitions[key] = [new_transition[key]]
                else:
                    new_transitions[key].append(new_transition[key])

        new_transitions = replay_description_.prepare_samples(
            (len(new_transitions['states']), ), new_transitions)

        cumulative_return = _process_env.cumulative_return()
        _process_env.reset()
        del frozen_policy
        return new_transitions, cumulative_return
コード例 #6
0
ファイル: boxbot_rand.py プロジェクト: ericjang/adaptive-e2c
# E2C Parameters
num_episodes=50 # total overall cycles
B=100 # num minibatches per cycle
batch_size=128
data_size = 500
k=.1
A=int(k*data_size) # number of samples we gather on each cycle

class RandomPolicy(Policy):
  def __init__(self, batch_size, x_dim, u_dim):
    super(RandomPolicy, self).__init__(batch_size, x_dim, u_dim)
  def eval(self, sess, x):
    return np.random.uniform(low=-5.,high=5.,size=self.u_dim)
    #np.random.randn(self.u_dim)
Policy.register(RandomPolicy)


DATA_PATH='/ltmp/e2c-boxbot-rand'


robot_type = "polyp" #"octoarm" # walker, polyp


def run_experiment():
  #tmp - verify E2C model builds properly
  x0v = np.zeros((120,320,6))
  u_dim=20
  u=tf.placeholder(tf.float32, [batch_size, u_dim])
  e2c = E2CBoxbotModel(x0v, u, batch_size)
  for v in tf.all_variables():
コード例 #7
0
ファイル: naive.py プロジェクト: lzp1712/MAB
 def __init__(self, args):
     Policy.__init__(self, args)
     self.try_perSlot = int(args[1])
コード例 #8
0
 def __init__(self, args):
     Policy.__init__(self, args)
     self.gamma = args[1]
     self._weights = np.array([1] * self.n_bandits)
     self._probs = None
コード例 #9
0
ファイル: random.py プロジェクト: lzp1712/MAB
 def __init__(self, args):
     Policy.__init__(self, args)
コード例 #10
0
 def __init__(self, args):
     Policy.__init__(self, args)
     self.anneal = args[2] > 0.0
     self.decay = args[1]
コード例 #11
0
ファイル: main_dqn.py プロジェクト: ARVILab/rl-framework
def main():
    torch.set_num_threads(1)
    torch.manual_seed(0)

    env = gym.make(env_name)
    env.seed(42)

    print('New model')
    policy = Policy('dqn', env.observation_space.shape[0], env.action_space.n)
    target_policy = Policy('dqn', env.observation_space.shape[0],
                           env.action_space.n)
    policy.to(device)
    target_policy.to(device)
    target_policy.load_state_dict(policy.state_dict())
    optimizer = DQNOptimizer(policy, target_policy, mini_batch_size, discount,
                             learning_rate, update_epochs)

    episode_rewards = deque(maxlen=50)

    get_epsilon = lambda episode: np.exp(-episode * e_decay)

    for eps in range(0, n_eps + 1):
        state = env.reset()
        storage = Storage(device=device)

        episode_rewards.append(test_env(target_policy, gym.make(env_name)))
        if eps % 5 == 0:
            print('Avg reward', np.mean(episode_rewards))

        for step in range(n_steps):

            state = torch.FloatTensor(state).to(device)

            with torch.no_grad():
                action = policy.act(state, get_epsilon(eps))

            next_state, reward, done, _ = env.step(action.item())

            storage.push(state, action, reward, next_state, done)

            state = next_state

            if done:
                state = env.reset()

        storage.compute()

        loss = optimizer.update(storage)

        if eps % target_policy_update:
            target_policy.load_state_dict(policy.state_dict())

        with open('metrics.csv', 'a') as metrics:
            metrics.write('{}\n'.format(loss))
コード例 #12
0
 def __init__(self, args):
     Policy.__init__(self, args)
     # 记录每个臂的beta分布参数
     self.betaArgs = [[args[1], args[2]] for _ in range(self.n_bandits)]
コード例 #13
0
ファイル: main_hdqn.py プロジェクト: ARVILab/rl-framework
def main():
    torch.set_num_threads(1)
    torch.manual_seed(0)

    env = MountainCarEnvInherit()
    env.seed(42)

    meta_policy = Policy('dqn', env.observation_space.shape[0], goal_object.get_size()) 
    target_meta_policy = Policy('dqn', env.observation_space.shape[0], goal_object.get_size()) 

    policy = Policy('dqn', env.observation_space.shape[0] + goal_object.get_size(), env.action_space.n)
    target_policy = Policy('dqn', env.observation_space.shape[0] + goal_object.get_size(), env.action_space.n)

    meta_policy.to(device)
    target_meta_policy.to(device)
    target_meta_policy.load_state_dict(meta_policy.state_dict())

    policy.to(device)
    target_policy.to(device)
    target_policy.load_state_dict(policy.state_dict())

    optimizer_meta_policy = DQNOptimizer(meta_policy, target_meta_policy, mini_batch_size, discount, learning_rate, update_epochs)

    optimizer_policy = DQNOptimizer(policy, target_policy, mini_batch_size, discount, learning_rate, update_epochs)

    episode_rewards = deque(maxlen=50)

    get_meta_epsilon = lambda episode: np.exp(-episode * e_meta_decay)
    get_epsilon = lambda episode: np.exp(-episode * e_decay)

    frame = 0
    meta_frame = 0

    for eps in range(0, n_eps + 1):

        if eps % 1 == 0:
            episode_rewards.append(test_env(meta_policy, policy, MountainCarEnvInherit()))
            print('Avg reward', np.mean(episode_rewards))

        storage = Storage(device=device)
        storage_meta = Storage(device=device)
        print('Game', eps)

        state0 = env.reset()
        state = state0.copy()
        state = torch.FloatTensor(state).to(device)

        done = False

        for step in range(100):

            extrinsic_reward = 0
            goal = meta_policy.act(state, get_meta_epsilon(step))
            onehot_goal = to_onehot(goal, goal_object.get_size())

            print('Goal', goal)

            goal_reached = False

            for i in range(100):

                joint_state = torch.FloatTensor(np.concatenate([state.cpu().numpy(), onehot_goal], axis=0)).to(device)

                with torch.no_grad():
                    action = policy.act(joint_state, get_epsilon(frame))

                next_state, reward, done, _ = env.step(action.item())

                intrinsic_reward = get_intrinsic_reward(goal, next_state)
                goal_reached = True if intrinsic_reward else False

                joint_next_state = np.concatenate([next_state, onehot_goal], axis=0)
                storage.push(joint_state, action, intrinsic_reward, joint_next_state, done)

                extrinsic_reward += reward

                state = next_state
                state = torch.FloatTensor(state).to(device)

                frame += 1

                if done or goal_reached:
                    break

            goal = torch.LongTensor([goal]).to(device)
            storage_meta.push(torch.FloatTensor(state0).to(device), goal, extrinsic_reward, next_state, done)

            meta_frame += 1

            if done:
                break

        storage.compute()
        storage_meta.compute()

        loss_meta = optimizer_meta_policy.update(storage_meta)
        loss = optimizer_policy.update(storage)

        if eps % target_policy_update:
            target_meta_policy.load_state_dict(meta_policy.state_dict())
            target_policy.load_state_dict(policy.state_dict())

        with open('metrics.csv', 'a') as metrics:
            metrics.write('{},{}\n'.format(loss_meta, loss))
コード例 #14
0
 def update(self, arm, reward):
     Policy.update(self, arm, reward)
     ratio = math.exp(self.gamma * reward /
                      (self.n_bandits * self._probs[arm]))
     # todo 权重会越来越大?
     self._weights[arm] *= ratio
コード例 #15
0
def get_policies(env, goal_object):
    meta_policy = Policy(env.observation_space.shape[0],
                         goal_object.get_size())
    target_meta_policy = Policy(env.observation_space.shape[0],
                                goal_object.get_size())
    policy = Policy(env.observation_space.shape[0] + 1, env.action_space.n)
    target_policy = Policy(env.observation_space.shape[0] + 1,
                           env.action_space.n)

    meta_policy.to(device)
    target_meta_policy.to(device)
    policy.to(device)
    target_policy.to(device)

    target_meta_policy.load_state_dict(meta_policy.state_dict())
    target_policy.load_state_dict(policy.state_dict())

    return meta_policy, target_meta_policy, policy, target_policy
コード例 #16
0
ファイル: softmax.py プロジェクト: lzp1712/MAB
 def __init__(self, args):
     Policy.__init__(self, args)
     self.temperature = args[1]  # 降火参数,温度越高,分子越随机,成为气体;低温的时候有序排列,成为固体
     self.anneal = args[2] > 0.0
コード例 #17
0
ファイル: boxbot_rand.py プロジェクト: ericjang/adaptive-e2c
batch_size = 128
data_size = 500
k = .1
A = int(k * data_size)  # number of samples we gather on each cycle


class RandomPolicy(Policy):
    def __init__(self, batch_size, x_dim, u_dim):
        super(RandomPolicy, self).__init__(batch_size, x_dim, u_dim)

    def eval(self, sess, x):
        return np.random.uniform(low=-5., high=5., size=self.u_dim)
        #np.random.randn(self.u_dim)


Policy.register(RandomPolicy)

DATA_PATH = '/ltmp/e2c-boxbot-rand'

robot_type = "polyp"  #"octoarm" # walker, polyp


def run_experiment():
    #tmp - verify E2C model builds properly
    x0v = np.zeros((120, 320, 6))
    u_dim = 20
    u = tf.placeholder(tf.float32, [batch_size, u_dim])
    e2c = E2CBoxbotModel(x0v, u, batch_size)
    for v in tf.all_variables():
        print("%s : %s" % (v.name, v.get_shape()))
    sess = tf.InteractiveSession()
コード例 #18
0
ファイル: ucb1_normal.py プロジェクト: lzp1712/MAB
 def __init__(self, args):
     Policy.__init__(self, args)
     self.squared_reward = [0.0] * self.n_bandits