Ejemplo n.º 1
0
 def get_data(self, data_dict: DataDict, model_dict: ModelDict,
              batch_idx: np.ndarray):
     data = data_dict.get(self.data_key)[batch_idx]
     tensor = torch.as_tensor(data, dtype=self.dtype).to(device).reshape(
         len(batch_idx), -1)
     scaler = model_dict.get(self.scaler_key)
     scaled = scaler.forward(tensor)
     return scaled
Ejemplo n.º 2
0
    def train_one_episode(self, data_dict: DataDict, model_dict: ModelDict,
                          batch_size=64):

        # shuffle data dicts
        random_idx = np.random.choice(range(data_dict.n_examples), data_dict.n_examples, replace=False)
        for k, v in data_dict.dict.items():
            data_dict.set(k, v[random_idx])

        # cast model to CUDA or CPU
        for model in model_dict.as_list():
            model.to(device)

        epoch_loss = 0
        for epoch in tqdm(range(self.n_epochs)):
            epoch_loss += self.train_one_epoch(data_dict, model_dict, batch_size)
        episode_loss = epoch_loss / self.n_epochs

        torch.cuda.empty_cache()
        return episode_loss
Ejemplo n.º 3
0
def main():
    model_dicts_path = config_dict["model_dicts_path"]
    data_dict_paths = config_dict["data_dict_paths"]

    model_dicts: List[ModelDict] = list(torch.load(model_dicts_path, map_location="cpu").values())
    data_dicts = [torch.load(path, map_location="cpu") for path in data_dict_paths]

    encoded_states_list = []
    for model_dict, data_dict in zip(model_dicts, data_dicts):
        states = data_dict.get(DataKey.states)
        state_scaler = model_dict.get(ModelKey.state_scaler)
        state_encoder = model_dict.get(ModelKey.state_encoder)

        states_tensor = torch.as_tensor(states).float()
        states_scaled_tensor = state_scaler.forward(states_tensor)
        encoded_states_tensor, _ = state_encoder.forward(states_scaled_tensor)
        encoded_states = encoded_states_tensor.detach().numpy()
        encoded_states_list.append(encoded_states)

    encoded_states = np.concatenate(encoded_states_list, axis=0)
    n_states, _ = encoded_states.shape
    data_dict = DataDict(n_states)
    data_dict.set(DataKey.encoded_states, encoded_states)
    torch.save(data_dict, "{:s}_encoded_states.pkl".format(args.output_prefix))
Ejemplo n.º 4
0
def main():
    samples1 = np.random.uniform((0, 0), (1, 1), (10000, 2))
    samples2 = np.random.uniform((1, 1), (2, 2), (10000, 2))
    samples3 = np.random.uniform((1, 0), (2, 1), (10000, 2))
    samples = np.concatenate([samples1, samples2, samples3], axis=0)
    samples[:, 0] += 1
    samples[:, 1] += 1

    angle = np.pi / 2
    rotation_matrix = np.asarray([[np.cos(angle), -np.sin(angle)],
                                  [np.sin(angle), np.cos(angle)]])
    data_dict = DataDict(30000)
    data_dict.set(DataKey.states, samples)
    torch.save(data_dict, "./data/shapes0.pkl")

    rotated_samples = samples @ rotation_matrix
    data_dict = DataDict(30000)
    data_dict.set(DataKey.states, rotated_samples)
    torch.save(data_dict, "./data/shapes1.pkl")

    plt.figure()
    plt.scatter(samples[:, 0], samples[:, 1])
    plt.scatter(rotated_samples[:, 0], rotated_samples[:, 1])
    plt.show()
Ejemplo n.º 5
0
def main():
    config_name = args.config_name
    with open('./configs/simulate_character.yml', 'r') as f:
        config_dict = yaml.safe_load(f)[config_name]
    noise_scale = config_dict["noise_scale"]
    reset_every = config_dict["reset_every"]
    num_examples = config_dict["num_examples"]
    push_scale = config_dict["push_scale"]

    factory = get_env_factory(config_dict, config_name, True, False)

    env = factory.make_env()
    env.seed()

    if config_name == "pendulum2d":
        state_scaler = DummyNet()
        actor_model = Pendulum2DSolver(env)
        model_dict = ModelDict()
        model_dict.set(ModelKey.state_scaler, state_scaler)
        model_dict.set(ModelKey.actor, actor_model)
    else:
        model_dict = torch.load(config_dict['expert_path'], map_location='cpu')

    state_scaler = model_dict.get(ModelKey.state_scaler)
    actor_model = model_dict.get(ModelKey.actor)

    action_getter = ActionGetterFromState(state_scaler, actor_model)

    perturb_every = 61

    # collect state-action pairs
    tups = []
    print('Collect state-action pairs')
    state = env.reset()
    done = False
    burn_in_counter = 40  # discard the first few ill-defined states
    pbar = tqdm(total=num_examples)
    example_counter = 0
    trajectory_counter = 0
    while example_counter < num_examples:

        if done or trajectory_counter >= reset_every:
            state = env.reset()
            burn_in_counter = 40
            trajectory_counter = 0  # start trajectory over

        phase = env.phase  # current phase
        next_phase = (phase + 1) % 20
        log_prob = 0
        action = action_getter.get_action(state)

        if trajectory_counter > 0 and trajectory_counter % perturb_every == 0:
            push_strength = np.random.normal(-push_scale, push_scale)
            push_strength = np.clip(push_strength, -push_scale, push_scale)
            dq = env.robot_skeleton.dq
            dq[0] += push_strength
            env.robot_skeleton.set_velocities(dq)

        noise = np.random.normal(0, noise_scale)
        next_state, reward, done, _ = env.step(action + noise)

        state_ = state.copy()
        action_ = action.copy()
        next_state_ = next_state.copy()

        if config_dict["mirror"]:
            if phase >= 10:
                state_ = env.state_getter._mirror_state(state_)
                action_ = reflect_control_vector(action_)
            if next_phase >= 10:
                next_state_ = env.state_getter._mirror_state(next_state_)

        if burn_in_counter <= 0:
            tups.append(
                TupleSARSP(state_[:-1], phase, action_, log_prob, reward,
                           next_state_[:-1], done))
            example_counter += 1
            trajectory_counter += 1
            pbar.update(1)

        burn_in_counter = max(burn_in_counter - 1, 0)
        state = next_state

    pbar.close()

    print('Turn state-action pairs into a dataset')
    dataset = DatasetSARSP.from_tuple_list(tups)

    data_dict = DataDict(len(tups))
    data_dict.set(DataKey.states, dataset.states)
    data_dict.set(DataKey.actions, dataset.actions)
    data_dict.set(DataKey.next_states, dataset.next_states)
    data_dict.set(DataKey.rewards, dataset.rewards)
    data_dict.set(DataKey.dones, dataset.dones)
    data_dict.set(DataKey.phases, dataset.phases)
    data_dict.set(DataKey.log_probs, dataset.log_probs)

    print('Save dataset as a pickle')
    save_filename = '{:s}_{:07d}.pkl'.format(args.output_prefix, num_examples)
    torch.save(data_dict, save_filename)
    print('Saved to {:s}'.format(save_filename))
Ejemplo n.º 6
0
def main():
    states0 = []
    actions0 = []
    next_states0 = []
    states1 = []
    actions1 = []
    next_states1 = []

    n_steps = 10
    n_trajectories = 1000
    starting_states0 = np.linspace((0, 0), (1, 0), n_trajectories)
    starting_states1 = np.linspace((0, 0), (0, 1), n_trajectories)
    goal_state = np.array([1., 1.])

    for starting_state0, starting_state1 in zip(starting_states0,
                                                starting_states1):
        state0 = starting_state0
        state1 = starting_state1
        vel0 = (goal_state - starting_state0) / n_steps
        vel1 = (goal_state - starting_state1) / n_steps
        for t in range(n_steps):
            next_state0 = state0 + vel0
            next_state1 = state1 + vel1

            states0.append(state0)
            next_states0.append(next_state0)
            states1.append(state1)
            next_states1.append(next_state1)
            actions0.append(np.asarray([0., 0.]))
            actions1.append(np.asarray([0., 0.]))

            state0 = next_state0
            state1 = next_state1

    n_examples = n_steps * n_trajectories
    random_idx0 = np.random.choice(range(n_examples),
                                   n_examples,
                                   replace=False)
    states0 = np.stack(states0)[random_idx0]
    next_states0 = np.stack(next_states0)[random_idx0]
    actions0 = np.stack(actions0)[random_idx0]

    random_idx1 = np.random.choice(range(n_examples),
                                   n_examples,
                                   replace=False)
    states1 = np.stack(states1)[random_idx1]
    next_states1 = np.stack(next_states1)[random_idx1]
    actions1 = np.stack(actions1)[random_idx1]

    dataset0 = DataDict(n_examples)
    dataset0.set(DataKey.states, states0)
    dataset0.set(DataKey.next_states, next_states0)
    dataset0.set(DataKey.actions, actions0)

    dataset1 = DataDict(n_examples)
    dataset1.set(DataKey.states, states1)
    dataset1.set(DataKey.next_states, next_states1)
    dataset1.set(DataKey.actions, actions1)

    torch.save(dataset0, "./data/wedges0.pkl")
    torch.save(dataset1, "./data/wedges1.pkl")
Ejemplo n.º 7
0
    def collect_data_dict(self, model_dict: ModelDict, envs_container: EnvsContainer) -> DataDict:
        if self.reset_every_collection:
            envs_container.states = envs_container.envs.reset()
        tup_matrix = self.collect_experience_tuple_matrix(model_dict, envs_container)
        state_dim = envs_container.env.observation_space.shape[0]
        action_dim = envs_container.env.action_space.shape[0]
        n, t = tup_matrix.shape
        states_matrix = np.zeros((n, t, state_dim), dtype=np.float)
        next_states_matrix = np.zeros((n, t, state_dim), dtype=np.float)
        actions_matrix = np.zeros((n, t, action_dim), dtype=np.float)
        rewards_matrix = np.zeros((n, t), dtype=np.float)
        dones_matrix = np.zeros((n, t), dtype=np.bool)
        # phases_matrix = np.zeros((n, t), dtype=np.int)
        log_probs_matrix = np.zeros((n, t), dtype=np.float)
        value_predictions_matrix = np.zeros((n, t), dtype=np.float)

        critic = model_dict.get(ModelKey.critic)
        state_scaler = model_dict.get(ModelKey.state_scaler)

        for i in range(n):
            for j in range(t):
                tup = tup_matrix[i, j]
                states_matrix[i, j] = tup.state
                actions_matrix[i, j] = tup.action
                next_states_matrix[i, j] = tup.next_state
                rewards_matrix[i, j] = tup.reward
                dones_matrix[i, j] = tup.done
                # phases_matrix[i, j] = tup.phase
                log_probs_matrix[i, j] = tup.log_prob
                state_tensor = torch.as_tensor(tup.state.reshape(1, -1)).float()
                state_scaled_tensor = state_scaler.forward(state_tensor)
                value_prediction, _ = critic.forward(state_scaled_tensor)
                value_predictions_matrix[i, j] = value_prediction.cpu().detach().numpy().squeeze()

        cumulative_rewards_matrix = compute_cumulative_rewards_matrix(value_predictions_matrix, next_states_matrix,
                                                                      rewards_matrix, dones_matrix,
                                                                      critic, state_scaler)
        # scale advantage
        advantages_matrix = cumulative_rewards_matrix - value_predictions_matrix
        advantages_reshaped = advantages_matrix.reshape(-1, 1)
        advantage_scaler = StandardScaler()
        advantages_reshaped_scaled = advantage_scaler.fit_transform(advantages_reshaped)

        data_dict = DataDict(n * t)
        data_dict.set(DataKey.states, states_matrix.reshape(-1, state_dim))
        data_dict.set(DataKey.actions, actions_matrix.reshape(-1, action_dim))
        data_dict.set(DataKey.next_states, next_states_matrix.reshape(-1, state_dim))
        data_dict.set(DataKey.log_probs, log_probs_matrix.reshape(-1, ))
        data_dict.set(DataKey.rewards, rewards_matrix.reshape(-1, ))
        data_dict.set(DataKey.dones, dones_matrix.reshape(-1, ))
        data_dict.set(DataKey.advantages, advantages_reshaped_scaled.reshape(-1, ))
        data_dict.set(DataKey.cumulative_rewards, cumulative_rewards_matrix.reshape(-1, ))

        return data_dict
Ejemplo n.º 8
0
 def collect_data_dict(self, model_dict: ModelDict, envs_container: EnvsContainer,
                       use_action_replay=True) -> DataDict:
     tuple_array = self.collect_experience_tuple_array(model_dict, envs_container)
     tuple_list = np.stack(tuple_array).reshape(-1, ).tolist()
     if use_action_replay:
         tups_from_buffer = self.sample_from_buffer()
         for tup in tuple_list:
             self.buffer.append(tup)
         tuple_list.extend(tups_from_buffer)
     dataset = DatasetSARS.from_tuple_list(tuple_list)
     data_dict = DataDict(len(tuple_array))
     data_dict.set(DataKey.states, dataset.states)
     data_dict.set(DataKey.actions, dataset.actions)
     data_dict.set(DataKey.next_states, dataset.next_states)
     data_dict.set(DataKey.log_probs, dataset.log_probs)
     data_dict.set(DataKey.rewards, dataset.rewards)
     data_dict.set(DataKey.dones, dataset.dones)
     # data_dict.set(DataKey.phases, dataset.phases)
     return data_dict
Ejemplo n.º 9
0
 def get_data(self, data_dict: DataDict, model_dict: ModelDict,
              batch_idx: np.ndarray):
     data = data_dict.get(self.data_key)
     return data[batch_idx]