Esempio n. 1
0
def sample_paths_one_core(N,
                          policy,
                          T=1e6,
                          env=None,
                          env_name=None,
                          pegasus_seed=None,
                          mode='sample'):
    """
    params:
    N               : number of sample points
    policy          : policy to be used to sample the data
    T               : maximum length of trajectory
    env             : env object to sample from
    env_name        : name of env to be sampled from
                      (one of env or env_name must be specified)
    pegasus_seed    : seed for environment (numpy speed must be set externally)
    """

    if env_name is None and env is None:
        print("No environment specified! Error will be raised")
    if env is None: env = get_environment(env_name)
    # if pegasus_seed is not None: env.env._seed(pegasus_seed)
    T = min(T, env.horizon)

    start_time = timer.time()

    # print("####### Gathering Samples #######")
    sampled_so_far = 0
    paths = []
    seed = pegasus_seed if pegasus_seed is not None else 0

    while sampled_so_far < N:
        if mode == 'sample':
            this_path = base_sampler.do_rollout(1, policy, T, env, env_name,
                                                seed)  # do 1 rollout
        elif mode == 'evaluation':
            this_path = eval_sampler.do_evaluation_rollout(
                1, policy, env, env_name, seed)
        else:
            # print("Mode has to be either 'sample' for training time or 'evaluation' for test time performance")
            break
        paths.append(this_path[0])
        seed += 1
        sampled_so_far += len(this_path[0]["rewards"])

    # print("======= Samples Gathered  ======= | >>>> Time taken = %f " % (timer.time()-start_time) )
    # print("................................. | >>>> # samples = %i # trajectories = %i " % (sampled_so_far, len(paths)) )
    return paths
Esempio n. 2
0
def do_evaluation_rollout(N,
                          policy,
                          T=1e6,
                          env=None,
                          env_name=None,
                          pegasus_seed=None):
    """
    params:
    N               : number of trajectories
    policy          : policy to be used to sample the data
    T               : maximum length of trajectory
    env             : env object to sample from
    env_name        : name of env to be sampled from
                      (one of env or env_name must be specified)
    pegasus_seed    : seed for environment (numpy speed must be set externally)
    """

    if env_name is None and env is None:
        print("No environment specified! Error will be raised")
    if env is None: env = get_environment(env_name)
    if pegasus_seed is not None:
        try:
            env.env._seed(pegasus_seed)
        except AttributeError as e:
            env.env.seed(pegasus_seed)
    T = min(T, env.horizon)

    # print("####### Worker started #######")

    paths = []

    for ep in range(N):

        # Set pegasus seed if asked
        if pegasus_seed is not None:
            seed = pegasus_seed + ep
            try:
                env.env._seed(seed)
            except AttributeError as e:
                env.env.seed(seed)
            np.random.seed(seed)
        else:
            np.random.seed()

        observations = []
        actions = []
        rewards = []
        agent_infos = []
        env_infos = []

        o = env.reset()
        done = False
        t = 0

        while t < T and done != True:
            _, agent_info = policy.get_action(o)
            a = agent_info['evaluation']
            next_o, r, done, env_info = env.step(a)
            # observations.append(o.ravel())
            observations.append(o)
            actions.append(a)
            rewards.append(r)
            agent_infos.append(agent_info)
            env_infos.append(env_info)
            o = next_o
            t += 1

        path = dict(
            observations=np.array(observations),
            actions=np.array(actions),
            rewards=np.array(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
            terminated=done)

        paths.append(path)

    # print("====== Worker finished ======")

    return paths
Esempio n. 3
0
    def trajectory_generator(self, beta, dagger_ep):
        if self.env_name is None and self.env is None:
            print("No environment specified! Error will be raised")
        if self.env is None: self.env = get_environment(self.env_name)

        T = self.env.horizon

        paths = []
        print('Generating trajectories')
        for ep in tqdm(range(self.num_traj_gen)):
            if self.seed is not None:
                seed = self.seed + ep + dagger_ep * self.num_traj_gen
                self.env.env.env._seed(seed)
                np.random.seed(seed)
            else:
                np.random.seed()
            observations = []
            actions = []
            rewards = []
            agent_infos = []
            env_infos = []
            path_image_pixels = []
            all_robot_info = []

            o = self.env.reset()
            robot_info = None
            if self.has_robot_info:
                o, env_info = self.env.reset()
                robot_info = env_info['robot_info']
            done = False
            t = 0

            while t < T and done != True:
                r = np.random.random()
                image_pix = self.env.get_pixels(frame_size=FRAME_SIZE,
                                                camera_name=self.camera_name,
                                                device_id=self.device_id)

                a_expert, agent_info_expert = self.expert_policy.get_action(o)

                img = image_pix
                prev_img = image_pix
                prev_prev_img = image_pix
                if t > 0:
                    prev_img = path_image_pixels[t - 1]
                if t > 1:
                    prev_prev_img = path_image_pixels[t - 2]

                prev_prev_img = np.expand_dims(prev_prev_img, axis=0)
                prev_img = np.expand_dims(prev_img, axis=0)
                img = np.expand_dims(img, axis=0)

                o_img = np.concatenate((prev_prev_img, prev_img, img), axis=0)
                a_viz, agent_info_viz = self.viz_policy.get_action(
                    o_img,
                    use_seq=self.use_seq,
                    use_cuda=self.use_cuda,
                    robot_info=robot_info)

                if r <= beta:
                    a = agent_info_expert['evaluation']
                    agent_info = agent_info_expert
                else:
                    a = a_viz
                    agent_info = agent_info_viz

                next_o, r, done, env_info = self.env.step(a)
                observations.append(o)
                actions.append(agent_info_expert['evaluation'])
                rewards.append(r)
                agent_infos.append(agent_info)
                env_infos.append(env_info)
                path_image_pixels.append(image_pix)
                if self.has_robot_info:
                    all_robot_info.append(robot_info)
                    robot_info = env_info['robot_info']

                o = next_o
                t += 1

            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
                agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
                env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
                terminated=done,
                image_pixels=np.array(path_image_pixels))
            if self.has_robot_info:
                path['robot_info'] = np.array(all_robot_info)

            paths.append(path)
        return paths
Esempio n. 4
0
    def __init__(self,
                 dagger_epochs,
                 expert_policy,
                 viz_policy,
                 old_data_loader: DataLoader,
                 val_data_loader: DataLoader,
                 log_dir,
                 pol_dir_name,
                 has_robot_info=False,
                 beta_start=1.0,
                 beta_decay=0.9,
                 beta_cutoff=0.0,
                 optimizer=None,
                 camera_name=None,
                 lr=3e-4,
                 log_step=10,
                 bins=0,
                 use_img=True,
                 use_seq=True,
                 trainer_epochs=5,
                 num_traj_gen=20,
                 env_name=None,
                 env=None,
                 save_epoch=1,
                 eval_num_traj=25,
                 seed=500,
                 sliding_window=0,
                 device_id=None,
                 use_cuda=False):

        self.beta = beta_start
        self.dagger_epochs = dagger_epochs
        self.expert_policy = expert_policy
        self.viz_policy = viz_policy
        self.old_data_loader = old_data_loader
        self.beta_decay = beta_decay
        self.camera_name = camera_name
        self.has_robot_info = has_robot_info
        self.beta_cutoff = beta_cutoff
        self.log_step = log_step
        self.bins = bins
        self.pol_dir_name = pol_dir_name
        self.use_img = use_img
        self.use_seq = use_seq
        self.trainer_epochs = trainer_epochs
        self.val_data_loader = val_data_loader
        self.num_traj_gen = num_traj_gen
        self.eval_num_traj = eval_num_traj
        self.env = env
        self.env_name = env_name
        self.save_epoch = save_epoch
        self.sliding_window = sliding_window
        self.device_id = device_id
        self.use_cuda = use_cuda

        # filewriters
        self.log_tf_train = Logger(os.path.join(LOG_DIR, log_dir))
        self.log_tf_val = Logger(os.path.join(LOG_DIR, log_dir, 'validation'))
        self.log_expert = Logger(os.path.join(LOG_DIR, log_dir, 'expert'))
        self.log_viz = Logger(os.path.join(LOG_DIR, log_dir, 'viz'))

        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(
            self.viz_policy.trainable_params,
            lr=lr) if optimizer is None else optimizer

        self.seed = seed
        if self.env_name is None and self.env is None:
            print("No environment specified! Error will be raised")
        if self.env is None: self.env = get_environment(self.env_name)

        self.expert_reward, _, _ = self.env.evaluate_policy(
            self.expert_policy,
            num_episodes=self.eval_num_traj,
            mean_action=True,
            seed=self.seed,
            device_id=self.device_id)
Esempio n. 5
0
    def trajectory_generator(self, beta, dagger_ep):
        if self.env_name is None and self.env is None:
            print("No environment specified! Error will be raised")
        if self.env is None: self.env = get_environment(self.env_name)

        T = self.env.horizon

        paths = []
        print('Generating trajectories')
        for ep in tqdm(range(self.num_traj_gen)):
            if self.seed is not None:
                seed = self.seed + ep + dagger_ep * self.num_traj_gen
                self.env.env.env._seed(seed)
                np.random.seed(seed)
            else:
                np.random.seed()
            observations = []
            actions = []
            rewards = []
            agent_infos = []
            env_infos = []
            path_image_pixels = []
            all_robot_info = []

            o = self.env.reset()
            robot_info = None
            if self.has_robot_info:
                o, env_info = self.env.reset()
                robot_info = env_info['robot_info']
            done = False
            t = 0

            while t < T and done != True:
                r = np.random.random()
                image_pix = self.env.get_pixels(frame_size=FRAME_SIZE, camera_name=self.camera_name,
                                                device_id=self.device_id)

                a_expert, agent_info_expert = self.expert_policy.get_action(o)

                img = image_pix
                prev_img = image_pix
                prev_prev_img = image_pix
                if t > 0:
                    prev_img = path_image_pixels[t - 1]
                if t > 1:
                    prev_prev_img = path_image_pixels[t - 2]

                prev_prev_img = np.expand_dims(prev_prev_img, axis=0)
                prev_img = np.expand_dims(prev_img, axis=0)
                img = np.expand_dims(img, axis=0)

                o_img = np.concatenate((prev_prev_img, prev_img, img), axis=0)
                a_viz, agent_info_viz = self.viz_policy.get_action(o_img, use_seq=self.use_seq, use_cuda=self.use_cuda,
                                                                   robot_info=robot_info)

                if r <= beta:
                    a = agent_info_expert['evaluation']
                    agent_info = agent_info_expert
                else:
                    a = a_viz
                    agent_info = agent_info_viz

                next_o, r, done, env_info = self.env.step(a)
                observations.append(o)
                actions.append(agent_info_expert['evaluation'])
                rewards.append(r)
                agent_infos.append(agent_info)
                env_infos.append(env_info)
                path_image_pixels.append(image_pix)
                if self.has_robot_info:
                    all_robot_info.append(robot_info)
                    robot_info = env_info['robot_info']

                o = next_o
                t += 1

            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
                agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
                env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
                terminated=done,
                image_pixels=np.array(path_image_pixels)
            )
            if self.has_robot_info:
                path['robot_info'] = np.array(all_robot_info)

            paths.append(path)
        return paths
Esempio n. 6
0
    def __init__(self,
                 dagger_epochs,
                 expert_policy,
                 viz_policy,
                 old_data_loader: DataLoader,
                 val_data_loader: DataLoader,
                 log_dir,
                 pol_dir_name,
                 has_robot_info=False,
                 beta_start=1.0,
                 beta_decay=0.9,
                 beta_cutoff=0.0,
                 optimizer=None,
                 camera_name=None,
                 lr=3e-4,
                 log_step=10,
                 bins=0,
                 use_img=True,
                 use_seq=True,
                 trainer_epochs=5,
                 num_traj_gen=20,
                 env_name=None,
                 env=None,
                 save_epoch=1,
                 eval_num_traj=25,
                 seed=500,
                 sliding_window=0,
                 device_id=None,
                 use_cuda=False):

        self.beta = beta_start
        self.dagger_epochs = dagger_epochs
        self.expert_policy = expert_policy
        self.viz_policy = viz_policy
        self.old_data_loader = old_data_loader
        self.beta_decay = beta_decay
        self.camera_name = camera_name
        self.has_robot_info = has_robot_info
        self.beta_cutoff = beta_cutoff
        self.log_step = log_step
        self.bins = bins
        self.pol_dir_name = pol_dir_name
        self.use_img = use_img
        self.use_seq = use_seq
        self.trainer_epochs = trainer_epochs
        self.val_data_loader = val_data_loader
        self.num_traj_gen = num_traj_gen
        self.eval_num_traj = eval_num_traj
        self.env = env
        self.env_name = env_name
        self.save_epoch = save_epoch
        self.sliding_window = sliding_window
        self.device_id = device_id
        self.use_cuda = use_cuda

        # filewriters
        self.log_tf_train = Logger(os.path.join(LOG_DIR, log_dir))
        self.log_tf_val = Logger(os.path.join(LOG_DIR, log_dir, 'validation'))
        self.log_expert = Logger(os.path.join(LOG_DIR, log_dir, 'expert'))
        self.log_viz = Logger(os.path.join(LOG_DIR, log_dir, 'viz'))

        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.viz_policy.trainable_params, lr=lr) if optimizer is None else optimizer

        self.seed = seed
        if self.env_name is None and self.env is None:
            print("No environment specified! Error will be raised")
        if self.env is None: self.env = get_environment(self.env_name)

        self.expert_reward, _, _ = self.env.evaluate_policy(self.expert_policy,
                                                            num_episodes=self.eval_num_traj, mean_action=True,
                                                            seed=self.seed, device_id=self.device_id)