Exemple #1
0
 def get_shard_size(self):
     """Gets size of shards on disk."""
     if not len(self.metadata_df):
         raise ValueError("No data in dataset.")
     sample_y = load_from_disk(
         os.path.join(self.data_dir,
                      next(self.metadata_df.iterrows())[1]['y']))
     return len(sample_y)
Exemple #2
0
 def __len__(self):
     """
 Finds number of elements in dataset.
 """
     total = 0
     for _, row in self.metadata_df.iterrows():
         y = load_from_disk(os.path.join(self.data_dir, row['ids']))
         total += len(y)
     return total
Exemple #3
0
    def get_data_shape(self):
        """
    Gets array shape of datapoints in this dataset.
    """
        if not len(self.metadata_df):
            raise ValueError("No data in dataset.")
        sample_X = load_from_disk(
            os.path.join(self.data_dir,
                         next(self.metadata_df.iterrows())[1]['X']))

        return np.shape(sample_X)[1:]
Exemple #4
0
    def get_shard(self, i):
        """Retrieves data for the i-th shard from disk."""
        row = self.metadata_df.iloc[i]
        X = np.array(load_from_disk(os.path.join(self.data_dir, row['X'])))

        if row['y'] is not None:
            y = np.array(load_from_disk(os.path.join(self.data_dir, row['y'])))
        else:
            y = None

        if row['w'] is not None:
            # TODO (ytz): Under what condition does this exist but the file itself doesn't?
            w_filename = os.path.join(self.data_dir, row['w'])
            if os.path.exists(w_filename):
                w = np.array(load_from_disk(w_filename))
            else:
                w = np.ones(y.shape)
        else:
            w = None

        ids = np.array(load_from_disk(os.path.join(self.data_dir, row['ids'])),
                       dtype=object)
        return (X, y, w, ids)
Exemple #5
0
        def iterate(dataset):
            for _, row in dataset.metadata_df.iterrows():
                X = np.array(
                    load_from_disk(os.path.join(dataset.data_dir, row['X'])))

                ids = np.array(load_from_disk(
                    os.path.join(dataset.data_dir, row['ids'])),
                               dtype=object)
                # These columns may be missing is the dataset is unlabelled.
                if row['y'] is not None:
                    y = np.array(
                        load_from_disk(os.path.join(dataset.data_dir,
                                                    row['y'])))
                else:
                    y = None
                if row['w'] is not None:
                    w_filename = os.path.join(dataset.data_dir, row['w'])
                    if os.path.exists(w_filename):
                        w = np.array(load_from_disk(w_filename))
                    else:
                        w = np.ones(y.shape)
                else:
                    w = None
                yield (X, y, w, ids)
Exemple #6
0
    def load_metadata(self):
        try:
            tasks_filename, metadata_filename = self._get_metadata_filename()
            with open(tasks_filename) as fin:
                tasks = json.load(fin)
            metadata_df = pd.read_csv(metadata_filename, compression='gzip')
            metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
            return tasks, metadata_df
        except Exception as e:
            pass

        # Load obsolete format -> save in new format
        metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
        if os.path.exists(metadata_filename):
            tasks, metadata_df = load_from_disk(metadata_filename)
            del metadata_df['task_names']
            del metadata_df['basename']
            save_metadata(tasks, metadata_df, self.data_dir)
            return tasks, metadata_df
        raise ValueError("No Metadata Found On Disk")