def get_shard_size(self): """Gets size of shards on disk.""" if not len(self.metadata_df): raise ValueError("No data in dataset.") sample_y = load_from_disk( os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['y'])) return len(sample_y)
def __len__(self): """ Finds number of elements in dataset. """ total = 0 for _, row in self.metadata_df.iterrows(): y = load_from_disk(os.path.join(self.data_dir, row['ids'])) total += len(y) return total
def get_data_shape(self): """ Gets array shape of datapoints in this dataset. """ if not len(self.metadata_df): raise ValueError("No data in dataset.") sample_X = load_from_disk( os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X'])) return np.shape(sample_X)[1:]
def get_shard(self, i): """Retrieves data for the i-th shard from disk.""" row = self.metadata_df.iloc[i] X = np.array(load_from_disk(os.path.join(self.data_dir, row['X']))) if row['y'] is not None: y = np.array(load_from_disk(os.path.join(self.data_dir, row['y']))) else: y = None if row['w'] is not None: # TODO (ytz): Under what condition does this exist but the file itself doesn't? w_filename = os.path.join(self.data_dir, row['w']) if os.path.exists(w_filename): w = np.array(load_from_disk(w_filename)) else: w = np.ones(y.shape) else: w = None ids = np.array(load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object) return (X, y, w, ids)
def iterate(dataset): for _, row in dataset.metadata_df.iterrows(): X = np.array( load_from_disk(os.path.join(dataset.data_dir, row['X']))) ids = np.array(load_from_disk( os.path.join(dataset.data_dir, row['ids'])), dtype=object) # These columns may be missing is the dataset is unlabelled. if row['y'] is not None: y = np.array( load_from_disk(os.path.join(dataset.data_dir, row['y']))) else: y = None if row['w'] is not None: w_filename = os.path.join(dataset.data_dir, row['w']) if os.path.exists(w_filename): w = np.array(load_from_disk(w_filename)) else: w = np.ones(y.shape) else: w = None yield (X, y, w, ids)
def load_metadata(self): try: tasks_filename, metadata_filename = self._get_metadata_filename() with open(tasks_filename) as fin: tasks = json.load(fin) metadata_df = pd.read_csv(metadata_filename, compression='gzip') metadata_df = metadata_df.where((pd.notnull(metadata_df)), None) return tasks, metadata_df except Exception as e: pass # Load obsolete format -> save in new format metadata_filename = os.path.join(self.data_dir, "metadata.joblib") if os.path.exists(metadata_filename): tasks, metadata_df = load_from_disk(metadata_filename) del metadata_df['task_names'] del metadata_df['basename'] save_metadata(tasks, metadata_df, self.data_dir) return tasks, metadata_df raise ValueError("No Metadata Found On Disk")