def __init__(self, dataset_dir, split_str, labels_path, transform_fns): """ """ split_path = path.join(dataset_dir, f'{split_str}.csv') self.split_df = pd.read_csv(split_path, index_col=0) self.split_str = split_str self.labels_df = pd.read_csv(labels_path, index_col=0, header=[0, 1]) self.exam_ids = list(self.split_df.index.unique()) self.transform_fns = transform_fns self.shuffle_transform = 'shuffle' in [f['fn'] for f in transform_fns] self.instance_transform = None for f in transform_fns: # only extract instances if asked to do so and specified for split if 'extract_instance' == f['fn'] and split_str in f['args']['splits']: self.instance_transform = f['args'] logger.info(f"using instance extraction on {f['args']['splits']} splits") break if self.instance_transform != None and self.instance_transform.get('instance_only', False): # only access exam_ids with instance level labels exam_ids = [] for exam_id in self.exam_ids: rows = self.split_df.loc[exam_id] if isinstance(rows, pd.Series): if not np.isnan(rows['label.lv']): exam_ids.append(exam_id) else: if not np.isnan(rows.iloc[0]['label.lv']): exam_ids.append(exam_id) logger.info(f'using {len(exam_ids)} of {len(self.exam_ids)} exam_ids') self.exam_ids = exam_ids else: logger.info(f'using {len(self.exam_ids)} exam_ids') X_dict = {'exam_ids': []} Y_dict = { 'primary': [], 'primary_multiclass': [], '2normal_binary': [] # labels: control, 1,2 (normal) | 3,4 (abnormal), } for idx, exam_id in enumerate(self.exam_ids): X_dict['exam_ids'].append(exam_id) y_dict = self.get_y(exam_id) for t, label in y_dict.items(): Y_dict[t].append(label) Y_dict = {k: torch.from_numpy(np.array(v)) for k, v in Y_dict.items()} EmmentalDataset.__init__(self, 'cow-tus-dataset', X_dict=X_dict, Y_dict=Y_dict)
def __init__(self, dataset_dir, split_str, labels_path, transform_fns): """ """ split_path = path.join(dataset_dir, f'{split_str}.csv') self.split_df = pd.read_csv(split_path, index_col=0) self.labels_df = pd.read_csv(labels_path, index_col=0, header=[0, 1]) self.loop_idxs = range(len(self.split_df)) # use df.iloc because loop_ids are not unique... loop_idxs = [] for loop_idx in self.loop_idxs: row = self.split_df.iloc[loop_idx] loop_type = row['exdir.loop_type'] if f'label.{loop_type}' in row.keys() and not np.isnan(row[f'label.{loop_type}']): loop_idxs.append(loop_idx) logger.info(f'using {len(loop_idxs)} of {len(self.loop_idxs)} loop_idxs') self.loop_idxs = loop_idxs self.transform_fns = transform_fns X_dict = {'loop_idxs': []} Y_dict = { 'primary': [], 'primary_multiclass': [], '2normal_binary': [] # labels: control, 1,2 (normal) | 3,4 (abnormal), } for idx, loop_idx in enumerate(self.loop_idxs): X_dict['loop_idxs'].append(loop_idx) y_dict = self.get_y(loop_idx) for t, label in y_dict.items(): Y_dict[t].append(label) Y_dict = {k: torch.from_numpy(np.array(v)) for k, v in Y_dict.items()} EmmentalDataset.__init__(self, 'cow-tus-dataset', X_dict=X_dict, Y_dict=Y_dict)