def process_raw_path(self, data_path, label_path): y_offset = 0 data_list = [] cat_ys = [] for cat_idx, cat in enumerate(self.categories): idx = self.category_ids[cat] point_paths = sorted(glob.glob(osp.join(data_path, idx, '*.pts'))) y_paths = sorted(glob.glob(osp.join(label_path, idx, '*.seg'))) points = [read_txt_array(path) for path in point_paths] ys = [read_txt_array(path, dtype=torch.long) for path in y_paths] lens = [y.size(0) for y in ys] y = torch.cat(ys).unique(return_inverse=True)[1] + y_offset cat_ys.append(y.unique()) y_offset = y.max().item() + 1 ys = y.split(lens) for (pos, y) in zip(points, ys): data = Data(y=y, pos=pos, category=cat_idx) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) y_mask = torch.zeros((len(self.categories), y_offset), dtype=torch.bool) for i in range(len(cat_ys)): y_mask[i, cat_ys[i]] = 1 return data_list, y_mask
def process_raw_path(self, data_path, label_path): y_offset = 0 data_list = [] cat_ys = [] for cat_idx, cat in enumerate(self.categories): idx = self.category_ids[cat] point_paths = sorted(glob.glob(osp.join(data_path, idx, '*.pts'))) y_paths = sorted(glob.glob(osp.join(label_path, idx, '*.seg'))) points = [read_txt_array(path) for path in point_paths] # ys = list of tensor(1, 2, 3, 2, 1) labels for each item ys = [read_txt_array(path, dtype=torch.long) for path in y_paths] # print('Points and shape: ', points) print('=' * 50) print('Labels and shape: ', ys) # Retrieve the lengths of each label tensor (i.e. number of labelled points) lens = [y.size(0) for y in ys] # torch.cat(ys): First, concatenate all the tensors in the horizontal dimension as a single very long tensor # .unique(return_inverse=True)[1]: Second, return the 'standardized labels', which are now labels from 0 to N y = torch.cat(ys).unique(return_inverse=True)[1] + y_offset # Return a set of labels (which are 0 to N) cat_ys.append(y.unique()) y_offset = y.max().item() + 1 # Having recorded the lengths of all separate items (total number of points per item), # we now split the concatenated horizontal tensor back into the list of tensors per item ys = y.split(lens) # Now create a dataset per item for (pos, y) in zip(points, ys): print(y) data = Data(y=y, pos=pos, category=cat_idx) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) y_mask = torch.zeros((len(self.categories), y_offset), dtype=torch.bool) for i in range(len(cat_ys)): y_mask[i, cat_ys[i]] = 1 return data_list, y_mask
def _process_filenames(self, filenames): data_raw_list = [] data_list = [] categories_ids = [self.category_ids[cat] for cat in self.categories] cat_idx = {categories_ids[i]: i for i in range(len(categories_ids))} has_pre_transform = self.pre_transform is not None id_scan = -1 for name in tq(filenames): cat = name.split(osp.sep)[0] if cat not in categories_ids: continue id_scan += 1 data = read_txt_array(osp.join(self.raw_dir, name)) pos = data[:, :3] x = data[:, 3:6] y = data[:, -1].type(torch.long) category = torch.ones(x.shape[0], dtype=torch.long) * cat_idx[cat] id_scan_tensor = torch.from_numpy(np.asarray([id_scan])).clone() data = Data(pos=pos, x=x, y=y, category=category, id_scan=id_scan_tensor) data = SaveOriginalPosId()(data) if self.pre_filter is not None and not self.pre_filter(data): continue data_raw_list.append(data.clone() if has_pre_transform else data) if has_pre_transform: data = self.pre_transform(data) data_list.append(data) if not has_pre_transform: return [], data_raw_list return data_raw_list, data_list
def process_set(self, dataset): with open( osp.join(self.raw_dir, "modelnet{}_shape_names.txt".format(self.name)), "r") as f: categories = f.read().splitlines() categories = sorted(categories) with open( osp.join(self.raw_dir, "modelnet{}_{}.txt".format(self.name, dataset)), "r") as f: split_objects = f.read().splitlines() data_list = [] for target, category in enumerate(categories): folder = osp.join(self.raw_dir, category) category_ojects = filter(lambda o: category in o, split_objects) paths = [ "{}/{}.txt".format(folder, o.strip()) for o in category_ojects ] for path in paths: raw = read_txt_array(path, sep=",") data = Data(pos=raw[:, :3], norm=raw[:, 3:], y=torch.tensor([target])) data_list.append(data) if self.pre_filter is not None: data_list = [d for d in data_list if self.pre_filter(d)] if self.pre_transform is not None: data_list = [self.pre_transform(d) for d in data_list] return self.collate(data_list)
def process_set(self, dataset): f = osp.join(self.raw_dir, f'modelnet{self.name}_shape_names.txt') with open(f, 'r') as f: categories = f.read().split('\n')[:-1] cate_id = {cate : i for i, cate in enumerate(categories)} f = osp.join(self.raw_dir, f'modelnet{self.name}_{dataset}.txt') with open(f, 'r') as f: file_list = f.read().split('\n')[:-1] data_list = [] with tqdm(file_list) as t: for file_name in t: category = '_'.join(file_name.split('_')[:-1]) f = osp.join(self.raw_dir, category, f'{file_name}.txt') data = read_txt_array(f, sep=',') data = Data(pos=data[:, :3], norm=data[:, 3:]) data.y = torch.tensor([cate_id[category]]) data_list.append(data) if self.pre_filter is not None: data_list = [d for d in data_list if self.pre_filter(d)] if self.pre_transform is not None: data_list = [self.pre_transform(d) for d in data_list] return self.collate(data_list)
def process_graph(self, triple_path, feature_path): g1 = read_txt_array(triple_path, sep='\t', dtype=torch.long) subj, rel, obj = g1.t() name_dict = {} with open(feature_path, 'r') as f: for line in f: info = line.strip().split('\t') info = info if len(info) == 2 else info + [''] seq_str = remove_punc(info[1]).strip() if seq_str == "": seq_str = '<unk>' name_dict[int(info[0])] = seq_str idx = torch.tensor(list(name_dict.keys())) assoc = torch.full((idx.max().item() + 1,), -1, dtype=torch.long) assoc[idx] = torch.arange(idx.size(0)) subj, obj = assoc[subj], assoc[obj] edge_index = torch.stack([subj, obj], dim=0) edge_index, rel = sort_edge_index(edge_index, rel) # xs = [None for _ in range(idx.size(0))] names = [None for _ in range(idx.size(0))] for i in name_dict.keys(): names[assoc[i]] = name_dict[i] # x = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True) return edge_index, rel, assoc, names
def process_events(self) -> torch.Tensor: events = [] for path in self.raw_paths: data = read_txt_array(path, sep='\t', end=4, dtype=torch.long) data[:, 3] = data[:, 3] // 15 events += [data] return torch.cat(events, dim=0)
def process_graph( self, triple_path: str, feature_path: str, embeddings: Dict[str, Tensor], ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: g1 = read_txt_array(triple_path, sep='\t', dtype=torch.long) subj, rel, obj = g1.t() x_dict = {} with open(feature_path, 'r') as f: for line in f: info = line.strip().split('\t') info = info if len(info) == 2 else info + ['**UNK**'] seq = info[1].lower().split() hs = [embeddings.get(w, embeddings['**UNK**']) for w in seq] x_dict[int(info[0])] = torch.stack(hs, dim=0) idx = torch.tensor(list(x_dict.keys())) assoc = torch.full((idx.max().item() + 1, ), -1, dtype=torch.long) assoc[idx] = torch.arange(idx.size(0)) subj, obj = assoc[subj], assoc[obj] edge_index = torch.stack([subj, obj], dim=0) edge_index, rel = sort_edge_index(edge_index, rel) xs = [None for _ in range(idx.size(0))] for i in x_dict.keys(): xs[assoc[i]] = x_dict[i] x = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True) return x, edge_index, rel, assoc
def process_filenames(self, filenames): data_list = [] categories_ids = [self.category_ids[cat] for cat in self.categories] categories_indices = [ list(self.category_ids).index(key) for key in self.categories ] cat_idx = { categories_ids[i]: v for i, v in enumerate(categories_indices) } for name in filenames: cat = name.split(osp.sep)[0] if cat not in categories_ids: continue data = read_txt_array(osp.join(self.raw_dir, name)) pos = data[:, :3] x = data[:, 3:6] y = data[:, -1].type(torch.long) data = Data(pos=pos, x=x, y=y, category=cat_idx[cat]) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) return data_list
def _read_file(self, filename): raw = read_txt_array(filename) pos = raw[:, :3] x = raw[:, 3:6] if raw.shape[1] == 7: y = raw[:, 6].type(torch.long) else: y = None return Data(pos=pos, x=x, y=y)
def process_y_n2id(self, link_path, n2i_curr, n2i_dataset): curr_reverse = self.pair[:2] == "en" assoc0, assoc1 = tuple([self.get_assoc(n2i_curr[i], n2i_dataset[i]) for i in range(2)]) g1, g2 = read_txt_array(link_path, "\t", dtype=torch.long).t() if curr_reverse: g1, g2 = g2, g1 g1 = assoc0[g1] g2 = assoc1[g2] return torch.stack([g1, g2], dim=0).to(self.device)
def process(self): data_list = [] for cat in self.categories: paths = glob.glob(osp.join(self.raw_dir, f'{cat}*.tri')) paths = [path[:-4] for path in paths] paths = sorted(paths, key=lambda e: (len(e), e)) for path in paths: pos = read_txt_array(f'{path}.vert') face = read_txt_array(f'{path}.tri', dtype=torch.long) data = Data(pos=pos, face=face.t().contiguous()) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[0])
def process(self): edge_path = osp.join(self.raw_dir, '{}_edgelist.txt'.format(self.name)) edge_index = read_txt_array(edge_path, sep=',', dtype=torch.long).t() docs_path = osp.join(self.raw_dir, '{}_docs.txt'.format(self.name)) f = open(docs_path, 'rb') content_list = [] for line in f.readlines(): line = str(line, encoding="utf-8") content_list.append(line.split(",")) x = np.array(content_list, dtype=float) x = torch.from_numpy(x).to(torch.float) label_path = osp.join(self.raw_dir, '{}_labels.txt'.format(self.name)) f = open(label_path, 'rb') content_list = [] for line in f.readlines(): line = str(line, encoding="utf-8") line = line.replace("\r", "").replace("\n", "") content_list.append(line) y = np.array(content_list, dtype=int) y = torch.from_numpy(y).to(torch.int64) data_list = [] data = Data(edge_index=edge_index, x=x, y=y) random_node_indices = np.random.permutation(y.shape[0]) training_size = int(len(random_node_indices) * 0.7) val_size = int(len(random_node_indices) * 0.1) train_node_indices = random_node_indices[:training_size] val_node_indices = random_node_indices[training_size:training_size + val_size] test_node_indices = random_node_indices[training_size + val_size:] train_masks = torch.zeros([y.shape[0]], dtype=torch.uint8) train_masks[train_node_indices] = 1 val_masks = torch.zeros([y.shape[0]], dtype=torch.uint8) val_masks[val_node_indices] = 1 test_masks = torch.zeros([y.shape[0]], dtype=torch.uint8) test_masks[test_node_indices] = 1 data.train_mask = train_masks data.val_mask = val_masks data.test_mask = test_masks if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) data, slices = self.collate([data]) torch.save((data, slices), self.processed_paths[0])
def process(self): path_file = self.raw_paths with open(path_file[0], "r") as f: filenames = f.read().split('\n')[:-1] data_list = [] for filename in filenames: pos_path = osp.join(self.raw_dir, filename+'.xyz') normal_path = osp.join(self.raw_dir, filename+'.normals') curv_path = osp.join(self.raw_dir, filename+'.curv') idx_path = osp.join(self.raw_dir, filename+'.pidx') pos = read_txt_array(pos_path) normals = read_txt_array(normal_path) curv = read_txt_array(curv_path) normals_and_curv = torch.cat([normals, curv], dim=1) pidx = read_txt_array(idx_path, dtype=torch.long) data = Data(pos=pos, x=normals_and_curv, y=pidx) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[0])
def read_file(folder, prefix, name): path = osp.join(folder, "ind.{}.{}".format(prefix.lower(), name)) if name == "test.index": return read_txt_array(path, dtype=torch.long) with open(path, "rb") as f: if sys.version_info > (3, 0): out = pickle.load(f, encoding="latin1") else: out = pickle.load(f) if name == "graph" or name == "y.index": return out out = out.todense() if hasattr(out, "todense") else out out = torch.Tensor(out) return out
def read_file(folder, prefix, name): path = osp.join(folder, 'ind.{}.{}'.format(prefix.lower(), name)) if name == 'test.index': return read_txt_array(path, dtype=torch.long) with open(path, 'rb') as f: if sys.version_info > (3, 0): out = pickle.load(f, encoding='latin1') else: out = pickle.load(f) if name == 'graph': return out out = out.todense() if hasattr(out, 'todense') else out out = torch.Tensor(out) return out
def read_file(folder, prefix, name): path = osp.join(folder, f'ind.{prefix.lower()}.{name}') if name == 'test.index': return read_txt_array(path, dtype=torch.long) with open(path, 'rb') as f: if sys.version_info > (3, 0): warnings.filterwarnings('ignore', '.*`scipy.sparse.csr` name.*') out = pickle.load(f, encoding='latin1') else: out = pickle.load(f) if name == 'graph': return out out = out.todense() if hasattr(out, 'todense') else out out = torch.Tensor(out) return out
def process(self): x = sp.load_npz( osp.join(self.raw_dir, f'new_{self.feature}_feature.npz')) x = torch.from_numpy(x.todense()).to(torch.float) edge_index = read_txt_array(osp.join(self.raw_dir, 'A.txt'), sep=',', dtype=torch.long).t() edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0)) y = np.load(osp.join(self.raw_dir, 'graph_labels.npy')) y = torch.from_numpy(y).to(torch.long) _, y = y.unique(sorted=True, return_inverse=True) batch = np.load(osp.join(self.raw_dir, 'node_graph_id.npy')) batch = torch.from_numpy(batch).to(torch.long) node_slice = torch.cumsum(batch.bincount(), 0) node_slice = torch.cat([torch.tensor([0]), node_slice]) edge_slice = torch.cumsum(batch[edge_index[0]].bincount(), 0) edge_slice = torch.cat([torch.tensor([0]), edge_slice]) graph_slice = torch.arange(y.size(0) + 1) self.slices = { 'x': node_slice, 'edge_index': edge_slice, 'y': graph_slice } edge_index -= node_slice[batch[edge_index[0]]].view(1, -1) self.data = Data(x=x, edge_index=edge_index, y=y) for path, split in zip(self.processed_paths, ['train', 'val', 'test']): idx = np.load(osp.join(self.raw_dir, f'{split}_idx.npy')).tolist() data_list = [self.get(i) for i in idx] if self.pre_filter is not None: data_list = [d for d in data_list if self.pre_filter(d)] if self.pre_transform is not None: data_list = [self.pre_transform(d) for d in data_list] torch.save(self.collate(data_list), path)
def process(self): ref_data = read_off( osp.join(self.raw_paths[0], 'null', f'{self.cat}.off')) train_list = [] name = f'{self.part}_{self.cat}_*.off' paths = glob.glob(osp.join(self.raw_paths[0], self.part, name)) paths = [path[:-4] for path in paths] paths = sorted(paths, key=lambda e: (len(e), e)) for path in paths: data = read_off(f'{path}.off') y = read_txt_array(f'{path}.baryc_gt') data.y = y[:, 0].to(torch.long) - 1 data.y_baryc = y[:, 1:] train_list.append(data) test_list = [] name = f'{self.part}_{self.cat}_*.off' paths = glob.glob(osp.join(self.raw_paths[1], self.part, name)) paths = [path[:-4] for path in paths] paths = sorted(paths, key=lambda e: (len(e), e)) for path in paths: test_list.append(read_off(f'{path}.off')) if self.pre_filter is not None: train_list = [d for d in train_list if self.pre_filter(d)] test_list = [d for d in test_list if self.pre_filter(d)] if self.pre_transform is not None: ref_data = self.pre_transform(ref_data) train_list = [self.pre_transform(d) for d in train_list] test_list = [self.pre_transform(d) for d in test_list] torch.save(ref_data, self.processed_paths[0]) torch.save(self.collate(train_list), self.processed_paths[1]) torch.save(self.collate(test_list), self.processed_paths[2])
def process_filenames(self, filenames): data_list = [] categories_ids = [self.category_ids[cat] for cat in self.categories] cat_idx = {categories_ids[i]: i for i in range(len(categories_ids))} for name in tq(filenames): cat = name.split(osp.sep)[0] if cat not in categories_ids: continue data = read_txt_array(osp.join(self.raw_dir, name)) pos = data[:, :3] x = data[:, 3:6] y = data[:, -1].type(torch.long) category = torch.ones(x.shape[0], dtype=torch.long) * cat_idx[cat] data = Data(pos=pos, x=x, y=y, category=category) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) return data_list
def read_file(folder, name, dtype=None): path = osp.join(folder, '{}.txt'.format(name)) return read_txt_array(path, sep=',', dtype=dtype)
def read_file(folder, prefix, name, dtype=None): path = osp.join(folder, f'{prefix}_{name}.txt') return read_txt_array(path, sep=',', dtype=dtype)
def read_file(folder, prefix, name, dtype=None): path = osp.join(folder, "{}_{}.txt".format(prefix, name)) return read_txt_array(path, sep=",", dtype=dtype)
def process_y(self, path: str, assoc1: Tensor, assoc2: Tensor) -> Tensor: row, col, mask = read_txt_array(path, sep='\t', dtype=torch.long).t() mask = mask.to(torch.bool) return torch.stack([assoc1[row[mask]], assoc2[col[mask]]], dim=0)