nodes = np.random.rand(num_nodes, 1) edge_index = np.reshape( np.meshgrid(np.arange(num_nodes), np.arange(num_nodes)), (2, -1)) # edge_index = torch.tensor([[0,1], [1,2]]) x = torch.tensor(nodes) # x = torch.tensor([[1], [2], [3]]) edge_index = torch.tensor(edge_index, dtype=torch.long) y = torch.zeros(num_nodes, dtype=torch.bool) y[nodes.argmin(0)] = True print(x) print(y) print(edge_index) data = Data(x=x, edge_index=edge_index, y=y) import newtorkx as nx import matplotlib.pyplot as plt from torch_geometric.utils import to_networkx def visualize(h, color, epoch=None, loss=None): plt.figure(figsize=(7, 7)) plt.xticks([]) plt.yticks([]) if torch.is_tensor(h): h = h.detach().cpu().numpy() plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap='Set2') if epoch is not None and loss is not None:
def process(self): graph_file, task_file, train_file, test_file = self.raw_paths g = rdf.Graph() with gzip.open(graph_file, 'rb') as f: g.parse(file=f, format='nt') freq_ = Counter(g.predicates()) def freq(rel): return freq_[rel] if rel in freq_ else 0 relations = sorted(set(g.predicates()), key=lambda rel: -freq(rel)) subjects = set(g.subjects()) objects = set(g.objects()) nodes = list(subjects.union(objects)) relations_dict = {rel: i for i, rel in enumerate(list(relations))} nodes_dict = {node: i for i, node in enumerate(nodes)} edge_list = [] for s, p, o in g.triples((None, None, None)): src, dst, rel = nodes_dict[s], nodes_dict[o], relations_dict[p] edge_list.append([src, dst, 2 * rel]) edge_list.append([dst, src, 2 * rel + 1]) edge_list = sorted(edge_list, key=lambda x: (x[0], x[1], x[2])) edge = torch.tensor(edge_list, dtype=torch.long).t().contiguous() edge_index, edge_type = edge[:2], edge[2] if self.name == 'am': label_header = 'label_cateogory' nodes_header = 'proxy' elif self.name == 'aifb': label_header = 'label_affiliation' nodes_header = 'person' elif self.name == 'mutag': label_header = 'label_mutagenic' nodes_header = 'bond' elif self.name == 'bgs': label_header = 'label_lithogenesis' nodes_header = 'rock' labels_df = pd.read_csv(task_file, sep='\t') labels_set = set(labels_df[label_header].values.tolist()) labels_dict = {lab: i for i, lab in enumerate(list(labels_set))} nodes_dict = {np.unicode(key): val for key, val in nodes_dict.items()} train_labels_df = pd.read_csv(train_file, sep='\t') train_indices, train_labels = [], [] for nod, lab in zip(train_labels_df[nodes_header].values, train_labels_df[label_header].values): train_indices.append(nodes_dict[nod]) train_labels.append(labels_dict[lab]) train_idx = torch.tensor(train_indices, dtype=torch.long) train_y = torch.tensor(train_labels, dtype=torch.long) test_labels_df = pd.read_csv(test_file, sep='\t') test_indices, test_labels = [], [] for nod, lab in zip(test_labels_df[nodes_header].values, test_labels_df[label_header].values): test_indices.append(nodes_dict[nod]) test_labels.append(labels_dict[lab]) test_idx = torch.tensor(test_indices, dtype=torch.long) test_y = torch.tensor(test_labels, dtype=torch.long) data = Data(edge_index=edge_index) data.edge_type = edge_type data.train_idx = train_idx data.train_y = train_y data.test_idx = test_idx data.test_y = test_y data, slices = self.collate([data]) torch.save((data, slices), self.processed_paths[0])
n_flights = load_pickle(SOURCE_PATH / 'dataset/timeseries/data/travel_matrix.pkl') distances = 1 - distances norm = np.max(cases) cases = cases / np.max(cases) edges = distances / np.max(distances) + n_flights / np.max(n_flights) edges /= np.max(edges) labels = torch.FloatTensor(cases[:, 0]) features = torch.FloatTensor(cases[:, 1:]) edge_attr = torch.FloatTensor(edges[np.nonzero(edges)].reshape(-1, 1)) edge_index = torch.FloatTensor(np.argwhere(edges != 0).transpose()) # edges[edges <= 0.2] = 0 edges = torch.FloatTensor(edges) idx = torch.tensor(list(range(features.size(0)))) data = Data(x=features, edge_index=edge_index, edge_attr=edges, y=labels, idx=idx) model = GNN(data.x.size(1), 1).to(device) model.train() data = data.to(device) loss = torch.nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.005) for epoch in range(epochs): optimizer.zero_grad() pred = model(data) # print(pred.size()) cost = loss(pred, data.y) print(cost) cost.backward() optimizer.step() for x, y in zip(pred, data.y): print(int(x.item() * norm), int(y.item() * norm))
def get_messages(ogn, trainloader, n_msg=250): print( "Warning: this function assumes that only a single message component dominates", flush=True) all_msg_input = [] all_msgs = [] all_msg_sums = [] all_nodes = [] all_outputs = [] X = trainloader.data.x y = trainloader.data.y batch = trainloader.batch_size i = 0 for subgraph in trainloader(): n_offset = len(subgraph.n_id) cur_len = n_offset cur_edge_index = subgraph.blocks[0].edge_index.clone() cur_edge_index[0] += n_offset g = Data(x=torch.cat( (X[subgraph.n_id], X[subgraph.blocks[0].n_id])).cuda(), y=torch.cat( (y[subgraph.n_id], y[subgraph.blocks[0].n_id])).cuda(), edge_index=cur_edge_index.cuda()) s1 = g.x[g.edge_index[0]] s2 = g.x[g.edge_index[1]] msg_input = torch.cat([s1[:, :3] - s2[:, :3], s1[:, 3:], s2[:, 3:]], dim=1) raw_msg = ogn.msg_fnc(msg_input) msg_input = msg_input.detach().cpu().numpy() all_msg_input.append(msg_input) best_msg_idx = np.argmax(raw_msg.std(0).detach().cpu().numpy()) best_msgs = raw_msg[:, best_msg_idx].detach().cpu().numpy() all_msgs.append(best_msgs) associated_sum_message = np.array([ raw_msg[np.argwhere(g.edge_index[1].detach().cpu().numpy() == i). T].sum(0)[best_msg_idx].detach().cpu().numpy() for i in range(batch) ]) all_msg_sums.append(associated_sum_message) node = g.x[list(range(batch))] output = ogn(g) all_nodes.append(node.detach().cpu().numpy()) all_outputs.append(output.detach().cpu().numpy()) i += 1 if i > n_msg: break all_msg_input = np.concatenate(all_msg_input) all_msgs = np.concatenate(all_msgs) all_msg_sums = np.concatenate(all_msg_sums) all_nodes = np.concatenate(all_nodes) all_outputs = np.concatenate(all_outputs) #plt.scatter( # x=np.arange(raw_msg.std(0).shape[0]), # y=np.log10(np.sort(raw_msg.std(0).detach().cpu().numpy())), # s=3 #) msg_func_data = pd.DataFrame({ **{ 'dx dy dz vx1 vy1 vz1 M1 vx2 vy2 vz2 M2'.split(' ')[i]: all_msg_input[:, i] for i in range(all_msg_input.shape[1]) }, **{ 'message': all_msgs } }) node_func_data = pd.DataFrame({ **{ 'x y z vx vy vz M'.split(' ')[i]: all_nodes[:, i] for i in range(7) }, **{ 'message': all_msg_sums, 'output': all_outputs[:, 0] } }) idx_node = np.arange(node_func_data.shape[0]) np.random.shuffle(idx_node) idx_msg = np.arange(msg_func_data.shape[0]) np.random.shuffle(idx_msg) return { 'node_function': node_func_data.iloc[idx_node], #.iloc[:5000].to_csv('node_func.csv'); 'msg_function': msg_func_data.iloc[idx_msg] #.iloc[:5000].to_csv('msg_func.csv') }
def process(self): from PIL import Image import torchvision.transforms as T import torchvision.models as models splits = np.load(osp.join(self.raw_dir, 'splits.npz'), allow_pickle=True) category_idx = self.categories.index(self.category) train_split = list(splits['train'])[category_idx] test_split = list(splits['test'])[category_idx] image_path = osp.join(self.raw_dir, 'images', 'JPEGImages') info_path = osp.join(self.raw_dir, 'images', 'Annotations') annotation_path = osp.join(self.raw_dir, 'annotations') labels = {} vgg16_outputs = [] def hook(module, x, y): vgg16_outputs.append(y) vgg16 = models.vgg16(pretrained=True).to(self.device) vgg16.eval() vgg16.features[20].register_forward_hook(hook) # relu4_2 vgg16.features[25].register_forward_hook(hook) # relu5_1 transform = T.Compose([ T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_set, test_set = [], [] for i, name in enumerate(chain(train_split, test_split)): filename = '_'.join(name.split('/')[1].split('_')[:-1]) idx = int(name.split('_')[-1].split('.')[0]) - 1 path = osp.join(info_path, f'{filename}.xml') obj = minidom.parse(path).getElementsByTagName('object')[idx] trunc = obj.getElementsByTagName('truncated')[0].firstChild.data occ = obj.getElementsByTagName('occluded') occ = '0' if len(occ) == 0 else occ[0].firstChild.data diff = obj.getElementsByTagName('difficult')[0].firstChild.data if bool(int(trunc)) or bool(int(occ)) or bool(int(diff)): continue if self.category == 'person' and int(filename[:4]) > 2008: continue xmin = float(obj.getElementsByTagName('xmin')[0].firstChild.data) xmax = float(obj.getElementsByTagName('xmax')[0].firstChild.data) ymin = float(obj.getElementsByTagName('ymin')[0].firstChild.data) ymax = float(obj.getElementsByTagName('ymax')[0].firstChild.data) box = (xmin, ymin, xmax, ymax) dom = minidom.parse(osp.join(annotation_path, name)) keypoints = dom.getElementsByTagName('keypoint') poss, ys = [], [] for keypoint in keypoints: label = keypoint.attributes['name'].value if label not in labels: labels[label] = len(labels) ys.append(labels[label]) x = float(keypoint.attributes['x'].value) y = float(keypoint.attributes['y'].value) poss += [x, y] y = torch.tensor(ys, dtype=torch.long) pos = torch.tensor(poss, dtype=torch.float).view(-1, 2) if pos.numel() == 0: continue # These examples do not make any sense anyway... # Add a small offset to the bounding because some keypoints lay # outside the bounding box intervals. box = (min(pos[:, 0].min().floor().item(), box[0]) - 16, min(pos[:, 1].min().floor().item(), box[1]) - 16, max(pos[:, 0].max().ceil().item(), box[2]) + 16, max(pos[:, 1].max().ceil().item(), box[3]) + 16) # Rescale keypoints. pos[:, 0] = (pos[:, 0] - box[0]) * 256.0 / (box[2] - box[0]) pos[:, 1] = (pos[:, 1] - box[1]) * 256.0 / (box[3] - box[1]) path = osp.join(image_path, f'{filename}.jpg') with open(path, 'rb') as f: img = Image.open(f).convert('RGB').crop(box) img = img.resize((256, 256), resample=Image.BICUBIC) img = transform(img) data = Data(img=img, pos=pos, y=y, name=filename) if i < len(train_split): train_set.append(data) else: test_set.append(data) data_list = list(chain(train_set, test_set)) imgs = [data.img for data in data_list] loader = DataLoader(imgs, self.batch_size, shuffle=False) for i, batch_img in enumerate(loader): vgg16_outputs.clear() with torch.no_grad(): vgg16(batch_img.to(self.device)) out1 = F.interpolate(vgg16_outputs[0], (256, 256), mode='bilinear', align_corners=False) out2 = F.interpolate(vgg16_outputs[1], (256, 256), mode='bilinear', align_corners=False) for j in range(out1.size(0)): data = data_list[i * self.batch_size + j] idx = data.pos.round().long().clamp(0, 255) x_1 = out1[j, :, idx[:, 1], idx[:, 0]].to('cpu') x_2 = out2[j, :, idx[:, 1], idx[:, 0]].to('cpu') data.img = None data.x = torch.cat([x_1.t(), x_2.t()], dim=-1) del out1 del out2 if self.pre_filter is not None: train_set = [data for data in train_set if self.pre_filter(data)] test_set = [data for data in test_set if self.pre_filter(data)] if self.pre_transform is not None: train_set = [self.pre_transform(data) for data in train_set] test_set = [self.pre_transform(data) for data in test_set] torch.save(self.collate(train_set), self.processed_paths[0]) torch.save(self.collate(test_set), self.processed_paths[1])
def get_sp_info(self, img, target): # 3. Super Pixel deal_super_pixel = DealSuperPixel(image_data=img, ds_image_size=self.image_size, super_pixel_size=self.sp_size) segment, super_pixel_info, adjacency_info = deal_super_pixel.run() # Resize Super Pixel _now_data_list = [] for key in super_pixel_info: _now_data = cv2.resize(super_pixel_info[key]["data2"] / 255, (self.sp_ve_size, self.sp_ve_size), interpolation=cv2.INTER_NEAREST) _now_data_list.append(_now_data) pass net_data = np.transpose(_now_data_list, axes=(0, 3, 1, 2)) # 4. Visual Embedding shape_feature, texture_feature = self.ve_model.forward( torch.from_numpy(net_data).float().to(self.device)) shape_feature, texture_feature = shape_feature.detach().numpy( ), texture_feature.detach().numpy() for sp_i in range(len(super_pixel_info)): super_pixel_info[sp_i]["feature_shape"] = shape_feature[sp_i] super_pixel_info[sp_i]["feature_texture"] = texture_feature[sp_i] pass # Data for Batch: super_pixel_info x, pos, area, size = [], [], [], [] for sp_i in range(len(super_pixel_info)): now_sp = super_pixel_info[sp_i] _size = now_sp["size"] _area = now_sp["area"] _x = np.concatenate( [now_sp["feature_shape"], now_sp["feature_texture"], [_size]], axis=0) x.append(_x) size.append([_size]) area.append(_area) pos.append([_area[1] - _area[0], _area[3] - _area[2]]) pass # Data for Batch: adjacency_info edge_index, edge_w = [], [] for edge_i in range(len(adjacency_info)): edge_index.append( [adjacency_info[edge_i][0], adjacency_info[edge_i][1]]) # edge_w.append([adjacency_info[edge_i][2], adjacency_info[edge_i][2]]) edge_w.append(adjacency_info[edge_i][2]) pass edge_index = np.transpose(edge_index, axes=(1, 0)) # Data for Batch: Data g_data = Data(x=torch.from_numpy(np.asarray(x)).float(), edge_index=torch.from_numpy(edge_index), y=torch.tensor([target]), pos=torch.from_numpy(np.asarray(pos)), area=torch.from_numpy(np.asarray(area)), size=torch.from_numpy(np.asarray(size)), edge_w=torch.from_numpy(np.asarray(edge_w)).float()) return g_data
def newData(nodeFeats, edgeSyms, graphLab): return Data( x=torch.tensor(nodeFeats, dtype=torch.float), # node features edge_index=torch.tensor(edgeSyms).t().contiguous(), # edge y=torch.tensor(graphLab)) # graph label
def get_batch(self, X): # Wrap input node and edge features, along with the single edge_index, into a `torch_geometric.data.Batch` instance data_list = [Data(x=x) for x in X] return Batch.from_data_list(data_list)
def forward(self, X, edge_index, edge_weight): """ :param X: Input data of shape (batch_size, num_nodes, in_channels) :param edge_index: Graph connectivity in COO format with shape(2, num_edges) :param edge_weight: Edge feature matrix with shape (num_edges, num_edge_features) :return: Output data of shape (batch_size, num_nodes, out_channels) """ if torch.is_tensor(X): sz = X.shape if self.gcn_partition == 'cluster': out = torch.zeros(sz[0], sz[1], self.out_channels, device=X.device) graph_data = Data(edge_index=edge_index, edge_attr=edge_weight, train_mask=torch.arange(0, sz[1]), num_nodes=sz[1]).to('cpu') cluster_data = ClusterData(graph_data, num_parts=50, recursive=False, save_dir='./data/cluster') loader = ClusterLoader(cluster_data, batch_size=5, shuffle=True, num_workers=0) for subgraph in loader: out[:, subgraph.train_mask] = self.gcn( X[:, subgraph.train_mask], subgraph.edge_index.to(X.device), subgraph.edge_attr.to(X.device)) elif self.gcn_partition == 'sample': # Use NeighborSampler() to iterates over graph nodes in a mini-batch fashion # and constructs sampled subgraphs (use cpu for no CUDA version) out = torch.zeros(sz[0], sz[1], self.out_channels, device=X.device) graph_data = Data(edge_index=edge_index, num_nodes=sz[1]).to('cpu') loader = NeighborSampler(graph_data, size=[10, 5], num_hops=2, batch_size=120, shuffle=True, add_self_loops=False) for data_flow in loader(): block1 = data_flow[0] t = self.gcn1(X, edge_index[:, block1.e_id], edge_weight[block1.e_id]) block2 = data_flow[1] part_out = self.gcn2(t, edge_index[:, block2.e_id], edge_weight[block2.e_id]) out[:, data_flow.n_id] = part_out[:, data_flow.n_id] elif self.batch_training: if self.adj_available: out = self.gcn(X, edge_index, edge_weight) else: out = self.gcn(X, edge_index) else: # Currently, conv in [GATConv] cannot use argument node_dim for batch training # This is a temp solution but it's very very very slow! # Costing about 6 times more than batch_training batch = self.get_batch(X) if self.adj_available: out = self.gcn(batch.x, edge_index, edge_weight) else: out = self.gcn(batch.x, edge_index) out = out.view(sz[0], sz[1], -1) return out
load_local_torchpoints3d() from torch_points3d.models.segmentation.pointnet import PointNet from torch_geometric.data import Data, Batch from torch_points3d.datasets.batch import SimpleBatch ##################### PARTIAL_DENSE FORMAT ##################### num_points = 500 num_classes = 10 input_nc = 3 pos = torch.randn((num_points, 3)) x = torch.randn((num_points, input_nc)) data = Data(pos=pos, x=x) data = Batch.from_data_list([data, data]) print(data) #Batch(batch=[1000], pos=[1000, 3], x=[1000, 3]) pointnet = PointNet(OmegaConf.create({'conv_type': 'PARTIAL_DENSE'})) pointnet.set_input(data, "cpu") data_out = pointnet.forward() print(data_out.shape) # torch.Size([1000, 4]) ##################### DENSE FORMAT ##################### num_points = 500
def __getitem__(self, idx): exclude_graph_X, exclude_graph_edge_index = self.subgraph_build_func(self.X, self.edge_index, self.exclude_mask[idx]) include_graph_X, include_graph_edge_index = self.subgraph_build_func(self.X, self.edge_index, self.include_mask[idx]) exclude_data = Data(x=exclude_graph_X, edge_index=exclude_graph_edge_index) include_data = Data(x=include_graph_X, edge_index=include_graph_edge_index) return exclude_data, include_data
# We convert the individual graphs into a single big one, so that sampling # neighbors does not need to care about different edge types. # This will return the following: # * `edge_index`: The new global edge connectivity. # * `edge_type`: The edge type for each edge. # * `node_type`: The node type for each node. # * `local_node_idx`: The original index for each node. # * `local2global`: A dictionary mapping original (local) node indices of # type `key` to global ones. # `key2int`: A dictionary that maps original keys to their new canonical type. out = group_hetero_graph(data.edge_index_dict, data.num_nodes_dict) edge_index, edge_type, node_type, local_node_idx, local2global, key2int = out homo_data = Data(edge_index=edge_index, edge_attr=edge_type, node_type=node_type, local_node_idx=local_node_idx, num_nodes=node_type.size(0)) homo_data.y = node_type.new_full((node_type.size(0), 1), -1) homo_data.y[local2global['paper']] = data.y_dict['paper'] homo_data.train_mask = torch.zeros((node_type.size(0)), dtype=torch.bool) homo_data.train_mask[local2global['paper'][split_idx['train']['paper']]] = True homo_data.valid_mask = torch.zeros((node_type.size(0)), dtype=torch.bool) homo_data.valid_mask[local2global['paper'][split_idx['valid']['paper']]] = True rec_loss = torch.zeros(homo_data.num_nodes) #print(homo_data) train_loader = GraphSAINTRandomWalkSampler(homo_data,
def __merge_edges__(self, x, data, edge_score, decimator): # Torch tensors batch = data.batch edge_index = data.edge_index # Build a priority queue to store edge costs and store which nodes are still valid PQ = PriorityQueue([(edge_score[i].item(), i) for i in range(len(edge_score))]) # Loop over edges, contracting edges and updating node positions nodes_remaining = set(range(decimator.num_vertices)) cluster = torch.empty_like(batch, device=torch.device('cpu')) new_edge_indices = [] i = 0 while len(PQ) > 0: ei = PQ.popItem() # check if nodes have already been merged source, target = decimator.E[ei] if (source not in nodes_remaining) or (target not in nodes_remaining): continue contracted = decimator.contractEdge(ei) if contracted: # this edge was successfully contracted nodes_remaining.remove(source) cluster[source] = i if source != target: nodes_remaining.remove(target) cluster[target] = i i += 1 new_edge_indices.append(ei) # The remaining nodes are simply kept. for node_idx in nodes_remaining: cluster[node_idx] = i i += 1 cluster = cluster.to(x.device) # We compute the new features as an addition of the old ones. new_x = scatter_add(x, cluster, dim=0, dim_size=i) if edge_score is not None: new_edge_score = edge_score[new_edge_indices] if len(nodes_remaining) > 0: remaining_score = x.new_ones( (new_x.size(0) - len(new_edge_indices), )) new_edge_score = torch.cat([new_edge_score, remaining_score]) new_x = new_x * new_edge_score.view(-1, 1) else: new_edge_score = x.new_ones((new_x.size(0), )) N = new_x.size(0) new_edge_index, _ = coalesce(cluster[edge_index], None, N, N) new_batch = x.new_empty(new_x.size(0), dtype=torch.long) new_batch = new_batch.scatter_(0, cluster, batch) unpool_info = self.unpool_description(edge_index=edge_index, cluster=cluster, batch=batch, new_edge_score=new_edge_score) # update mesh vertices #vi = torch.empty_like(new_batch, device=torch.device('cpu')) #vi[cluster] = torch.arange(cluster.size(0)) #new_pos = data.pos[vi][:,0:3] vi = np.empty(i, dtype=np.int64) vi[cluster.cpu().numpy()] = np.arange(decimator.num_vertices) new_pos = torch.tensor(decimator.V[vi][:, 0:3]) # update faces new_face = torch.empty_like(data.face) new_face[0, :] = cluster[data.face[ 0, :]] # assign vertices to their new cluster id new_face[1, :] = cluster[data.face[ 1, :]] # assign vertices to their new cluster id new_face[2, :] = cluster[data.face[ 2, :]] # assign vertices to their new cluster id fi = (new_face[0, :] == new_face[1, :]) + (new_face[0, :] == new_face[2, :]) + ( new_face[1, :] == new_face[2, :] ) # faces with duplicate vertices new_face = new_face[:, ~fi] # remove duplicates new_data = Data(edge_index=new_edge_index, batch=new_batch, pos=new_pos, face=new_face) new_data.unpool_info = unpool_info return new_x, new_data
def nx_to_graph_data_obj( g, center_id, allowable_features_downstream=None, allowable_features_pretrain=None, node_id_to_go_labels=None ): n_nodes = g.number_of_nodes() # nodes nx_node_ids = [n_i for n_i in g.nodes()] # contains list of nx node ids # in a particular ordering. Will be used as a mapping to convert # between nx node ids and data obj node indices x = torch.tensor(np.ones(n_nodes).reshape(-1, 1), dtype=torch.float) # we don't have any node labels, so set to dummy 1. dim n_nodes x 1 center_node_idx = nx_node_ids.index(center_id) center_node_idx = torch.tensor([center_node_idx], dtype=torch.long) # edges edges_list = [] edge_features_list = [] for node_1, node_2, attr_dict in g.edges(data=True): edge_feature = [ attr_dict["w1"], attr_dict["w2"], attr_dict["w3"], attr_dict["w4"], attr_dict["w5"], attr_dict["w6"], attr_dict["w7"], 0, 0, ] # last 2 indicate self-loop # and masking edge_feature = np.array(edge_feature, dtype=int) # convert nx node ids to data obj node index i = nx_node_ids.index(node_1) j = nx_node_ids.index(node_2) edges_list.append((i, j)) edge_features_list.append(edge_feature) edges_list.append((j, i)) edge_features_list.append(edge_feature) # data.edge_index: Graph connectivity in COO format with shape [2, num_edges] edge_index = torch.tensor(np.array(edges_list).T, dtype=torch.long) # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features] edge_attr = torch.tensor(np.array(edge_features_list), dtype=torch.float) try: species_id = int(nx_node_ids[0].split(".")[0]) # nx node id is of the form: # species_id.protein_id species_id = torch.tensor([species_id], dtype=torch.long) except Exception: # occurs when nx node id has no species id info. For the extract # substructure context pair transform, where we convert a data obj to # a nx graph obj (which does not have original node id info) species_id = torch.tensor([0], dtype=torch.long) # dummy species # id is 0 # construct data obj data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr) data.species_id = species_id data.center_node_idx = center_node_idx if node_id_to_go_labels: # supervised case with go node labels # Construct a dim n_pretrain_go_classes tensor and a # n_downstream_go_classes tensor for the center node. 0 is no data # or negative, 1 is positive. downstream_go_node_feature = [0] * len(allowable_features_downstream) pretrain_go_node_feature = [0] * len(allowable_features_pretrain) if center_id in node_id_to_go_labels: go_labels = node_id_to_go_labels[center_id] # get indices of allowable_features_downstream that match with elements # in go_labels _, node_feature_indices, _ = np.intersect1d(allowable_features_downstream, go_labels, return_indices=True) for idx in node_feature_indices: downstream_go_node_feature[idx] = 1 # get indices of allowable_features_pretrain that match with # elements in go_labels _, node_feature_indices, _ = np.intersect1d(allowable_features_pretrain, go_labels, return_indices=True) for idx in node_feature_indices: pretrain_go_node_feature[idx] = 1 data.go_target_downstream = torch.tensor(np.array(downstream_go_node_feature), dtype=torch.long) data.go_target_pretrain = torch.tensor(np.array(pretrain_go_node_feature), dtype=torch.long) return data
def test_data(): torch_geometric.set_debug(True) x = torch.tensor([[1, 3, 5], [2, 4, 6]], dtype=torch.float).t() edge_index = torch.tensor([[0, 0, 1, 1, 2], [1, 1, 0, 2, 1]]) data = Data(x=x, edge_index=edge_index).to(torch.device('cpu')) N = data.num_nodes assert data.x.tolist() == x.tolist() assert data['x'].tolist() == x.tolist() assert sorted(data.keys) == ['edge_index', 'x'] assert len(data) == 2 assert 'x' in data and 'edge_index' in data and 'pos' not in data assert data.__cat_dim__('x', data.x) == 0 assert data.__cat_dim__('edge_index', data.edge_index) == -1 assert data.__inc__('x', data.x) == 0 assert data.__inc__('edge_index', data.edge_index) == data.num_nodes assert not data.x.is_contiguous() data.contiguous() assert data.x.is_contiguous() assert not data.is_coalesced() data.edge_index, _ = coalesce(data.edge_index, None, N, N) data = data.coalesce() assert data.is_coalesced() clone = data.clone() assert clone != data assert len(clone) == len(data) assert clone.x.tolist() == data.x.tolist() assert clone.edge_index.tolist() == data.edge_index.tolist() data['x'] = x + 1 assert data.x.tolist() == (x + 1).tolist() assert data.__repr__() == 'Data(edge_index=[2, 4], x=[3, 2])' dictionary = {'x': data.x, 'edge_index': data.edge_index} data = Data.from_dict(dictionary) assert sorted(data.keys) == ['edge_index', 'x'] assert not data.contains_isolated_nodes() assert not data.contains_self_loops() assert data.is_undirected() assert not data.is_directed() assert data.num_nodes == 3 assert data.num_edges == 4 assert data.num_faces is None assert data.num_node_features == 2 assert data.num_features == 2 data.edge_attr = torch.randn(data.num_edges, 2) assert data.num_edge_features == 2 data.edge_attr = None data.x = None assert data.num_nodes == 3 data.edge_index = None assert data.num_nodes is None assert data.num_edges is None data.num_nodes = 4 assert data.num_nodes == 4 data = Data(x=x, attribute=x) assert len(data) == 2 assert data.x.tolist() == x.tolist() assert data.attribute.tolist() == x.tolist() face = torch.tensor([[0, 1], [1, 2], [2, 3]]) data = Data(num_nodes=4, face=face) assert data.num_faces == 2 assert data.num_nodes == 4 data = Data(title="test") assert data.__repr__() == 'Data(title=test)' assert data.num_node_features == 0 assert data.num_edge_features == 0 torch_geometric.set_debug(False)
if use_cuda: device = torch.device('cuda:' + str(use_cuda)) else: device = torch.device('cpu') # 2 read and processing data data = Dataset(data_path='/home/zhengyi/') # idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test adj, features, labels = data.adj, data.features, data.labels adj = sparse_mx_to_torch_sparse_long_tensor(adj) # features = sparse_mx_to_torch_sparse_tensor(features) features = torch.FloatTensor(features) labels = torch.LongTensor(labels - 1) # print(torch.min(labels)) data = Data(x=features, edge_index=adj, y=labels).to(device) # gen idx_train, idx_val, idx_test _idx = np.arange(len(labels)) val_size = 0.1 test_size = 0.8 train_size = 1 - val_size - test_size stratify = labels idx_train_and_val, idx_test = train_test_split(_idx, random_state=None, train_size=train_size + val_size, test_size=test_size, stratify=stratify) stratify = stratify[idx_train_and_val] idx_train, idx_val = train_test_split(
if tfs is not None: x_pos = torch.tensor(tfs['features']['dom_x'].inverse_transform(tmp_event[['dom_x']]),dtype=torch.float) y_pos = torch.tensor(tfs['features']['dom_y'].inverse_transform(tmp_event[['dom_y']]),dtype=torch.float) z_pos = torch.tensor(tfs['features']['dom_z'].inverse_transform(tmp_event[['dom_z']]),dtype=torch.float) x = torch.cat([torch.tensor(tmp_event[['charge_log10','time']].values,dtype=torch.float),x_pos,y_pos,z_pos],dim=1) pos = torch.cat([x_pos,y_pos,z_pos],dim=1) else: x = torch.tensor(tmp_event[['charge_log10','time','dom_x','dom_y','dom_z']].values,dtype=torch.float) #Features pos = torch.tensor(tmp_event[['dom_x','dom_y','dom_z']].values,dtype=torch.float) #Position query = "SELECT energy_log10, time, position_x, position_y, position_z, direction_x, direction_y, direction_z, azimuth, zenith FROM truth WHERE event_no = {}".format(event_no) y = pd.read_sql(query,con) y = torch.tensor(y.values,dtype=torch.float) #Target dat = Data(x=x,edge_index=None,edge_attr=None,y=y,pos=pos) # T.KNNGraph(loop=True)(dat) #defining edges by k-NN with k=6 !!! Make sure .pos is not scaled!!! ie. x,y,z -!-> ax,by,cz T.KNNGraph(k=6, loop=False, force_undirected = False)(dat) dat.adj_t = None T.ToUndirected()(dat) T.AddSelfLoops()(dat) (row, col) = dat.edge_index dat.edge_index = torch.stack([col,row],dim=0) data_list.append(dat) if (i+1) % subdivides == 0: data, slices = InMemoryDataset.collate(data_list) torch.save((data,slices), destination + '/{}k_{}{}.pt'.format(subdivides//1000,save_filename,subset))
def main(): global device global graphname print(socket.gethostname()) seed = 0 if not download: mp.set_start_method('spawn', force=True) outputs = None if "OMPI_COMM_WORLD_RANK" in os.environ.keys(): os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"] # Initialize distributed environment with SLURM if "SLURM_PROCID" in os.environ.keys(): os.environ["RANK"] = os.environ["SLURM_PROCID"] if "SLURM_NTASKS" in os.environ.keys(): os.environ["WORLD_SIZE"] = os.environ["SLURM_NTASKS"] if "MASTER_ADDR" not in os.environ.keys(): os.environ["MASTER_ADDR"] = "127.0.0.1" os.environ["MASTER_PORT"] = "1234" dist.init_process_group(backend='nccl') rank = dist.get_rank() size = dist.get_world_size() print("Processes: " + str(size)) # device = torch.device('cpu') devid = rank_to_devid(rank, acc_per_rank) device = torch.device('cuda:{}'.format(devid)) torch.cuda.set_device(device) curr_devid = torch.cuda.current_device() # print(f"curr_devid: {curr_devid}", flush=True) devcount = torch.cuda.device_count() if graphname == "Cora": path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', graphname) dataset = Planetoid(path, graphname, transform=T.NormalizeFeatures()) data = dataset[0] data = data.to(device) data.x.requires_grad = True inputs = data.x.to(device) inputs.requires_grad = True data.y = data.y.to(device) edge_index = data.edge_index num_features = dataset.num_features num_classes = dataset.num_classes elif graphname == "Reddit": path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', graphname) dataset = Reddit(path, T.NormalizeFeatures()) data = dataset[0] data = data.to(device) data.x.requires_grad = True inputs = data.x.to(device) inputs.requires_grad = True data.y = data.y.to(device) edge_index = data.edge_index num_features = dataset.num_features num_classes = dataset.num_classes elif graphname == 'Amazon': print(f"Loading coo...", flush=True) edge_index = torch.load("../data/Amazon/processed/data.pt") print(f"Done loading coo", flush=True) # edge_index = edge_index.t_() # n = 9430088 n = 14249639 # n = 14249640 num_features = 300 num_classes = 24 # mid_layer = 24 inputs = torch.rand(n, num_features) data = Data() data.y = torch.rand(n).uniform_(0, num_classes - 1).long() data.train_mask = torch.ones(n).long() # edge_index = edge_index.to(device) print(f"edge_index.size: {edge_index.size()}", flush=True) print(f"edge_index: {edge_index}", flush=True) data = data.to(device) # inputs = inputs.to(device) inputs.requires_grad = True data.y = data.y.to(device) elif graphname == 'subgraph3': print(f"Loading coo...", flush=True) edge_index = torch.load("../data/subgraph3/processed/data.pt") print(f"Done loading coo", flush=True) n = 8745542 num_features = 128 # mid_layer = 512 # mid_layer = 64 num_classes = 256 inputs = torch.rand(n, num_features) data = Data() data.y = torch.rand(n).uniform_(0, num_classes - 1).long() data.train_mask = torch.ones(n).long() print(f"edge_index.size: {edge_index.size()}", flush=True) data = data.to(device) inputs.requires_grad = True data.y = data.y.to(device) if download: exit() if normalization: adj_matrix, _ = add_remaining_self_loops(edge_index, num_nodes=inputs.size(0)) else: adj_matrix = edge_index init_process(rank, size, inputs, adj_matrix, data, num_features, num_classes, device, outputs, run) if outputs is not None: return outputs[0]
def __getitem__(self, idx): sampling_strategy = cfg.train_sampling if self.ds.sets == "train" else cfg.eval_sampling if self.num_graphs_in_matching_instance is None: raise ValueError("Num_graphs has to be set to an integer value.") idx = idx if self.true_epochs else None anno_list, perm_mat_list = self.ds.get_k_samples( idx, k=self.num_graphs_in_matching_instance, cls=self.cls, mode=sampling_strategy) for perm_mat in perm_mat_list: if (not perm_mat.size or (perm_mat.size < 2 * 2 and sampling_strategy == "intersection") and not self.true_epochs): # 'and not self.true_epochs' because we assume all data is valid when sampling a true epoch next_idx = None if idx is None else idx + 1 return self.__getitem__(next_idx) points_gt = [ np.array([(kp["x"], kp["y"]) for kp in anno_dict["keypoints"]]) for anno_dict in anno_list ] n_points_gt = [len(p_gt) for p_gt in points_gt] graph_list = [] for p_gt, n_p_gt in zip(points_gt, n_points_gt): edge_indices, edge_features = build_graphs(p_gt, n_p_gt) # Add dummy node features so the __slices__ of them is saved when creating a batch pos = torch.tensor(p_gt).to(torch.float32) / 256.0 assert (pos > -1e-5).all(), p_gt graph = Data( edge_attr=torch.tensor(edge_features).to(torch.float32), edge_index=torch.tensor(edge_indices, dtype=torch.long), x=pos, pos=pos, ) graph.num_nodes = n_p_gt graph_list.append(graph) ret_dict = { "Ps": [torch.Tensor(x) for x in points_gt], "ns": [torch.tensor(x) for x in n_points_gt], "gt_perm_mat": perm_mat_list, "edges": graph_list, } imgs = [anno["image"] for anno in anno_list] if imgs[0] is not None: trans = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(cfg.NORM_MEANS, cfg.NORM_STD) ]) imgs = [trans(img) for img in imgs] ret_dict["images"] = imgs elif "feat" in anno_list[0]["keypoints"][0]: feat_list = [ np.stack([kp["feat"] for kp in anno_dict["keypoints"]], axis=-1) for anno_dict in anno_list ] ret_dict["features"] = [torch.Tensor(x) for x in feat_list] return ret_dict
val_data = {6: np.arange(24) + 6, 7: np.arange(24) + 7} # dense_adjacency = nx.to_pandas_adjacency(graph) sparse_adj = nx.to_scipy_sparse_matrix(graph).tocoo() sparse_adj_in_coo_format = np.stack([sparse_adj.row, sparse_adj.col]) sparse_adj_in_coo_format_tensor = torch.tensor(sparse_adj_in_coo_format, dtype=torch.long).cuda() frame_data = pd.DataFrame.from_dict(data) valframe = pd.DataFrame.from_dict(val_data) data_graphs = [] for i in range(len(frame_data) - 1): x = torch.tensor([frame_data.iloc[i]], dtype=torch.double).cuda() x = x.permute(1, 0) # nodes, features y = torch.tensor([frame_data.iloc[i + 1]], dtype=torch.double).cuda() y = y.permute(1, 0) # nodes, features data_entry = Data(x=x, y=y, edge_index=sparse_adj_in_coo_format_tensor) data_graphs.append(data_entry) loader = DataLoader(data_graphs, batch_size=1) val_graphs = [] for i in range(len(valframe) - 1): x = torch.tensor([valframe.iloc[i]], dtype=torch.double).cuda() x = x.permute(1, 0) y = torch.tensor([valframe.iloc[i]], dtype=torch.double).cuda() y = y.permute(1, 0) val_data_entry = Data(x=x, y=y, edge_index=sparse_adj_in_coo_format_tensor) val_graphs.append(val_data_entry) val_loader = DataLoader(val_graphs, batch_size=1) #training_tensors = dicttotensor(data) #val_tensors = dicttotensor(val_data) model = Net(data_graphs[0]['x'].shape[0])
def do_training(ogn, graph, lr=1e-3, total_epochs=100, batch_per_epoch=1500, weight_decay=1e-8, batch=32, l1=1e-2): idx = graph.edge_index.cuda() X = graph.x.cuda() y = graph.y.cuda() N = graph.x.shape[0] device = torch.device('cuda') v = torch.ones(idx.shape[1], device=device) mat = sparse.IntTensor(idx, v, torch.Size([N, N])) mat2 = ts.tensor.SparseTensor.from_torch_sparse_coo_tensor(mat, has_value=False) row, col, _ = mat2.csr() # Set up optimizer: init_lr = lr opt = torch.optim.Adam(ogn.parameters(), lr=init_lr, weight_decay=weight_decay) sched = OneCycleLR( opt, max_lr=init_lr, steps_per_epoch=batch_per_epoch, #len(trainloader), epochs=total_epochs, final_div_factor=1e5) all_losses = [] epoch = 0 for epoch in trange(epoch, total_epochs): ogn.cuda() total_loss = 0.0 i = 0 num_items = 0 while i < batch_per_epoch: opt.zero_grad() node_idx = torch.randint(0, N - 1, (batch, ), device=device) neighbor_idx = torch.cat([ col[row[node_idx[i]]:row[node_idx[i] + 1]] for i in range(batch) ]) new_node_idx = torch.cat([ torch.ones(row[node_idx[i] + 1] - row[node_idx[i]], dtype=int, device=device) * i for i in range(batch) ]) new_neighbor_idx = torch.arange(batch, batch + len(neighbor_idx), device=device, dtype=int) Xcur = torch.cat([X[node_idx], X[neighbor_idx]], dim=0) ycur = torch.cat([y[node_idx], y[neighbor_idx]], dim=0) edge_index = torch.cat([ new_neighbor_idx[None], new_node_idx[None] ]) #new_node_idx[None], new_neighbor_idx[None]]) g = Data(x=Xcur, y=ycur, edge_index=edge_index) loss, reg = new_loss(ogn, g, batch, regularization=l1) ((loss + reg) / int(batch + 1)).backward() opt.step() sched.step() total_loss += loss.item() i += 1 num_items += batch cur_loss = total_loss / num_items all_losses.append(cur_loss) print(cur_loss, flush=True) return all_losses
dataSel = data_list[0] # print(data_list[0],filename_list[0]) # print(data) loader = DataLoader(data_list, batch_size=len(data_list), shuffle=False) # for data in loader: #batch, # print(data) # print(data.x) # print(data.edge_index) for batch in loader: # print(batch.num_features) # print(batch.num_graphs) pass data = Data(x=dataSel.x, edge_index=dataSel.edge_index) # print(data.num_features) parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default='GAE') args = parser.parse_args() assert args.model in ['GAE', 'VGAE'] kwargs = {'GAE': GAE, 'VGAE': VGAE} class Encoder(torch.nn.Module): # def __init__(self, in_channels, out_channels): def __init__(self, in_channels, out_channels): super(Encoder, self).__init__() self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) if args.model in ['GAE']:
def load_graph_data(realization=0, cutoff=30): try: cur_data = pd.read_hdf('halos_%d.h5' % (realization, )) except: from generate_halo_data_nv import generate_data generate_data(realization, get_cluster()) cur_data = pd.read_hdf('halos_%d.h5' % (realization, )) # # Now, let's connect nearby halos: xyz = np.array([cur_data.x, cur_data.y, cur_data.z]).T tree = KDTree(xyz) # ## Let's see what a good radius is. Let's aim for ~8 particles or so for average region_of_influence = cutoff #plt.hist(tree.query_radius(xyz, region_of_influence, count_only=True)-1, bins=31); #plt.xlabel('Number with') #plt.ylabel('Number of neighbors') # ## So, let's create the adjacency matrix: neighbors = tree.query_radius(xyz, region_of_influence, sort_results=True, return_distance=True)[0] all_edges = [] for j in range(len(neighbors)): if len(neighbors[j]) == 1: continue #Receiving is second! cur = np.array([neighbors[j][1:], np.ones(len(neighbors[j]) - 1) * j], dtype=np.int64) all_edges.append(cur) all_edges = np.concatenate(all_edges, axis=1) # # Now let's put this data into PyTorch: X_raw = torch.from_numpy( np.array(cur_data['x y z vx vy vz M14'.split(' ')])) y_raw = torch.from_numpy(np.array(cur_data[['delta']])) pos_scale = X_raw[:, :3].std(0).mean(0) pos_mean = 500 vel_scale = X_raw[:, 3:6].std(0).mean(0) M14_scale = X_raw[:, 6].std() X = X_raw.clone() X[:, :3] = (X[:, :3] - pos_mean) / pos_scale X[:, 3:6] = (X[:, 3:6]) / vel_scale X[:, 6] = (X[:, 6]) / M14_scale edge_index = torch.LongTensor(torch.from_numpy(all_edges)) cur_data['z'].min() # Which nodes are far enough from the edge? nodes_far_from_edge = np.product( [((region_of_influence < cur_data[dim]) & (1000 - region_of_influence > cur_data[dim])) for dim in 'x y z'.split(' ')], 0).astype(np.float32) # We'll include this in the y-vector as a simple multiplier against the loss for bad nodes: y = torch.cat( [y_raw, torch.from_numpy(nodes_far_from_edge)[:, None]], dim=1) graph_data = Data(X, edge_index=edge_index, y=y) return { 'graph': graph_data, 'column_description': 'x columns are [x, y, z, vx, vy, vz, M]; everything has been scaled to be std=1. y columns are [bias, mask], where mask=1 indicates that the node should be used as a receiver for training; mask=0 indicates that the node is too close to the edge. Multiply the node-wise loss by the mask during training.', 'pos_scale': pos_scale, 'vel_scale': vel_scale, 'M14_scale': M14_scale }
# edge_index = torch.tensor(edge_lists, dtype=torch.long) # data2 = Data(edge_index=edge_index.t().contiguous()) # edge_lists = [x for x in edge_lists if not x[0]==x[1]] # remove self loop l = np.array(edge_lists) # sort by first then second column # https://stackoverflow.com/a/38194077/12859133 l = l[l[:, 1].argsort()] l = l[l[:, 0].argsort(kind='mergesort')] l = l.transpose() r = np.array([l[1, :], l[0, :]]) # 建無向圖 edge_index = torch.from_numpy(r).long() # edge_index = torch.from_numpy(r).long()-1 # node index應該都要從0開始 x = torch.arange(1, l[0].max() + 1).long() data = Data(x, edge_index) # 假設x是從node id 0 開始遞增 d = data.edge_index.data.numpy() # 建nx的無向圖 G = nx.Graph() G.add_edges_from(edge_lists) # networkx無向圖轉Data data3 = from_networkx(G) data3.x = torch.tensor(list(G.nodes)).unsqueeze( -1) # 不一定對的上, 因為node id可能早被重新編碼 # 很費時, 建好graph就直接存檔(networkx(用Data直接轉) + Data) torch.save(data, dataset + 'global_graph_start0.pt') # 如果是dynamic版的話: # 用Data()建T張獨立的graph, 不要把這graph放進dataloader => 各自train GAT到T emb
####Node features Mesh nodes x_mesh = torch.eye(len(set(mesh_filtered['DescriptorName_UI']))) ####Edges paper-mesh mesh_filtered['PMID'] = [ dict_reindex[e] for e in mesh_filtered['PMID'].tolist() ] mesh_filtered['DescriptorName_UI'] = [ dict_reindex_mesh[e] for e in mesh_filtered['DescriptorName_UI'].tolist() ] edge_index_paper_mesh = torch.LongTensor( mesh_filtered[['PMID', 'DescriptorName_UI']].values.transpose()) #Create dataset # edge_index_paper_paper = to_undirected(edge_index_paper_paper) # edge_index_paper_mesh = to_undirected(edge_index_paper_mesh) edge_type = torch.cat([ torch.zeros(edge_index_paper_paper.size(1)), torch.ones(edge_index_paper_mesh.size(1)) ], 0) edge_index = torch.cat([edge_index_paper_paper, edge_index_paper_mesh], 1) dataset = Data(x_paper=x_paper, x_mesh=x_mesh, edge_index=edge_index, edge_type=edge_type, mesh_feature_dim=x_mesh.size(1), paper_feature_dim=x_paper.size(1)) torch.save(dataset, 'het_graph_paper_mesh.pk')
def process_single_file(self, raw_file_name): with open(osp.join(self.raw_dir, raw_file_name), "rb") as fi: all_data = pickle.load(fi, encoding='iso-8859-1') batch_data = [] for idata, data in enumerate(all_data): mat = data["dm"].copy() #set all edges with distance greater than 0.5 to 0 md = mat.todense() md[md > 0.5] = 0 mat = scipy.sparse.coo_matrix(md) mat_reco_cand = data["dm_elem_cand"].copy() mat_reco_gen = data["dm_elem_gen"].copy() mul1 = mat.multiply(mat_reco_cand) mul2 = mat.multiply(mat_reco_gen) mul1 = mul1 > 0 mul2 = mul2 > 0 if len(mat.row) > 0: mat_reco_cand = scipy.sparse.coo_matrix( (np.array(mul1[mat.row, mat.col]).squeeze(), (mat.row, mat.col)), shape=(mat.shape[0], mat.shape[1])) mat_reco_gen = scipy.sparse.coo_matrix( (np.array(mul2[mat.row, mat.col]).squeeze(), (mat.row, mat.col)), shape=(mat.shape[0], mat.shape[1])) else: mat_reco_cand = scipy.sparse.coo_matrix( np.zeros((mat.shape[0], mat.shape[1]))) mat_reco_gen = scipy.sparse.coo_matrix( np.zeros((mat.shape[0], mat.shape[1]))) X = data["Xelem"] ygen = data['ygen'] ycand = data['ycand'] #node_sel = X[:, 4] > 0.2 #row_index, col_index, dm_data = mat.row, mat.col, mat.data #num_elements = X.shape[0] #num_edges = row_index.shape[0] #edge_index = np.zeros((2, 2*num_edges)) #edge_index[0, :num_edges] = row_index #edge_index[1, :num_edges] = col_index #edge_index[0, num_edges:] = col_index #edge_index[1, num_edges:] = row_index #edge_index = torch.tensor(edge_index, dtype=torch.long) #edge_data = dm_data #edge_attr = np.zeros((2*num_edges, 1)) #edge_attr[:num_edges,0] = edge_data #edge_attr[num_edges:,0] = edge_data #edge_attr = torch.tensor(edge_attr, dtype=torch.float) r = torch_geometric.utils.from_scipy_sparse_matrix(mat) rc = torch_geometric.utils.from_scipy_sparse_matrix(mat_reco_cand) rg = torch_geometric.utils.from_scipy_sparse_matrix(mat_reco_gen) #edge_index, edge_attr = torch_geometric.utils.subgraph(torch.tensor(node_sel, dtype=torch.bool), # edge_index, edge_attr, relabel_nodes=True, num_nodes=len(X)) x = torch.tensor(X, dtype=torch.float) ygen = torch.tensor(ygen, dtype=torch.float) ycand = torch.tensor(ycand, dtype=torch.float) data = Data( x=x, edge_index=r[0].to(dtype=torch.long), edge_attr=r[1].to(dtype=torch.float), ygen=ygen, ycand=ycand, target_edge_attr_cand=rc[1].to(dtype=torch.float), target_edge_attr_gen=rg[1].to(dtype=torch.float), ) data_prep(data) batch_data += [data] return batch_data
def _process(self, data_list): if len(data_list) == 0: return Data() data = Batch.from_data_list(data_list) delattr(data, "batch") return data
hidden_dim4, edge_input_dim) self.resmpblock4 = ResidualMessagePassingBlock(hidden_dim4, hidden_dim5, edge_input_dim) self.set2set = Set2Set(hidden_dim5, processing_steps=3) self.ffnn_out = torch.nn.Linear(hidden_dim5 * 2, output_dim) def forward(self, data): data.x = F.relu(self.ffnn(data.x)) data = self.resmpblock0(data) data = self.resmpblock1(data) data = self.resmpblock2(data) data = self.resmpblock3(data) data = self.resmpblock4(data) x = self.set2set(data.x, data.batch) x = self.ffnn_out(x) return x.view(-1) if __name__ == '__main__': from torch_geometric.data import Data data = Data(x=torch.rand([100, 18]), edge_attr=torch.rand([200, 7]), edge_index=torch.ones([2, 200]).long(), y=torch.ones([200]), batch=torch.zeros([200]).long()) resmpblock0 = ResidualMessagePassingBlock(18, 18) print(resmpblock0(data))
def physnet_to_datalist(self, N, R, E, D, Q, Z, num_mol, mols, efgs_batch, EFG_R, EFG_Z, num_efg, sol_data=None): """ load data from PhysNet structure to InMemoryDataset structure (more compact) :return: """ from rdkit.Chem.inchi import MolToInchi data_array = np.empty(num_mol, dtype=Data) t0 = time.time() Z_0 = Z[0, :] n_heavy = len(Z_0) - (Z_0 == 0).sum() - (Z_0 == 1).sum() jianing_to_dongdong_map = [] for i in tqdm(range(num_mol)): if self.bond_atom_sep: mol = mols[i] else: mol = None # atomic infos _tmp_Data = Data() num_atoms = N[i] _tmp_Data.N = num_atoms.view(-1) _tmp_Data.R = R[i, :N[i]].view(-1, 3) _tmp_Data.E = E[i].view(-1) _tmp_Data.D = D[i].view(-1, 3) _tmp_Data.Q = Q[i].view(-1) _tmp_Data.Z = Z[i, :N[i]].view(-1) if self.cal_efg: _tmp_Data.atom_to_EFG_batch = efgs_batch[i, :N[i]].view(-1) _tmp_Data.EFG_R = EFG_R[i, :num_efg[i]].view(-1, 3) _tmp_Data.EFG_Z = EFG_Z[i, :num_efg[i]].view(-1) _tmp_Data.EFG_N = num_efg[i].view(-1) if sol_data is not None: # find molecule from solvation csv file based on InChI, if found, add it this_sol_data = sol_data.loc[sol_data["InChI"] == MolToInchi(mol)] if this_sol_data.shape[0] == 1: for key in sol_keys: _tmp_Data.__setattr__( key, torch.as_tensor(this_sol_data.iloc[0][key]).view(-1)) jianing_to_dongdong_map.append(1) else: jianing_to_dongdong_map.append(0) continue _tmp_Data = self.pre_transform( data=_tmp_Data, edge_version=self.edge_version, do_sort_edge=self.sort_edge, cal_efg=self.cal_efg, cutoff=self.cutoff, extended_bond=self.extended_bond, boundary_factor=self.boundary_factor, type_3_body=self.type_3_body, use_center=self.use_center, mol=mol, cal_3body_term=self.cal_3body_term, bond_atom_sep=self.bond_atom_sep, record_long_range=self.record_long_range) data_array[i] = _tmp_Data if sol_data is not None: torch.save(torch.as_tensor(jianing_to_dongdong_map), "jianing_to_dongdong_map_{}.pt".format(n_heavy)) data_list = [ data_array[i] for i in range(num_mol) if data_array[i] is not None ] return data_list
def inference(self, X, y): labels = torch.LongTensor().to(self.config.device) outputs = torch.FloatTensor().to(self.config.device) # Dictionary storing (output, label) pair for all driving categories categories = dict.fromkeys(self.unique_clips) for key, val in categories.items(): categories[key] = {'outputs': outputs, 'labels': labels} acc_loss_test = 0 folder_names = [] attns_weights = [] node_attns = [] inference_time = 0 with torch.no_grad(): for i in range(len(X)): # iterate through scenegraphs data, label, category = X[i]['sequence'], y[i], X[i][ 'category'] data_list = [ Data(x=g['node_features'], edge_index=g['edge_index'], edge_attr=g['edge_attr']) for g in data ] self.test_loader = DataLoader(data_list, batch_size=len(data_list)) sequence = next(iter(self.test_loader)).to(self.config.device) self.model.eval() #start = torch.cuda.Event(enable_timing=True) #end = torch.cuda.Event(enable_timing=True) #start.record() output, attns = self.model.forward(sequence.x, sequence.edge_index, sequence.edge_attr, sequence.batch) #end.record() #torch.cuda.synchronize() inference_time += 0 #start.elapsed_time(end) loss_test = self.loss_func( output.view(-1, 2), torch.LongTensor([label]).to(self.config.device)) acc_loss_test += loss_test.detach().cpu().item() label = torch.tensor(label, dtype=torch.long).to(self.config.device) # store output, label statistics self.update_categorical_outputs(categories, output, label, category) folder_names.append(X[i]['folder_name']) if 'lstm_attn_weights' in attns: attns_weights.append(attns['lstm_attn_weights'].squeeze(). detach().cpu().numpy().tolist()) if 'pool_score' in attns: node_attn = {} node_attn["original_batch"] = sequence.batch.detach().cpu( ).numpy().tolist() node_attn["pool_perm"] = attns['pool_perm'].detach().cpu( ).numpy().tolist() node_attn["pool_batch"] = attns['batch'].detach().cpu( ).numpy().tolist() node_attn["pool_score"] = attns['pool_score'].detach().cpu( ).numpy().tolist() node_attns.append(node_attn) sum_seq_len = 0 num_risky_sequences = 0 sequences = len(categories['all']['labels']) for indices in range(sequences): seq_output = categories['all']['outputs'][indices] label = categories['all']['labels'][indices] pred = torch.argmax(seq_output) # risky clip if label == 1: num_risky_sequences += 1 sum_seq_len += seq_output.shape[0] avg_risky_seq_len = sum_seq_len / num_risky_sequences return categories, \ folder_names, \ acc_loss_test/len(X), \ avg_risky_seq_len, \ inference_time, \ attns_weights, \ node_attns