def _save_g(file_path, g, labels=None): save_graphs(file_path, g, labels=labels)
def create_old_heterograph_files(): path = os.path.join(os.path.dirname(__file__), "data/hetero1.bin") g_list0 = create_heterographs(F.int64) + create_heterographs(F.int32) labels_dict = {"graph_label": F.ones(54)} save_graphs(path, g_list0, labels_dict)
train_G = nx.from_scipy_sparse_matrix( full_adj[:n_training_samples][:, :n_training_samples]) train_DGL = dgl.DGLGraph() train_DGL.from_networkx(train_G, edge_attrs=['weight']) # train_DGL.from_scipy_sparse_matrix(full_adj[:n_training_samples][:, :n_training_samples]) assert (len(train_DGL) == train_features.shape[0]) test_G = nx.from_scipy_sparse_matrix( full_adj[n_training_docs:][:, n_training_docs:]) test_DGL = dgl.DGLGraph() test_DGL.from_networkx(test_G, edge_attrs=['weight']) # test_DGL.from_scipy_sparse_matrix(full_adj[n_training_docs:][:, n_training_docs:]) assert (len(test_DGL) == test_features.shape[0]) Gs = [train_DGL, test_DGL] save_graphs('graph.bin', Gs) print(Gs[0]) print('load graph done') class Model(nn.Module): def __init__(self, feature_dim, inter_dim, final_dim): super(Model, self).__init__() self.gcn1 = GraphConv(feature_dim, inter_dim) self.gcn2 = GraphConv(inter_dim, final_dim) self.dropout = nn.Dropout(p=0.5) def forward(self, graph, features): x = self.gcn1(graph, features) # x = self.dropout(x) x = F.relu(x)
def _pre_process(self, smiles_to_graph, node_featurizer, edge_featurizer, load, log_every, init_mask, n_jobs=1): """Pre-process the dataset * Convert molecules from smiles format into DGLGraphs and featurize their atoms * Set missing labels to be 0 and use a binary masking matrix to mask them Parameters ---------- smiles_to_graph : callable, SMILES -> DGLGraph Function for converting a SMILES (str) into a DGLGraph. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. load : bool Whether to load the previously pre-processed dataset or pre-process from scratch. ``load`` should be False when we want to try different graph construction and featurization methods and need to preprocess from scratch. Default to True. log_every : bool Print a message every time ``log_every`` molecules are processed. It only comes into effect when :attr:`n_jobs` is greater than 1. init_mask : bool Whether to initialize a binary mask indicating the existence of labels. n_jobs : int Degree of parallelism for pre processing. Default to 1. """ if os.path.exists(self.cache_file_path) and load: # DGLGraphs have been constructed before, reload them print('Loading previously saved dgl graphs...') self.graphs, label_dict = load_graphs(self.cache_file_path) self.labels = label_dict['labels'] if init_mask: self.mask = label_dict['mask'] self.valid_ids = label_dict['valid_ids'].tolist() else: print('Processing dgl graphs from scratch...') if n_jobs > 1: self.graphs = pmap(smiles_to_graph, self.smiles, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, n_jobs=n_jobs) else: self.graphs = [] for i, s in enumerate(self.smiles): if (i + 1) % log_every == 0: print('Processing molecule {:d}/{:d}'.format( i + 1, len(self))) self.graphs.append( smiles_to_graph(s, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer)) # Keep only valid molecules self.valid_ids = [] graphs = [] for i, g in enumerate(self.graphs): if g is not None: self.valid_ids.append(i) graphs.append(g) self.graphs = graphs _label_values = self.df[self.task_names].values # np.nan_to_num will also turn inf into a very large number self.labels = F.zerocopy_from_numpy( np.nan_to_num(_label_values).astype( np.float32))[self.valid_ids] valid_ids = torch.tensor(self.valid_ids) if init_mask: self.mask = F.zerocopy_from_numpy( (~np.isnan(_label_values)).astype( np.float32))[self.valid_ids] save_graphs(self.cache_file_path, self.graphs, labels={ 'labels': self.labels, 'mask': self.mask, 'valid_ids': valid_ids }) else: self.mask = None save_graphs(self.cache_file_path, self.graphs, labels={ 'labels': self.labels, 'valid_ids': valid_ids }) self.smiles = [self.smiles[i] for i in self.valid_ids]
src = torch.tensor(graph_df.u.values) dst = torch.tensor(graph_df.i.values) label = torch.tensor(graph_df.label.values, dtype=torch.float32) timestamp = torch.tensor(graph_df.ts.values, dtype=torch.float32) edge_feat = torch.tensor(edge_features[1:], dtype=torch.float32) g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src]))) len_event = src.shape[0] g.edata['label'] = label.repeat(2).squeeze() g.edata['timestamp'] = timestamp.repeat(2).squeeze() g.edata['feat'] = edge_feat.repeat(2, 1).squeeze() print(g) save_graphs(f"./data/{args.data}.bin", g) if args.new_node_count: origin_num_edges = g.num_edges() // 2 train_eid = torch.arange(0, int(0.7 * origin_num_edges)) un_train_eid = torch.arange(int(0.7 * origin_num_edges), origin_num_edges) train_g = dgl.graph(g.find_edges(train_eid)) val_n_test_g = dgl.compact_graphs(dgl.graph(g.find_edges(un_train_eid))) print( f'total nodes: {g.num_nodes()}, training nodes: {train_g.num_nodes()}, val_n_test nodes: {val_n_test_g.num_nodes()}' ) old_nodes = val_n_test_g.num_nodes() - g.num_nodes() + train_g.num_nodes() print( f'old nodes in val_n_test: {old_nodes} ({round((old_nodes)*100/val_n_test_g.num_nodes(),4)}%)'
def gaussian_square(x0: float, xn: float, y0: float, yn: float, stop: float, steps: int, f: str = '0', ud_top: str = '0', ud_bottom: str = '0', ud_left: str = '0', ud_right: str = '0', u0: str = '0', cell_size: float = 5., tol: float = 1e-4, dy: bool = False, path: str = 'data/gaussian_square_static.bin'): '''Create Gaussian Equation Dataset 2D Gaussian equation, dynamic process, rectangle domain, can custom each boundary's condition, boundary, can choose static boundary condition or dynamic boundary condition. Args: x0: <float> left boundary for x xn: <float> right boundary for x y0: <float> left boundary for y yn: <float> right boundary for y stop: <float> process stop time steps: <int> number of time step f: <str> right part of laplace function, in cpp argument format ud_top: <str> boundary condition on the top of rectangle ud_bottom: <str> boundary condition on the bottom of rectangle ud_left: <str> boundary condition on the left of rectangle ud_right: <str> boundary condition on the right of rectangle u0: <str> initialization condition function, in cpp argument format cell_siez: <float> cell size for created mesh tol: <float> boundary bias, e.g. (x-tol, x+tol) is a boundary on x dy: <bool> if the boundary condition is dynamic path: <str> path for saving generated dgl graph, in .bin format ''' if (dy): mesh, function_space, bc = rectangle(x0, xn, y0, yn, ud_top, ud_bottom, ud_left, ud_right, cell_size, 0, tol) else: mesh, function_space, bc = rectangle(x0, xn, y0, yn, ud_top, ud_bottom, ud_left, ud_right, cell_size, tol) dt = stop / steps u0 = Expression(u0, degree=2) f = Expression(f, degree=2) un = interpolate(u0, function_space) u = TrialFunction(function_space) v = TestFunction(function_space) F = u * v * dx + dt * dot(grad(u), grad(v)) * dx - (un + dt * f) * v * dx a, L = lhs(F), rhs(F) u = Function(function_space) t = 0 graphs = [] for _ in range(steps): t += dt if (dy): _, bc = rectangle(x0, xn, y0, yn, ud_top, ud_bottom, ud_left, ud_right, cell_size, t, tol) solve(a == L, u, bc) un.assign(u) graphs.append(to_dgl(function=u, mesh=mesh)) save_graphs(path, graphs)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): self.graph, _ = load_graphs(pre_processed_file_path) else: ### check if the downloaded file exists if self.binary: # npz format has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' ### pre-process and save if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.is_hetero: graph = read_heterograph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files, binary=self.binary)[0] else: graph = read_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files, binary=self.binary)[0] print('Saving...') save_graphs(pre_processed_file_path, graph, {}) self.graph, _ = load_graphs(pre_processed_file_path)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): if not self.is_hetero: self.graph, _ = load_graphs(pre_processed_file_path) else: with open(pre_processed_file_path, 'rb') as f: self.graph = pickle.load(f) else: ### check if the downloaded file exists has_necessary_file_simple = osp.exists( osp.join(self.root, "raw", "edge.csv.gz")) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, "raw", "triplet-type-list.csv.gz")) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" ### pre-process and save if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ self.name]["additional edge files"].split(',') if self.is_hetero: graph = read_csv_heterograph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[0] with open(pre_processed_file_path, 'wb') as f: pickle.dump([graph], f) with open(pre_processed_file_path, 'rb') as f: self.graph = pickle.load(f) else: graph = read_csv_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[0] print('Saving...') save_graphs(pre_processed_file_path, graph, {}) self.graph, _ = load_graphs(pre_processed_file_path)
neg_score = score_func(neg_g, emb).reshape(-1, neg_sample_size) filter_bias = neg_g.edata['false_neg'].reshape( -1, neg_sample_size).to(device) pos_score = F.logsigmoid(pos_score) neg_score = F.logsigmoid(neg_score) neg_score -= filter_bias.float() pos_score = pos_score.unsqueeze(1) rankings = torch.sum(neg_score >= pos_score, dim=1) + 1 return np.mean(1.0 / rankings.cpu().numpy()) device = torch.device(('cpu', 'cuda')[torch.cuda.is_available()]) g = load_ws() g = g.to(device) g.readonly() save_graphs('./ws.bin', g) features = g.ndata['features'] features = features.to(device) in_feats = g.ndata['features'].shape[1] #Model hyperparameters n_hidden = in_feats n_layers = 1 dropout = 0.5 aggregator_type = 'gcn' # create GraphSAGE model gconv_model = GraphSAGEModel(in_feats, n_hidden, n_hidden, n_layers, F.relu, dropout, aggregator_type) eids = np.random.permutation(g.number_of_edges()) train_eids = eids[:int(len(eids) * 0.8)]
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): self.graph, label_dict = load_graphs(pre_processed_file_path) if self.is_hetero: self.labels = label_dict else: self.labels = label_dict['labels'] else: ### check if the downloaded file exists if self.binary: # npz format has_necessary_file_simple = osp.exists(osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists(osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists(osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists(osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') ### pre-process and save add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info['additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info['additional edge files'].split(',') if self.is_hetero: graph = read_heterograph_dgl(raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)[0] if self.binary: tmp = np.load(osp.join(raw_dir, 'node-label.npz')) label_dict = {} for key in list(tmp.keys()): label_dict[key] = tmp[key] del tmp else: label_dict = read_node_label_hetero(raw_dir) # convert into torch tensor if 'classification' in self.task_type: for nodetype in label_dict.keys(): # detect if there is any nan node_label = label_dict[nodetype] if np.isnan(node_label).any(): label_dict[nodetype] = torch.from_numpy(node_label).to(torch.float32) else: label_dict[nodetype] = torch.from_numpy(node_label).to(torch.long) else: for nodetype in label_dict.keys(): node_label = label_dict[nodetype] label_dict[nodetype] = torch.from_numpy(node_label).to(torch.float32) else: graph = read_graph_dgl(raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)[0] ### adding prediction target if self.binary: node_label = np.load(osp.join(raw_dir, 'node-label.npz'))['node_label'] else: node_label = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression='gzip', header = None).values if 'classification' in self.task_type: # detect if there is any nan if np.isnan(node_label).any(): node_label = torch.from_numpy(node_label).to(torch.float32) else: node_label = torch.from_numpy(node_label).to(torch.long) else: node_label = torch.from_numpy(node_label).to(torch.float32) label_dict = {'labels': node_label} print('Saving...') save_graphs(pre_processed_file_path, graph, label_dict) self.graph, label_dict = load_graphs(pre_processed_file_path) if self.is_hetero: self.labels = label_dict else: self.labels = label_dict['labels']
def save_graph(file_path, g): save_graphs(file_path, g)