def process(self, xd1, xd2, xt1, xt2, y, train_mixed, smile_graph): smiles1 = xd1 target1 = xt1 smiles2 = xd2 target2 = xt2 labels = y # convert SMILES to molecular representation using rdkit c_size1, features1, edge_index1 = smile_graph[smiles1] c_size2, features2, edge_index2 = smile_graph[smiles2] # make the graph ready for PyTorch Geometrics GCN algorithms: GCNData1 = DATA.Data( x=torch.Tensor(features1), edge_index=torch.LongTensor(edge_index1).transpose(1, 0), y=torch.FloatTensor([labels])) GCNData1.target = torch.LongTensor([target1]) GCNData1.train_mixed = torch.LongTensor([train_mixed]) GCNData1.__setitem__('c_size', torch.LongTensor([c_size1])) GCNData2 = DATA.Data( x=torch.Tensor(features2), edge_index=torch.LongTensor(edge_index2).transpose(1, 0), y=torch.FloatTensor([labels])) GCNData2.target = torch.LongTensor([target2]) GCNData2.train_mixed = torch.LongTensor([train_mixed]) GCNData2.__setitem__('c_size', torch.LongTensor([c_size2])) return GCNData1, GCNData2
def process(self, xd, target_key, y, smile_graph, target_graph): assert (len(xd) == len(target_key) and len(xd) == len(y)), 'The three lists must be the same length!' data_list_mol = [] data_list_pro = [] data_len = len(xd) for i in range(data_len): smiles = xd[i] tar_key = target_key[i] labels = y[i] # convert SMILES to molecular representation using rdkit c_size, features, edge_index = smile_graph[smiles] target_size, target_features, target_edge_index = target_graph[ tar_key] # print(np.array(features).shape, np.array(edge_index).shape) # print(target_features.shape, target_edge_index.shape) # make the graph ready for PyTorch Geometrics GCN algorithms: GCNData_mol = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.FloatTensor([labels])) GCNData_mol.__setitem__('c_size', torch.LongTensor([c_size])) GCNData_pro = DATA.Data( x=torch.Tensor(target_features), edge_index=torch.LongTensor(target_edge_index).transpose(1, 0), y=torch.FloatTensor([labels])) GCNData_pro.__setitem__('target_size', torch.LongTensor([target_size])) # print(GCNData.target.size(), GCNData.target_edge_index.size(), GCNData.target_x.size()) data_list_mol.append(GCNData_mol) data_list_pro.append(GCNData_pro) if self.pre_filter is not None: data_list_mol = [ data for data in data_list_mol if self.pre_filter(data) ] data_list_pro = [ data for data in data_list_pro if self.pre_filter(data) ] if self.pre_transform is not None: data_list_mol = [ self.pre_transform(data) for data in data_list_mol ] data_list_pro = [ self.pre_transform(data) for data in data_list_pro ] self.data_mol = data_list_mol self.data_pro = data_list_pro
def _build_compound_graph_data(self, atoms, edges, target=None): atoms = np.reshape(atoms, (-1, 1)) #m x 1 atoms[atoms >= self.atom_vocab_size] = self.atom_vocab_size - 1 edges = np.array(edges) if edges.shape[0] != 0: edges = np.transpose(edges) edges = torch.LongTensor(edges) if target is not None: return gDATA.Data(x=torch.LongTensor(atoms), edge_index=edges, y=torch.LongTensor([target])) return gDATA.Data(x=torch.LongTensor(atoms), edge_index=edges)
def use(self, smiles: list, model_filename=None) -> list: # Figure out what to use if self._model is None and model_filename is None: raise RuntimeError( 'Model not previously built, or model not supplied') if model_filename is not None: self._model = torch.load(model_filename) self._model.eval() # Prepare data data = [] for idx, smi in enumerate(smiles): a, b = self._ce.encode(smi) data.append( gdata.Data(x=a, edge_index=self._ce.connectivity(smi), edge_attr=b).to(self._config['device'])) loader_test = gdata.DataLoader(data, batch_size=1, shuffle=False) # Get results results = [] for batch in loader_test: _, res = self._model(batch) results.append(res.detach().numpy()[0]) return results
def process(self, xd, xt, y,smile_graph): assert (len(xd) == len(xt) and len(xt) == len(y)), "The three lists must be the same length!" data_list = [] data_len = len(xd) for i in range(data_len): # print('Converting SMILES to graph: {}/{}'.format(i+1, data_len)) smiles = xd[i] target = xt[i] labels = y[i] # convert SMILES to molecular representation using rdkit c_size, features, edge_index = smile_graph[smiles] # make the graph ready for PyTorch Geometrics GCN algorithms: GCNData = DATA.Data(x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.FloatTensor([labels])) GCNData.target = torch.LongTensor([target]) GCNData.__setitem__('c_size', torch.LongTensor([c_size])) # append graph, label and target sequence to data list data_list.append(GCNData) if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] print('Graph construction done. Saving to file.') data, slices = self.collate(data_list) # save preprocessed data: torch.save((data, slices), self.processed_paths[0])
def process(self, xd, xt, y, smile_graph): assert (len(xd) == len(xt) and len(xt) == len(y)) data_list = [] data_len = len(xd) for i in range(data_len): smiles = xd[i] target = xt[i] labels = y[i] c_size, features, edge_index = smile_graph[smiles] GCNData = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.FloatTensor([labels])) GCNData.target = torch.LongTensor([target]) GCNData.__setitem__('c_size', torch.LongTensor([c_size])) data_list.append(GCNData) if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] print('Graph construction done. Saving to file.') data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])
def binarized_data(self): probs = F.softmax(self.edge_attr, dim=-1) ids = probs.argmax(-1) edge_attr = torch.zeros_like(probs) edge_attr.scatter_(-1, ids.view(-1, 1), 1) edge_attr = edge_attr - probs.detach() + probs return gd.Data(x=self.nodes, edge_index=self.edge_index, edge_attr=edge_attr)
def prepped_to_tensor(data): x, position, depth, depth_count, edge_index = data x = th.LongTensor(x) position = th.LongTensor(position) depth = th.LongTensor(depth) edge_index = th.LongTensor(edge_index) return th_data.Data( x=x, position=position, depth_mask=depth, depth_count=th.LongTensor(depth_count), edge_index=edge_index )
def forward(self, data: pyg_data.Data): out = [] for i in range(self.__edge_attr_dim): # New graph that corresponds to the edge attributes _mask = data.edge_attr[:, i].byte() _edge_index = torch.masked_select(data.edge_index, mask=_mask).view(2, -1) _data = pyg_data.Data(x=data.x, edge_index=_edge_index) out.append(self.__gat_nets[i](_data)) return torch.cat(tuple(out), dim=1)
def get_datasets(args): """ Gets the dataset for hateful Twitter users """ # mask not multiple datasets datadir = 'hate_with_refex' if args.use_refex else 'hate' feat_data, labels, edges = load_hate(args, datadir) dataset = pyg_d.Data(x=feat_data, edge_index=edges, y=labels, batch=feat_data[:, 0]) return dataset
def get_dataloader(graph, X, y, batch_size=1,undirected=True, shuffle=True): """ Converts a graph and a dataset to a dataloader. Parameters: ---------- graph : igraph object The underlying graph to be fed to the graph neural networks. X : numpy ndarray Input dataset with columns as features and rows as observations. y : numpy ndarray Class labels. batch_size: int, default=1 The batch size. undirected: boolean if the input graph is undirected (symmetric adjacency matrix). Returns: -------- dataloader : a pytorch-geometric dataloader. All of the graphs will have the same connectivity (given by the input graph), but the node features will be the features from X. """ n_obs, n_features = X.shape rows, cols = np.where(graph == 1) edges = zip(rows.tolist(), cols.tolist()) sources = [] targets = [] for edge in edges: sources.append(edge[0]) targets.append(edge[1]) if undirected: sources.append(edge[0]) targets.append(edge[1]) edge_index = torch.tensor([sources,targets],dtype=torch.long) list_graphs = [] y = y.tolist() # print(y) for i in range(n_obs): y_tensor = torch.tensor(y[i]) X_tensor = torch.tensor(X[i,:]).view(X.shape[1], 1).float() data = geo_dt.Data(x=X_tensor, edge_index=edge_index, y=y_tensor) list_graphs.append(data.coalesce()) dataloader = geo_dt.DataLoader(list_graphs, batch_size=batch_size, shuffle=shuffle) return dataloader
def to_torch_geom(adj, features, graph_labels, debug=True): graphs = [] for i in range(len(adj)): # Graph of a given size print("len adj", len(adj)) batch_i = [] for j in range(adj[i].shape[0]): # Number of graphs graph_adj = adj[i][j] ## [edge_index, edge_attribute] graph = data.Data(x=features[i][j], edge_index=(graph_adj)[0], y=graph_labels[i][j].unsqueeze(0)) # , pos=node_labels[i][j]) if not debug: batch_i.append(graph) if debug: batch_i.append(graph) graphs.append(batch_i) return graphs.to(device)
def to_torch_geom(adj, features, node_labels, graph_labels, device, debug): graphs = {} for key in adj.keys(): # train, val, test graphs[key] = [] for i in range(len(adj[key])): # Graph of a given size batch_i = [] for j in range(adj[key][i].shape[0]): # Number of graphs graph_adj = adj[key][i][j] graph = data.Data(x=features[key][i][j], edge_index=dense_to_sparse(graph_adj)[0], y=graph_labels[key][i][j].unsqueeze(0), pos=node_labels[key][i][j]) if not debug: batch_i.append(graph) if debug: batch_i.append(graph) graphs[key].append(batch_i) return graphs
def deal_with_mat(self): """ 将.mat 转化为 [Data] :return: DataList: [Data] """ print("dealing with mat...") m = loadmat(self.raw_paths[0]) A = utils.from_scipy_sparse_matrix(m['network']) att = torch.from_numpy(m['attributes'].todense().astype(np.float32)) y = torch.from_numpy(m['labels'].reshape(-1)).to(torch.long) # 如果y最小值不是0,则认为idx从1开始 if int(torch.min(y)) != 0: y -= 1 dt = tgd.Data(x=att, edge_index=A[0], edge_weight=A[1].to(torch.float32), y=y) # print(dt) return [dt]
def parse(self, line): match = json.loads(line) radiant_heroes = [ int(h) for h in match['radiant_team'].split(',') ] dire_heroes = [ int(h) for h in match['dire_team'].split(',') ] onehot = np.zeros(10, 130) for i, h in enumerate(radiant_heroes): onehot[i][h] = 1 for i, h in enumerate(dire_heroes): onehot[i + 5][h] = 1 nodes = torch.Tensor(onehot, dtype=torch.float) edges = self.fixed_edges edge_attrs = self.match_edge_features( radiant_heroes, dire_heroes, self.stats) label = float(match['radiant_win']) return geodata.Data(nodes, edges, edge_attrs, label)
def idx2data(provider, idx: int): struct = gt.Structure.str2fullstructure(provider[idx]) edge_index = [[], []] edge_attr = [] nodes = [[1]] * (1 + len(struct)) for node, ops in enumerate(struct): for op, pre in ops: edge_index[0].append(pre) edge_index[1].append(node + 1) edge_attr.append(torch.eye(len(OP_IDX))[OP_IDX[op]]) x = gd.Data( x=tensor(nodes, dtype=torch.float), edge_index=tensor(edge_index, dtype=torch.long), edge_attr=torch.stack(edge_attr), y=tensor([idx2acc(provider, idx)]) ) return x
def loadGeometricDataset(datapath, graphpath, encoder): json_graph = get_json(graphpath) #Getting data. data = LoadDataset(datapath, encoder, False) inputs, labels = [element[0] for element in data] , [element[1] for element in data] #Getting edges. edges = json_graph["edges"] edges2 = [[v,u] for u,v in edges] edges += edges2 #geometric data. edge_index = torch.tensor(edges, dtype = torch.long) data_list = [] for i in range(len(data)): x = torch.tensor(inputs[i], dtype = torch.float) y = torch.tensor(labels[i], dtype = torch.float) data_list.append(geodata.Data(x = x , edge_index = edge_index.t().contiguous() , y = y)) return data_list
def train_model(log=False): extract = utils.from_scipy_sparse_matrix(mat) G = data.Data(edge_index=extract[0], edge_attr=extract[1], x=x, y=y) edge_index = G.edge_index num_feat = 5 num_graph_conv_layers = 2 graph_conv_embed_sizes = 256 num_lin_layers = 3 lin_hidden_sizes = 256 num_classes = 2 model = GCN(num_feat, num_graph_conv_layers, graph_conv_embed_sizes, num_lin_layers, lin_hidden_sizes, num_classes) model.load_state_dict( torch.load(load_model_file, map_location=torch.device('cpu'))) model.eval() return model, x, y, edge_index
def graph_data(data_helper): pmi, edges_matrix, edg_nums = cal_PMI(window_size=15, mode='train') index = 0 data_list = [] seq_edge_w = torch.zeros((edg_nums, 1)) for content, label, _ in data_helper.batch_iter(batch_size=1, num_epoch=1): vocab_c = data_helper.vocab print("content lenth each data", len(vocab_c)) ###----------this is for the original operating mode --------######### f = feature_etr(vocab_c) ##### --------- --------------- ####### print('file no', index) index += 1 e, n, _, edg_ar = graphcon(content, label, edges_matrix, pmi) ##--------------------------------------------------## edges1 = [np.array([edge[0], edge[1]]) for edge in e] edge_index = torch.tensor(np.array(edges1).T, dtype=torch.long) #.cuda() edge_attr = torch.tensor(seq_edge_w[edg_ar], dtype=torch.float) #.cuda() # print("edge atr", edge_attr.size()) # edge_index, _ = add_remaining_self_loops (edge_index, edge_attr) print("edge index size", edge_index.size()) ####-------------------------------### ft = torch.tensor(f, dtype=torch.float) #.cuda() y = torch.tensor(label, dtype=torch.float) data_list.append( data.Data(x=ft, edge_index=edge_index, edge_attr=edge_attr, y=y)) return data_list
def process(self, xd, xt, y, smile_graph): assert (len(xd) == len(xt) and len(xt) == len(y)), "The three lists must be the same length!" data_list = [] data_len = len(xd) for i in range(data_len): smiles = xd[i] target = xt[i] labels = y[i] # convert SMILES to molecular representation using rdkit c_size, features, edge_index = smile_graph[smiles] # make the graph ready for PyTorch Geometrics GCN algorithms: GCNData = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.FloatTensor([labels])) GCNData.target = torch.LongTensor([target]) GCNData.__setitem__('c_size', torch.LongTensor([c_size])) data_list.append(GCNData) return data_list
def arch2data(arch: Union[str, gt.Structure], acc=None): if isinstance(arch, str): struct = gt.Structure.str2fullstructure(arch) else: struct = arch edge_index = [[], []] edge_attr = [] nodes = [[1.] for _ in range(1 + len(struct))] for idx, ops in enumerate(struct): for op, pre in ops: edge_index[0].append(pre) edge_index[1].append(idx + 1) edge_attr.append(torch.eye(len(OP_IDX))[OP_IDX[op]]) x = gd.Data(x=tensor(nodes), edge_index=tensor(edge_index, dtype=torch.long), edge_attr=torch.stack(edge_attr)) if acc is not None: x.y = tensor([acc]) return x
def process(self, groups, xd, xt, y, smile_graph): """Customize the process method to fit the task of drug-target affinity prediction. Args: xd: List of SMILES. xt: List of encoded target (categorical or one-hot). y: List of labels. Returns: PyTorch-Geometric format processed data. """ assert (len(xd) == len(xt) and len(xt) == len(y)), "The three lists must be the same length!" data_list = [] data_len = len(xd) for i in range(data_len): smiles = xd[i] target = xt[i] labels = y[i] group = groups[i] # Convert SMILES to molecular representation using rdkit c_size, features, edge_index = smile_graph[smiles] # Make the graph ready for PyTorch Geometrics GCN algorithms GCNData = DATA.Data( g=torch.FloatTensor([group]), x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.FloatTensor([labels])) GCNData.target = torch.LongTensor([target]) GCNData.__setitem__('c_size', torch.LongTensor([c_size])) # Append graph, label and target sequence to data list data_list.append(GCNData) if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] print('Graph construction done. Saving to file.') self.data, self.slices = self.collate(data_list)
def use(self, smiles: List[str], model_filename: str = None) -> List[List[float]]: """ Uses a pre-trained CompoundGCN, either trained in-session or recalled from a file, for use on new data Args: smiles (list[str]): SMILES strings to predict for model_filename (str, optional): filename/path of model to load, default = None (model trained in-session used) Returns: list[list[float]]: predicted values of shape [n_samples, n_targets] """ # Figure out what to use if self._model is None and model_filename is None: raise RuntimeError( 'Model not previously built, or model not supplied') if model_filename is not None: self._model = torch.load(model_filename) self._model.eval() # Prepare data data = [] for idx, smi in enumerate(smiles): a, b, c = self._ce.encode(smi) data.append( gdata.Data(x=a, edge_index=c, edge_attr=b).to(self._device)) loader_test = gdata.DataLoader(data, batch_size=1, shuffle=False) # Get results results = [] for batch in loader_test: res, _, _ = self._model(batch) results.append(res.detach().numpy().tolist()[0]) return results
from dataset import cora_data, num_features, num_classes from config import device, lr, weight_decay, hidden_features # we only set 2-fc layers (i.e. the same config, A -> I) to train again. class TwoLayersFC(nn.Module): def __init__(self): super(TwoLayersFC, self).__init__() self.fc_1 = nn.Linear(num_features, hidden_features) self.fc_2 = nn.Linear(hidden_features, num_classes) def forward(self, data): x = data.x x = F.dropout(x, training=self.training) x = F.relu(self.fc_1(x)) x = F.dropout(x, training=self.training) x = self.fc_2(x) x = F.relu(x) return F.log_softmax(x, dim=1) if __name__ == '__main__': # import torch_geometric import torch_geometric.data as gdata x = torch.randn(3, num_features) edge_index = torch.tensor([ [0,0,1,1,2], [1,2,0,2,1], ]) test_data = gdata.Data(x=x) f = TwoLayersFC() y = f(test_data)
def process(self, xd, xt_mut, xt_meth, xt_ge, y, smile_graph): assert (len(xd) == len(xt_mut) and len(xt_mut) == len(y)) and len(y) == len(xt_meth) and len(xt_meth) == len( xt_ge), "The four lists must be the same length!" data_list = [] data_len = len(xd) for i in range(data_len): print('Converting SMILES to graph: {}/{}'.format(i + 1, data_len)) smiles = xd[i] target_mut = xt_mut[i] target_meth = xt_meth[i] target_ge = xt_ge[i] labels = y[i] # convert SMILES to molecular representation using rdkit c_size, features, edge_index = smile_graph[smiles] # make the graph ready for PyTorch Geometrics GCN algorithms: GCNData = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.FloatTensor([labels])) # require_grad of cell-line for saliency map if self.saliency_map == True: GCNData.target_mut = torch.tensor([target_mut], dtype=torch.float, requires_grad=True) GCNData.target_meth = torch.tensor([target_meth], dtype=torch.float, requires_grad=True) GCNData.target_ge = torch.tensor([target_ge], dtype=torch.float, requires_grad=True) else: GCNData.target_mut = torch.FloatTensor([target_mut]) GCNData.target_meth = torch.FloatTensor([target_meth]) GCNData.target_ge = torch.FloatTensor([target_ge]) GCNData.__setitem__('c_size', torch.LongTensor([c_size])) # append graph, label and target sequence to data list data_list.append(GCNData) # for xt_meth """for i in range(data_len): print('Converting SMILES to graph: {}/{}'.format(i + 1, data_len)) smiles = xd[i] target = xt_meth[i] labels = y[i] # convert SMILES to molecular representation using rdkit c_size, features, edge_index = smile_graph[smiles] # make the graph ready for PyTorch Geometrics GCN algorithms: GCNData = DATA.Data(x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.FloatTensor([labels])) # require_grad of cell-line for saliency map if self.saliency_map == True: GCNData.target = torch.tensor([target], dtype=torch.float, requires_grad=True) else: GCNData.target = torch.FloatTensor([target]) GCNData.__setitem__('c_size', torch.LongTensor([c_size])) # append graph, label and target sequence to data list data_list_meth.append(GCNData) #append data_list_mut and data_list_meth together for x in data_list_meth: data_list.append(x) """ if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] print('Graph construction done. Saving to file.') data, slices = self.collate(data_list) # save preprocessed data: torch.save((data, slices), self.processed_paths[0])
def wrapper(func, *args, **kwargs): """wrapper to measure functions execution time through timeit. :param function func: user defined function :param type *args: `*args` of function :param type **kwargs: `**kwargs` of function :return: wrapped function with no arguments needed. :rtype: wrapped_function """ def wrapped(): return func(*args, **kwargs) return wrapped if __name__ == '__main__': row = 22 col = 26 wrap = wrapper(create_graph, row, col) print(timeit.timeit(wrap, number=100000)) indx_out, indx_in = create_graph(row, col) coo = torch.tensor([indx_out, indx_in], dtype=torch.long) graph = data.Data(edge_index=coo) G = utils.to_networkx(graph) draw_graph(G) print(indx_out, '\n', indx_in)
# train-test split edges genes = torch.arange(len(node_classes))[node_classes == 0] diseases = torch.arange(len(node_classes))[node_classes == 1] validation_genes_mask = torch.randint(0, 100, size=(len(node_classes) - torch.sum(node_classes).item(), )) validation_genes = torch.arange(0, len(node_classes) - torch.sum(node_classes), dtype=torch.long)[validation_genes_mask < 20] full_graph = gdata.Data( edge_index=edge_index, edge_types=edge_types, feats=features, node_classes=node_classes, num_nodes=len(node_classes), ) # positive train/val pos_val = torch.logical_or( torch.logical_and( torch.BoolTensor(np.isin(full_graph.edge_index[0], validation_genes), ), full_graph.edge_types == 1, ), torch.logical_and( torch.BoolTensor(np.isin(full_graph.edge_index[1], validation_genes), ), full_graph.edge_types == 1,
def train(self, smiles: list, target: list, model_filename: str = None, model_config: dict = None): ''' GraphOperator.train: trains a graph neural network given SMILES strings, target values, supplied config (i.e. architecture, hyper- parameters) Args: smiles (list): list of SMILES strings (str) target (list): list of target values (1d, float) model_filename (str): if not None, saves model to this location model_config (dict): configuration dict; if none supplied, default is used Returns: None ''' # Check for inequality in length of input, target data if len(smiles) != len(target): raise ValueError( 'Supplied SMILES and targets not the same length: {}, {}'. format(len(smiles), len(target))) # Prepare data self._ce = CompoundEncoder(smiles) data = [] for idx, smi in enumerate(smiles): a, b = self._ce.encode(smi) data.append( gdata.Data(x=a, edge_index=self._ce.connectivity(smi), edge_attr=b, y=torch.tensor(target[idx]).type(torch.float)).to( self._config['device'])) # Split data into training, validation subsets data_train, data_valid = train_test_split( data, test_size=self._config['valid_size']) loader_train = gdata.DataLoader(data_train, batch_size=self._config['batch_size'], shuffle=True) loader_valid = gdata.DataLoader(data_valid, batch_size=self._config['batch_size'], shuffle=True) # Create model self._model = MessagePassingNet(self._ce.ATOM_DIM, len(target[0]), task=self._config['task'], config=model_config) self._model.construct() self._model.to(self._config['device']) optimizer = torch.optim.Adam(self._model.parameters(), lr=self._config['learning_rate']) # Setup callbacks CBO = CallbackOperator() _lrdecay = LRDecayLinear(self._config['learning_rate'], self._config['lr_decay'], optimizer) _validator = Validator(loader_valid, self._model, self._config['valid_epoch_iter'], self._config['valid_patience']) CBO.add_cb(_lrdecay) CBO.add_cb(_validator) # TRAIN BEGIN CBO.on_train_begin() # Begin training loop for epoch in range(self._config['epochs']): # EPOCH BEGIN if not CBO.on_epoch_begin(epoch): break train_loss = 0.0 self._model.train() for b_idx, batch in enumerate(loader_train): # BATCH BEGIN if not CBO.on_batch_begin(b_idx): break optimizer.zero_grad() embedding, pred = self._model(batch) target = batch.y if self._config['task'] == 'node': pred = pred[batch.train_mask] target = target[batch.train_mask] # BATCH END, LOSS BEGIN if not CBO.on_batch_end(b_idx): break if not CBO.on_loss_begin(b_idx): break loss = self._model.loss(pred, target) loss.backward() # LOSS END, STEP BEGIN if not CBO.on_loss_end(b_idx): break if not CBO.on_step_begin(b_idx): break optimizer.step() train_loss += loss.detach().item() * batch.num_graphs # STEP END if not CBO.on_step_end(b_idx): break train_loss /= len(loader_train.dataset) # EPOCH END if not CBO.on_epoch_end(epoch): break if self._config['verbose']: print('Epoch: {} | Train Loss: {} | Valid Loss: {}'.format( epoch, train_loss, _validator._best_loss)) # TRAIN END CBO.on_train_end() if model_filename is not None: torch.save(self._model, model_filename)
def train(self, smiles: List[str], target: List[List[float]], model_config: dict = None, valid_size: float = 0.2, valid_epoch_iter: int = 1, valid_patience: int = 16, batch_size: int = 1, lr: float = 0.001, lr_decay: float = 0.0, epochs: int = 128, verbose: int = 0, random_state: int = None, shuffle: bool = False, **kwargs) -> Tuple[List[float], List[float]]: """ Trains a CompoundCGN using supplied SMILES strings, target values Args: smiles (list[str]): list of SMILES strings, one per compound target (list[list[float]]): list of target values, shape [n_samples, n_targets], one per compound model_filename (str, optional): if not `None`, saves the trained model to this filename/path model_config (dict, optional): if not supplied, uses default model architecture: { 'n_messages': 1, 'n_hidden': 1, 'hidden_dim': 32, 'dropout': 0.00 } valid_size (float, optional): proportion of training set used for periodic validation, default = 0.2 valid_epoch_iter (int, optional): validation set performance is measured every `this` epochs, default = 1 epochs valid_patience (int, optional): if lower validation set loss not encountered after `this` many epochs, terminate to avoid overfitting, default = 16 batch_size (int, optional): size of each batch during training, default = 1 lr (float, optional): learning rate for Adam opt, default = 0.001 lr_decay (float, optional): linear rate of decay of learning rate per epoch, default = 0.0 epochs (int, optional): number of training epochs, default = 128 verbose (int, optional): training and validation loss printed to console every `this` epochs, default = 0 (no printing) random_state (int, optional): if not `None`, seeds validation subset randomized selection with this value shuffle (bool, optional): if True, shuffles training and validation subsets between training epochs, default = False **kwargs: additional arguments passed to torch.optim.Adam Returns: tuple[list[float], list[float]]: (training losses, validation losses) over all training epochs """ # Check for inequality in length of input, target data if len(smiles) != len(target): raise ValueError( 'Supplied SMILES and targets not the same length: {}, {}'. format(len(smiles), len(target))) # Prepare data self._ce = CompoundEncoder(smiles) data = [] for idx, smi in enumerate(smiles): a, b, c = self._ce.encode(smi) data.append( gdata.Data(x=a, edge_index=c, edge_attr=b, y=torch.tensor( target[idx]).type(torch.float).reshape( 1, len(target[idx]))).to(self._device)) # Split data into training, validation subsets data_train, data_valid = train_test_split(data, test_size=valid_size, random_state=random_state) loader_train = gdata.DataLoader(data_train, batch_size=batch_size, shuffle=True) loader_valid = gdata.DataLoader(data_valid, batch_size=batch_size, shuffle=True) # Create model if model_config is None: self._model = CompoundGCN(self._ce.ATOM_DIM, self._ce.BOND_DIM, len(target[0])) else: self._model = CompoundGCN(self._ce.ATOM_DIM, self._ce.BOND_DIM, len(target[0]), model_config['n_messages'], model_config['n_hidden'], model_config['hidden_dim'], model_config['dropout']) self._model.to(self._device) optimizer = torch.optim.Adam(self._model.parameters(), lr=lr, **kwargs) # Setup callbacks CBO = CallbackOperator() _lrdecay = LRDecayLinear(lr, lr_decay, optimizer) _validator = Validator(loader_valid, self._model, valid_epoch_iter, valid_patience) CBO.add_cb(_lrdecay) CBO.add_cb(_validator) # Record loss for return train_losses = [] valid_losses = [] # TRAIN BEGIN CBO.on_train_begin() # Begin training loop for epoch in range(epochs): # EPOCH BEGIN if not CBO.on_epoch_begin(epoch): break if shuffle: data_train, data_valid = train_test_split( data, test_size=valid_size, random_state=random_state) loader_train = gdata.DataLoader(data_train, batch_size=batch_size, shuffle=True) loader_valid = gdata.DataLoader(data_valid, batch_size=batch_size, shuffle=True) train_loss = 0.0 self._model.train() for b_idx, batch in enumerate(loader_train): # BATCH BEGIN if not CBO.on_batch_begin(b_idx): break optimizer.zero_grad() pred, _, _ = self._model(batch) target = batch.y # BATCH END, LOSS BEGIN if not CBO.on_batch_end(b_idx): break if not CBO.on_loss_begin(b_idx): break loss = self._model.loss(pred, target) loss.backward() # LOSS END, STEP BEGIN if not CBO.on_loss_end(b_idx): break if not CBO.on_step_begin(b_idx): break optimizer.step() train_loss += loss.detach().item() * batch.num_graphs # STEP END if not CBO.on_step_end(b_idx): break train_loss /= len(loader_train.dataset) # EPOCH END if not CBO.on_epoch_end(epoch): break if verbose > 0: if epoch % verbose == 0: print('Epoch: {} | Train Loss: {} | Valid Loss: {}'.format( epoch, train_loss, _validator._most_recent_loss)) train_losses.append(train_loss) valid_losses.append(_validator._most_recent_loss.detach().item()) # TRAIN END CBO.on_train_end() return (train_losses, valid_losses)
def process(self): if osp.exists( os.path.join(self.processed_dir, 'Decagon-{}-multi.pt'.format(self.datatype))): return data_list = [] # >>> Obtain One-Hot Encoding for Side-Effects json_dict = { literal_eval(k): v for k, v in self.json_load[self.datatype].items() } total = len(json_dict) for idx, (smiles1, smiles2) in enumerate(json_dict): printProgress(idx + 1, total, '{} dataset preparation: '.format(self.datatype), ' ', 2, 50) mol1 = MolFromSmiles(smiles1) mol2 = MolFromSmiles(smiles2) label = np.array(json_dict[(smiles1, smiles2)]) #print(len(label[label == 1])) #print(len(label[label == 0])) #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label)) if mol1 is None or mol2 is None: print("There is a missing drug from the pair (%s,%s)" % (mol1, mol2)) continue ###################################################################### # >>> Get pairwise graph G1, G2 c1_size = mol1.GetNumAtoms() c2_size = mol2.GetNumAtoms() if c1_size == 0 or c2_size == 0: print("There is a size error from pair (%s,%s)" % (mol1, mol2)) continue atoms1 = mol1.GetAtoms() atoms2 = mol2.GetAtoms() bonds1 = mol1.GetBonds() bonds2 = mol2.GetBonds() features, edges = [], [] for atom in atoms1: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for atom in atoms2: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for bond in bonds1: edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) for bond in bonds2: edges.append([ bond.GetBeginAtomIdx() + c1_size, bond.GetEndAtomIdx() + c1_size ]) if len(edges) == 0: continue G = nx.Graph(edges).to_directed() edge_index = [[e1, e2] for e1, e2 in G.edges] GraphSiameseData = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.Tensor(label).view(1, -1)) GraphSiameseData.__setitem__('c1_size', torch.LongTensor([c1_size])) GraphSiameseData.__setitem__('c2_size', torch.LongTensor([c2_size])) data_list.append(GraphSiameseData) ########################################################################### if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] # check this function data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])