def get_data(): dataset = args.name path = '../data/geometric/QM9' trainset = QM9(path) testset = QM9(path) lenTrain = len(trainset) lenTest = len(testset) print("Len Dataset:", lenTrain) trainLoader = DataLoader(trainset[:lenTrain], batch_size=1, shuffle=False) testloader = DataLoader(trainset[:lenTest], batch_size=1, shuffle=False) print("Len TrainLoader:", len(trainLoader)) return trainLoader, testloader
def get_dataset(name, sparse=True, dirname=None): if dirname is None: path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name) else: path = osp.join(dirname, name) if name == "QM9": dataset = QM9(path) elif name == "QM7b": dataset = QM7b(path) else: dataset = TUDataset(path, name) dataset.data.edge_attr = None if dataset.data.x is None: max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) num_nodes = max_num_nodes = 0 for data in dataset: num_nodes += data.num_nodes max_num_nodes = max(data.num_nodes, max_num_nodes) # Filter out a few really large graphs in order to apply DiffPool. if name == 'REDDIT-BINARY': num_nodes = min(int(num_nodes / len(dataset) * 1.5), max_num_nodes) else: num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes) # num_nodes = max_num_nodes indices = [] for i, data in enumerate(dataset): if data.num_nodes <= num_nodes: indices.append(i) dataset = dataset[torch.tensor(indices)] if not sparse: if dataset.transform is None: dataset.transform = T.ToDense(num_nodes) else: dataset.transform = T.Compose( [dataset.transform, T.ToDense(num_nodes)]) return dataset
def __init__(self): dataset = "QM9" path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) target = 0 class MyTransform(object): def __call__(self, data): # Specify target. data.y = data.y[:, target] return data class Complete(object): def __call__(self, data): device = data.edge_index.device row = torch.arange(data.num_nodes, dtype=torch.long, device=device) col = torch.arange(data.num_nodes, dtype=torch.long, device=device) row = row.view(-1, 1).repeat(1, data.num_nodes).view(-1) col = col.repeat(data.num_nodes) edge_index = torch.stack([row, col], dim=0) edge_attr = None if data.edge_attr is not None: idx = data.edge_index[ 0] * data.num_nodes + data.edge_index[1] size = list(data.edge_attr.size()) size[0] = data.num_nodes * data.num_nodes edge_attr = data.edge_attr.new_zeros(size) edge_attr[idx] = data.edge_attr edge_index, edge_attr = remove_self_loops( edge_index, edge_attr) data.edge_attr = edge_attr data.edge_index = edge_index return data transform = T.Compose( [MyTransform(), Complete(), T.Distance(norm=False)]) if not osp.exists(path): QM9(path) super(QM9Dataset, self).__init__(path)
def show_graph_regression(): qm9 = QM9(root='data') test_loader = DataLoader(qm9[int(1000 * 0.8):1000], batch_size=1, shuffle=True) model = GNNStack(max(qm9.num_node_features, 1), 32, qm9.num_classes, task='graph') model.load_state_dict(torch.load('graphnn/savegraphmodel_32hid.pth')) example = next(iter(test_loader)) emb, pred = model(example) fig, axes = plt.subplots(2, 1, figsize=(10, 4)) fig.suptitle('Graph property prediction') axes[0].imshow(pred.detach().numpy()) axes[0].set_title('Prediction') axes[1].imshow(example.y.detach().numpy()) axes[1].set_title('Ground truth')
def __init__(self, dataset, config): self.qm9 = False if dataset == "QM9": self.qm9 = True self.data_dir = os.path.join(self.curr_dir, '../tg_datasets/QM9') self.data = QM9(self.data_dir, transform=self.transform_data) else: self.data = TUDataset(self.data_dir, dataset, transform=self.transform_data, use_node_attr=self.use_node_attr) self.data.shuffle() # Shuffle the dataset for key, value in config.items(): if hasattr(self.config, key): setattr(self.config, key, type(getattr(self.config, key))(value)) else: print(f'Config key \'{key}\' is not valid for PPGN') sys.exit()
def load_dataset(name): """ Load real-world datasets, available in PyTorch Geometric. Used as a helper for DiskDataSource. """ task = "graph" if name == "enzymes": dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES") elif name == "proteins": dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS") elif name == "cox2": dataset = TUDataset(root="/tmp/cox2", name="COX2") elif name == "aids": dataset = TUDataset(root="/tmp/AIDS", name="AIDS") elif name == "reddit-binary": dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY") elif name == "imdb-binary": dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY") elif name == "firstmm_db": dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB") elif name == "dblp": dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1") elif name == "ppi": dataset = PPI(root="/tmp/PPI") elif name == "qm9": dataset = QM9(root="/tmp/QM9") elif name == "atlas": dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)] if task == "graph": train_len = int(0.8 * len(dataset)) train, test = [], [] dataset = list(dataset) random.shuffle(dataset) has_name = hasattr(dataset[0], "name") for i, graph in tqdm(enumerate(dataset)): if not type(graph) == nx.Graph: if has_name: del graph.name graph = pyg_utils.to_networkx(graph).to_undirected() if i < train_len: train.append(graph) else: test.append(graph) return train, test, task
def __init__(self, path: str): pyg_dataset = QM9(os.path.join(path, '_pyg')) if hasattr(pyg_dataset, "__data_list__"): delattr(pyg_dataset, "__data_list__") if hasattr(pyg_dataset, "_data_list"): delattr(pyg_dataset, "_data_list") super(QM9Dataset, self).__init__([ GeneralStaticGraphGenerator.create_homogeneous_static_graph( { 'x': data.x, 'pos': data.pos, 'z': data.z }, data.edge_index, edges_data={'edge_attr': data.edge_attr}, graph_data={ 'idx': data.idx, 'y': data.y }) for data in pyg_dataset ])
def get_dataset(dataset_name): """ Retrieves the dataset corresponding to the given name. """ path = join('dataset', dataset_name) if dataset_name == 'reddit': dataset = Reddit(path) elif dataset_name == 'flickr': dataset = Flickr(path) elif dataset_name == 'zinc': dataset = ZINC(root='dataset', subset=True, split='train') elif dataset_name == 'QM9': dataset = QM9(root='dataset') elif dataset_name == 'github': dataset = GitHub(path) elif dataset_name == 'ppi': dataset = PPI(path) elif dataset_name in ['amazon_comp', 'amazon_photo']: dataset = Amazon(path, "Computers", T.NormalizeFeatures() ) if dataset_name == 'amazon_comp' else Amazon( path, "Photo", T.NormalizeFeatures()) data = dataset.data idx_train, idx_test = train_test_split(list(range(data.x.shape[0])), test_size=0.4, random_state=42) idx_val, idx_test = train_test_split(idx_test, test_size=0.5, random_state=42) data.train_mask = torch.tensor(idx_train) data.val_mask = torch.tensor(idx_val) data.test_mask = torch.tensor(idx_test) dataset.data = data elif dataset_name in ["Cora", "CiteSeer", "PubMed"]: dataset = Planetoid(path, name=dataset_name, split="public", transform=T.NormalizeFeatures()) else: raise NotImplementedError return dataset
def load_dataset(args): # automatic data loading and splitting transform = add_zeros if args.dataset == 'ogbg-ppa' else None cls_criterion = get_loss_function(args.dataset) idx2word_mapper = None if args.dataset == 'mnist': train_data = MNISTSuperpixels(root='dataset', train=True, transform=T.Polar()) dataset = train_data dataset.name = 'mnist' dataset.eval_metric = 'acc' validation_data = [] test_data = MNISTSuperpixels(root='dataset', train=False, transform=T.Polar()) train_data = list(train_data) test_data = list(test_data) elif args.dataset == 'QM9': # Contains 19 targets. Use only the first 12 (0-11) QM9_VALIDATION_START = 110000 QM9_VALIDATION_END = 120000 dataset = QM9(root='dataset', transform=ExtractTargetTransform(args.target)).shuffle() dataset.name = 'QM9' dataset.eval_metric = 'mae' train_data = dataset[:QM9_VALIDATION_START] validation_data = dataset[QM9_VALIDATION_START:QM9_VALIDATION_END] test_data = dataset[QM9_VALIDATION_END:] train_data = list(train_data) validation_data = list(validation_data) test_data = list(test_data) elif args.dataset == 'zinc': train_data = ZINC(root='dataset', subset=True, split='train') dataset = train_data dataset.name = 'zinc' validation_data = ZINC(root='dataset', subset=True, split='val') test_data = ZINC(root='dataset', subset=True, split='test') dataset.eval_metric = 'mae' train_data = list(train_data) validation_data = list(validation_data) test_data = list(test_data) elif args.dataset in [ 'ogbg-molhiv', 'ogbg-molpcba', 'ogbg-ppa', 'ogbg-code2' ]: dataset = PygGraphPropPredDataset(name=args.dataset, transform=transform) if args.dataset == 'obgb-code2': seq_len_list = np.array([len(seq) for seq in dataset.data.y]) max_seq_len = args.max_seq_len num_less_or_equal_to_max = np.sum( seq_len_list <= args.max_seq_len) / len(seq_len_list) print( f'Target sequence less or equal to {max_seq_len} is {num_less_or_equal_to_max}%.' ) split_idx = dataset.get_idx_split() # The following is only used in the evaluation of the ogbg-code classifier. if args.dataset == 'ogbg-code2': vocab2idx, idx2vocab = get_vocab_mapping( [dataset.data.y[i] for i in split_idx['train']], args.num_vocab) # specific transformations for the ogbg-code dataset dataset.transform = transforms.Compose([ augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len) ]) idx2word_mapper = partial(decode_arr_to_seq, idx2vocab=idx2vocab) train_data = list(dataset[split_idx["train"]]) validation_data = list(dataset[split_idx["valid"]]) test_data = list(dataset[split_idx["test"]]) return dataset, train_data, validation_data, test_data, cls_criterion, idx2word_mapper
def load_dataset(name): """ Load real-world datasets, available in PyTorch Geometric. Used as a helper for DiskDataSource. """ task = "graph" if name == "enzymes": dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES") elif name == "proteins": dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS") elif name == "cox2": dataset = TUDataset(root="/tmp/cox2", name="COX2") elif name == "aids": dataset = TUDataset(root="/tmp/AIDS", name="AIDS") elif name == "reddit-binary": dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY") elif name == "imdb-binary": dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY") elif name == "firstmm_db": dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB") elif name == "dblp": dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1") elif name == "ppi": dataset = PPI(root="/tmp/PPI") elif name == "qm9": dataset = QM9(root="/tmp/QM9") elif name == "atlas": dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)] elif name == 'aifb': dataset = Entities(root="/tmp/aifb", name='AIFB') # 90 edge types elif name == 'wn18': dataset = WordNet18(root="/tmp/wn18") elif name == 'fb15k237': dataset = [None] if task == "graph": train_len = int(0.8 * len(dataset)) train, test = [], [] if name not in ['aifb', 'wn18', 'fb15k237']: dataset = list(dataset) random.shuffle(dataset) has_name = hasattr(dataset[0], "name") else: has_name = True for i, graph in tqdm(enumerate(dataset)): if not type(graph) == nx.Graph: try: if has_name: del graph.name except: pass if name == 'aifb': graph = pyg_utils.to_networkx(graph, edge_attrs=['edge_type']) elif name == 'wn18': graph = pyg_utils.to_networkx(graph, edge_attrs=['edge_type']) elif name == 'fb15k237': data = FB15k_237() (graph, _, _, _) = data.load() graph = graph.to_networkx() edge_type_dict = [] for j in graph.edges: edge_type_dict.append(graph.edges[j]['label']) edge_type_dict = { i: ind for ind, i in enumerate(sorted(set(edge_type_dict))) } for j in graph.edges: graph.edges[j]['edge_type'] = edge_type_dict[ graph.edges[j]['label']] del graph.edges[j]['label'] del graph.edges[j]['weight'] else: graph = pyg_utils.to_networkx(graph).to_undirected() if name == 'aifb': train.append(graph) test.append(deepcopy(graph)) elif name == 'wn18': train.append(graph) test.append(deepcopy(graph)) elif name == 'fb15k237': train.append(graph) test.append(deepcopy(graph)) else: if i < train_len: train.append(graph) else: test.append(graph) return train, test, task
def execute(config): path = 'QM9' dataset = QM9(path) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.manual_seed(config['seed']) # Report meV instead of eV. units = 1000 if config['target'] in [2, 3, 4, 6, 7, 8, 9, 10] else 1 _, datasets = SchNet.from_qm9_pretrained(path, dataset, config['target']) train_dataset, val_dataset, _test_dataset = datasets train_dataset = train_dataset[:config['ptr']] model = Network( muls=(config['mul0'], config['mul1'], config['mul2']), lmax=config['lmax'], num_layers=config['num_layers'], number_of_basis=config['rad_gaussians'], fc_neurons=[config['rad_h']] * config['rad_layers'], mean=config['mean'], std=config['std'], atomref=dataset.atomref(config['target']), ) model = model.to(device) wandb.watch(model) # modules = [model.embedding, model.radial] + list(model.layers) + [model.atomref] # lrs = [0.1, 0.01] + [1] * len(model.layers) + [0.1] # param_groups = [] # for lr, module in zip(lrs, modules): # jac = [] # for data in DataLoader(train_dataset[:20]): # data = data.to(device) # jac += [torch.autograd.grad(model(data.z, data.pos), module.parameters())[0].flatten()] # jac = torch.stack(jac) # kernel = jac @ jac.T # print('kernel({}) = {:.2e} +- {:.2e}'.format(module, kernel.mean().item(), kernel.std().item()), flush=True) # lr = lr / (kernel.mean() + kernel.std()).item() # param_groups.append({ # 'params': list(module.parameters()), # 'lr': lr, # }) # lrs = torch.tensor([x['lr'] for x in param_groups]) # lrs = config['lr'] * lrs / lrs.max().item() # for group, lr in zip(param_groups, lrs): # group['lr'] = lr.item() optim = torch.optim.Adam(model.parameters(), lr=config['lr']) # print(optim, flush=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=25, factor=0.5, verbose=True) dynamics = [] wall = time.perf_counter() wall_print = time.perf_counter() for epoch in itertools.count(): errs = [] loader = DataLoader(train_dataset, batch_size=config['bs'], shuffle=True) for step, data in enumerate(loader): data = data.to(device) pred = model(data.z, data.pos, data.batch) optim.zero_grad() (pred.view(-1) - data.y[:, config['target']]).pow(2).mean().backward() optim.step() err = pred.view(-1) - data.y[:, config['target']] errs += [err.cpu().detach()] if time.perf_counter() - wall_print > 15: wall_print = time.perf_counter() w = time.perf_counter() - wall e = epoch + (step + 1) / len(loader) print(( f'[{e:.1f}] [' f'wall={w / 3600:.2f}h ' f'wall/epoch={w / e:.0f}s ' f'wall/step={1e3 * w / e / len(loader):.0f}ms ' f'step={step}/{len(loader)} ' f'mae={units * torch.cat(errs)[-200:].abs().mean():.5f} ' f'lr={min(x["lr"] for x in optim.param_groups):.1e}-{max(x["lr"] for x in optim.param_groups):.1e}]' ), flush=True) if epoch == 0: called_num = [0] def trace_handler(p): print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)) p.export_chrome_trace( f"{datetime.datetime.now()}_{called_num[0]}.json") called_num[0] += 1 with torch.profiler.profile(activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA ], schedule=torch.profiler.schedule( wait=1, warmup=1, active=1), on_trace_ready=trace_handler) as prof: for step, data in enumerate(loader): data = data.to(device) pred = model(data.z, data.pos, data.batch) mse = (pred.view(-1) - data.y[:, config['target']]).pow(2) mse.mean().backward() prof.step() if step == 3: break train_err = torch.cat(errs) errs = [] loader = DataLoader(val_dataset, batch_size=256) for data in loader: data = data.to(device) with torch.no_grad(): pred = model(data.z, data.pos, data.batch) err = pred.view(-1) - data.y[:, config['target']] errs += [err.cpu().detach()] val_err = torch.cat(errs) lrs = [x['lr'] for x in optim.param_groups] dynamics += [{ 'epoch': epoch, 'wall': time.perf_counter() - wall, 'train': { 'mae': { 'mean': units * train_err.abs().mean().item(), 'std': units * train_err.abs().std().item(), }, 'mse': { 'mean': units * train_err.pow(2).mean().item(), 'std': units * train_err.pow(2).std().item(), } }, 'val': { 'mae': { 'mean': units * val_err.abs().mean().item(), 'std': units * val_err.abs().std().item(), }, 'mse': { 'mean': units * val_err.pow(2).mean().item(), 'std': units * val_err.pow(2).std().item(), } }, 'lrs': lrs, }] dynamics[-1]['_runtime'] = dynamics[-1]['wall'] wandb.log(dynamics[-1]) print( f'[{epoch}] Target: {config["target"]:02d}, MAE TRAIN: {units * train_err.abs().mean():.5f} ± {units * train_err.abs().std():.5f}, MAE VAL: {units * val_err.abs().mean():.5f} ± {units * val_err.abs().std():.5f}', flush=True) scheduler.step(val_err.pow(2).mean()) yield { 'args': config, 'dynamics': dynamics, 'state': {k: v.cpu() for k, v in model.state_dict().items()}, } if dynamics[-1]['wall'] > config['wall']: break
def __init__(self, root, target): super().__init__() self.root = root self.dataset = QM9(root=root) self.target = target
def main(): np.random.seed(0) torch.manual_seed(0) # --------------------- PARSE ARGS ----------------------- parser = argparse.ArgumentParser() parser.add_argument("--train-size", type=int, default=5000) parser.add_argument("--target", type=int, choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], default=0) parser.add_argument("--batch-size", type=int, default=20) parser.add_argument("--num-epoch", type=int, default=500) parser.add_argument("--lr", type=float, default=0.001) parser.add_argument("--weight-decay", type=float, default=0.0) parser.add_argument("--encoder-hidden-dim", type=int, default=64) parser.add_argument("--lamda", type=float, default=0.001) parser.add_argument("--patience", type=int, default=30) args = parser.parse_args() print("- Args ----------------------") for k, v in vars(args).items(): print(" - {}={}".format(k, v)) print("-----------------------------") # --------------------- LOAD DATASET --------------------- print("Loading dataset...") dataset = QM9(QM9_DATASET_PATH, pre_transform=T.Compose([Complete(), T.Distance(norm=False)]), transform=TargetLabelSelection(args.target)).shuffle() mean = dataset.data.y[:, args.target].mean().item() std = dataset.data.y[:, args.target].std().item() dataset.data.y[:, args.target] = (dataset.data.y[:, args.target] - mean) / std test_dataset = dataset[:10000] val_dataset = dataset[10000:20000] train_dataset = dataset[20000:20000 + args.train_size] test_loader = DataLoader(test_dataset, batch_size=args.batch_size) val_loader = DataLoader(val_dataset, batch_size=args.batch_size) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) unsup_train_dataset = dataset[20000:] unsup_train_loader = DataLoader(unsup_train_dataset, batch_size=args.batch_size, shuffle=True) print("- Dataset -------------------") print(" - # train: {:,}".format(len(train_dataset))) print(" - # val: {:,}".format(len(val_dataset))) print(" - # test: {:,}".format(len(test_dataset))) print(" - # train (unsup.): {:,}".format(len(unsup_train_dataset))) print("-----------------------------") # --------------------- TRAIN MODEL ---------------------- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = InfoGraphSemi(dataset.num_features, args.encoder_hidden_dim).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=5, min_lr=0.000001) val_error = evaluate(model, val_loader, std, device) print("| Epoch: {:3} | Val MAE: {:10.4f} |".format(0, val_error)) print("Starting training...") start_time = time.time() checkpoint_path = "model_{}.pt".format(start_time) min_val_error = None min_val_epoch = 0 for epoch in range(1, args.num_epoch + 1): train_loss = train(model, train_loader, unsup_train_loader, optimizer, args.lamda, device) val_error = evaluate(model, val_loader, std, device) scheduler.step(val_error) if min_val_error is None or val_error < min_val_error: min_val_error = val_error min_val_epoch = epoch torch.save(model.state_dict(), checkpoint_path) lr = scheduler.optimizer.param_groups[0]['lr'] elapsed_time = datetime.timedelta(seconds=int(time.time() - start_time)) print( "| Epoch: {:3} | time: {} | lr: {:7f} | Train loss: {:8.4f} | Val MAE: {:8.4f} |{}" .format(epoch, elapsed_time, lr, train_loss, val_error, " *" if min_val_epoch == epoch else "")) if epoch - min_val_epoch > args.patience: print("Early stopping...") break print("Training finished!") print("Evaluating on test set...") model.load_state_dict(torch.load(checkpoint_path)) test_error = evaluate(model, test_loader, std, device) print("| Val MAE: {:8.4f} | Test MAE: {:8.4f} |".format( min_val_error, test_error))
def _qm9(self, target): dataset = QM9('data/QM9', transform=QM9Transformer(target)) mean = dataset.data.y.mean(dim=0, keepdim=True) std = dataset.data.y.std(dim=0, keepdim=True) dataset.data.y = (dataset.data.y - mean) / std return dataset, std[:, target].item(), 11, 4
def __init__(self, f1_alpha, f2_alpha, f3_alpha): dataset = QM9('data/QM9') super(QM9Sampler, self).__init__(dataset, f1_alpha, f2_alpha, f3_alpha)
import torch.nn as nn import torch_geometric from torch_geometric.data import Batch from torch_geometric.datasets import QM9 start = datetime.now() epochs = 1000 batch_size = 64 device = 'cuda' if torch.cuda.is_available() else 'cpu' keep_data_on_gpu = torch.cuda.is_available() target = None in_features, out_features = 4 + 13 + 3 + 2, 12 path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'eee-QM9') dataset = QM9(path) if target is not None: print(f'Target: {target}') out_features = 1 dataset.data.y = dataset.data.y[:, target].unsqueeze(1) else: dataset.data.y = dataset.data.y[:, :12] def get_data_loader(dataset, batch_size, keep_on_gpu=True): data = preprocess_graphs(dataset, keep_on_gpu) data = PyGDenseDataset(data) n_nodes = np.array([g.x.shape[0] for g in dataset]) # number of nodes for each graph loader = torch.utils.data.DataLoader(data,
from schnetpack import SchNet from torch_geometric.datasets import QM9 import torch_geometric.transforms as T from torch_geometric.data import DataLoader import torch.nn.functional as F import torch # from torch_geometric.nn import SchNet from model.nmp_edge import NMPEdge from model.schnet import SchNet # DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu') torch.manual_seed(0) dataset = QM9(root='/home/galkampel/tmp/QM9') # , transform=T.Distance(norm=False) train_val_set, test_set = torch.utils.data.random_split(dataset, [120000, 9433]) train_set, val_set = torch.utils.data.random_split(train_val_set, [110000, 10000]) device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu') train_loader = DataLoader(train_set, batch_size=32, shuffle=True) model = NMPEdge(hidden_channels=256, num_filters=256, hypernet_update=True).to(device) # model = SchNet(hidden_channels=256, num_filters=256).to(device) model.train() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) target = 7 n_iter = 1 for i in range(10): mae_tot = 0 for batch in train_loader: batch = batch.to(device) optimizer.zero_grad()
class CausalClassifyNet(torch.nn.Module): def __init__(self, h_dim, e_dim, times): super(CausalClassifyNet, self).__init__() self.conv_layer = ConvLayer(h_dim, e_dim, times) self.lin1 = Sequential(Linear(h_dim, h_dim), ReLU(), Linear(h_dim, 2)) def forward(self, batch, out): out = self.lin1(global_mean_pool(self.conv_layer(batch, out), batch.batch)) return F.log_softmax(out, dim=1) if __name__ == '__main__': from torch_geometric.datasets import QM9 from torch_geometric.data import DataLoader dataset = QM9('data/QM9') loader = DataLoader(dataset, batch_size=6) data = iter(loader).next() model = BaselineRegressNet(11, 32, 4, 6) print(model(data)) model = DirlNet(11, 32, 4, 3) print(model(data, 1)) R = CausalFeatureNet(11, 32, 4, 3) D = CausalClassifyNet(32, 4, 3) L = CausalRegressNet(32, 4, 3) print(R(data)) print(D(data, R(data)), L(data, R(data)))
def process(self): ogq = QM9(root=self.root) print(ogq[0].y) smiles_prop_dict = {} datalist = [] tar = tarfile.open(self.raw_paths[0], "r:bz2") for tarinfo in tqdm(tar): if tarinfo.isreg(): f = tar.extractfile(tarinfo) lines = f.read().decode().split('\n') targets = [float(i) for i in lines[1].split('\t')[1:-1]] # hof = [float(i) for i in lines[-4].split('\t')] # Harmonic oscillator frequencies smiles = lines[-3].split('\t')[1] # t_n = ['A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv'] # target_dict = dict(zip(t_names, targets)) try: mol_graph = self.mol2pyg(smiles) data = mol_graph data.y = torch.tensor(targets).float() datalist.append(data) except: # Boost.Python.ArgumentError continue else: continue tar.close() ys = torch.stack([data.y for data in datalist]) y_mean = ys.mean(dim=0) y_std = ys.std(dim=0) heterodata_list = [] for i in tqdm(range(len(datalist))): data_i, data_j = random.choice(datalist), random.choice(datalist) outer_edge_index_i, outer_edge_index_j = self.generate_outer(data_i.x.size(0), data_j.x.size(0)) data = tg.data.HeteroData() data['x_i'].x = data_i.x.float() data['x_j'].x = data_j.x.float() data['x_i', 'inner_edge_i', 'x_i'].edge_index = data_i.edge_index.long() data['x_i', 'inner_edge_i', 'x_i'].edge_attr = data_i.edge_attr.float() data['x_j', 'inner_edge_j', 'x_j'].edge_index = data_j.edge_index.long() data['x_j', 'inner_edge_j', 'x_j'].edge_attr = data_j.edge_attr.float() data['x_i', 'outer_edge_ij', 'x_j'].edge_index = outer_edge_index_i.long() data['x_j', 'outer_edge_ji', 'x_i'].edge_index = outer_edge_index_j.long() data['x_i', 'outer_edge_ij', 'x_j'].edge_attr = torch.ones(size=(outer_edge_index_i.max() + 1, data_i.edge_attr.size(1))) data['x_j', 'inner_edge_j', 'x_j'].edge_attr = torch.ones(size=(outer_edge_index_j.max() + 1, data_j.edge_attr.size(1))) data['y_i'].y = data_i.y.float() data['y_j'].y = data_j.y.float() data['y_i'].y_norm = (data_i.y.float() - y_mean) / y_std data['y_j'].y_norm = (data_j.y.float() - y_mean) / y_std data.binary_y = torch.tensor([int(0)], dtype=torch.long) heterodata_list.append(data) data, slices = self.collate(heterodata_list) print('Saving...') torch.save((data, slices), self.processed_paths[0])
def __call__(self, data): data.y = data.y[:, int(args.target)] # Specify target: 0 = mu return data parser = argparse.ArgumentParser() parser.add_argument('--target', default=0) args = parser.parse_args() target = int(args.target) print('---- Target: {} ----'.format(target)) path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', '1-2-3-QM9') dataset = QM9( path, transform=T.Compose([MyTransform(), T.Distance()]), pre_transform=MyPreTransform(), pre_filter=MyFilter()) dataset.data.iso_type_2 = torch.unique(dataset.data.iso_type_2, True, True)[1] num_i_2 = dataset.data.iso_type_2.max().item() + 1 dataset.data.iso_type_2 = one_hot(dataset.data.iso_type_2, num_classes=num_i_2) dataset.data.iso_type_3 = torch.unique(dataset.data.iso_type_3, True, True)[1] num_i_3 = dataset.data.iso_type_3.max().item() + 1 dataset.data.iso_type_3 = one_hot(dataset.data.iso_type_3, num_classes=num_i_3) dataset = dataset.shuffle() # Normalize targets to mean = 0 and std = 1. tenpercent = int(len(dataset) * 0.1)
def execute(config): device = torch.device(config['device']) torch.manual_seed(config['seed']) # Report meV instead of eV. units = 1000 if config['target'] in [2, 3, 4, 6, 7, 8, 9, 10] else 1 dataset = QM9(config['data_path']) train_dataset, val_dataset = dataset[:50000], dataset[50000:70000] model = Network( muls=(config['mul0'], config['mul1'], config['mul2']), sh_lmax=config['shlmax'], num_layers=config['num_layers'], max_radius=config['max_radius'], num_basis=config['num_basis'], fc_neurons=[config['radial_num_neurons']] * config['radial_num_layers'], num_neighbors=20.0, num_nodes=20.0, atomref=dataset.atomref(config['target']), ) model = model.to(device) wandb.watch(model) optim = torch.optim.Adam(model.parameters(), lr=config['lr']) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=25, factor=0.5, verbose=True) runtime = time.perf_counter() runtime_print = time.perf_counter() for epoch in itertools.count(): errs = [] loader = DataLoader(train_dataset, batch_size=config['bs'], shuffle=True) for step, data in enumerate(loader): data = data.to(device) pred = model(data) err = pred.view(-1) - data.y[:, config['target']] optim.zero_grad() err.pow(2).mean().backward() optim.step() errs += [err.cpu().detach()] if time.perf_counter() - runtime_print > 15: runtime_print = time.perf_counter() w = time.perf_counter() - runtime e = epoch + (step + 1) / len(loader) print(( f'[{e:.1f}] [' f'runtime={w / 3600:.2f}h ' f'runtime/epoch={w / e:.0f}s ' f'runtime/step={1e3 * w / e / len(loader):.0f}ms ' f'step={step}/{len(loader)} ' f'mae={units * torch.cat(errs)[-200:].abs().mean():.5f} '), flush=True) train_err = torch.cat(errs) errs = [] loader = DataLoader(val_dataset, batch_size=256) for data in loader: data = data.to(device) with torch.no_grad(): pred = model(data) err = pred.view(-1) - data.y[:, config['target']] errs += [err.cpu().detach()] val_err = torch.cat(errs) lrs = [x['lr'] for x in optim.param_groups] status = { 'epoch': epoch, '_runtime': time.perf_counter() - runtime, 'train': { 'mae': { 'mean': units * train_err.abs().mean().item(), 'std': units * train_err.abs().std().item(), }, 'mse': { 'mean': units * train_err.pow(2).mean().item(), 'std': units * train_err.pow(2).std().item(), } }, 'val': { 'mae': { 'mean': units * val_err.abs().mean().item(), 'std': units * val_err.abs().std().item(), }, 'mse': { 'mean': units * val_err.pow(2).mean().item(), 'std': units * val_err.pow(2).std().item(), } }, 'lrs': lrs, } wandb.log(status) print(( f'[{epoch}] Target: {config["target"]:02d}, ' f'MAE TRAIN: {units * train_err.abs().mean():.5f} ± {units * train_err.abs().std():.5f}, ' f'MAE VAL: {units * val_err.abs().mean():.5f} ± {units * val_err.abs().std():.5f}' ), flush=True) scheduler.step(val_err.pow(2).mean()) if status['_runtime'] > config['max_runtime']: break
out, h = self.gru(m.unsqueeze(0), h) out = out.squeeze(0) out = self.set2set(out, data.batch) out = F.relu(self.lin1(out)) out = self.lin2(out) return out results = [] results_log = [] for _ in range(5): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', '1t-QM9') dataset = QM9(path, transform=T.Compose([Complete(), T.Distance(norm=False)])) dataset.data.y = dataset.data.y[:, 0:12] dataset = dataset.shuffle() tenpercent = int(len(dataset) * 0.1) print("###") mean = dataset.data.y.mean(dim=0, keepdim=True) std = dataset.data.y.std(dim=0, keepdim=True) dataset.data.y = (dataset.data.y - mean) / std mean, std = mean.to(device), std.to(device) print("###") test_dataset = dataset[:tenpercent].shuffle() val_dataset = dataset[tenpercent:2 * tenpercent].shuffle() train_dataset = dataset[2 * tenpercent:].shuffle()
class MyTransform(object): # k-NN graph, and feature and target selection. def __call__(self, data): dist = (data.pos.view(-1, 1, 3) - data.pos.view(1, -1, 3)).norm(dim=-1) dist.fill_diagonal_(float('inf')) mask = dist <= args.cutoff data.edge_index = mask.nonzero().t() data.edge_attr = None # No need to maintain bond types. data.x = data.x[:, :5] # Just make use of atom types as features. data.y = data.y[:, args.target] return data path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'QM9') dataset = QM9(path, transform=MyTransform()).shuffle() train_dataset = dataset[:110000] val_dataset = dataset[110000:120000] test_dataset = dataset[120000:] train_loader = DataLoader(train_dataset, 44, shuffle=True, num_workers=6) val_loader = DataLoader(val_dataset, 44, num_workers=6) test_loader = DataLoader(test_dataset, 44, num_workers=6) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = DimeNet(in_channels=dataset.num_node_features, hidden_channels=128, out_channels=1, num_blocks=6, num_bilinear=8, num_spherical=7,
def execute(args): path = 'QM9' dataset = QM9(path) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Report meV instead of eV. units = 1000 if args.target in [2, 3, 4, 6, 7, 8, 9, 10] else 1 _, datasets = SchNet.from_qm9_pretrained(path, dataset, args.target) train_dataset, val_dataset, _test_dataset = datasets model = Network(muls=(args.mul0, args.mul1, args.mul2), ps=(1, ) if 'shp' in args.opts else (1, -1), lmax=args.lmax, num_layers=args.num_layers, rad_gaussians=args.rad_gaussians, rad_hs=(args.rad_h, ) * args.rad_layers + (args.rad_bottleneck, ), mean=args.mean, std=args.std, atomref=dataset.atomref(args.target), options=args.opts) model = model.to(device) # profile loader = DataLoader(train_dataset, batch_size=args.bs, shuffle=False) for step, data in enumerate(loader): with torch.autograd.profiler.profile(use_cuda=True, record_shapes=True) as prof: data = data.to(device) pred = model(data.z, data.pos, data.batch) mse = (pred.view(-1) - data.y[:, args.target]).pow(2) mse.mean().backward() if step == 5: break prof.export_chrome_trace(f"{datetime.datetime.now()}.json") optim = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=25, factor=0.5, verbose=True) dynamics = [] wall = time.perf_counter() wall_print = time.perf_counter() for epoch in itertools.count(): errs = [] loader = DataLoader(train_dataset, batch_size=args.bs, shuffle=True) for step, data in enumerate(loader): data = data.to(device) pred = model(data.z, data.pos, data.batch) optim.zero_grad() (pred.view(-1) - data.y[:, args.target]).pow(2).mean().backward() optim.step() err = pred.view(-1) - data.y[:, args.target] errs += [err.cpu().detach()] if time.perf_counter() - wall_print > 15: wall_print = time.perf_counter() w = time.perf_counter() - wall e = epoch + (step + 1) / len(loader) print( (f'[{e:.1f}] [' f'wall={w / 3600:.2f}h ' f'wall/epoch={w / e:.0f}s ' f'wall/step={1e3 * w / e / len(loader):.0f}ms ' f'step={step}/{len(loader)} ' f'mae={units * torch.cat(errs)[-200:].abs().mean():.5f} ' f'lr={optim.param_groups[0]["lr"]:.1e}]'), flush=True) train_err = torch.cat(errs) errs = [] loader = DataLoader(val_dataset, batch_size=256) for data in loader: data = data.to(device) with torch.no_grad(): pred = model(data.z, data.pos, data.batch) err = pred.view(-1) - data.y[:, args.target] errs += [err.cpu().detach()] val_err = torch.cat(errs) dynamics += [{ 'epoch': epoch, 'wall': time.perf_counter() - wall, 'train': { 'mae': { 'mean': units * train_err.abs().mean().item(), 'std': units * train_err.abs().std().item(), }, 'mse': { 'mean': units * train_err.pow(2).mean().item(), 'std': units * train_err.pow(2).std().item(), } }, 'val': { 'mae': { 'mean': units * val_err.abs().mean().item(), 'std': units * val_err.abs().std().item(), }, 'mse': { 'mean': units * val_err.pow(2).mean().item(), 'std': units * val_err.pow(2).std().item(), } }, 'lr': optim.param_groups[0]["lr"], }] print( f'[{epoch}] Target: {args.target:02d}, MAE TRAIN: {units * train_err.abs().mean():.5f} ± {units * train_err.abs().std():.5f}, MAE VAL: {units * val_err.abs().mean():.5f} ± {units * val_err.abs().std():.5f}', flush=True) scheduler.step(val_err.pow(2).mean()) yield { 'args': args, 'dynamics': dynamics, 'state': {k: v.cpu() for k, v in model.state_dict().items()}, }
class MyTransform(object): def __call__(self, data): data.y = data.y[:, int(args.target)] # Specify target: 0 = mu return data parser = argparse.ArgumentParser() parser.add_argument('--target', default=0) args = parser.parse_args() target = int(args.target) print('---- Target: {} ----'.format(target)) path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', '1-QM9') dataset = QM9(path, transform=T.Compose([MyTransform(), T.Distance()])) dataset = dataset.shuffle() # Normalize targets to mean = 0 and std = 1. tenpercent = int(len(dataset) * 0.1) mean = dataset.data.y[tenpercent:].mean(dim=0) std = dataset.data.y[tenpercent:].std(dim=0) dataset.data.y = (dataset.data.y - mean) / std test_dataset = dataset[:tenpercent] val_dataset = dataset[tenpercent:2 * tenpercent] train_dataset = dataset[2 * tenpercent:] test_loader = DataLoader(test_dataset, batch_size=64) val_loader = DataLoader(val_dataset, batch_size=64) train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
parser = argparse.ArgumentParser() parser.add_argument('--target', type=int, default=0) parser.add_argument('--dim', type=int, default=64) args = parser.parse_args() class MyTransform: def __call__(self, data): data.y = data.y[:, args.target] return data path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets', 'QM9') transform = T.Compose([MyTransform(), T.Distance()]) dataset = QM9(path, transform=transform).shuffle() # Normalize targets to mean=0 and std=1 mean = dataset.data.y[:, args.target].mean().item() std = dataset.data.y[:, args.target].std().item() dataset.data.y[:, args.target] = (dataset.data.y[:, args.target] - mean) / std # dataset split tenpercent = int(len(dataset) * 0.1) test_dataset = dataset[:tenpercent] val_dataset = dataset[tenpercent:2 * tenpercent] train_dataset = dataset[2 * tenpercent:] test_loader = DataLoader(test_dataset, batch_size=256) val_loader = DataLoader(val_dataset, batch_size=256) train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
return data.num_nodes > 6 # Remove graphs with less than 6 nodes. class MyPreTransform(object): def __call__(self, data): x = data.x data.x = data.x[:, :5] data = ConnectedThreeMalkin()(data) data.x = x return data path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', '1-23-QssM9') dataset = QM9(path, transform=T.Compose([T.Distance(norm=False)]), pre_transform=MyPreTransform(), pre_filter=MyFilter()) dataset.data.y = dataset.data.y[:, 0:12] dataset.data.iso_type_3 = torch.unique(dataset.data.iso_type_3, True, True)[1] num_i_3 = dataset.data.iso_type_3.max().item() + 1 dataset.data.iso_type_3 = F.one_hot(dataset.data.iso_type_3, num_classes=num_i_3).to(torch.float) #gfggg class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() M_in, M_out = dataset.num_features, 32