def __init__(self, config, mode='train', transform=None): self.config = config self.mode = mode self.transform = transform self.raw_dataset = PCQM4MDataset(config.base_data_path, only_smiles=True) log.info("preprocess graph data in %s" % self.__class__.__name__) processed_path = os.path.join(self.raw_dataset.folder, "pgl_processed") if not os.path.exists(processed_path): os.makedirs(processed_path) data_file = os.path.join(processed_path, "graph_data.pkl") if os.path.exists(data_file): log.info("loading graph data from pkl file") self.graph_list = pkl.load(open(data_file, "rb")) else: log.info("loading graph data from smiles data") self.graph_list = [] for i in tqdm.tqdm(range(len(self.raw_dataset))): # num_nodes, edge_index, node_feat, edge_feat, label smiles, label = self.raw_dataset[i] graph = smiles2graph(smiles) new_graph = {} new_graph["edges"] = graph["edge_index"].T new_graph["num_nodes"] = graph["num_nodes"] new_graph["node_feat"] = graph["node_feat"] new_graph["edge_feat"] = graph["edge_feat"] new_graph["label"] = label self.graph_list.append(new_graph) pkl.dump(self.graph_list, open(data_file, 'wb'))
def __init__(self, config, mode='train', transform=None): self.config = config self.mode = mode self.transform = transform self.raw_dataset = PCQM4MDataset(config.base_data_path, only_smiles=True) log.info("preprocess graph data in %s" % self.__class__.__name__) graph_path = os.path.join(self.config.preprocess_file, "mmap_graph") label_file = os.path.join(self.config.preprocess_file, "label.npy") self.graph = pgl.Graph.load(graph_path) self.label = np.load(label_file)
def __init__(self, config, mode="train"): log.info("dataset_type is %s" % self.__class__.__name__) self.config = config self.mode = mode self.transform = config.transform self.raw_dataset = PCQM4MDataset(config.base_data_path, only_smiles=True) self.graph_list = None if not config.debug and self.config.preprocess_file is not None: log.info("preprocess graph data in %s" % self.__class__.__name__) processed_path = os.path.join(self.config.base_data_path, "pgl_processed") if not os.path.exists(processed_path): os.makedirs(processed_path) data_file = os.path.join(processed_path, self.config.preprocess_file) if os.path.exists(data_file): log.info("loading graph data from pkl file") self.graph_list = pkl.load(open(data_file, "rb")) else: log.info("loading graph data from smiles data using %s transform" \ % self.transform) self.graph_list = [] for i in tqdm.tqdm(range(len(self.raw_dataset))): # num_nodes, edge_index, node_feat, edge_feat, label smiles, label = self.raw_dataset[i] g = getattr(self, self.transform)(smiles, label) self.graph_list.append(g) pkl.dump(self.graph_list, open(data_file, 'wb')) else: processed_path = os.path.join(self.config.base_data_path, "pgl_processed") vocab_file = os.path.join(processed_path, "junc_vocab.txt") self.vocab = load_vocab(vocab_file)
def main_mlp(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbgmol* data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument('--num_mlp_layers', type=int, default=6, help='number of mlp layers (default: 6)') parser.add_argument('--drop_ratio', type=float, default=0.2, help='dropout ratio (default: 0.2)') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--emb_dim', type=int, default=1600, help='embedding dimensionality (default: 1600)') parser.add_argument('--train_subset', action='store_true') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--radius', type=int, default=2, help='radius (default: 2)') parser.add_argument('--log_dir', type=str, default="", help='tensorboard log directory') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed(42) random.seed(42) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") dataset = PCQM4MDataset(root='dataset/', only_smiles=True) fp_processed_file = preprocess_fp(dataset, args.radius) data_dict = torch.load(fp_processed_file) X, Y = data_dict['X'], data_dict['Y'] split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() if args.train_subset: print('train subset') subset_ratio = 0.1 subset_idx = torch.randperm(len( split_idx["train"]))[:int(subset_ratio * len(split_idx["train"]))] train_dataset = TensorDataset(X[split_idx['train'][subset_idx]], Y[split_idx['train'][subset_idx]]) else: train_dataset = TensorDataset(X[split_idx['train']], Y[split_idx['train']]) valid_dataset = TensorDataset(X[split_idx['valid']], Y[split_idx['valid']]) test_dataset = TensorDataset(X[split_idx['test-dev']], Y[split_idx['test']]) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.save_test_dir != '': test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) model = MLP(num_mlp_layers=args.num_mlp_layers, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio).to(device) num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') optimizer = optim.Adam(model.parameters(), lr=0.001) if args.log_dir != '': writer = SummaryWriter(log_dir=args.log_dir) best_valid_mae = 1000 if args.train_subset: scheduler = StepLR(optimizer, step_size=300, gamma=0.25) args.epochs = 1000 else: scheduler = StepLR(optimizer, step_size=30, gamma=0.25) for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train_mae = train(model, device, train_loader, optimizer) print('Evaluating...') valid_mae = eval(model, device, valid_loader, evaluator) print({'Train': train_mae, 'Validation': valid_mae}) if args.log_dir != '': writer.add_scalar('valid/mae', valid_mae, epoch) writer.add_scalar('train/mae', train_mae, epoch) if valid_mae < best_valid_mae: best_valid_mae = valid_mae if args.checkpoint_dir != '': print('Saving checkpoint...') checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'best_val_mae': best_valid_mae, 'num_params': num_params } torch.save(checkpoint, osp.join(args.checkpoint_dir, 'checkpoint.pt')) if args.save_test_dir != '': print('Predicting on test data...') y_pred = test(model, device, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir, mode='test-dev') scheduler.step() print(f'Best validation MAE so far: {best_valid_mae}') if args.log_dir != '': writer.close()
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on pcqm4m with DGL') parser.add_argument('--seed', type=int, default=42, help='random seed to use (default: 42)') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help='GNN to use, which can be from ' '[gin, gin-virtual, gcn, gcn-virtual] (default: gin-virtual)') parser.add_argument( '--graph_pooling', type=str, default='sum', help='graph pooling strategy mean or sum (default: sum)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument( '--num_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=600, help='dimensionality of hidden units in GNNs (default: 600)') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) np.random.seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) device = torch.device("cuda:" + str(args.device)) else: device = torch.device("cpu") ### automatic data loading and splitting ### Read in the raw SMILES strings smiles_dataset = PCQM4MDataset(root='dataset/', only_smiles=True) split_idx = smiles_dataset.get_idx_split() test_smiles_dataset = [smiles_dataset[i] for i in split_idx['test']] onthefly_dataset = OnTheFlyPCQMDataset(test_smiles_dataset) test_loader = DataLoader(onthefly_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl) ### automatic evaluator. evaluator = PCQM4MEvaluator() shared_params = { 'num_layers': args.num_layers, 'emb_dim': args.emb_dim, 'drop_ratio': args.drop_ratio, 'graph_pooling': args.graph_pooling } if args.gnn == 'gin': model = GNN(gnn_type='gin', virtual_node=False, **shared_params).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', virtual_node=True, **shared_params).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', virtual_node=False, **shared_params).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', virtual_node=True, **shared_params).to(device) else: raise ValueError('Invalid GNN type') num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') checkpoint_path = os.path.join(args.checkpoint_dir, 'checkpoint.pt') if not os.path.exists(checkpoint_path): raise RuntimeError(f'Checkpoint file not found at {checkpoint_path}') ## reading in checkpoint checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint['model_state_dict']) print('Predicting on test data...') y_pred = test(model, device, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir)
def main_mlp(): # Training settings parser = argparse.ArgumentParser( description="GNN baselines on ogbgmol* data with Pytorch Geometrics") parser.add_argument("--device", type=int, default=0, help="which gpu to use if any (default: 0)") parser.add_argument( "--num_mlp_layers", type=int, default=6, help="number of mlp layers (default: 6)", ) parser.add_argument("--drop_ratio", type=float, default=0.2, help="dropout ratio (default: 0.2)") parser.add_argument( "--batch_size", type=int, default=256, help="input batch size for training (default: 256)", ) parser.add_argument( "--emb_dim", type=int, default=1600, help="embedding dimensionality (default: 1600)", ) parser.add_argument("--train_subset", action="store_true") parser.add_argument( "--epochs", type=int, default=100, help="number of epochs to train (default: 100)", ) parser.add_argument("--num_workers", type=int, default=0, help="number of workers (default: 0)") parser.add_argument("--radius", type=int, default=2, help="radius (default: 2)") parser.add_argument("--log_dir", type=str, default="", help="tensorboard log directory") parser.add_argument("--checkpoint_dir", type=str, default="", help="directory to save checkpoint") parser.add_argument( "--save_test_dir", type=str, default="", help="directory to save test submission file", ) args = parser.parse_args() print(args) np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed(42) random.seed(42) device = (torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")) dataset = PCQM4MDataset(root="dataset/", only_smiles=True) fp_processed_file = preprocess_fp(dataset, args.radius) data_dict = torch.load(fp_processed_file) X, Y = data_dict["X"], data_dict["Y"] split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() if args.train_subset: print("train subset") subset_ratio = 0.1 subset_idx = torch.randperm(len( split_idx["train"]))[:int(subset_ratio * len(split_idx["train"]))] train_dataset = TensorDataset(X[split_idx["train"][subset_idx]], Y[split_idx["train"][subset_idx]]) else: train_dataset = TensorDataset(X[split_idx["train"]], Y[split_idx["train"]]) valid_dataset = TensorDataset(X[split_idx["valid"]], Y[split_idx["valid"]]) test_dataset = TensorDataset(X[split_idx["test"]], Y[split_idx["test"]]) train_loader = DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, ) valid_loader = DataLoader( valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, ) if args.save_test_dir is not "": test_loader = DataLoader( test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, ) if args.checkpoint_dir is not "": os.makedirs(args.checkpoint_dir, exist_ok=True) model = MLP( num_mlp_layers=args.num_mlp_layers, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, ).to(device) num_params = sum(p.numel() for p in model.parameters()) print(f"#Params: {num_params}") optimizer = optim.Adam(model.parameters(), lr=0.001) if args.log_dir is not "": writer = SummaryWriter(log_dir=args.log_dir) best_valid_mae = 1000 if args.train_subset: scheduler = StepLR(optimizer, step_size=300, gamma=0.25) args.epochs = 1000 else: scheduler = StepLR(optimizer, step_size=30, gamma=0.25) for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print("Training...") train_mae = train(model, device, train_loader, optimizer) print("Evaluating...") valid_mae = eval(model, device, valid_loader, evaluator) print({"Train": train_mae, "Validation": valid_mae}) if args.log_dir is not "": writer.add_scalar("valid/mae", valid_mae, epoch) writer.add_scalar("train/mae", train_mae, epoch) if valid_mae < best_valid_mae: best_valid_mae = valid_mae if args.checkpoint_dir is not "": print("Saving checkpoint...") checkpoint = { "epoch": epoch, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "best_val_mae": best_valid_mae, "num_params": num_params, } torch.save(checkpoint, osp.join(args.checkpoint_dir, "checkpoint.pt")) if args.save_test_dir is not "": print("Predicting on test data...") y_pred = test(model, device, test_loader) print("Saving test submission file...") evaluator.save_test_submission({"y_pred": y_pred}, args.save_test_dir) scheduler.step() print(f"Best validation MAE so far: {best_valid_mae}") if args.log_dir is not "": writer.close()