Example #1
0
    def __init__(self, config, mode='train', transform=None):
        self.config = config
        self.mode = mode
        self.transform = transform
        self.raw_dataset = PCQM4MDataset(config.base_data_path,
                                         only_smiles=True)

        log.info("preprocess graph data in %s" % self.__class__.__name__)
        processed_path = os.path.join(self.raw_dataset.folder, "pgl_processed")
        if not os.path.exists(processed_path):
            os.makedirs(processed_path)
        data_file = os.path.join(processed_path, "graph_data.pkl")

        if os.path.exists(data_file):
            log.info("loading graph data from pkl file")
            self.graph_list = pkl.load(open(data_file, "rb"))
        else:
            log.info("loading graph data from smiles data")
            self.graph_list = []
            for i in tqdm.tqdm(range(len(self.raw_dataset))):
                # num_nodes, edge_index, node_feat, edge_feat, label
                smiles, label = self.raw_dataset[i]
                graph = smiles2graph(smiles)
                new_graph = {}
                new_graph["edges"] = graph["edge_index"].T
                new_graph["num_nodes"] = graph["num_nodes"]
                new_graph["node_feat"] = graph["node_feat"]
                new_graph["edge_feat"] = graph["edge_feat"]
                new_graph["label"] = label
                self.graph_list.append(new_graph)

            pkl.dump(self.graph_list, open(data_file, 'wb'))
Example #2
0
    def __init__(self, config, mode='train', transform=None):
        self.config = config
        self.mode = mode
        self.transform = transform
        self.raw_dataset = PCQM4MDataset(config.base_data_path,
                                         only_smiles=True)

        log.info("preprocess graph data in %s" % self.__class__.__name__)

        graph_path = os.path.join(self.config.preprocess_file, "mmap_graph")
        label_file = os.path.join(self.config.preprocess_file, "label.npy")

        self.graph = pgl.Graph.load(graph_path)
        self.label = np.load(label_file)
Example #3
0
    def __init__(self, config, mode="train"):
        log.info("dataset_type is %s" % self.__class__.__name__)
        self.config = config
        self.mode = mode
        self.transform = config.transform
        self.raw_dataset = PCQM4MDataset(config.base_data_path,
                                         only_smiles=True)

        self.graph_list = None
        if not config.debug and self.config.preprocess_file is not None:
            log.info("preprocess graph data in %s" % self.__class__.__name__)
            processed_path = os.path.join(self.config.base_data_path,
                                          "pgl_processed")
            if not os.path.exists(processed_path):
                os.makedirs(processed_path)
            data_file = os.path.join(processed_path,
                                     self.config.preprocess_file)

            if os.path.exists(data_file):
                log.info("loading graph data from pkl file")
                self.graph_list = pkl.load(open(data_file, "rb"))
            else:
                log.info("loading graph data from smiles data using %s transform" \
                        % self.transform)
                self.graph_list = []
                for i in tqdm.tqdm(range(len(self.raw_dataset))):
                    # num_nodes, edge_index, node_feat, edge_feat, label
                    smiles, label = self.raw_dataset[i]
                    g = getattr(self, self.transform)(smiles, label)
                    self.graph_list.append(g)

                pkl.dump(self.graph_list, open(data_file, 'wb'))
        else:
            processed_path = os.path.join(self.config.base_data_path,
                                          "pgl_processed")
            vocab_file = os.path.join(processed_path, "junc_vocab.txt")
            self.vocab = load_vocab(vocab_file)
Example #4
0
def main_mlp():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbgmol* data with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--num_mlp_layers',
                        type=int,
                        default=6,
                        help='number of mlp layers (default: 6)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0.2,
                        help='dropout ratio (default: 0.2)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        help='input batch size for training (default: 256)')
    parser.add_argument('--emb_dim',
                        type=int,
                        default=1600,
                        help='embedding dimensionality (default: 1600)')
    parser.add_argument('--train_subset', action='store_true')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--radius',
                        type=int,
                        default=2,
                        help='radius (default: 2)')
    parser.add_argument('--log_dir',
                        type=str,
                        default="",
                        help='tensorboard log directory')
    parser.add_argument('--checkpoint_dir',
                        type=str,
                        default='',
                        help='directory to save checkpoint')
    parser.add_argument('--save_test_dir',
                        type=str,
                        default='',
                        help='directory to save test submission file')
    args = parser.parse_args()

    print(args)

    np.random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    random.seed(42)

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    dataset = PCQM4MDataset(root='dataset/', only_smiles=True)
    fp_processed_file = preprocess_fp(dataset, args.radius)

    data_dict = torch.load(fp_processed_file)
    X, Y = data_dict['X'], data_dict['Y']

    split_idx = dataset.get_idx_split()
    ### automatic evaluator. takes dataset name as input
    evaluator = PCQM4MEvaluator()

    if args.train_subset:
        print('train subset')
        subset_ratio = 0.1
        subset_idx = torch.randperm(len(
            split_idx["train"]))[:int(subset_ratio * len(split_idx["train"]))]
        train_dataset = TensorDataset(X[split_idx['train'][subset_idx]],
                                      Y[split_idx['train'][subset_idx]])

    else:
        train_dataset = TensorDataset(X[split_idx['train']],
                                      Y[split_idx['train']])

    valid_dataset = TensorDataset(X[split_idx['valid']], Y[split_idx['valid']])
    test_dataset = TensorDataset(X[split_idx['test-dev']],
                                 Y[split_idx['test']])

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)

    if args.save_test_dir != '':
        test_loader = DataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers)

    if args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    model = MLP(num_mlp_layers=args.num_mlp_layers,
                emb_dim=args.emb_dim,
                drop_ratio=args.drop_ratio).to(device)

    num_params = sum(p.numel() for p in model.parameters())
    print(f'#Params: {num_params}')

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    if args.log_dir != '':
        writer = SummaryWriter(log_dir=args.log_dir)

    best_valid_mae = 1000

    if args.train_subset:
        scheduler = StepLR(optimizer, step_size=300, gamma=0.25)
        args.epochs = 1000
    else:
        scheduler = StepLR(optimizer, step_size=30, gamma=0.25)

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        train_mae = train(model, device, train_loader, optimizer)

        print('Evaluating...')
        valid_mae = eval(model, device, valid_loader, evaluator)

        print({'Train': train_mae, 'Validation': valid_mae})

        if args.log_dir != '':
            writer.add_scalar('valid/mae', valid_mae, epoch)
            writer.add_scalar('train/mae', train_mae, epoch)

        if valid_mae < best_valid_mae:
            best_valid_mae = valid_mae
            if args.checkpoint_dir != '':
                print('Saving checkpoint...')
                checkpoint = {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                    'best_val_mae': best_valid_mae,
                    'num_params': num_params
                }
                torch.save(checkpoint,
                           osp.join(args.checkpoint_dir, 'checkpoint.pt'))

            if args.save_test_dir != '':
                print('Predicting on test data...')
                y_pred = test(model, device, test_loader)
                print('Saving test submission file...')
                evaluator.save_test_submission({'y_pred': y_pred},
                                               args.save_test_dir,
                                               mode='test-dev')

        scheduler.step()

        print(f'Best validation MAE so far: {best_valid_mae}')

    if args.log_dir != '':
        writer.close()
Example #5
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on pcqm4m with DGL')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help='random seed to use (default: 42)')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help='GNN to use, which can be from '
        '[gin, gin-virtual, gcn, gcn-virtual] (default: gin-virtual)')
    parser.add_argument(
        '--graph_pooling',
        type=str,
        default='sum',
        help='graph pooling strategy mean or sum (default: sum)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0,
                        help='dropout ratio (default: 0)')
    parser.add_argument(
        '--num_layers',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=600,
        help='dimensionality of hidden units in GNNs (default: 600)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        help='input batch size for training (default: 256)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--checkpoint_dir',
                        type=str,
                        default='',
                        help='directory to save checkpoint')
    parser.add_argument('--save_test_dir',
                        type=str,
                        default='',
                        help='directory to save test submission file')
    args = parser.parse_args()

    print(args)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)
        device = torch.device("cuda:" + str(args.device))
    else:
        device = torch.device("cpu")

    ### automatic data loading and splitting
    ### Read in the raw SMILES strings
    smiles_dataset = PCQM4MDataset(root='dataset/', only_smiles=True)
    split_idx = smiles_dataset.get_idx_split()

    test_smiles_dataset = [smiles_dataset[i] for i in split_idx['test']]
    onthefly_dataset = OnTheFlyPCQMDataset(test_smiles_dataset)
    test_loader = DataLoader(onthefly_dataset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             collate_fn=collate_dgl)

    ### automatic evaluator.
    evaluator = PCQM4MEvaluator()

    shared_params = {
        'num_layers': args.num_layers,
        'emb_dim': args.emb_dim,
        'drop_ratio': args.drop_ratio,
        'graph_pooling': args.graph_pooling
    }

    if args.gnn == 'gin':
        model = GNN(gnn_type='gin', virtual_node=False,
                    **shared_params).to(device)
    elif args.gnn == 'gin-virtual':
        model = GNN(gnn_type='gin', virtual_node=True,
                    **shared_params).to(device)
    elif args.gnn == 'gcn':
        model = GNN(gnn_type='gcn', virtual_node=False,
                    **shared_params).to(device)
    elif args.gnn == 'gcn-virtual':
        model = GNN(gnn_type='gcn', virtual_node=True,
                    **shared_params).to(device)
    else:
        raise ValueError('Invalid GNN type')

    num_params = sum(p.numel() for p in model.parameters())
    print(f'#Params: {num_params}')

    checkpoint_path = os.path.join(args.checkpoint_dir, 'checkpoint.pt')
    if not os.path.exists(checkpoint_path):
        raise RuntimeError(f'Checkpoint file not found at {checkpoint_path}')

    ## reading in checkpoint
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])

    print('Predicting on test data...')
    y_pred = test(model, device, test_loader)
    print('Saving test submission file...')
    evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir)
Example #6
0
def main_mlp():
    # Training settings
    parser = argparse.ArgumentParser(
        description="GNN baselines on ogbgmol* data with Pytorch Geometrics")
    parser.add_argument("--device",
                        type=int,
                        default=0,
                        help="which gpu to use if any (default: 0)")
    parser.add_argument(
        "--num_mlp_layers",
        type=int,
        default=6,
        help="number of mlp layers (default: 6)",
    )
    parser.add_argument("--drop_ratio",
                        type=float,
                        default=0.2,
                        help="dropout ratio (default: 0.2)")
    parser.add_argument(
        "--batch_size",
        type=int,
        default=256,
        help="input batch size for training (default: 256)",
    )
    parser.add_argument(
        "--emb_dim",
        type=int,
        default=1600,
        help="embedding dimensionality (default: 1600)",
    )
    parser.add_argument("--train_subset", action="store_true")
    parser.add_argument(
        "--epochs",
        type=int,
        default=100,
        help="number of epochs to train (default: 100)",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=0,
                        help="number of workers (default: 0)")
    parser.add_argument("--radius",
                        type=int,
                        default=2,
                        help="radius (default: 2)")
    parser.add_argument("--log_dir",
                        type=str,
                        default="",
                        help="tensorboard log directory")
    parser.add_argument("--checkpoint_dir",
                        type=str,
                        default="",
                        help="directory to save checkpoint")
    parser.add_argument(
        "--save_test_dir",
        type=str,
        default="",
        help="directory to save test submission file",
    )
    args = parser.parse_args()

    print(args)

    np.random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    random.seed(42)

    device = (torch.device("cuda:" + str(args.device))
              if torch.cuda.is_available() else torch.device("cpu"))

    dataset = PCQM4MDataset(root="dataset/", only_smiles=True)
    fp_processed_file = preprocess_fp(dataset, args.radius)

    data_dict = torch.load(fp_processed_file)
    X, Y = data_dict["X"], data_dict["Y"]

    split_idx = dataset.get_idx_split()
    ### automatic evaluator. takes dataset name as input
    evaluator = PCQM4MEvaluator()

    if args.train_subset:
        print("train subset")
        subset_ratio = 0.1
        subset_idx = torch.randperm(len(
            split_idx["train"]))[:int(subset_ratio * len(split_idx["train"]))]
        train_dataset = TensorDataset(X[split_idx["train"][subset_idx]],
                                      Y[split_idx["train"][subset_idx]])

    else:
        train_dataset = TensorDataset(X[split_idx["train"]],
                                      Y[split_idx["train"]])

    valid_dataset = TensorDataset(X[split_idx["valid"]], Y[split_idx["valid"]])
    test_dataset = TensorDataset(X[split_idx["test"]], Y[split_idx["test"]])

    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
    )

    if args.save_test_dir is not "":
        test_loader = DataLoader(
            test_dataset,
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=args.num_workers,
        )

    if args.checkpoint_dir is not "":
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    model = MLP(
        num_mlp_layers=args.num_mlp_layers,
        emb_dim=args.emb_dim,
        drop_ratio=args.drop_ratio,
    ).to(device)

    num_params = sum(p.numel() for p in model.parameters())
    print(f"#Params: {num_params}")

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    if args.log_dir is not "":
        writer = SummaryWriter(log_dir=args.log_dir)

    best_valid_mae = 1000

    if args.train_subset:
        scheduler = StepLR(optimizer, step_size=300, gamma=0.25)
        args.epochs = 1000
    else:
        scheduler = StepLR(optimizer, step_size=30, gamma=0.25)

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print("Training...")
        train_mae = train(model, device, train_loader, optimizer)

        print("Evaluating...")
        valid_mae = eval(model, device, valid_loader, evaluator)

        print({"Train": train_mae, "Validation": valid_mae})

        if args.log_dir is not "":
            writer.add_scalar("valid/mae", valid_mae, epoch)
            writer.add_scalar("train/mae", train_mae, epoch)

        if valid_mae < best_valid_mae:
            best_valid_mae = valid_mae
            if args.checkpoint_dir is not "":
                print("Saving checkpoint...")
                checkpoint = {
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "scheduler_state_dict": scheduler.state_dict(),
                    "best_val_mae": best_valid_mae,
                    "num_params": num_params,
                }
                torch.save(checkpoint,
                           osp.join(args.checkpoint_dir, "checkpoint.pt"))

            if args.save_test_dir is not "":
                print("Predicting on test data...")
                y_pred = test(model, device, test_loader)
                print("Saving test submission file...")
                evaluator.save_test_submission({"y_pred": y_pred},
                                               args.save_test_dir)

        scheduler.step()

        print(f"Best validation MAE so far: {best_valid_mae}")

    if args.log_dir is not "":
        writer.close()