Esempio n. 1
0
def get_data():
    dataset = args.name
    path = '../data/geometric/QM9'
    trainset = QM9(path)
    testset = QM9(path)
    lenTrain = len(trainset)
    lenTest = len(testset)

    print("Len Dataset:", lenTrain)
    trainLoader = DataLoader(trainset[:lenTrain], batch_size=1, shuffle=False)
    testloader = DataLoader(trainset[:lenTest], batch_size=1, shuffle=False)
    print("Len TrainLoader:", len(trainLoader))

    return trainLoader, testloader
Esempio n. 2
0
def get_dataset(name, sparse=True, dirname=None):
    if dirname is None:
        path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                        name)
    else:
        path = osp.join(dirname, name)
    if name == "QM9":
        dataset = QM9(path)
    elif name == "QM7b":
        dataset = QM7b(path)
    else:
        dataset = TUDataset(path, name)
    dataset.data.edge_attr = None

    if dataset.data.x is None:
        max_degree = 0
        degs = []
        for data in dataset:
            degs += [degree(data.edge_index[0], dtype=torch.long)]
            max_degree = max(max_degree, degs[-1].max().item())

        if max_degree < 1000:
            dataset.transform = T.OneHotDegree(max_degree)
        else:
            deg = torch.cat(degs, dim=0).to(torch.float)
            mean, std = deg.mean().item(), deg.std().item()
            dataset.transform = NormalizedDegree(mean, std)

    num_nodes = max_num_nodes = 0
    for data in dataset:
        num_nodes += data.num_nodes
        max_num_nodes = max(data.num_nodes, max_num_nodes)

    # Filter out a few really large graphs in order to apply DiffPool.
    if name == 'REDDIT-BINARY':
        num_nodes = min(int(num_nodes / len(dataset) * 1.5), max_num_nodes)
    else:
        num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes)
        # num_nodes = max_num_nodes

    indices = []
    for i, data in enumerate(dataset):
        if data.num_nodes <= num_nodes:
            indices.append(i)
    dataset = dataset[torch.tensor(indices)]

    if not sparse:
        if dataset.transform is None:
            dataset.transform = T.ToDense(num_nodes)
        else:
            dataset.transform = T.Compose(
                [dataset.transform, T.ToDense(num_nodes)])
    return dataset
Esempio n. 3
0
    def __init__(self):
        dataset = "QM9"
        path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data",
                        dataset)

        target = 0

        class MyTransform(object):
            def __call__(self, data):
                # Specify target.
                data.y = data.y[:, target]
                return data

        class Complete(object):
            def __call__(self, data):
                device = data.edge_index.device
                row = torch.arange(data.num_nodes,
                                   dtype=torch.long,
                                   device=device)
                col = torch.arange(data.num_nodes,
                                   dtype=torch.long,
                                   device=device)
                row = row.view(-1, 1).repeat(1, data.num_nodes).view(-1)
                col = col.repeat(data.num_nodes)
                edge_index = torch.stack([row, col], dim=0)
                edge_attr = None
                if data.edge_attr is not None:
                    idx = data.edge_index[
                        0] * data.num_nodes + data.edge_index[1]
                    size = list(data.edge_attr.size())
                    size[0] = data.num_nodes * data.num_nodes
                    edge_attr = data.edge_attr.new_zeros(size)
                    edge_attr[idx] = data.edge_attr
                edge_index, edge_attr = remove_self_loops(
                    edge_index, edge_attr)
                data.edge_attr = edge_attr
                data.edge_index = edge_index
                return data

        transform = T.Compose(
            [MyTransform(), Complete(),
             T.Distance(norm=False)])
        if not osp.exists(path):
            QM9(path)
        super(QM9Dataset, self).__init__(path)
Esempio n. 4
0
def show_graph_regression():
    qm9 = QM9(root='data')
    test_loader = DataLoader(qm9[int(1000 * 0.8):1000],
                             batch_size=1,
                             shuffle=True)
    model = GNNStack(max(qm9.num_node_features, 1),
                     32,
                     qm9.num_classes,
                     task='graph')
    model.load_state_dict(torch.load('graphnn/savegraphmodel_32hid.pth'))
    example = next(iter(test_loader))
    emb, pred = model(example)
    fig, axes = plt.subplots(2, 1, figsize=(10, 4))
    fig.suptitle('Graph property prediction')
    axes[0].imshow(pred.detach().numpy())
    axes[0].set_title('Prediction')
    axes[1].imshow(example.y.detach().numpy())
    axes[1].set_title('Ground truth')
Esempio n. 5
0
 def __init__(self, dataset, config):
     self.qm9 = False
     if dataset == "QM9":
         self.qm9 = True
         self.data_dir = os.path.join(self.curr_dir, '../tg_datasets/QM9')
         self.data = QM9(self.data_dir,
                         transform=self.transform_data)
     else:
         self.data = TUDataset(self.data_dir,
                               dataset,
                               transform=self.transform_data,
                               use_node_attr=self.use_node_attr)
     self.data.shuffle() # Shuffle the dataset
     for key, value in config.items():
         if hasattr(self.config, key):
             setattr(self.config, key, type(getattr(self.config, key))(value))
         else:
             print(f'Config key \'{key}\' is not valid for PPGN')
             sys.exit()                
Esempio n. 6
0
def load_dataset(name):
    """ Load real-world datasets, available in PyTorch Geometric.

    Used as a helper for DiskDataSource.
    """
    task = "graph"
    if name == "enzymes":
        dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
    elif name == "proteins":
        dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS")
    elif name == "cox2":
        dataset = TUDataset(root="/tmp/cox2", name="COX2")
    elif name == "aids":
        dataset = TUDataset(root="/tmp/AIDS", name="AIDS")
    elif name == "reddit-binary":
        dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY")
    elif name == "imdb-binary":
        dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY")
    elif name == "firstmm_db":
        dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB")
    elif name == "dblp":
        dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1")
    elif name == "ppi":
        dataset = PPI(root="/tmp/PPI")
    elif name == "qm9":
        dataset = QM9(root="/tmp/QM9")
    elif name == "atlas":
        dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)]
    if task == "graph":
        train_len = int(0.8 * len(dataset))
        train, test = [], []
        dataset = list(dataset)
        random.shuffle(dataset)
        has_name = hasattr(dataset[0], "name")
        for i, graph in tqdm(enumerate(dataset)):
            if not type(graph) == nx.Graph:
                if has_name: del graph.name
                graph = pyg_utils.to_networkx(graph).to_undirected()
            if i < train_len:
                train.append(graph)
            else:
                test.append(graph)
    return train, test, task
Esempio n. 7
0
 def __init__(self, path: str):
     pyg_dataset = QM9(os.path.join(path, '_pyg'))
     if hasattr(pyg_dataset, "__data_list__"):
         delattr(pyg_dataset, "__data_list__")
     if hasattr(pyg_dataset, "_data_list"):
         delattr(pyg_dataset, "_data_list")
     super(QM9Dataset, self).__init__([
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(
             {
                 'x': data.x,
                 'pos': data.pos,
                 'z': data.z
             },
             data.edge_index,
             edges_data={'edge_attr': data.edge_attr},
             graph_data={
                 'idx': data.idx,
                 'y': data.y
             }) for data in pyg_dataset
     ])
Esempio n. 8
0
def get_dataset(dataset_name):
    """
    Retrieves the dataset corresponding to the given name.
    """
    path = join('dataset', dataset_name)
    if dataset_name == 'reddit':
        dataset = Reddit(path)
    elif dataset_name == 'flickr':
        dataset = Flickr(path)
    elif dataset_name == 'zinc':
        dataset = ZINC(root='dataset', subset=True, split='train')
    elif dataset_name == 'QM9':
        dataset = QM9(root='dataset')
    elif dataset_name == 'github':
        dataset = GitHub(path)
    elif dataset_name == 'ppi':
        dataset = PPI(path)
    elif dataset_name in ['amazon_comp', 'amazon_photo']:
        dataset = Amazon(path, "Computers", T.NormalizeFeatures()
                         ) if dataset_name == 'amazon_comp' else Amazon(
                             path, "Photo", T.NormalizeFeatures())
        data = dataset.data
        idx_train, idx_test = train_test_split(list(range(data.x.shape[0])),
                                               test_size=0.4,
                                               random_state=42)
        idx_val, idx_test = train_test_split(idx_test,
                                             test_size=0.5,
                                             random_state=42)
        data.train_mask = torch.tensor(idx_train)
        data.val_mask = torch.tensor(idx_val)
        data.test_mask = torch.tensor(idx_test)
        dataset.data = data
    elif dataset_name in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(path,
                            name=dataset_name,
                            split="public",
                            transform=T.NormalizeFeatures())
    else:
        raise NotImplementedError

    return dataset
Esempio n. 9
0
def load_dataset(args):
    # automatic data loading and splitting
    transform = add_zeros if args.dataset == 'ogbg-ppa' else None
    cls_criterion = get_loss_function(args.dataset)
    idx2word_mapper = None

    if args.dataset == 'mnist':
        train_data = MNISTSuperpixels(root='dataset',
                                      train=True,
                                      transform=T.Polar())
        dataset = train_data
        dataset.name = 'mnist'
        dataset.eval_metric = 'acc'
        validation_data = []
        test_data = MNISTSuperpixels(root='dataset',
                                     train=False,
                                     transform=T.Polar())

        train_data = list(train_data)
        test_data = list(test_data)

    elif args.dataset == 'QM9':
        # Contains 19 targets. Use only the first 12 (0-11)
        QM9_VALIDATION_START = 110000
        QM9_VALIDATION_END = 120000
        dataset = QM9(root='dataset',
                      transform=ExtractTargetTransform(args.target)).shuffle()
        dataset.name = 'QM9'
        dataset.eval_metric = 'mae'

        train_data = dataset[:QM9_VALIDATION_START]
        validation_data = dataset[QM9_VALIDATION_START:QM9_VALIDATION_END]
        test_data = dataset[QM9_VALIDATION_END:]

        train_data = list(train_data)
        validation_data = list(validation_data)
        test_data = list(test_data)

    elif args.dataset == 'zinc':
        train_data = ZINC(root='dataset', subset=True, split='train')

        dataset = train_data
        dataset.name = 'zinc'
        validation_data = ZINC(root='dataset', subset=True, split='val')
        test_data = ZINC(root='dataset', subset=True, split='test')
        dataset.eval_metric = 'mae'

        train_data = list(train_data)
        validation_data = list(validation_data)
        test_data = list(test_data)

    elif args.dataset in [
            'ogbg-molhiv', 'ogbg-molpcba', 'ogbg-ppa', 'ogbg-code2'
    ]:
        dataset = PygGraphPropPredDataset(name=args.dataset,
                                          transform=transform)

        if args.dataset == 'obgb-code2':
            seq_len_list = np.array([len(seq) for seq in dataset.data.y])
            max_seq_len = args.max_seq_len
            num_less_or_equal_to_max = np.sum(
                seq_len_list <= args.max_seq_len) / len(seq_len_list)
            print(
                f'Target sequence less or equal to {max_seq_len} is {num_less_or_equal_to_max}%.'
            )

        split_idx = dataset.get_idx_split()
        # The following is only used in the evaluation of the ogbg-code classifier.
        if args.dataset == 'ogbg-code2':
            vocab2idx, idx2vocab = get_vocab_mapping(
                [dataset.data.y[i] for i in split_idx['train']],
                args.num_vocab)
            # specific transformations for the ogbg-code dataset
            dataset.transform = transforms.Compose([
                augment_edge,
                lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len)
            ])
            idx2word_mapper = partial(decode_arr_to_seq, idx2vocab=idx2vocab)

        train_data = list(dataset[split_idx["train"]])
        validation_data = list(dataset[split_idx["valid"]])
        test_data = list(dataset[split_idx["test"]])

    return dataset, train_data, validation_data, test_data, cls_criterion, idx2word_mapper
def load_dataset(name):
    """ Load real-world datasets, available in PyTorch Geometric.

    Used as a helper for DiskDataSource.
    """
    task = "graph"
    if name == "enzymes":
        dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
    elif name == "proteins":
        dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS")
    elif name == "cox2":
        dataset = TUDataset(root="/tmp/cox2", name="COX2")
    elif name == "aids":
        dataset = TUDataset(root="/tmp/AIDS", name="AIDS")
    elif name == "reddit-binary":
        dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY")
    elif name == "imdb-binary":
        dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY")
    elif name == "firstmm_db":
        dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB")
    elif name == "dblp":
        dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1")
    elif name == "ppi":
        dataset = PPI(root="/tmp/PPI")
    elif name == "qm9":
        dataset = QM9(root="/tmp/QM9")
    elif name == "atlas":
        dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)]
    elif name == 'aifb':
        dataset = Entities(root="/tmp/aifb", name='AIFB')  # 90 edge types
    elif name == 'wn18':
        dataset = WordNet18(root="/tmp/wn18")
    elif name == 'fb15k237':
        dataset = [None]
    if task == "graph":
        train_len = int(0.8 * len(dataset))
        train, test = [], []
        if name not in ['aifb', 'wn18', 'fb15k237']:
            dataset = list(dataset)
            random.shuffle(dataset)
            has_name = hasattr(dataset[0], "name")
        else:
            has_name = True
        for i, graph in tqdm(enumerate(dataset)):
            if not type(graph) == nx.Graph:
                try:
                    if has_name: del graph.name
                except:
                    pass
                if name == 'aifb':
                    graph = pyg_utils.to_networkx(graph,
                                                  edge_attrs=['edge_type'])
                elif name == 'wn18':
                    graph = pyg_utils.to_networkx(graph,
                                                  edge_attrs=['edge_type'])
                elif name == 'fb15k237':
                    data = FB15k_237()
                    (graph, _, _, _) = data.load()
                    graph = graph.to_networkx()
                    edge_type_dict = []
                    for j in graph.edges:
                        edge_type_dict.append(graph.edges[j]['label'])
                    edge_type_dict = {
                        i: ind
                        for ind, i in enumerate(sorted(set(edge_type_dict)))
                    }

                    for j in graph.edges:
                        graph.edges[j]['edge_type'] = edge_type_dict[
                            graph.edges[j]['label']]
                        del graph.edges[j]['label']
                        del graph.edges[j]['weight']
                else:
                    graph = pyg_utils.to_networkx(graph).to_undirected()
            if name == 'aifb':
                train.append(graph)
                test.append(deepcopy(graph))
            elif name == 'wn18':
                train.append(graph)
                test.append(deepcopy(graph))
            elif name == 'fb15k237':
                train.append(graph)
                test.append(deepcopy(graph))
            else:
                if i < train_len:
                    train.append(graph)
                else:
                    test.append(graph)

    return train, test, task
Esempio n. 11
0
def execute(config):
    path = 'QM9'
    dataset = QM9(path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.manual_seed(config['seed'])

    # Report meV instead of eV.
    units = 1000 if config['target'] in [2, 3, 4, 6, 7, 8, 9, 10] else 1

    _, datasets = SchNet.from_qm9_pretrained(path, dataset, config['target'])
    train_dataset, val_dataset, _test_dataset = datasets
    train_dataset = train_dataset[:config['ptr']]

    model = Network(
        muls=(config['mul0'], config['mul1'], config['mul2']),
        lmax=config['lmax'],
        num_layers=config['num_layers'],
        number_of_basis=config['rad_gaussians'],
        fc_neurons=[config['rad_h']] * config['rad_layers'],
        mean=config['mean'],
        std=config['std'],
        atomref=dataset.atomref(config['target']),
    )
    model = model.to(device)

    wandb.watch(model)

    # modules = [model.embedding, model.radial] + list(model.layers) + [model.atomref]
    # lrs = [0.1, 0.01] + [1] * len(model.layers) + [0.1]
    # param_groups = []
    # for lr, module in zip(lrs, modules):
    #     jac = []
    #     for data in DataLoader(train_dataset[:20]):
    #         data = data.to(device)
    #         jac += [torch.autograd.grad(model(data.z, data.pos), module.parameters())[0].flatten()]
    #     jac = torch.stack(jac)
    #     kernel = jac @ jac.T
    #     print('kernel({}) = {:.2e} +- {:.2e}'.format(module, kernel.mean().item(), kernel.std().item()), flush=True)
    #     lr = lr / (kernel.mean() + kernel.std()).item()
    #     param_groups.append({
    #         'params': list(module.parameters()),
    #         'lr': lr,
    #     })

    # lrs = torch.tensor([x['lr'] for x in param_groups])
    # lrs = config['lr'] * lrs / lrs.max().item()

    # for group, lr in zip(param_groups, lrs):
    #     group['lr'] = lr.item()

    optim = torch.optim.Adam(model.parameters(), lr=config['lr'])
    # print(optim, flush=True)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                           patience=25,
                                                           factor=0.5,
                                                           verbose=True)

    dynamics = []
    wall = time.perf_counter()
    wall_print = time.perf_counter()

    for epoch in itertools.count():

        errs = []
        loader = DataLoader(train_dataset,
                            batch_size=config['bs'],
                            shuffle=True)
        for step, data in enumerate(loader):
            data = data.to(device)

            pred = model(data.z, data.pos, data.batch)
            optim.zero_grad()
            (pred.view(-1) -
             data.y[:, config['target']]).pow(2).mean().backward()
            optim.step()

            err = pred.view(-1) - data.y[:, config['target']]
            errs += [err.cpu().detach()]

            if time.perf_counter() - wall_print > 15:
                wall_print = time.perf_counter()
                w = time.perf_counter() - wall
                e = epoch + (step + 1) / len(loader)
                print((
                    f'[{e:.1f}] ['
                    f'wall={w / 3600:.2f}h '
                    f'wall/epoch={w / e:.0f}s '
                    f'wall/step={1e3 * w / e / len(loader):.0f}ms '
                    f'step={step}/{len(loader)} '
                    f'mae={units * torch.cat(errs)[-200:].abs().mean():.5f} '
                    f'lr={min(x["lr"] for x in optim.param_groups):.1e}-{max(x["lr"] for x in optim.param_groups):.1e}]'
                ),
                      flush=True)

        if epoch == 0:
            called_num = [0]

            def trace_handler(p):
                print(p.key_averages().table(sort_by="self_cuda_time_total",
                                             row_limit=-1))
                p.export_chrome_trace(
                    f"{datetime.datetime.now()}_{called_num[0]}.json")
                called_num[0] += 1

            with torch.profiler.profile(activities=[
                    torch.profiler.ProfilerActivity.CPU,
                    torch.profiler.ProfilerActivity.CUDA
            ],
                                        schedule=torch.profiler.schedule(
                                            wait=1, warmup=1, active=1),
                                        on_trace_ready=trace_handler) as prof:

                for step, data in enumerate(loader):
                    data = data.to(device)
                    pred = model(data.z, data.pos, data.batch)
                    mse = (pred.view(-1) - data.y[:, config['target']]).pow(2)
                    mse.mean().backward()
                    prof.step()
                    if step == 3:
                        break

        train_err = torch.cat(errs)

        errs = []
        loader = DataLoader(val_dataset, batch_size=256)
        for data in loader:
            data = data.to(device)
            with torch.no_grad():
                pred = model(data.z, data.pos, data.batch)

            err = pred.view(-1) - data.y[:, config['target']]
            errs += [err.cpu().detach()]
        val_err = torch.cat(errs)

        lrs = [x['lr'] for x in optim.param_groups]
        dynamics += [{
            'epoch': epoch,
            'wall': time.perf_counter() - wall,
            'train': {
                'mae': {
                    'mean': units * train_err.abs().mean().item(),
                    'std': units * train_err.abs().std().item(),
                },
                'mse': {
                    'mean': units * train_err.pow(2).mean().item(),
                    'std': units * train_err.pow(2).std().item(),
                }
            },
            'val': {
                'mae': {
                    'mean': units * val_err.abs().mean().item(),
                    'std': units * val_err.abs().std().item(),
                },
                'mse': {
                    'mean': units * val_err.pow(2).mean().item(),
                    'std': units * val_err.pow(2).std().item(),
                }
            },
            'lrs': lrs,
        }]
        dynamics[-1]['_runtime'] = dynamics[-1]['wall']
        wandb.log(dynamics[-1])

        print(
            f'[{epoch}] Target: {config["target"]:02d}, MAE TRAIN: {units * train_err.abs().mean():.5f} ± {units * train_err.abs().std():.5f}, MAE VAL: {units * val_err.abs().mean():.5f} ± {units * val_err.abs().std():.5f}',
            flush=True)

        scheduler.step(val_err.pow(2).mean())

        yield {
            'args': config,
            'dynamics': dynamics,
            'state': {k: v.cpu()
                      for k, v in model.state_dict().items()},
        }

        if dynamics[-1]['wall'] > config['wall']:
            break
Esempio n. 12
0
 def __init__(self, root, target):
     super().__init__()
     self.root = root
     self.dataset = QM9(root=root)
     self.target = target
Esempio n. 13
0
def main():
    np.random.seed(0)
    torch.manual_seed(0)
    # --------------------- PARSE ARGS -----------------------
    parser = argparse.ArgumentParser()

    parser.add_argument("--train-size", type=int, default=5000)
    parser.add_argument("--target",
                        type=int,
                        choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                        default=0)
    parser.add_argument("--batch-size", type=int, default=20)
    parser.add_argument("--num-epoch", type=int, default=500)
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--weight-decay", type=float, default=0.0)
    parser.add_argument("--encoder-hidden-dim", type=int, default=64)
    parser.add_argument("--lamda", type=float, default=0.001)
    parser.add_argument("--patience", type=int, default=30)

    args = parser.parse_args()

    print("- Args ----------------------")
    for k, v in vars(args).items():
        print(" - {}={}".format(k, v))
    print("-----------------------------")

    # --------------------- LOAD DATASET ---------------------
    print("Loading dataset...")
    dataset = QM9(QM9_DATASET_PATH,
                  pre_transform=T.Compose([Complete(),
                                           T.Distance(norm=False)]),
                  transform=TargetLabelSelection(args.target)).shuffle()

    mean = dataset.data.y[:, args.target].mean().item()
    std = dataset.data.y[:, args.target].std().item()
    dataset.data.y[:,
                   args.target] = (dataset.data.y[:, args.target] - mean) / std

    test_dataset = dataset[:10000]
    val_dataset = dataset[10000:20000]
    train_dataset = dataset[20000:20000 + args.train_size]

    test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)

    unsup_train_dataset = dataset[20000:]
    unsup_train_loader = DataLoader(unsup_train_dataset,
                                    batch_size=args.batch_size,
                                    shuffle=True)

    print("- Dataset -------------------")
    print(" - # train: {:,}".format(len(train_dataset)))
    print(" - # val: {:,}".format(len(val_dataset)))
    print(" - # test: {:,}".format(len(test_dataset)))
    print(" - # train (unsup.): {:,}".format(len(unsup_train_dataset)))
    print("-----------------------------")

    # --------------------- TRAIN MODEL ----------------------
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = InfoGraphSemi(dataset.num_features,
                          args.encoder_hidden_dim).to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     mode='min',
                                                     factor=0.7,
                                                     patience=5,
                                                     min_lr=0.000001)

    val_error = evaluate(model, val_loader, std, device)
    print("| Epoch: {:3} | Val MAE: {:10.4f} |".format(0, val_error))
    print("Starting training...")

    start_time = time.time()
    checkpoint_path = "model_{}.pt".format(start_time)
    min_val_error = None
    min_val_epoch = 0
    for epoch in range(1, args.num_epoch + 1):
        train_loss = train(model, train_loader, unsup_train_loader, optimizer,
                           args.lamda, device)
        val_error = evaluate(model, val_loader, std, device)
        scheduler.step(val_error)

        if min_val_error is None or val_error < min_val_error:
            min_val_error = val_error
            min_val_epoch = epoch
            torch.save(model.state_dict(), checkpoint_path)

        lr = scheduler.optimizer.param_groups[0]['lr']
        elapsed_time = datetime.timedelta(seconds=int(time.time() -
                                                      start_time))
        print(
            "| Epoch: {:3} | time: {} | lr: {:7f} | Train loss: {:8.4f} | Val MAE: {:8.4f} |{}"
            .format(epoch, elapsed_time, lr, train_loss, val_error,
                    " *" if min_val_epoch == epoch else ""))

        if epoch - min_val_epoch > args.patience:
            print("Early stopping...")
            break
    print("Training finished!")

    print("Evaluating on test set...")
    model.load_state_dict(torch.load(checkpoint_path))
    test_error = evaluate(model, test_loader, std, device)
    print("| Val MAE: {:8.4f} | Test MAE: {:8.4f} |".format(
        min_val_error, test_error))
Esempio n. 14
0
 def _qm9(self, target):
     dataset = QM9('data/QM9', transform=QM9Transformer(target))
     mean = dataset.data.y.mean(dim=0, keepdim=True)
     std = dataset.data.y.std(dim=0, keepdim=True)
     dataset.data.y = (dataset.data.y - mean) / std
     return dataset, std[:, target].item(), 11, 4
Esempio n. 15
0
 def __init__(self, f1_alpha, f2_alpha, f3_alpha):
     dataset = QM9('data/QM9')
     super(QM9Sampler, self).__init__(dataset, f1_alpha, f2_alpha, f3_alpha)
Esempio n. 16
0
import torch.nn as nn
import torch_geometric
from torch_geometric.data import Batch
from torch_geometric.datasets import QM9

start = datetime.now()

epochs = 1000
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
keep_data_on_gpu = torch.cuda.is_available()
target = None
in_features, out_features = 4 + 13 + 3 + 2, 12

path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'eee-QM9')
dataset = QM9(path)

if target is not None:
    print(f'Target: {target}')
    out_features = 1
    dataset.data.y = dataset.data.y[:, target].unsqueeze(1)
else:
    dataset.data.y = dataset.data.y[:, :12]


def get_data_loader(dataset, batch_size, keep_on_gpu=True):
    data = preprocess_graphs(dataset, keep_on_gpu)
    data = PyGDenseDataset(data)
    n_nodes = np.array([g.x.shape[0]
                        for g in dataset])  # number of nodes for each graph
    loader = torch.utils.data.DataLoader(data,
Esempio n. 17
0
from schnetpack import SchNet
from torch_geometric.datasets import QM9
import torch_geometric.transforms as T
from torch_geometric.data import DataLoader
import torch.nn.functional as F
import torch
# from torch_geometric.nn import SchNet
from model.nmp_edge import NMPEdge
from model.schnet import SchNet

# DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)
dataset = QM9(root='/home/galkampel/tmp/QM9')  # , transform=T.Distance(norm=False)
train_val_set, test_set = torch.utils.data.random_split(dataset, [120000, 9433])
train_set, val_set = torch.utils.data.random_split(train_val_set, [110000, 10000])

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
model = NMPEdge(hidden_channels=256, num_filters=256, hypernet_update=True).to(device)
# model = SchNet(hidden_channels=256, num_filters=256).to(device)


model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
target = 7
n_iter = 1
for i in range(10):
    mae_tot = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
Esempio n. 18
0
class CausalClassifyNet(torch.nn.Module):
    def __init__(self, h_dim, e_dim, times):
        super(CausalClassifyNet, self).__init__()
        self.conv_layer = ConvLayer(h_dim, e_dim, times)
        self.lin1 = Sequential(Linear(h_dim, h_dim), ReLU(), Linear(h_dim, 2))

    def forward(self, batch, out):
        out = self.lin1(global_mean_pool(self.conv_layer(batch, out), batch.batch))
        return F.log_softmax(out, dim=1)


if __name__ == '__main__':
    from torch_geometric.datasets import QM9
    from torch_geometric.data import DataLoader

    dataset = QM9('data/QM9')
    loader = DataLoader(dataset, batch_size=6)
    data = iter(loader).next()

    model = BaselineRegressNet(11, 32, 4, 6)
    print(model(data))

    model = DirlNet(11, 32, 4, 3)
    print(model(data, 1))

    R = CausalFeatureNet(11, 32, 4, 3)
    D = CausalClassifyNet(32, 4, 3)
    L = CausalRegressNet(32, 4, 3)
    print(R(data))
    print(D(data, R(data)), L(data, R(data)))
    def process(self):

        ogq = QM9(root=self.root)
        print(ogq[0].y)

        smiles_prop_dict = {}
        datalist = []
        tar = tarfile.open(self.raw_paths[0], "r:bz2")
        for tarinfo in tqdm(tar):
            if tarinfo.isreg():
                f = tar.extractfile(tarinfo)
                lines = f.read().decode().split('\n')
                targets = [float(i) for i in lines[1].split('\t')[1:-1]]
                # hof = [float(i) for i in lines[-4].split('\t')]  # Harmonic oscillator frequencies
                smiles = lines[-3].split('\t')[1]
                # t_n = ['A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
                # target_dict = dict(zip(t_names, targets))

                try:
                    mol_graph = self.mol2pyg(smiles)
                    data = mol_graph
                    data.y = torch.tensor(targets).float()
                    datalist.append(data)
                except:  # Boost.Python.ArgumentError
                    continue
            else:
                continue
        tar.close()

        ys = torch.stack([data.y for data in datalist])
        y_mean = ys.mean(dim=0)
        y_std = ys.std(dim=0)

        heterodata_list = []
        for i in tqdm(range(len(datalist))):
            data_i, data_j = random.choice(datalist), random.choice(datalist)
            outer_edge_index_i, outer_edge_index_j = self.generate_outer(data_i.x.size(0), data_j.x.size(0))

            data = tg.data.HeteroData()
            data['x_i'].x = data_i.x.float()
            data['x_j'].x = data_j.x.float()
            data['x_i', 'inner_edge_i', 'x_i'].edge_index = data_i.edge_index.long()
            data['x_i', 'inner_edge_i', 'x_i'].edge_attr = data_i.edge_attr.float()
            data['x_j', 'inner_edge_j', 'x_j'].edge_index = data_j.edge_index.long()
            data['x_j', 'inner_edge_j', 'x_j'].edge_attr = data_j.edge_attr.float()

            data['x_i', 'outer_edge_ij', 'x_j'].edge_index = outer_edge_index_i.long()
            data['x_j', 'outer_edge_ji', 'x_i'].edge_index = outer_edge_index_j.long()

            data['x_i', 'outer_edge_ij', 'x_j'].edge_attr = torch.ones(size=(outer_edge_index_i.max() + 1,
                                                                             data_i.edge_attr.size(1)))
            data['x_j', 'inner_edge_j', 'x_j'].edge_attr = torch.ones(size=(outer_edge_index_j.max() + 1,
                                                                            data_j.edge_attr.size(1)))

            data['y_i'].y = data_i.y.float()
            data['y_j'].y = data_j.y.float()

            data['y_i'].y_norm = (data_i.y.float() - y_mean) / y_std
            data['y_j'].y_norm = (data_j.y.float() - y_mean) / y_std

            data.binary_y = torch.tensor([int(0)], dtype=torch.long)
            heterodata_list.append(data)

        data, slices = self.collate(heterodata_list)
        print('Saving...')
        torch.save((data, slices), self.processed_paths[0])
Esempio n. 20
0
    def __call__(self, data):
        data.y = data.y[:, int(args.target)]  # Specify target: 0 = mu
        return data


parser = argparse.ArgumentParser()
parser.add_argument('--target', default=0)
args = parser.parse_args()
target = int(args.target)

print('---- Target: {} ----'.format(target))

path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', '1-2-3-QM9')
dataset = QM9(
    path,
    transform=T.Compose([MyTransform(), T.Distance()]),
    pre_transform=MyPreTransform(),
    pre_filter=MyFilter())

dataset.data.iso_type_2 = torch.unique(dataset.data.iso_type_2, True, True)[1]
num_i_2 = dataset.data.iso_type_2.max().item() + 1
dataset.data.iso_type_2 = one_hot(dataset.data.iso_type_2, num_classes=num_i_2)

dataset.data.iso_type_3 = torch.unique(dataset.data.iso_type_3, True, True)[1]
num_i_3 = dataset.data.iso_type_3.max().item() + 1
dataset.data.iso_type_3 = one_hot(dataset.data.iso_type_3, num_classes=num_i_3)

dataset = dataset.shuffle()

# Normalize targets to mean = 0 and std = 1.
tenpercent = int(len(dataset) * 0.1)
Esempio n. 21
0
def execute(config):
    device = torch.device(config['device'])
    torch.manual_seed(config['seed'])

    # Report meV instead of eV.
    units = 1000 if config['target'] in [2, 3, 4, 6, 7, 8, 9, 10] else 1

    dataset = QM9(config['data_path'])
    train_dataset, val_dataset = dataset[:50000], dataset[50000:70000]

    model = Network(
        muls=(config['mul0'], config['mul1'], config['mul2']),
        sh_lmax=config['shlmax'],
        num_layers=config['num_layers'],
        max_radius=config['max_radius'],
        num_basis=config['num_basis'],
        fc_neurons=[config['radial_num_neurons']] *
        config['radial_num_layers'],
        num_neighbors=20.0,
        num_nodes=20.0,
        atomref=dataset.atomref(config['target']),
    )
    model = model.to(device)

    wandb.watch(model)

    optim = torch.optim.Adam(model.parameters(), lr=config['lr'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                           patience=25,
                                                           factor=0.5,
                                                           verbose=True)

    runtime = time.perf_counter()
    runtime_print = time.perf_counter()

    for epoch in itertools.count():

        errs = []
        loader = DataLoader(train_dataset,
                            batch_size=config['bs'],
                            shuffle=True)

        for step, data in enumerate(loader):
            data = data.to(device)

            pred = model(data)
            err = pred.view(-1) - data.y[:, config['target']]

            optim.zero_grad()
            err.pow(2).mean().backward()
            optim.step()

            errs += [err.cpu().detach()]

            if time.perf_counter() - runtime_print > 15:
                runtime_print = time.perf_counter()
                w = time.perf_counter() - runtime
                e = epoch + (step + 1) / len(loader)
                print((
                    f'[{e:.1f}] ['
                    f'runtime={w / 3600:.2f}h '
                    f'runtime/epoch={w / e:.0f}s '
                    f'runtime/step={1e3 * w / e / len(loader):.0f}ms '
                    f'step={step}/{len(loader)} '
                    f'mae={units * torch.cat(errs)[-200:].abs().mean():.5f} '),
                      flush=True)

        train_err = torch.cat(errs)

        errs = []
        loader = DataLoader(val_dataset, batch_size=256)
        for data in loader:
            data = data.to(device)
            with torch.no_grad():
                pred = model(data)

            err = pred.view(-1) - data.y[:, config['target']]
            errs += [err.cpu().detach()]
        val_err = torch.cat(errs)

        lrs = [x['lr'] for x in optim.param_groups]
        status = {
            'epoch': epoch,
            '_runtime': time.perf_counter() - runtime,
            'train': {
                'mae': {
                    'mean': units * train_err.abs().mean().item(),
                    'std': units * train_err.abs().std().item(),
                },
                'mse': {
                    'mean': units * train_err.pow(2).mean().item(),
                    'std': units * train_err.pow(2).std().item(),
                }
            },
            'val': {
                'mae': {
                    'mean': units * val_err.abs().mean().item(),
                    'std': units * val_err.abs().std().item(),
                },
                'mse': {
                    'mean': units * val_err.pow(2).mean().item(),
                    'std': units * val_err.pow(2).std().item(),
                }
            },
            'lrs': lrs,
        }
        wandb.log(status)

        print((
            f'[{epoch}] Target: {config["target"]:02d}, '
            f'MAE TRAIN: {units * train_err.abs().mean():.5f} ± {units * train_err.abs().std():.5f}, '
            f'MAE VAL: {units * val_err.abs().mean():.5f} ± {units * val_err.abs().std():.5f}'
        ),
              flush=True)

        scheduler.step(val_err.pow(2).mean())

        if status['_runtime'] > config['max_runtime']:
            break
Esempio n. 22
0
            out, h = self.gru(m.unsqueeze(0), h)
            out = out.squeeze(0)

        out = self.set2set(out, data.batch)
        out = F.relu(self.lin1(out))
        out = self.lin2(out)
        return out


results = []
results_log = []
for _ in range(5):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                    '1t-QM9')
    dataset = QM9(path,
                  transform=T.Compose([Complete(),
                                       T.Distance(norm=False)]))
    dataset.data.y = dataset.data.y[:, 0:12]
    dataset = dataset.shuffle()

    tenpercent = int(len(dataset) * 0.1)
    print("###")
    mean = dataset.data.y.mean(dim=0, keepdim=True)
    std = dataset.data.y.std(dim=0, keepdim=True)
    dataset.data.y = (dataset.data.y - mean) / std
    mean, std = mean.to(device), std.to(device)

    print("###")
    test_dataset = dataset[:tenpercent].shuffle()
    val_dataset = dataset[tenpercent:2 * tenpercent].shuffle()
    train_dataset = dataset[2 * tenpercent:].shuffle()
Esempio n. 23
0

class MyTransform(object):  # k-NN graph, and feature and target selection.
    def __call__(self, data):
        dist = (data.pos.view(-1, 1, 3) - data.pos.view(1, -1, 3)).norm(dim=-1)
        dist.fill_diagonal_(float('inf'))
        mask = dist <= args.cutoff
        data.edge_index = mask.nonzero().t()
        data.edge_attr = None  # No need to maintain bond types.
        data.x = data.x[:, :5]  # Just make use of atom types as features.
        data.y = data.y[:, args.target]
        return data


path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'QM9')
dataset = QM9(path, transform=MyTransform()).shuffle()
train_dataset = dataset[:110000]
val_dataset = dataset[110000:120000]
test_dataset = dataset[120000:]

train_loader = DataLoader(train_dataset, 44, shuffle=True, num_workers=6)
val_loader = DataLoader(val_dataset, 44, num_workers=6)
test_loader = DataLoader(test_dataset, 44, num_workers=6)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DimeNet(in_channels=dataset.num_node_features,
                hidden_channels=128,
                out_channels=1,
                num_blocks=6,
                num_bilinear=8,
                num_spherical=7,
Esempio n. 24
0
def execute(args):
    path = 'QM9'
    dataset = QM9(path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Report meV instead of eV.
    units = 1000 if args.target in [2, 3, 4, 6, 7, 8, 9, 10] else 1

    _, datasets = SchNet.from_qm9_pretrained(path, dataset, args.target)
    train_dataset, val_dataset, _test_dataset = datasets

    model = Network(muls=(args.mul0, args.mul1, args.mul2),
                    ps=(1, ) if 'shp' in args.opts else (1, -1),
                    lmax=args.lmax,
                    num_layers=args.num_layers,
                    rad_gaussians=args.rad_gaussians,
                    rad_hs=(args.rad_h, ) * args.rad_layers +
                    (args.rad_bottleneck, ),
                    mean=args.mean,
                    std=args.std,
                    atomref=dataset.atomref(args.target),
                    options=args.opts)
    model = model.to(device)

    # profile
    loader = DataLoader(train_dataset, batch_size=args.bs, shuffle=False)
    for step, data in enumerate(loader):
        with torch.autograd.profiler.profile(use_cuda=True,
                                             record_shapes=True) as prof:
            data = data.to(device)
            pred = model(data.z, data.pos, data.batch)
            mse = (pred.view(-1) - data.y[:, args.target]).pow(2)
            mse.mean().backward()
        if step == 5:
            break
    prof.export_chrome_trace(f"{datetime.datetime.now()}.json")

    optim = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                           patience=25,
                                                           factor=0.5,
                                                           verbose=True)

    dynamics = []
    wall = time.perf_counter()
    wall_print = time.perf_counter()

    for epoch in itertools.count():

        errs = []
        loader = DataLoader(train_dataset, batch_size=args.bs, shuffle=True)
        for step, data in enumerate(loader):
            data = data.to(device)

            pred = model(data.z, data.pos, data.batch)
            optim.zero_grad()
            (pred.view(-1) - data.y[:, args.target]).pow(2).mean().backward()
            optim.step()

            err = pred.view(-1) - data.y[:, args.target]
            errs += [err.cpu().detach()]

            if time.perf_counter() - wall_print > 15:
                wall_print = time.perf_counter()
                w = time.perf_counter() - wall
                e = epoch + (step + 1) / len(loader)
                print(
                    (f'[{e:.1f}] ['
                     f'wall={w / 3600:.2f}h '
                     f'wall/epoch={w / e:.0f}s '
                     f'wall/step={1e3 * w / e / len(loader):.0f}ms '
                     f'step={step}/{len(loader)} '
                     f'mae={units * torch.cat(errs)[-200:].abs().mean():.5f} '
                     f'lr={optim.param_groups[0]["lr"]:.1e}]'),
                    flush=True)

        train_err = torch.cat(errs)

        errs = []
        loader = DataLoader(val_dataset, batch_size=256)
        for data in loader:
            data = data.to(device)
            with torch.no_grad():
                pred = model(data.z, data.pos, data.batch)

            err = pred.view(-1) - data.y[:, args.target]
            errs += [err.cpu().detach()]
        val_err = torch.cat(errs)

        dynamics += [{
            'epoch': epoch,
            'wall': time.perf_counter() - wall,
            'train': {
                'mae': {
                    'mean': units * train_err.abs().mean().item(),
                    'std': units * train_err.abs().std().item(),
                },
                'mse': {
                    'mean': units * train_err.pow(2).mean().item(),
                    'std': units * train_err.pow(2).std().item(),
                }
            },
            'val': {
                'mae': {
                    'mean': units * val_err.abs().mean().item(),
                    'std': units * val_err.abs().std().item(),
                },
                'mse': {
                    'mean': units * val_err.pow(2).mean().item(),
                    'std': units * val_err.pow(2).std().item(),
                }
            },
            'lr': optim.param_groups[0]["lr"],
        }]

        print(
            f'[{epoch}] Target: {args.target:02d}, MAE TRAIN: {units * train_err.abs().mean():.5f} ± {units * train_err.abs().std():.5f}, MAE VAL: {units * val_err.abs().mean():.5f} ± {units * val_err.abs().std():.5f}',
            flush=True)

        scheduler.step(val_err.pow(2).mean())

        yield {
            'args': args,
            'dynamics': dynamics,
            'state': {k: v.cpu()
                      for k, v in model.state_dict().items()},
        }
class MyTransform(object):
    def __call__(self, data):
        data.y = data.y[:, int(args.target)]  # Specify target: 0 = mu
        return data


parser = argparse.ArgumentParser()
parser.add_argument('--target', default=0)
args = parser.parse_args()
target = int(args.target)

print('---- Target: {} ----'.format(target))

path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', '1-QM9')
dataset = QM9(path, transform=T.Compose([MyTransform(), T.Distance()]))

dataset = dataset.shuffle()

# Normalize targets to mean = 0 and std = 1.
tenpercent = int(len(dataset) * 0.1)
mean = dataset.data.y[tenpercent:].mean(dim=0)
std = dataset.data.y[tenpercent:].std(dim=0)
dataset.data.y = (dataset.data.y - mean) / std

test_dataset = dataset[:tenpercent]
val_dataset = dataset[tenpercent:2 * tenpercent]
train_dataset = dataset[2 * tenpercent:]
test_loader = DataLoader(test_dataset, batch_size=64)
val_loader = DataLoader(val_dataset, batch_size=64)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
Esempio n. 26
0
parser = argparse.ArgumentParser()
parser.add_argument('--target', type=int, default=0)
parser.add_argument('--dim', type=int, default=64)
args = parser.parse_args()


class MyTransform:
    def __call__(self, data):
        data.y = data.y[:, args.target]
        return data


path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets', 'QM9')
transform = T.Compose([MyTransform(), T.Distance()])
dataset = QM9(path, transform=transform).shuffle()

# Normalize targets to mean=0 and std=1
mean = dataset.data.y[:, args.target].mean().item()
std = dataset.data.y[:, args.target].std().item()
dataset.data.y[:, args.target] = (dataset.data.y[:, args.target] - mean) / std

# dataset split
tenpercent = int(len(dataset) * 0.1)
test_dataset = dataset[:tenpercent]
val_dataset = dataset[tenpercent:2 * tenpercent]
train_dataset = dataset[2 * tenpercent:]

test_loader = DataLoader(test_dataset, batch_size=256)
val_loader = DataLoader(val_dataset, batch_size=256)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
Esempio n. 27
0
        return data.num_nodes > 6  # Remove graphs with less than 6 nodes.


class MyPreTransform(object):
    def __call__(self, data):
        x = data.x
        data.x = data.x[:, :5]
        data = ConnectedThreeMalkin()(data)
        data.x = x
        return data


path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                '1-23-QssM9')
dataset = QM9(path,
              transform=T.Compose([T.Distance(norm=False)]),
              pre_transform=MyPreTransform(),
              pre_filter=MyFilter())
dataset.data.y = dataset.data.y[:, 0:12]

dataset.data.iso_type_3 = torch.unique(dataset.data.iso_type_3, True, True)[1]
num_i_3 = dataset.data.iso_type_3.max().item() + 1
dataset.data.iso_type_3 = F.one_hot(dataset.data.iso_type_3,
                                    num_classes=num_i_3).to(torch.float)

#gfggg


class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        M_in, M_out = dataset.num_features, 32