Ejemplo n.º 1
0
    def test_statistics(self, qm9_dataset, split_path, args):
        # test for statistics not in split file
        if os.path.exists(split_path):
            os.remove(split_path)
        os.makedirs(os.path.dirname(split_path), exist_ok=True)
        train, val, test = spk.data.train_test_split(qm9_dataset, 10, 5,
                                                     split_path)
        train_loader = spk.data.AtomsLoader(train, batch_size=5)
        mean, stddev = get_statistics(
            split_path=split_path,
            train_loader=train_loader,
            args=args,
            atomref=None,
            per_atom=False,
        )
        energies = []
        for batch in train_loader:
            energies.append(batch["energy_U0"])
        assert_almost_equal(torch.cat(energies).mean(), mean["energy_U0"], 2)

        # test for statistics in split file
        split_file = np.load(split_path)
        saved_mean = split_file["mean"]
        mean, stddev = get_statistics(
            split_path=split_path,
            train_loader=train_loader,
            args=args,
            atomref=None,
            per_atom=False,
        )
        assert_almost_equal(saved_mean, mean["energy_U0"])

        # test assertion on wrong split file
        with pytest.raises(Exception):
            get_statistics(
                split_path="I/do/not/exist.npz",
                train_loader=train_loader,
                args=args,
                atomref=None,
                per_atom=False,
            )
def main(args):

    # setup
    train_args = setup_run(args)
    device = torch.device("cuda" if args.cuda else "cpu")

    # get dataset
    environment_provider = get_environment_provider(train_args, device=device)
    dataset = get_dataset(train_args,
                          environment_provider=environment_provider)

    # get dataloaders
    split_path = os.path.join(args.modelpath, "split.npz")
    train_loader, val_loader, test_loader = get_loaders(args,
                                                        dataset=dataset,
                                                        split_path=split_path,
                                                        logging=logging)

    # define metrics
    metrics = get_metrics(train_args)

    # train or evaluate
    if args.mode == "train":

        # get statistics
        atomref = dataset.get_atomref(args.property)
        mean, stddev = get_statistics(
            args=args,
            split_path=split_path,
            train_loader=train_loader,
            atomref=atomref,
            divide_by_atoms=get_divide_by_atoms(args),
            logging=logging,
        )

        # build model
        model = get_model(args,
                          train_loader,
                          mean,
                          stddev,
                          atomref,
                          logging=logging)

        # build trainer
        logging.info("training...")
        trainer = get_trainer(args, model, train_loader, val_loader, metrics)

        # run training
        trainer.train(device, n_epochs=args.n_epochs)
        logging.info("...training done!")

    else:
        raise ("Use the original SchnetPack script instead.")
Ejemplo n.º 3
0
def main(args):

    #building model and dataset
    device = torch.device("cuda" if args.cuda else "cpu")
    environment_provider = spk.environment.AseEnvironmentProvider(cutoff=5.0)
    omdb = './omdb'

    if args.mode == "train":

        if not os.path.exists(os.path.join(args.model_path)):
            os.makedirs(args.model_path)

        spk.utils.spk_utils.set_random_seed(None)
        if not os.path.exists('omdb'):
            os.makedirs(omdb)

        omdData = OrganicMaterialsDatabase(
            args.datapath,
            download=False,
            load_only=[args.property],
            environment_provider=environment_provider)
        # split_path = os.path.join(args.model_path, "split.npz")
        split_path = os.path.join(
            '/home/s3754715/gnn_molecule/schnetpack/model_2020-06-23-18-44-59',
            "split.npz")
        train, val, test = spk.train_test_split(data=omdData,
                                                num_train=9000,
                                                num_val=1000,
                                                split_file=split_path)
        print('-----------')
        print(len(train))
        print(len(val))
        print(len(test))
        print('-------------')
        train_loader = spk.AtomsLoader(train,
                                       batch_size=16,
                                       sampler=RandomSampler(train),
                                       num_workers=4
                                       #pin_memory=True
                                       )
        val_loader = spk.AtomsLoader(val, batch_size=16, num_workers=2)
        test_loader = spk.AtomsLoader(test, batch_size=16, num_workers=2)
        atomref = omdData.get_atomref(args.property)
        mean, stddev = get_statistics(
            args=args,
            split_path=split_path,
            train_loader=train_loader,
            atomref=atomref,
            divide_by_atoms=get_divide_by_atoms(args),
            logging=logging)
        # means, stddevs = train_loader.get_statistics(
        # 	args.property, get_divide_by_atoms(args),atomref
        # )
        model_train = model(args, omdData, atomref, mean, stddev)
        trainer = train_model(args, model_train, train_loader, val_loader)
        print('started training')
        trainer.train(device=device, n_epochs=args.n_epochs)
        print('training finished')
        sch_model = torch.load(os.path.join(args.model_path, 'best_model'))

        err = 0
        sch_model.eval()
        for count, batch in enumerate(test_loader):
            # move batch to GPU, if necessary
            batch = {k: v.to(device) for k, v in batch.items()}

            # apply model
            pred = sch_model(batch)

            # calculate absolute error
            tmp = torch.sum(
                torch.abs(pred[args.property] - batch[args.property]))
            tmp = tmp.detach().cpu().numpy(
            )  # detach from graph & convert to numpy
            err += tmp
            print(tmp)
            # log progress
            percent = '{:3.2f}'.format(count / len(test_loader) * 100)
            print('Progress:',
                  percent + '%' + ' ' * (5 - len(percent)),
                  end="\r")

        err /= len(test)
        print('Test MAE', np.round(err, 3), 'eV =',
              np.round(err / (kcal / mol), 3), 'kcal/mol')

        #plot results
        plot_results(args)

    elif args.mode == "pred":
        print('predictionsss')
        sch_model = torch.load(os.path.join(args.model_path, 'best_model'),
                               map_location=torch.device(device))
        #reading test data
        # test_dataset = AtomsData('./cod_predict.db')
        # test_loader = spk.AtomsLoader(test_dataset, batch_size=32)

        #reading stored cod list
        #cod_list = np.load('./cod_id_list_old.npy')
        omdData = OrganicMaterialsDatabase(
            args.datapath,
            download=True,
            load_only=[args.property],
            environment_provider=environment_provider)
        split_path = os.path.join(args.model_path, "split.npz")
        train, val, test = spk.train_test_split(data=omdData,
                                                num_train=9000,
                                                num_val=1000,
                                                split_file=split_path)
        print(len(test))
        test_loader = spk.AtomsLoader(
            test,
            batch_size=32,  #num_workers=2
        )
        mean_abs_err = 0
        prediction_list = []
        actual_value_list = []
        print('Started generating predictions')
        for count, batch in enumerate(test_loader):

            # move batch to GPU, if necessary
            print('before batch')
            batch = {k: v.to(device) for k, v in batch.items()}
            print('after batch')
            # apply model
            pred = sch_model(batch)
            prediction_list.extend(
                pred['band_gap'].detach().cpu().numpy().flatten().tolist())
            actual_value_list.extend(
                batch['band_gap'].detach().cpu().numpy().flatten().tolist())
            # log progress
            percent = '{:3.2f}'.format(count / len(test_loader) * 100)
            print('Progress:',
                  percent + '%' + ' ' * (5 - len(percent)),
                  end="\r")

        cod_arr = np.genfromtxt(
            os.path.join(
                '/home/s3754715/gnn_molecule/schnetpack/dataset/OMDB-GAP1_v1.1',
                'CODids.csv'))
        cod_list = cod_arr[10000:].tolist()
        results_df = pd.DataFrame({
            'cod': cod_list,
            'prediction': prediction_list,
            'actual': actual_value_list
        })
        results_df.to_csv('./predictions.csv')
Ejemplo n.º 4
0
def main(args):

    # setup
    train_args = setup_run(args)

    device = torch.device("cuda" if args.cuda else "cpu")

    # get dataset
    environment_provider = get_environment_provider(train_args, device=device)
    dataset = get_dataset(train_args,
                          environment_provider=environment_provider)

    # get dataloaders
    split_path = os.path.join(args.modelpath, "split.npz")
    train_loader, val_loader, test_loader = get_loaders(args,
                                                        dataset=dataset,
                                                        split_path=split_path,
                                                        logging=logging)

    # define metrics
    metrics = get_metrics(train_args)

    # train or evaluate
    if args.mode == "train":

        # get statistics
        atomref = dataset.get_atomref(args.property)
        mean, stddev = get_statistics(
            args=args,
            split_path=split_path,
            train_loader=train_loader,
            atomref=atomref,
            divide_by_atoms=get_divide_by_atoms(args),
            logging=logging,
        )

        # build model
        model = get_model(args,
                          train_loader,
                          mean,
                          stddev,
                          atomref,
                          logging=logging)

        # build trainer
        logging.info("training...")
        trainer = get_trainer(args, model, train_loader, val_loader, metrics)

        # run training
        trainer.train(device, n_epochs=args.n_epochs)
        logging.info("...training done!")

    elif args.mode == "eval":

        # remove old evaluation files
        evaluation_fp = os.path.join(args.modelpath, "evaluation.txt")
        if os.path.exists(evaluation_fp):
            if args.overwrite:
                os.remove(evaluation_fp)
            else:
                raise ScriptError(
                    "The evaluation file does already exist at {}! Add overwrite flag"
                    " to remove.".format(evaluation_fp))

        # load model
        logging.info("loading trained model...")
        model = torch.load(os.path.join(args.modelpath, "best_model"))

        # run evaluation
        logging.info("evaluating...")
        if spk.utils.get_derivative(train_args) is None:
            with torch.no_grad():
                evaluate(
                    args,
                    model,
                    train_loader,
                    val_loader,
                    test_loader,
                    device,
                    metrics=metrics,
                )
        else:
            evaluate(
                args,
                model,
                train_loader,
                val_loader,
                test_loader,
                device,
                metrics=metrics,
            )
        logging.info("... evaluation done!")

    else:
        raise ScriptError("Unknown mode: {}".format(args.mode))
Ejemplo n.º 5
0
def main(args):

    # setup
    #    train_args = setup_run(args)
    logging.info("CUDA is used: " + str(args.cuda))
    if args.cuda:
        logging.info("CUDA is available: " + str(torch.cuda.is_available()))

    device = torch.device("cuda" if args.cuda else "cpu")

    # get dataset
    dataset = get_dataset(args)

    # get dataloaders
    split_path = os.path.join(args.modelpath, "split.npz")
    train_loader, val_loader, test_loader = get_loaders(args,
                                                        dataset=dataset,
                                                        split_path=split_path,
                                                        logging=logging)

    # define metrics
    metrics = get_metrics(args)

    # train or evaluate
    if args.mode == "train":

        # get statistics
        atomref = dataset.get_atomref(args.property)
        divide_by_atoms = settings.divide_by_atoms[args.property]
        mean, stddev = get_statistics(
            args=args,
            split_path=split_path,
            train_loader=train_loader,
            atomref=atomref,
            divide_by_atoms=divide_by_atoms,
            logging=logging,
        )
        aggregation_mode = settings.pooling_mode[args.property]

        # build model
        model = get_model(args,
                          train_loader,
                          mean,
                          stddev,
                          atomref,
                          aggregation_mode,
                          logging=logging)

        # build trainer
        logging.info("training...")
        trainer = get_trainer(args, model, train_loader, val_loader, metrics)

        # run training
        trainer.train(device, n_epochs=args.n_epochs)
        logging.info("...training done!")

    elif args.mode == "eval":

        # remove old evaluation files
        evaluation_fp = os.path.join(args.modelpath, "evaluation.txt")
        if os.path.exists(evaluation_fp):
            if args.overwrite:
                os.remove(evaluation_fp)
            else:
                raise ScriptError(
                    "The evaluation file does already exist at {}! Add overwrite flag"
                    " to remove.".format(evaluation_fp))

        # load model
        logging.info("loading trained model...")
        model = torch.load(os.path.join(args.modelpath, "best_model"))

        # run evaluation
        logging.info("evaluating...")
        if args.dataset != "md17":
            with torch.no_grad():
                evaluate(
                    args,
                    model,
                    train_loader,
                    val_loader,
                    test_loader,
                    device,
                    metrics=metrics,
                )
        else:
            evaluate(
                args,
                model,
                train_loader,
                val_loader,
                test_loader,
                device,
                metrics=metrics,
            )
        logging.info("... evaluation done!")

    else:
        raise ScriptError("Unknown mode: {}".format(args.mode))