コード例 #1
0
def test_cgcnn_clf(df_matbench_phonons):
    elem_emb = "cgcnn92"
    targets = ["phdos_clf"]
    tasks = ["classification"]
    losses = ["CSE"]
    robust = True
    model_name = "cgcnn-clf-test"
    elem_fea_len = 32
    h_fea_len = 128
    n_graph = 3
    n_hidden = 1
    ensemble = 2
    run_id = 1
    data_seed = 42
    epochs = 10
    log = False
    sample = 1
    test_size = 0.2
    resume = False
    fine_tune = None
    transfer = None
    optim = "AdamW"
    learning_rate = 3e-4
    momentum = 0.9
    weight_decay = 1e-6
    batch_size = 128
    workers = 0
    device = "cuda" if torch.cuda.is_available() else "cpu"

    task_dict = dict(zip(targets, tasks))
    loss_dict = dict(zip(targets, losses))

    dataset = CrystalGraphData(df=df_matbench_phonons,
                               elem_emb=elem_emb,
                               task_dict=task_dict)
    n_targets = dataset.n_targets
    elem_emb_len = dataset.elem_emb_len
    nbr_fea_len = dataset.nbr_fea_dim

    train_idx = list(range(len(dataset)))

    print(f"using {test_size} of training set as test set")
    train_idx, test_idx = split(train_idx,
                                random_state=data_seed,
                                test_size=test_size)
    test_set = torch.utils.data.Subset(dataset, test_idx)

    print("No validation set used, using test set for evaluation purposes")
    # NOTE that when using this option care must be taken not to
    # peak at the test-set. The only valid model to use is the one
    # obtained after the final epoch where the epoch count is
    # decided in advance of the experiment.
    val_set = test_set

    train_set = torch.utils.data.Subset(dataset, train_idx[0::sample])

    data_params = {
        "batch_size": batch_size,
        "num_workers": workers,
        "pin_memory": False,
        "shuffle": True,
        "collate_fn": collate_batch,
    }

    setup_params = {
        "optim": optim,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "momentum": momentum,
        "device": device,
    }

    restart_params = {
        "resume": resume,
        "fine_tune": fine_tune,
        "transfer": transfer,
    }

    model_params = {
        "task_dict": task_dict,
        "robust": robust,
        "n_targets": n_targets,
        "elem_emb_len": elem_emb_len,
        "nbr_fea_len": nbr_fea_len,
        "elem_fea_len": elem_fea_len,
        "n_graph": n_graph,
        "h_fea_len": h_fea_len,
        "n_hidden": n_hidden,
    }

    train_ensemble(
        model_class=CrystalGraphConvNet,
        model_name=model_name,
        run_id=run_id,
        ensemble_folds=ensemble,
        epochs=epochs,
        train_set=train_set,
        val_set=val_set,
        log=log,
        data_params=data_params,
        setup_params=setup_params,
        restart_params=restart_params,
        model_params=model_params,
        loss_dict=loss_dict,
    )

    data_params["batch_size"] = 64 * batch_size  # faster model inference
    data_params["shuffle"] = False  # need fixed data order due to ensembling

    results_dict = results_multitask(
        model_class=CrystalGraphConvNet,
        model_name=model_name,
        run_id=run_id,
        ensemble_folds=ensemble,
        test_set=test_set,
        data_params=data_params,
        robust=robust,
        task_dict=task_dict,
        device=device,
        eval_type="checkpoint",
        save_results=False,
    )

    logits = results_dict["phdos_clf"]["logits"]
    target = results_dict["phdos_clf"]["target"]

    # calculate metrics and errors with associated errors for ensembles
    ens_logits = np.mean(logits, axis=0)

    target_ohe = np.zeros_like(ens_logits)
    target_ohe[np.arange(target.size), target] = 1

    ens_acc = accuracy_score(target, np.argmax(ens_logits, axis=1))
    ens_roc_auc = roc_auc_score(target_ohe, ens_logits)

    assert ens_acc > 0.85
    assert ens_roc_auc > 0.9
コード例 #2
0
ファイル: cgcnn-example.py プロジェクト: CompRhys/aviary
def main(  # noqa: C901
    data_path,
    targets,
    tasks,
    losses,
    robust,
    elem_emb="cgcnn92",
    model_name="cgcnn",
    n_graph=4,
    elem_fea_len=64,
    n_hidden=1,
    h_fea_len=128,
    radius=5,
    max_num_nbr=12,
    dmin=0,
    step=0.2,
    ensemble=1,
    run_id=1,
    data_seed=42,
    epochs=100,
    patience=None,
    log=True,
    sample=1,
    test_size=0.2,
    test_path=None,
    val_size=0.0,
    val_path=None,
    resume=None,
    fine_tune=None,
    transfer=None,
    train=True,
    evaluate=True,
    optim="AdamW",
    learning_rate=3e-4,
    momentum=0.9,
    weight_decay=1e-6,
    batch_size=128,
    workers=0,
    device="cuda" if torch.cuda.is_available() else "cpu",
    **kwargs,
):

    if not len(targets) == len(tasks) == len(losses):
        raise AssertionError

    if not (evaluate or train):
        raise AssertionError(
            "No action given - At least one of 'train' or 'evaluate' cli flags required"
        )

    if test_path:
        test_size = 0.0

    if not (test_path and val_path):
        if test_size + val_size >= 1.0:
            raise AssertionError(
                f"'test_size'({test_size}) "
                f"plus 'val_size'({val_size}) must be less than 1")

    if ensemble > 1 and (fine_tune or transfer):
        raise NotImplementedError(
            "If training an ensemble with fine tuning or transferring"
            " options the models must be trained one by one using the"
            " run-id flag.")

    if fine_tune and transfer:
        raise AssertionError(
            "Cannot fine-tune and transfer checkpoint(s) at the same time.")

    task_dict = dict(zip(targets, tasks))
    loss_dict = dict(zip(targets, losses))

    dist_dict = {
        "radius": radius,
        "max_num_nbr": max_num_nbr,
        "dmin": dmin,
        "step": step,
    }

    if not os.path.exists(data_path):
        raise AssertionError(f"{data_path} does not exist!")
    # NOTE make sure to use dense datasets, here do not use the default na
    # as they can clash with "NaN" which is a valid material
    df = pd.read_csv(data_path,
                     keep_default_na=False,
                     na_values=[],
                     comment="#")

    dataset = CrystalGraphData(df=df,
                               elem_emb=elem_emb,
                               task_dict=task_dict,
                               **dist_dict)
    n_targets = dataset.n_targets
    elem_emb_len = dataset.elem_emb_len
    nbr_fea_len = dataset.nbr_fea_dim

    train_idx = list(range(len(dataset)))

    if evaluate:
        if test_path:

            if not os.path.exists(test_path):
                raise AssertionError(f"{test_path} does not exist!")
            # NOTE make sure to use dense datasets,
            # NOTE do not use default_na as "NaN" is a valid material
            df = pd.read_csv(test_path, keep_default_na=False, na_values=[])

            print(f"using independent test set: {test_path}")
            test_set = CrystalGraphData(df=df,
                                        elem_emb=elem_emb,
                                        task_dict=task_dict,
                                        **dist_dict)
            test_set = torch.utils.data.Subset(test_set, range(len(test_set)))
        elif test_size == 0.0:
            raise ValueError("test-size must be non-zero to evaluate model")
        else:
            print(f"using {test_size} of training set as test set")
            train_idx, test_idx = split(train_idx,
                                        random_state=data_seed,
                                        test_size=test_size)
            test_set = torch.utils.data.Subset(dataset, test_idx)

    if train:
        if val_path:

            if not os.path.exists(val_path):
                raise AssertionError(f"{val_path} does not exist!")
            # NOTE make sure to use dense datasets,
            # NOTE do not use default_na as "NaN" is a valid material
            df = pd.read_csv(val_path, keep_default_na=False, na_values=[])

            print(f"using independent validation set: {val_path}")
            val_set = CrystalGraphData(df=df,
                                       elem_emb=elem_emb,
                                       task_dict=task_dict,
                                       **dist_dict)
            val_set = torch.utils.data.Subset(val_set, range(len(val_set)))
        else:
            if val_size == 0.0 and evaluate:
                print(
                    "No validation set used, using test set for evaluation purposes"
                )
                # NOTE that when using this option care must be taken not to
                # peak at the test-set. The only valid model to use is the one
                # obtained after the final epoch where the epoch count is
                # decided in advance of the experiment.
                val_set = test_set
            elif val_size == 0.0:
                val_set = None
            else:
                print(f"using {val_size} of training set as validation set")
                train_idx, val_idx = split(
                    train_idx,
                    random_state=data_seed,
                    test_size=val_size / (1 - test_size),
                )
                val_set = torch.utils.data.Subset(dataset, val_idx)

        train_set = torch.utils.data.Subset(dataset, train_idx[0::sample])

    data_params = {
        "batch_size": batch_size,
        "num_workers": workers,
        "pin_memory": False,
        "shuffle": True,
        "collate_fn": collate_batch,
    }

    setup_params = {
        "optim": optim,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "momentum": momentum,
        "device": device,
    }

    if resume:
        resume = f"models/{model_name}/checkpoint-r{run_id}.pth.tar"

    restart_params = {
        "resume": resume,
        "fine_tune": fine_tune,
        "transfer": transfer,
    }

    model_params = {
        "task_dict": task_dict,
        "robust": robust,
        "n_targets": n_targets,
        "elem_emb_len": elem_emb_len,
        "nbr_fea_len": nbr_fea_len,
        "elem_fea_len": elem_fea_len,
        "n_graph": n_graph,
        "h_fea_len": h_fea_len,
        "n_hidden": n_hidden,
    }

    if train:
        train_ensemble(
            model_class=CrystalGraphConvNet,
            model_name=model_name,
            run_id=run_id,
            ensemble_folds=ensemble,
            epochs=epochs,
            patience=patience,
            train_set=train_set,
            val_set=val_set,
            log=log,
            data_params=data_params,
            setup_params=setup_params,
            restart_params=restart_params,
            model_params=model_params,
            loss_dict=loss_dict,
        )

    if evaluate:

        data_reset = {
            "batch_size": 16 * batch_size,  # faster model inference
            "shuffle": False,  # need fixed data order due to ensembling
        }
        data_params.update(data_reset)

        return results_multitask(
            model_class=CrystalGraphConvNet,
            model_name=model_name,
            run_id=run_id,
            ensemble_folds=ensemble,
            test_set=test_set,
            data_params=data_params,
            robust=robust,
            task_dict=task_dict,
            device=device,
            eval_type="checkpoint",
        )
コード例 #3
0
def test_roost_regression(df_matbench_phonons):
    elem_emb = "matscholar200"
    targets = ["last phdos peak"]
    tasks = ["regression"]
    losses = ["L1"]
    robust = True
    model_name = "roost-reg-test"
    elem_fea_len = 64
    n_graph = 3
    ensemble = 2
    run_id = 1
    data_seed = 42
    epochs = 25
    log = False
    sample = 1
    test_size = 0.2
    resume = False
    fine_tune = None
    transfer = None
    optim = "AdamW"
    learning_rate = 3e-4
    momentum = 0.9
    weight_decay = 1e-6
    batch_size = 128
    workers = 0
    device = "cuda" if torch.cuda.is_available() else "cpu"

    task_dict = dict(zip(targets, tasks))
    loss_dict = dict(zip(targets, losses))

    dataset = CompositionData(
        df=df_matbench_phonons, elem_emb=elem_emb, task_dict=task_dict
    )
    n_targets = dataset.n_targets
    elem_emb_len = dataset.elem_emb_len

    train_idx = list(range(len(dataset)))

    print(f"using {test_size} of training set as test set")
    train_idx, test_idx = split(train_idx, random_state=data_seed, test_size=test_size)
    test_set = torch.utils.data.Subset(dataset, test_idx)

    print("No validation set used, using test set for evaluation purposes")
    # NOTE that when using this option care must be taken not to
    # peak at the test-set. The only valid model to use is the one
    # obtained after the final epoch where the epoch count is
    # decided in advance of the experiment.
    val_set = test_set

    train_set = torch.utils.data.Subset(dataset, train_idx[0::sample])

    data_params = {
        "batch_size": batch_size,
        "num_workers": workers,
        "pin_memory": False,
        "shuffle": True,
        "collate_fn": collate_batch,
    }

    setup_params = {
        "optim": optim,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "momentum": momentum,
        "device": device,
    }

    restart_params = {
        "resume": resume,
        "fine_tune": fine_tune,
        "transfer": transfer,
    }

    model_params = {
        "task_dict": task_dict,
        "robust": robust,
        "n_targets": n_targets,
        "elem_emb_len": elem_emb_len,
        "elem_fea_len": elem_fea_len,
        "n_graph": n_graph,
        "elem_heads": 2,
        "elem_gate": [256],
        "elem_msg": [256],
        "cry_heads": 2,
        "cry_gate": [256],
        "cry_msg": [256],
        "trunk_hidden": [256, 256],
        "out_hidden": [128, 64],
    }

    train_ensemble(
        model_class=Roost,
        model_name=model_name,
        run_id=run_id,
        ensemble_folds=ensemble,
        epochs=epochs,
        train_set=train_set,
        val_set=val_set,
        log=log,
        data_params=data_params,
        setup_params=setup_params,
        restart_params=restart_params,
        model_params=model_params,
        loss_dict=loss_dict,
    )

    data_params["batch_size"] = 64 * batch_size  # faster model inference
    data_params["shuffle"] = False  # need fixed data order due to ensembling

    results_dict = results_multitask(
        model_class=Roost,
        model_name=model_name,
        run_id=run_id,
        ensemble_folds=ensemble,
        test_set=test_set,
        data_params=data_params,
        robust=robust,
        task_dict=task_dict,
        device=device,
        eval_type="checkpoint",
        save_results=False,
    )

    pred = results_dict["last phdos peak"]["pred"]
    target = results_dict["last phdos peak"]["target"]

    y_ens = np.mean(pred, axis=0)

    mae = np.abs(target - y_ens).mean()
    mse = np.square(target - y_ens).mean()
    rmse = np.sqrt(mse)
    r2 = r2_score(target, y_ens)

    assert r2 > 0.7
    assert mae < 150
    assert rmse < 300