Beispiel #1
0
def test_small_moddata_featurization():
    """ This test creates a new MODData from the MP 2018.6 structures. """
    data_file = Path(__file__).parent.joinpath("data/MP_2018.6_small.zip")

    # Loading pickles can be dangerous, so lets at least check that the MD5 matches
    # what it was when created
    assert (get_sha512_of_file(data_file) ==
            "37bd4f8ce6f29c904a13e5670dd53af9a8779094727052ec85ccd6362b1b3765"
            "ac613426331811b3f626242896d87c3f6bc1884cc5545875b5ae66a712f9e218")

    old = MODData.load(data_file)
    structures = old.structures
    targets = old.targets

    names = old.names
    new = MODData(structures, targets, target_names=names)
    new.featurize(fast=False)

    new_cols = sorted(new.df_featurized.columns.tolist())
    old_cols = sorted(old.df_featurized.columns.tolist())

    for i in range(len(old_cols)):
        print(new_cols[i], old_cols[i])
        assert new_cols[i] == old_cols[i]

    np.testing.assert_array_equal(old_cols, new_cols)

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            old.df_featurized[col].to_numpy(),
        )
Beispiel #2
0
def test_small_moddata_featurization(small_moddata):
    """ This test creates a new MODData from the MP 2018.6 structures. """

    old = small_moddata
    structures = old.structures
    targets = old.targets

    names = old.names
    new = MODData(structures, targets, target_names=names)
    new.featurize(fast=False, n_jobs=1)

    new_cols = sorted(new.df_featurized.columns.tolist())
    old_cols = sorted(old.df_featurized.columns.tolist())

    for i in range(len(old_cols)):
        print(new_cols[i], old_cols[i])
        assert new_cols[i] == old_cols[i]

    np.testing.assert_array_equal(old_cols, new_cols)

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            old.df_featurized[col].to_numpy(),
        )
def featurize(task, n_jobs=1):
    import warnings

    warnings.filterwarnings("ignore", category=RuntimeWarning)
    from modnet.preprocessing import MODData
    from modnet.featurizers.presets import DeBreuck2020Featurizer
    from matminer.datasets import load_dataset

    if task == "matbench_elastic":
        df_g = load_dataset("matbench_log_gvrh")
        df_k = load_dataset("matbench_log_kvrh")
        df = df_g.join(df_k.drop("structure", axis=1))
    else:
        df = load_dataset(task)

    mapping = {
        col: col.replace(" ", "_").replace("(", "").replace(")", "")
        for ind, col in enumerate(df.columns)
    }
    df.rename(columns=mapping, inplace=True)

    targets = [
        col for col in df.columns
        if col not in ("id", "structure", "composition")
    ]

    if "structure" not in df.columns:
        featurizer = CompositionOnlyFeaturizer()
    else:
        featurizer = DeBreuck2020Featurizer(fast_oxid=True)

    try:
        materials = df["structure"] if "structure" in df.columns else df[
            "composition"].map(Composition)
    except KeyError:
        raise RuntimeError(
            f"Could not find any materials data dataset for task {task!r}!")

    data = MODData(
        materials=materials.tolist(),
        targets=df[targets].values,
        target_names=targets,
        featurizer=featurizer,
    )
    data.featurize(n_jobs=n_jobs)
    os.makedirs("./precomputed", exist_ok=True)
    data.save(f"./precomputed/{task}_moddata.pkl.gz")
    return data
Beispiel #4
0
def test_small_moddata_composition_featurization(small_moddata_composition):
    """ This test creates a new MODData from the MP 2018.6 structures. """

    reference = small_moddata_composition
    compositions = reference.compositions

    new = MODData(materials=compositions)
    new.featurize(fast=False, n_jobs=1)

    new_cols = sorted(new.df_featurized.columns.tolist())
    ref_cols = sorted(reference.df_featurized.columns.tolist())

    for i in range(len(ref_cols)):
        # print(new_cols[i], ref_cols[i])
        assert new_cols[i] == ref_cols[i]

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            reference.df_featurized[col].to_numpy(),
        )
Beispiel #5
0
            materials = train_df[
                "structure"] if "structure" in train_df.columns else train_df[
                    "composition"].map(Composition)
        except KeyError:
            raise RuntimeError(
                f"Could not find any materials data dataset for task {task!r}!"
            )

        fast_oxid_featurizer = DeBreuck2020Featurizer(fast_oxid=True)
        train_data = MODData(
            materials=materials.tolist(),
            targets=train_df[targets].values,
            target_names=targets,
            featurizer=fast_oxid_featurizer,
        )
        train_data.featurize(n_jobs=32)
        train_data.feature_selection(n=-1, use_precomputed_cross_nmi=True)

        # create model
        targets_hierarchy = [[[field for field in targets]]]
        weights = {field: 1 for field in targets}
        model = EnsembleMODNetModel(targets_hierarchy, weights)

        # fit model

        if USE_GA:
            # you can either use a GA for hyper-parameter optimization or...
            from modnet.hyper_opt import FitGenetic
            ga = FitGenetic(train_data)
            model = ga.run(
                size_pop=20,