def test_small_moddata_featurization():
    """ This test creates a new MODData from the MP 2018.6 structures. """
    data_file = Path(__file__).parent.joinpath("data/MP_2018.6_small.zip")

    # Loading pickles can be dangerous, so lets at least check that the MD5 matches
    # what it was when created
    assert (get_sha512_of_file(data_file) ==
            "37bd4f8ce6f29c904a13e5670dd53af9a8779094727052ec85ccd6362b1b3765"
            "ac613426331811b3f626242896d87c3f6bc1884cc5545875b5ae66a712f9e218")

    old = MODData.load(data_file)
    structures = old.structures
    targets = old.targets

    names = old.names
    new = MODData(structures, targets, target_names=names)
    new.featurize(fast=False)

    new_cols = sorted(new.df_featurized.columns.tolist())
    old_cols = sorted(old.df_featurized.columns.tolist())

    for i in range(len(old_cols)):
        print(new_cols[i], old_cols[i])
        assert new_cols[i] == old_cols[i]

    np.testing.assert_array_equal(old_cols, new_cols)

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            old.df_featurized[col].to_numpy(),
        )
Exemple #2
0
def test_small_moddata_featurization(small_moddata):
    """ This test creates a new MODData from the MP 2018.6 structures. """

    old = small_moddata
    structures = old.structures
    targets = old.targets

    names = old.names
    new = MODData(structures, targets, target_names=names)
    new.featurize(fast=False, n_jobs=1)

    new_cols = sorted(new.df_featurized.columns.tolist())
    old_cols = sorted(old.df_featurized.columns.tolist())

    for i in range(len(old_cols)):
        print(new_cols[i], old_cols[i])
        assert new_cols[i] == old_cols[i]

    np.testing.assert_array_equal(old_cols, new_cols)

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            old.df_featurized[col].to_numpy(),
        )
Exemple #3
0
def test_load_precomputed():
    """Tries to load and unpack the dataset on figshare.

    Requies ~10 GB of memory.

    """

    MODData.load_precomputed("MP_2018.6")
Exemple #4
0
def test_precomputed_cross_nmi(small_moddata):

    new = MODData(
        materials=small_moddata.structures,
        targets=small_moddata.targets,
        target_names=small_moddata.names,
        df_featurized=small_moddata.df_featurized,
    )
    new.feature_selection(5, use_precomputed_cross_nmi=True)
Exemple #5
0
    def fit(self,data:MODData, val_fraction = 0.0, val_key = None, lr=0.001, epochs = 200, batch_size = 128, xscale='minmax',yscale=None):
        
        print('new')
        self.xscale = xscale
        self.target_names = data.names
        self.optimal_descriptors = data.get_optimal_descriptors()
        x = data.get_featurized_df()[self.optimal_descriptors[:self.n_feat]].values
        print(x.shape)
        y = data.get_target_df()[self.targets_flatten].values.transpose()
        print(y.shape)
        
        #Scale the input features:
        if self.xscale == 'minmax':
            self.xmin = x.min(axis=0)
            self.xmax = x.max(axis=0)
            x=(x-self.xmin)/(self.xmax-self.xmin) - 0.5
                
        elif self.xscale == 'standard':
            self.scaler = StandardScaler()
            x = self.scaler.fit_transform(x)

        x = np.nan_to_num(x)
        
        if val_fraction > 0:
            if self.PP:
                print_callback = LambdaCallback(
                  on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_{}_mae'.format(val_key)])))
            else:
                print_callback = LambdaCallback(
                  on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_mae'])))
        else:
            print_callback = LambdaCallback(
              on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}".format(epoch,logs['loss']))) 

        
        fit_params = {
                      'x': x,
                      'y': list(y),
                      'epochs': epochs,
                      'batch_size': batch_size,
                      'verbose': 0,
                      'validation_split' : val_fraction,
                      'callbacks':[print_callback]
                  }
        print('compile',flush=True)
        self.model.compile(loss = 'mse',optimizer=keras.optimizers.Adam(lr=lr),metrics=['mae'],loss_weights=self.weights)
        print('fit',flush=True)     
        
        self.model.fit(**fit_params)
Exemple #6
0
    def predict(self, test_data: MODData) -> pd.DataFrame:
        """Predict the target values for the passed MODData.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.

        Returns:
            A `pandas.DataFrame` containing the predicted values of the targets.


        """

        x = test_data.get_featurized_df()[
            self.optimal_descriptors[:self.n_feat]
        ].values

        # Scale the input features:
        if self._scaler is not None:
            x = self._scaler.transform(x)

        x = np.nan_to_num(x)

        if self._multi_target:
            p = np.array(self.model.predict(x))[:, :, 0].transpose()
        else:
            p = np.array(self.model.predict(x))[:, 0].transpose()

        predictions = pd.DataFrame(p)
        predictions.columns = self.targets_flatten
        predictions.index = test_data.structure_ids

        return predictions
Exemple #7
0
    def evaluate(self, test_data: MODData) -> pd.DataFrame:
        """Evaluates the target values for the passed MODData by returning the corresponding loss.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.


        Returns:
            Loss score
        """
        # prevents Nan predictions if some features are inf
        x = (test_data.get_featurized_df().replace(
            [np.inf, -np.inf, np.nan],
            0)[self.optimal_descriptors[:self.n_feat]].values)

        # Scale the input features:
        x = np.nan_to_num(x)
        if self._scaler is not None:
            x = self._scaler.transform(x)
            x = np.nan_to_num(x, nan=-1)

        y = []
        for targ in self.targets_flatten:
            if self.num_classes[targ] >= 2:  # Classification
                y_inner = tf.keras.utils.to_categorical(
                    test_data.df_targets[targ].values,
                    num_classes=self.num_classes[targ],
                )
            else:
                y_inner = test_data.df_targets[targ].values.astype(np.float,
                                                                   copy=False)
            y.append(y_inner)

        return self.model.evaluate(x, y)[0]
Exemple #8
0
def test_small_moddata_feature_selection_classif(small_moddata):
    """ This test creates classifier MODData and test the feature selection method """

    x1 = np.array([0] * 500 + [1] * 500 + [2] * 500, dtype='float')
    x2 = np.random.choice(2, 1500)
    x3 = x1 * x2
    x4 = x1 + (x2 * 0.5)
    targets = np.array(x1, dtype='int').reshape(-1, 1)
    features = np.array([x1, x2, x3, x4]).T
    names = ['my_classes']

    c_nmi = pd.DataFrame([[1, 0, 0.5, 0.5], [0, 1, 0.5, 0.5],
                          [0.5, 0.5, 1, 0.5], [0.5, 0.5, 0.5, 1]],
                         columns=['f1', 'f2', 'f3', 'f4'],
                         index=['f1', 'f2', 'f3', 'f4'])

    classif_md = MODData(['dummy'] * 1500,
                         targets,
                         target_names=names,
                         num_classes={"my_classes": 3})
    classif_md.df_featurized = pd.DataFrame(features,
                                            columns=['f1', 'f2', 'f3', 'f4'])
    classif_md.feature_selection(n=3, cross_nmi=c_nmi)
    assert len(classif_md.get_optimal_descriptors()) == 3
    assert classif_md.get_optimal_descriptors() == ['f1', 'f4', 'f3']
Exemple #9
0
def test_small_moddata_composition_featurization(small_moddata_composition):
    """ This test creates a new MODData from the MP 2018.6 structures. """

    reference = small_moddata_composition
    compositions = reference.compositions

    new = MODData(materials=compositions)
    new.featurize(fast=False, n_jobs=1)

    new_cols = sorted(new.df_featurized.columns.tolist())
    ref_cols = sorted(reference.df_featurized.columns.tolist())

    for i in range(len(ref_cols)):
        # print(new_cols[i], ref_cols[i])
        assert new_cols[i] == ref_cols[i]

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            reference.df_featurized[col].to_numpy(),
        )
def load_or_featurize(task, n_jobs=1):
    data_files = glob.glob("./precomputed/*.gz")
    if len(data_files) == 0:
        data = featurize(task, n_jobs=n_jobs)
    else:
        precomputed_moddata = data_files[0]
        if len(data_files) > 1:
            print(
                f"Found multiple data files {data_files}, loading the first {data_files[0]}"
            )

        data = MODData.load(precomputed_moddata)

    return data
Exemple #11
0
def _load_moddata(filename):
    """Loads the pickled MODData from the test directory and checks it's hash."""
    from modnet.preprocessing import MODData

    data_file = Path(__file__).parent.joinpath(f"data/{filename}")
    if filename not in _TEST_DATA_HASHES:
        raise RuntimeError(
            f"Cannot verify hash of {filename} as it was not provided, will not load pickle."
        )
    # Loading pickles can be dangerous, so lets at least check that the MD5 matches
    # what it was when created
    assert get_hash_of_file(data_file) == _TEST_DATA_HASHES[filename]

    return MODData.load(data_file)
Exemple #12
0
    def fit(
        self,
        training_data: MODData,
        n_jobs=1,
        **kwargs,
    ) -> None:
        """Train the model on the passed training `MODData` object.

        Parameters:
            same as MODNetModel fit.
        """

        if self.bootstrap:
            LOG.info("Generating bootstrap data...")
            train_datas = [training_data.split((resample(np.arange(len(training_data.df_targets)),
                    replace=True, random_state=2943),[]))[0] for _ in range(self.n_models)]
        else:
            train_datas = [training_data for _ in range(self.n_models)]

        if n_jobs<=1:
            for i in range(self.n_models):
                LOG.info(f"Bootstrap fitting model #{i + 1}/{self.n_models}")
                self.model[i].fit(train_datas[i], **kwargs)
                model_summary = ""
                for k in self.model[i].history.keys():
                    model_summary += "{}: {:.4f}\t".format(k, self.model[i].history[k][-1])
                LOG.info(model_summary)
        else:
                ctx = multiprocessing.get_context('spawn')
                pool = ctx.Pool(processes=n_jobs)
                tasks =[]
                for i,m in enumerate(self.model):
                    m._make_picklable()
                    tasks.append({'model':m, 'training_data':train_datas[i], 'model_id':i, **kwargs})
                for res in tqdm.tqdm(pool.imap_unordered(_map_fit_MODNet, tasks, chunksize=1),
                                     total=self.n_models):
                    model, model_id = res
                    model._restore_model()
                    self.model[model_id] = model
                    model_summary = f"Model #{model_id}\t"
                    for k in model.history.keys():
                        model_summary += "{}: {:.4f}\t".format(k, model.history[k][-1])
                    LOG.info(model_summary)
                pool.close()
                pool.join()
def test_load_moddata_zip():
    """ This test checks that older MODData objects can still be loaded. """

    data_file = Path(__file__).parent.joinpath("data/MP_2018.6_subset.zip")

    # Loading pickles can be dangerous, so lets at least check that the MD5 matches
    # what it was when created
    assert (get_sha512_of_file(data_file) ==
            "d7d75e646dbde539645c8c0b065fd82cbe93f81d3500809655bd13d0acf2027c"
            "1786091a73f53985b08868c5be431a3c700f7f1776002df28ebf3a12a79ab1a1")

    data = MODData.load(data_file)
    assert len(data.structures) == 100
    assert len(data.mpids) == 100
    assert len(data.df_structure) == 100
    assert len(data.df_featurized) == 100
    assert len(data.df_targets) == 100
    assert len(data.df_targets) == 100
Exemple #14
0
    def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame:
        """Predict the target values for the passed MODData.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.
            return_prob: For a classification tasks only: whether to return the probability of each
                class OR only return the most probable class.

        Returns:
            A `pandas.DataFrame` containing the predicted values of the targets.


        """
        # prevents Nan predictions if some features are inf
        x = (test_data.get_featurized_df().replace(
            [np.inf, -np.inf, np.nan],
            0)[self.optimal_descriptors[:self.n_feat]].values)

        # Scale the input features:
        x = np.nan_to_num(x)
        if self._scaler is not None:
            x = self._scaler.transform(x)
            x = np.nan_to_num(x, nan=-1)

        p = np.array(self.model.predict(x))
        if len(p.shape) == 2:
            p = np.array([p])
        p_dic = {}
        for i, name in enumerate(self.targets_flatten):
            if self.num_classes[name] >= 2:
                if return_prob:
                    temp = p[i, :, :] / (p[i, :, :].sum(axis=1)).reshape(
                        (-1, 1))
                    for j in range(temp.shape[-1]):
                        p_dic["{}_prob_{}".format(name, j)] = temp[:, j]
                else:
                    p_dic[name] = np.argmax(p[i, :, :], axis=1)
            else:
                p_dic[name] = p[i, :, 0]
        predictions = pd.DataFrame(p_dic)
        predictions.index = test_data.structure_ids

        return predictions
def featurize(task, n_jobs=1):
    import warnings

    warnings.filterwarnings("ignore", category=RuntimeWarning)
    from modnet.preprocessing import MODData
    from modnet.featurizers.presets import DeBreuck2020Featurizer
    from matminer.datasets import load_dataset

    if task == "matbench_elastic":
        df_g = load_dataset("matbench_log_gvrh")
        df_k = load_dataset("matbench_log_kvrh")
        df = df_g.join(df_k.drop("structure", axis=1))
    else:
        df = load_dataset(task)

    mapping = {
        col: col.replace(" ", "_").replace("(", "").replace(")", "")
        for ind, col in enumerate(df.columns)
    }
    df.rename(columns=mapping, inplace=True)

    targets = [
        col for col in df.columns
        if col not in ("id", "structure", "composition")
    ]

    if "structure" not in df.columns:
        featurizer = CompositionOnlyFeaturizer()
    else:
        featurizer = DeBreuck2020Featurizer(fast_oxid=True)

    try:
        materials = df["structure"] if "structure" in df.columns else df[
            "composition"].map(Composition)
    except KeyError:
        raise RuntimeError(
            f"Could not find any materials data dataset for task {task!r}!")

    data = MODData(
        materials=materials.tolist(),
        targets=df[targets].values,
        target_names=targets,
        featurizer=featurizer,
    )
    data.featurize(n_jobs=n_jobs)
    os.makedirs("./precomputed", exist_ok=True)
    data.save(f"./precomputed/{task}_moddata.pkl.gz")
    return data
def run_predict(data, final_model, settings, save_folds=False, dknn_only=False):
    """
    Runs benchmark based on final_model without training everything again.
    It also computes the Knn distance and puts it in the results pickle.
    In fine, this should be integrated inside modnet benchmark.
    :param data:
    :param final_model:
    :param settings:
    :return:
    """

    task = settings["task"]
    # rebuild the EnsembleMODNetModels from the final model

    n_best_archs = 5 # change this (from 1 to 5 max) to adapt number of inner best archs chosen

    bootstrap_size = 5
    outer_fold_size = bootstrap_size * 5 * 5
    inner_fold_size = bootstrap_size * 5
    models = []

    multi_target = bool(len(data.df_targets.columns) - 1)


    for i in range(5): # outer fold
        modnet_models = []
        for j in range(5): # inner fold
                modnet_models+=(
                    final_model.model[(i * outer_fold_size) + (j * inner_fold_size):
                                      (i * outer_fold_size) + (j * inner_fold_size) + (n_best_archs * bootstrap_size)])
        model = EnsembleMODNetModel(modnet_models=modnet_models)
        models.append(model)

    if dknn_only:
        with open(f"results/{task}_results.pkl", "rb") as f:
            results = pickle.load(f)
            results["dknns"] = []
    else:
        results = defaultdict(list)

    for ind, (train, test) in enumerate(matbench_kfold_splits(data, classification=settings.get("classification", False))):
        train_data, test_data = data.split((train, test))
        path = "folds/train_moddata_f{}".format(ind + 1)
        train_data = MODData.load(path)
        assert len(set(train_data.df_targets.index).intersection(set(test_data.df_targets.index))) == 0
        model = models[ind]

        # compute dkNN

        # TODO: test this quickly before submitting
        max_feat_model = np.argmax([m.n_feat for m in model.model])
        n_feat = model.model[max_feat_model].n_feat
        feature_names = model.model[max_feat_model].optimal_descriptors
        dknn = get_dknn(train_data, test_data, feature_names)
        results["dknns"].append(dknn)
        if dknn_only:
            continue

        predict_kwargs = {}
        if settings.get("classification"):
            predict_kwargs["return_prob"] = True
        if model.can_return_uncertainty:
            predict_kwargs["return_unc"] = True

        pred_results = model.predict(test_data, **predict_kwargs)
        if isinstance(pred_results, tuple):
            predictions, stds = pred_results
        else:
            predictions = pred_results
            stds = None

        targets = test_data.df_targets

        if settings.get("classification"):
            from sklearn.metrics import roc_auc_score
            from sklearn.preprocessing import OneHotEncoder

            y_true = OneHotEncoder().fit_transform(targets.values).toarray()
            score = roc_auc_score(y_true, predictions.values)
            pred_bool = model.predict(test_data, return_prob=False)
            print(f"ROC-AUC: {score}")
            errors = targets - pred_bool
        elif multi_target:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values), axis=0)
        else:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values))

        if save_folds:
            opt_feat = train_data.optimal_features[:n_feat]
            df_train = train_data.df_featurized
            df_train = df_train[opt_feat]
            df_train.to_csv("folds/train_f{}.csv".format(ind + 1))
            df_test = test_data.df_featurized
            df_test = df_test[opt_feat]
            errors.columns = [x + "_error" for x in errors.columns]
            df_test = df_test.join(errors)
            df_test.to_csv("folds/test_f{}.csv".format(ind + 1))

        results["predictions"].append(predictions)
        if stds is not None:
            results["stds"].append(stds)
        results["targets"].append(targets)
        results["errors"].append(errors)
        results["scores"].append(score)
        results['model'].append(model)

    return results
Exemple #17
0
    def fit_preset(
        self,
        data: MODData,
        presets: List[Dict[str, Any]] = None,
        val_fraction: float = 0.15,
        verbose: int = 0,
        classification: bool = False,
        refit: bool = True,
        fast: bool = False,
        nested: int = 5,
        callbacks: List[Any] = None,
        n_jobs=None,
    ) -> Tuple[List[List[Any]], np.ndarray, Optional[List[float]],
               List[List[float]], Dict[str, Any], ]:
        """Chooses an optimal hyper-parametered MODNet model from different presets.

        This function implements the "inner loop" of a cross-validation workflow. By
        modifying the `nested` argument, it can be run in full nested mode (i.e.
        train n_fold * n_preset models) or just with a simple random hold-out set.

        The data is first fitted on several well working MODNet presets
        with a validation set (10% of the furnished data by default).

        Sets the `self.model` attribute to the model with the lowest mean validation loss across
        all folds.

        Args:
            data: MODData object contain training and validation samples.
            presets: A list of dictionaries containing custom presets.
            verbose: The verbosity level to pass to tf.keras
            val_fraction: The fraction of the data to use for validation.
            classification: Whether or not we are performing classification.
            refit: Whether or not to refit the final model for each fold with
                the best-performing settings.
            fast: Used for debugging. If `True`, only fit the first 2 presets and
                reduce the number of epochs.
            nested: integer specifying whether or not to perform a full nested CV. If 0,
                a simple validation split is performed based on val_fraction argument.
                If an integer, use this number of inner CV folds, ignoring the `val_fraction` argument.
                Note: If set to 1, the value will be overwritten to a default of 5 folds.
            n_jobs: number of jobs for multiprocessing

        Returns:
            - A list of length num_outer_folds containing lists of MODNet models of length num_inner_folds.
            - A list of validation losses achieved by the best model for each fold during validation (excluding refit).
            - The learning curve of the final (refitted) model (or `None` if `refit` is `False`)
            - A nested list of learning curves for each trained model of lengths (num_outer_folds,  num_inner folds).
            - The settings of the best-performing preset.

        """

        from modnet.matbench.benchmark import matbench_kfold_splits

        if callbacks is None:
            es = tf.keras.callbacks.EarlyStopping(
                monitor="loss",
                min_delta=0.001,
                patience=100,
                verbose=verbose,
                mode="auto",
                baseline=None,
                restore_best_weights=False,
            )
            callbacks = [es]

        if presets is None:
            from modnet.model_presets import gen_presets

            presets = gen_presets(
                len(data.optimal_features),
                len(data.df_targets),
                classification=classification,
            )

        if fast and len(presets) >= 2:
            presets = presets[:2]
            for k, _ in enumerate(presets):
                presets[k]["epochs"] = 100

        num_nested_folds = 5
        if nested:
            num_nested_folds = nested
        if num_nested_folds <= 1:
            num_nested_folds = 5

        # create tasks
        splits = matbench_kfold_splits(data,
                                       n_splits=num_nested_folds,
                                       classification=classification)
        if not nested:
            splits = [
                train_test_split(range(len(data.df_featurized)),
                                 test_size=val_fraction)
            ]
            n_splits = 1
        else:
            n_splits = num_nested_folds
        train_val_datas = []
        for train, val in splits:
            train_val_datas.append(data.split((train, val)))

        tasks = []
        for i, params in enumerate(presets):
            n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"])

            for ind in range(n_splits):
                val_params = {}
                train_data, val_data = train_val_datas[ind]
                val_params["val_data"] = val_data

                tasks += [{
                    "train_data": train_data,
                    "targets": self.targets,
                    "weights": self.weights,
                    "num_classes": self.num_classes,
                    "n_feat": n_feat,
                    "num_neurons": params["num_neurons"],
                    "lr": params["lr"],
                    "batch_size": params["batch_size"],
                    "epochs": params["epochs"],
                    "loss": params["loss"],
                    "act": params["act"],
                    "out_act": self.out_act,
                    "callbacks": callbacks,
                    "preset_id": i,
                    "fold_id": ind,
                    "verbose": verbose,
                    **val_params,
                }]

        val_losses = 1e20 * np.ones((len(presets), n_splits))
        learning_curves = [[None for _ in range(n_splits)]
                           for _ in range(len(presets))]
        models = [[None for _ in range(n_splits)] for _ in range(len(presets))]

        ctx = multiprocessing.get_context("spawn")
        pool = ctx.Pool(processes=n_jobs)
        LOG.info(
            f"Multiprocessing on {n_jobs} cores. Total of {multiprocessing.cpu_count()} cores available."
        )

        for res in tqdm.tqdm(
                pool.imap_unordered(map_validate_model, tasks, chunksize=1),
                total=len(tasks),
        ):
            val_loss, learning_curve, model, preset_id, fold_id = res
            LOG.info(f"Preset #{preset_id} fitting finished, loss: {val_loss}")
            # reload the model object after serialization
            model._restore_model()

            val_losses[preset_id, fold_id] = val_loss
            learning_curves[preset_id][fold_id] = learning_curve
            models[preset_id][fold_id] = model

        pool.close()
        pool.join()

        val_loss_per_preset = np.mean(val_losses, axis=1)
        best_preset_idx = int(np.argmin(val_loss_per_preset))
        best_model_idx = int(np.argmin(val_losses[best_preset_idx, :]))
        best_preset = presets[best_preset_idx]
        best_learning_curve = learning_curves[best_preset_idx][best_model_idx]
        best_model = models[best_preset_idx][best_model_idx]

        LOG.info(
            "Preset #{} resulted in lowest validation loss with params {}".
            format(best_preset_idx + 1,
                   tasks[n_splits * best_preset_idx + best_model_idx]))

        if refit:
            LOG.info("Refitting with all data and parameters: {}".format(
                best_preset))
            # Building final model

            n_feat = min(len(data.get_optimal_descriptors()),
                         best_preset["n_feat"])
            self.model = MODNetModel(
                self.targets,
                self.weights,
                num_neurons=best_preset["num_neurons"],
                n_feat=n_feat,
                act=best_preset["act"],
                out_act=self.out_act,
                num_classes=self.num_classes,
            ).model
            self.n_feat = n_feat
            self.fit(
                data,
                val_fraction=0,
                lr=best_preset["lr"],
                epochs=best_preset["epochs"],
                batch_size=best_preset["batch_size"],
                loss=best_preset["loss"],
                callbacks=callbacks,
                verbose=verbose,
            )
        else:
            self.n_feat = best_model.n_feat
            self.model = best_model.model
            self._scaler = best_model._scaler

        return models, val_losses, best_learning_curve, learning_curves, best_preset
Exemple #18
0
    def fit(
        self,
        training_data: MODData,
        val_fraction: float = 0.0,
        val_key: Optional[str] = None,
        val_data: Optional[MODData] = None,
        lr: float = 0.001,
        epochs: int = 200,
        batch_size: int = 128,
        xscale: Optional[str] = "minmax",
        metrics: List[str] = ["mae"],
        callbacks: List[Callable] = None,
        verbose: int = 0,
        loss: str = "mse",
        **fit_params,
    ) -> None:
        """Train the model on the passed training `MODData` object.

        Parameters:
            training_data: A `MODData` that has been featurized and
                feature selected. The first `self.n_feat` entries in
                `training_data.get_optimal_descriptors()` will be used
                for training.
            val_fraction: The fraction of the training data to use as a
                validation set for tracking model performance during
                training.
            val_key: The target name to track on the validation set
                during training, if performing multi-target learning.
            lr: The learning rate.
            epochs: The maximum number of epochs to train for.
            batch_size: The batch size to use for training.
            xscale: The feature scaler to use, either `None`,
                `'minmax'` or `'standard'`.
            metrics: A list of tf.keras metrics to pass to `compile(...)`.
            loss: The built-in tf.keras loss to pass to `compile(...)`.
            fit_params: Any additional parameters to pass to `fit(...)`,
                these will be overwritten by the explicit keyword
                arguments above.

        """

        if self.n_feat > len(training_data.get_optimal_descriptors()):
            raise RuntimeError(
                "The model requires more features than computed in data. "
                f"Please reduce n_feat below or equal to {len(training_data.get_optimal_descriptors())}"
            )

        self.xscale = xscale
        self.target_names = list(self.weights.keys())
        self.optimal_descriptors = training_data.get_optimal_descriptors()

        x = training_data.get_featurized_df()[
            self.optimal_descriptors[:self.n_feat]].values

        # For compatibility with MODNet 0.1.7; if there is only one target in the training data,
        # use that for the name of the target too.
        if (len(self.targets_flatten) == 1
                and len(training_data.df_targets.columns) == 1):
            self.targets_flatten = list(training_data.df_targets.columns)

        y = []
        for targ in self.targets_flatten:
            if self.num_classes[targ] >= 2:  # Classification
                y_inner = tf.keras.utils.to_categorical(
                    training_data.df_targets[targ].values,
                    num_classes=self.num_classes[targ],
                )
                loss = "categorical_crossentropy"
            else:
                y_inner = training_data.df_targets[targ].values.astype(
                    np.float, copy=False)
            y.append(y_inner)

        # Scale the input features:
        if self.xscale == "minmax":
            self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5))

        elif self.xscale == "standard":
            self._scaler = StandardScaler()

        x = self._scaler.fit_transform(x)
        x = np.nan_to_num(x, nan=-1)

        if val_data is not None:
            val_x = val_data.get_featurized_df()[
                self.optimal_descriptors[:self.n_feat]].values
            val_x = self._scaler.transform(val_x)
            val_x = np.nan_to_num(val_x, nan=-1)
            try:
                val_y = list(val_data.get_target_df()[
                    self.targets_flatten].values.astype(
                        np.float, copy=False).transpose())
            except Exception:
                val_y = list(val_data.get_target_df().values.astype(
                    np.float, copy=False).transpose())
            validation_data = (val_x, val_y)
        else:
            validation_data = None

        # Optionally set up print callback
        if verbose:
            if val_fraction > 0 or validation_data:
                if self._multi_target and val_key is not None:
                    val_metric_key = f"val_{val_key}_mae"
                else:
                    val_metric_key = "val_mae"
                print_callback = tf.keras.callbacks.LambdaCallback(
                    on_epoch_end=lambda epoch, logs: print(
                        f"epoch {epoch}: loss: {logs['loss']:.3f}, "
                        f"val_loss:{logs['val_loss']:.3f} {val_metric_key}:{logs[val_metric_key]:.3f}"
                    ))

            else:
                print_callback = tf.keras.callbacks.LambdaCallback(
                    on_epoch_end=lambda epoch, logs: print(
                        f"epoch {epoch}: loss: {logs['loss']:.3f}"))

                if callbacks is None:
                    callbacks = [print_callback]
                else:
                    callbacks.append(print_callback)

        fit_params = {
            "x": x,
            "y": y,
            "epochs": epochs,
            "batch_size": batch_size,
            "verbose": verbose,
            "validation_split": val_fraction,
            "validation_data": validation_data,
            "callbacks": callbacks,
        }

        self.model.compile(
            loss=loss,
            optimizer=tf.keras.optimizers.Adam(lr=lr),
            metrics=metrics,
            loss_weights=self.weights,
        )
        history = self.model.fit(**fit_params)
        self.history = history.history
Exemple #19
0
            if col not in ("id", "structure", "composition")
        ]

        try:
            materials = train_df[
                "structure"] if "structure" in train_df.columns else train_df[
                    "composition"].map(Composition)
        except KeyError:
            raise RuntimeError(
                f"Could not find any materials data dataset for task {task!r}!"
            )

        fast_oxid_featurizer = DeBreuck2020Featurizer(fast_oxid=True)
        train_data = MODData(
            materials=materials.tolist(),
            targets=train_df[targets].values,
            target_names=targets,
            featurizer=fast_oxid_featurizer,
        )
        train_data.featurize(n_jobs=32)
        train_data.feature_selection(n=-1, use_precomputed_cross_nmi=True)

        # create model
        targets_hierarchy = [[[field for field in targets]]]
        weights = {field: 1 for field in targets}
        model = EnsembleMODNetModel(targets_hierarchy, weights)

        # fit model

        if USE_GA:
            # you can either use a GA for hyper-parameter optimization or...
            from modnet.hyper_opt import FitGenetic
Exemple #20
0
    def predict(self,
                test_data: MODData,
                return_prob=False,
                return_unc=False) -> pd.DataFrame:
        """Predict the target values for the passed MODData.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.
            return_prob: For a classification tasks only: whether to return the probability of each
                class OR only return the most probable class.
            return_unc: whether to return the standard deviation as a second dataframe

        Returns:
            A `pandas.DataFrame` containing the predicted values of the targets.
            If return_unc=True, two `pandas.DataFrame` : (predictions,std) containing the predicted values of the targets and
             the standard deviations of the epistemic uncertainty.


        """
        # prevents Nan predictions if some features are inf
        x = (test_data.get_featurized_df().replace(
            [np.inf, -np.inf, np.nan],
            0)[self.optimal_descriptors[:self.n_feat]].values)

        # Scale the input features:
        x = np.nan_to_num(x)
        if self._scaler is not None:
            x = self._scaler.transform(x)
            x = np.nan_to_num(x)

        all_predictions = []

        for i in range(1000):
            p = self.model.predict(x)
            if len(self.targets_flatten) == 1:
                p = np.array([p])
            all_predictions.append(p)

        p_dic = {}
        unc_dic = {}
        for i, name in enumerate(self.targets_flatten):
            if self.num_classes[name] >= 2:
                if return_prob:
                    preds = np.array([pred[i] for pred in all_predictions])
                    probs = preds / (preds.sum(axis=-1)).reshape((-1, 1))
                    mean_prob = probs.mean()
                    std_prob = probs.std()
                    for j in range(mean_prob.shape[-1]):
                        p_dic["{}_prob_{}".format(name, j)] = mean_prob[:, j]
                        unc_dic["{}_prob_{}".format(name, j)] = std_prob[:, j]
                else:
                    p_dic[name] = np.argmax(
                        np.array([pred[i]
                                  for pred in all_predictions]).mean(axis=0),
                        axis=1,
                    )
                    unc_dic[name] = np.max(
                        np.array([pred[i]
                                  for pred in all_predictions]).mean(axis=0),
                        axis=1,
                    )
            else:
                mean_p = np.array([pred[i]
                                   for pred in all_predictions]).mean(axis=0)
                std_p = np.array([pred[i]
                                  for pred in all_predictions]).std(axis=0)
                p_dic[name] = mean_p[:, 0]
                unc_dic[name] = std_p[:, 0]

        predictions = pd.DataFrame(p_dic)
        unc = pd.DataFrame(unc_dic)

        predictions.index = test_data.structure_ids
        unc.index = test_data.structure_ids

        if return_unc:
            return predictions, unc
        else:
            return predictions
Exemple #21
0
def matbench_benchmark(
    data: MODData,
    target: List[str],
    target_weights: Dict[str, float],
    fit_settings: Optional[Dict[str, Any]] = None,
    classification: bool = False,
    model_type: Type[MODNetModel] = MODNetModel,
    save_folds: bool = False,
    save_models: bool = False,
    hp_optimization: bool = True,
    inner_feat_selection: bool = True,
    use_precomputed_cross_nmi: bool = True,
    presets: Optional[List[dict]] = None,
    fast: bool = False,
    n_jobs: Optional[int] = None,
    nested: bool = False,
    **model_init_kwargs,
) -> dict:
    """Train and cross-validate a model against Matbench data splits, optionally
    performing hyperparameter optimisation.

    Arguments:
        data: The entire dataset as a `MODData`.
        target: The list of target names to train on.
        target_weights: The target weights to use for the `MODNetModel`.
        fit_settings: Any settings to pass to `model.fit(...)` directly
            (typically when not performing hyperparameter optimisation).
        classification: Whether all tasks are classification rather than regression.
        model_type: The type of the model to create and benchmark.
        save_folds: Whether to save dataframes with pre-processed fold
            data (e.g. feature selection).
        save_models: Whether to pickle all trained models according to
            their fold index and performance.
        hp_optimization: Whether to perform hyperparameter optimisation.
        inner_feat_selection: Whether to perform split-level feature
            selection or try to use pre-computed values.
        use_precomputed_cross_nmi: Whether to use the precmputed cross NMI
            from the Materials Project dataset, or recompute per fold.
        presets: Override the built-in hyperparameter grid with these presets.
        fast: Whether to perform debug training, i.e. reduced presets and epochs.
        n_jobs: Try to parallelize the inner fit_preset over this number of
            processes. Maxes out at number_of_presets*nested_folds
        nested: Whether to perform nested CV for hyperparameter optimisation.
        **model_init_kwargs: Additional arguments to pass to the model on creation.

    Returns:
        A dictionary containing all the results from the training, broken
            down by model and by fold.

    """

    if fit_settings is None:
        fit_settings = {}

    if not fit_settings.get("n_feat"):
        nf = len(data.df_featurized.columns)
        fit_settings["n_feat"] = nf
    if not fit_settings.get("num_neurons"):
        # Pass dummy network
        fit_settings["num_neurons"] = [[4], [4], [4], [4]]

    fold_data = []
    results = defaultdict(list)

    for ind, (train, test) in enumerate(matbench_kfold_splits(data)):
        train_data, test_data = data.split((train, test))
        if inner_feat_selection:
            path = "folds/train_moddata_f{}".format(ind + 1)
            if os.path.isfile(path):
                train_data = MODData.load(path)
            else:
                train_data.feature_selection(
                    n=-1, use_precomputed_cross_nmi=use_precomputed_cross_nmi)
            os.makedirs("folds", exist_ok=True)
            train_data.save(path)

        fold_data.append((train_data, test_data))

    args = (target, target_weights, fit_settings)

    model_kwargs = {
        "model_type": model_type,
        "hp_optimization": hp_optimization,
        "fast": fast,
        "classification": classification,
        "save_folds": save_folds,
        "presets": presets,
        "save_models": save_models,
        "nested": nested,
        "n_jobs": n_jobs,
    }

    model_kwargs.update(model_init_kwargs)

    fold_results = []
    for fold in enumerate(fold_data):
        fold_results.append(train_fold(fold, *args, **model_kwargs))

    for fold in fold_results:
        for key in fold:
            results[key].append(fold[key])

    return results
Exemple #22
0
    def fit_preset(
        self,
        data: MODData,
        presets: List[Dict[str, Any]] = None,
        val_fraction: float = 0.1,
        verbose: int = 0
    ) -> None:
        """Chooses an optimal hyper-parametered MODNet model from different presets.

        The data is first fitted on several well working MODNet presets
        with a validation set (10% of the furnished data by default).

        Sets the `self.model` attribute to the model with the lowest loss.

        Args:
            data: MODData object contain training and validation samples.
            presets: A list of dictionaries containing custom presets.
            verbose: The verbosity level to pass to Keras
            val_fraction: The fraction of the data to use for validation.

        """

        rlr = keras.callbacks.ReduceLROnPlateau(
            monitor="loss",
            factor=0.5,
            patience=20,
            verbose=verbose,
            mode="auto",
            min_delta=0,
        )
        es = keras.callbacks.EarlyStopping(
            monitor="loss",
            min_delta=0.001,
            patience=300,
            verbose=verbose,
            mode="auto",
            baseline=None,
            restore_best_weights=True,
        )
        callbacks = [rlr, es]

        if presets is None:
            from modnet.model_presets import MODNET_PRESETS
            presets = MODNET_PRESETS

        val_losses = 1e20 * np.ones((len(presets),))

        best_model = None
        best_n_feat = None

        for i, params in enumerate(presets):
            logging.info("Training preset #{}/{}".format(i + 1, len(presets)))
            n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"])
            self.model = MODNetModel(
                self.targets,
                self.weights,
                num_neurons=params["num_neurons"],
                n_feat=n_feat,
                act=params["act"],
            ).model
            self.n_feat = n_feat
            self.fit(
                data,
                val_fraction=val_fraction,
                lr=params["lr"],
                epochs=params["epochs"],
                batch_size=params["batch_size"],
                loss=params["loss"],
                callbacks=callbacks,
                verbose=verbose,
            )
            val_loss = np.array(self.model.history.history["val_loss"])[-20:].mean()
            if val_loss < min(val_losses):
                best_model = self.model
                best_n_feat = n_feat

            val_losses[i] = val_loss

            logging.info("Validation loss: {:.3f}".format(val_loss))

        best_preset = val_losses.argmin()
        logging.info(
            "Preset #{} resulted in lowest validation loss.\nFitting all data...".format(
                best_preset + 1
            )
        )
        self.n_feat = best_n_feat
        self.model = best_model