Beispiel #1
0
def get_skorch_regressor():
    X, y = make_regression(100, 5, n_informative=3, random_state=0)
    X = X.astype(np.float32)
    y = y / np.std(y)
    y = y.reshape(-1, 1).astype(np.float32)

    X_df = pd.DataFrame(X, columns=['col' + str(i) for i in range(X.shape[1])])

    class MyModule(nn.Module):
        def __init__(self, input_units=5, num_units=5, nonlin=nn.ReLU()):
            super(MyModule, self).__init__()

            self.dense0 = nn.Linear(input_units, num_units)
            self.nonlin = nonlin
            self.dense1 = nn.Linear(num_units, num_units)
            self.output = nn.Linear(num_units, 1)

        def forward(self, X, **kwargs):
            X = self.nonlin(self.dense0(X))
            X = self.nonlin(self.dense1(X))
            X = self.output(X)
            return X

    model = NeuralNetRegressor(
        MyModule,
        max_epochs=20,
        lr=0.2,
        iterator_train__shuffle=True,
    )

    model.fit(X_df.values, y)
    return model, X_df, y
Beispiel #2
0
def fit_custom_pytorch_module_w_skorch(module, X, y, hyperparams):
    """Fit a custom PyTorch module using Skorch."""

    skorch_net = NeuralNetRegressor(
        module=module,
        optimizer=torch.optim.Adam,
        lr=hyperparams["learning_rate"],
        optimizer__weight_decay=hyperparams["l2_decay"],
        max_epochs=hyperparams["max_epochs"],
        batch_size=hyperparams["batch_size"],
        iterator_train__shuffle=True,
    )

    skorch_net.fit(X, y)
    return skorch_net
    def optimize(model):

        logger.info("Checkpoint2")
        X = model.predictor_src  #+ self.predictor_tgt
        y = model.predictor_tgt
        # y = model.config.sentence_level
        print(X)
        print(y)

        #Hyperparameter Tuning with Random Search
        net = NeuralNetRegressor(
            model,
            max_epochs=10,
            lr=0.1,
            # Shuffle training data on each epoch
            iterator_train__shuffle=True,
        )

        net.fit(X, y)
        y_proba = net.predict_proba(X)

        # deactivate skorch-internal train-valid split and verbose logging
        net.set_params(train_split=False, verbose=0)
        params = {
            'epochs': [7],
            'hidden_LSTM': [32, 64, 128],
            'learning_rate_batch': [(32, '1e-3'), (64, '2e-3')],
            'dropout': [0.5],
        }
        gs = RandomizedSearchCV(net,
                                params,
                                refit=False,
                                cv=3,
                                scoring='accuracy',
                                verbose=2)

        gs.fit(X, y)
        print("best score: {:.3f}, best params: {}".format(
            gs.best_score_, gs.best_params_))
        return
Beispiel #4
0
def train_nn_model_validate1(nodes, X_train_scaled, Y_train, max_evals=10):

    #我觉得0.12的设置有点多了,还有很多数据没用到呢,感觉这样子设置应该会好一些的吧?
    #X_split_train, X_split_test, Y_split_train, Y_split_test = train_test_split(X_train_scaled, Y_train, test_size=0.12, stratify=Y_train)
    X_split_train, X_split_test, Y_split_train, Y_split_test = train_test_split(
        X_train_scaled, Y_train, test_size=0.14)
    #由于神经网络模型初始化、dropout等的问题导致网络不够稳定
    #解决这个问题的办法就是多重复计算几次,选择其中靠谱的模型
    best_rmse = 99999999999.9
    best_model = 0.0
    for j in range(0, max_evals):

        rsg = NeuralNetRegressor(
            lr=nodes["lr"],
            optimizer__weight_decay=nodes["optimizer__weight_decay"],
            criterion=nodes["criterion"],
            batch_size=nodes["batch_size"],
            optimizer__betas=nodes["optimizer__betas"],
            module=create_nn_module(nodes["input_nodes"],
                                    nodes["hidden_layers"],
                                    nodes["hidden_nodes"],
                                    nodes["output_nodes"],
                                    nodes["percentage"]),
            max_epochs=nodes["max_epochs"],
            callbacks=[
                skorch.callbacks.EarlyStopping(patience=nodes["patience"])
            ],
            device=nodes["device"],
            optimizer=nodes["optimizer"])
        init_module(rsg.module, nodes["weight_mode"], nodes["bias"])
        rsg.fit(X_split_train.astype(np.float32),
                Y_split_train.astype(np.float32))

        #Y_pred = rsg.predict(X_split_test.astype(np.float32))
        metric = cal_nnrsg_rmse(rsg, X_split_test, Y_split_test)

        best_model, best_rmse, flag = record_best_model_rmse(
            rsg, metric, best_model, best_rmse)

    return best_model, best_rmse
                             iterator_train__collate_fn=collate_pool,
                             iterator_train__shuffle=True,
                             iterator_valid__pin_memory=True,
                             iterator_valid__num_workers=0,
                             iterator_valid__collate_fn=collate_pool,
                             iterator_valid__shuffle=False,
                             device=device,
                             criterion=torch.nn.L1Loss,
                             dataset=MergeDataset,
                             callbacks=[cp, load_best_valid_loss, LR_schedule])

    # Assign everything to their respective dictionaries
    nets[ads] = net
    cps[ads] = cp
    best_finders[ads] = load_best_valid_loss
    lr_schedulers[ads] = LR_schedule

# Block the data
for ads, net in nets.items():
    _sdts_train = []
    _targets_train = []
    for doc, sdt, target in zip(docs_train, sdts_train, targets_train):
        if doc['adsorbate'] == ads:
            _sdts_train.append(sdt)
            _targets_train.append(target)
    _targets_train = np.array(_targets_train)

    # Fit
    net.initialize()
    net.fit(_sdts_train, _targets_train)
class PyTorchModel(BaseModel):
    def build_model(
        self,
        network=MVRegressor,
        device: str = "cpu",
        scale_data: bool = False,
        num_layers: int = 10,
        num_units: int = 50,
        dropout: float = 0.5,
        num_epochs: int = 10,
        batch_size: int = 128,
    ):

        self.scale_data = scale_data
        self.num_layers = num_layers
        self.num_units = num_units
        self.dropout = dropout
        self.num_epochs = num_epochs
        self.batch_size = batch_size

        if not all([hasattr(self, "input_dim"), hasattr(self, "output_dim")]):

            raise ValueError(
                "Please load dataset first to obtain proper sizes")

        if device == "cpu":
            self.device = device
        else:
            use_cuda = torch.cuda.is_available()
            self.device = torch.device("cuda" if use_cuda else "cpu")

        self.model = NeuralNetRegressor(
            network,
            device=self.device,
            module__input_dim=self.input_dim,
            module__output_dim=self.output_dim,
            module__n_layers=self.num_layers,
            module__num_units=self.num_units,
            module__p_dropout=self.dropout,
            max_epochs=self.num_epochs,
            criterion=nn.MSELoss,
            batch_size=self.batch_size,
            # Shuffle training data on each epoch
            iterator_train__shuffle=True,
            callbacks=[
                (
                    "lr_scheduler",
                    LRScheduler(policy=CyclicLR,
                                base_lr=0.001,
                                max_lr=0.01,
                                step_every="batch"),
                ),
            ],
        )

    def fit(self, X, y, **fit_params):

        if self.scale_data:
            X, y = self.scalar(X, y)

        X, y = (
            torch.tensor(X).float().to(device=self.device),
            torch.tensor(y).float().to(device=self.device),
        )
        self.model.fit(X, y, **fit_params)

    def load_model(
        self,
        input_dim: str,
        output_dim: str,
        filename: str,
        scale_data: bool = False,
    ):

        self.scale_data = scale_data
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.build_model(scale_data=scale_data)
        self.model = pickle.load(open(filename, "rb"))

    def predict(self, X):

        if self.scale_data:
            X = self.xscalar.transform(X)
        X = torch.tensor(X).float().to(device=self.device)
        preds = self.model.predict(X)

        if self.scale_data:
            preds = self.yscalar.inverse_transform(preds)

        return preds

    def sweep(
        self,
        params: Dict,
        X,
        y,
        search_algorithm: str = "bayesian",
        num_trials: int = 3,
        scoring_func: str = "r2",
        early_stopping: bool = False,
        results_csv_path: str = "outputs/results.csv",
        splitting_criteria: str = "timeseries",
        num_splits: int = 5,
    ):

        start_dir = str(pathlib.Path(os.getcwd()).parent)
        module_dir = str(pathlib.Path(__file__).parent)
        # temporarily change directory to file directory and then reset
        os.chdir(module_dir)

        if self.scale_data:
            X, y = self.scalar(X, y)

        X, y = (
            torch.tensor(X).float().to(device=self.device),
            torch.tensor(y).float().to(device=self.device),
        )

        if splitting_criteria.lower() == "cv":
            cv = None
        elif splitting_criteria.lower() == "timeseries":
            cv = TimeSeriesSplit(n_splits=num_splits)
        elif splitting_criteria.lower() == "grouped":
            cv = GroupShuffleSplit(n_splits=num_splits)
        elif splitting_criteria.lower() == "fixed":
            if type(test_indices) != list:
                raise ValueError(
                    "fixed split used but no test-indices provided...")
            cv = PredefinedSplit(test_fold=test_indices)
        else:
            raise ValueError(
                "Unknowing splitting criteria provided: {splitting_criteria}, should be one of [cv, timeseries, grouped]"
            )

        if search_algorithm.lower() == "bohb":
            early_stopping = True

        if any([
                search_algorithm.lower()
                in ["bohb", "bayesian", "hyperopt", "optuna"]
        ]):
            search = TuneSearchCV(
                self.model,
                params,
                search_optimization=search_algorithm,
                n_trials=num_trials,
                early_stopping=early_stopping,
                scoring=scoring_func,
            )
        elif search_algorithm == "grid":
            search = GridSearchCV(
                self.model,
                param_grid=params,
                refit=True,
                cv=num_trials,
                scoring=scoring_func,
            )
        elif search_algorithm == "random":
            search = RandomizedSearchCV(
                self.model,
                param_distributions=params,
                refit=True,
                cv=num_trials,
                scoring=scoring_func,
            )
        else:
            raise NotImplementedError(
                "Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random"
            )
        with mlflow.start_run() as run:
            search.fit(X, y)
        self.model = search.best_estimator_

        # set path back to initial
        os.chdir(start_dir)

        results_df = pd.DataFrame(search.cv_results_)
        logger.info(f"Best hyperparams: {search.best_params_}")

        if not pathlib.Path(results_csv_path).parent.exists():
            pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True,
                                                        parents=True)
        logger.info(f"Saving sweeping results to {results_csv_path}")
        logger.info(f"Best score: {search.best_score_}")
        results_df.to_csv(results_csv_path)
        cols_keep = [col for col in results_df if "param_" in col]
        cols_keep += ["mean_test_score"]

        results_df = results_df[cols_keep]

        return results_df
device = 'cpu'
# device = 'cuda:0'

net = NeuralNetRegressor(
    module=FullNN(unique_atoms, [fp_length, 5, 5],
                  device,
                  forcetraining=forcetraining),
    criterion=CustomLoss,
    criterion__force_coefficient=0.3,
    optimizer=torch.optim.LBFGS,
    lr=1,
    batch_size=400,
    max_epochs=50,
    iterator_train__collate_fn=collate_amp,
    iterator_valid__collate_fn=collate_amp,
    device=device,
    train_split=None,
    callbacks=[
        EpochScoring(forces_score,
                     on_train=True,
                     use_caching=True,
                     target_extractor=target_extractor),
        EpochScoring(energy_score,
                     on_train=True,
                     use_caching=True,
                     target_extractor=target_extractor)
    ],
)

net.fit(data, None)
        early = EarlyStopping(patience=args.patience, threshold=args.threshold)

        #Using the model with the NeuralNetRegressor to configure parameters
        net = NeuralNetRegressor(model,
                                 max_epochs=args.epochs,
                                 lr=args.lr,
                                 batch_size=args.batch_size,
                                 optimizer__momentum=args.momentum,
                                 iterator_train__shuffle=False,
                                 iterator_valid__shuffle=False
                                 #callbacks=[early]
                                 )

        start_training = time.time()
        net.fit(X_train, y_train)

        #saving the training time
        b = open("train_temps.txt", "a+")
        b.write("Iteration: " + str(number) + '\n')
        b.write("Lenght X: " + str(len(X)) + '\n')
        b.write("Lenght X train: " + str(len(X_train)) + '\n')
        b.write("Lenght X test: " + str(len(X_test)) + '\n')
        b.write(" Time to train: " +
                str(secs2hours(time.time() - start_training)) + '\n')
        b.write(" Time to train: " + str(time.time() - start_training) + '\n')
        b.close()

        # visualize the loss as the network trained
        # plotting training and validation loss
        epochs = [i for i in range(len(net.history))]
Beispiel #9
0
                         optimizer__momentum=0.9,
                         optimizer__weight_decay=0.001,
                         iterator_train__shuffle=True,
                         iterator_train__num_workers=10,
                         iterator_valid__shuffle=True,
                         iterator_valid__num_workers=10,
                         train_split=predefined_split(valid0df),
                         device='cuda:0,1,6')

rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4))

print("Fitting")
net.fit(train0df, y=None)
print("Fit completed")
history = net.history
train_loss0 = history[:, 'train_loss']
valid_loss0 = history[:, 'valid_loss']
ax1.plot(train_loss0)
ax1.plot(valid_loss0)
ax1.legend(['train_loss', 'valid_loss'])

net.save_params(f_params='dcs0_0005.pkl',
                f_optimizer='dcs0_0005_optimizer.pkl',
                f_history='dcs0_0005_history.json')

pred = net.predict_proba(valid0)
label = valid0.get_label()
accuracy = concordance_index(pred, label)
Beispiel #10
0
        out = self.l1(out)
        out = self.fc2(out)
        return out


net_regr = NeuralNetRegressor(
    Net(hidden_size=500),
    max_epochs=5000,
    lr=0.01,
    device='cuda',
    optimizer=torch.optim.Adam,
    train_split=None,
    verbose=1,
)

res = net_regr.fit(t_d_inp, t_d_oup)
# save
net_regr.save_params(f_params='step1result')

pred = net_regr.predict(test_inp)
mse = ((test_oup - pred)**2).mean()
print('test error = ' + str(mse))
# plot 1 loss
loss = net_regr.history[:, 'train_loss']
plt.figure()
plt.plot(loss)
plt.ylabel('loss')
plt.ylim([0, loss[-1] * 4])
# plot 2
plt.figure()
s = 3
    max_epochs=10,
    lr=0.1,
    verbose=1,
)

# =================Split the dataset using GroupShuffleSplit===================
gss = GroupShuffleSplit(n_splits=10, test_size=0.15, random_state=42)
gss.get_n_splits(X_regr, y_regr, groups=y)

for train_index, test_index in (gss.split(X_regr, y_regr, groups=y)):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_regr_train, X_regr_test = X_regr[train_index], X_regr[test_index]
    y_regr_train, y_regr_test = y_regr[train_index], y_regr[test_index]

    # ==============================Train the model================================
    net_regr.fit(X_regr_train, y_regr_train)

    # ================Test/Validate the model via 10fold crossValidation===========
    y_regr_pred = cross_val_predict(net_regr, X_regr_test, y_regr_test, cv=10)

    # ============================ Evaluate the model =============================
    score_regression = r2_score(y_regr_test, y_regr_pred)
    score_r2 = pearsonr(y_regr_test.flatten(), y_regr_pred.flatten())[0]**2
    rmse = sqrt(mean_squared_error(y_regr_test, y_regr_pred))
    #score_regression=mean_squared_error(y_regr_pred, y_regr_test)
    accuracy_regression.append(score_regression)
    accuracy_score_r2.append(score_r2)
    accuracy_rmse.append(rmse)

print(accuracy_regression)
print(score_r2)
                         iterator_valid__num_workers=0,
                         iterator_valid__collate_fn=collate_pool,
                         iterator_valid__shuffle=False,
                         device=device,
                         criterion=torch.nn.L1Loss,
                         dataset=MergeDataset,
                         callbacks=[cp, load_best_valid_loss, LR_schedule])

# # Training

# We can train a new model...

# In[ ]:

net.initialize()
net.fit(sdts_train, targets_train)

# ...or load whatever is cached

# In[ ]:

net.initialize()
net.load_params(f_history='valid_best_history.json',
                f_optimizer='valid_best_optimizer.pt',
                f_params='valid_best_params.pt')

# # Assess performance

# In[ ]:

import numpy as np
    ):
        super(RegressorModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(20, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 10)
        self.output = nn.Linear(10, 1)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = F.relu(self.dense1(X))
        X = self.output(X)
        return X


net_regr = NeuralNetRegressor(
    RegressorModule,
    max_epochs=20,
    lr=0.1,
    device='cuda',
)

net_regr.fit(X_regr, y_regr)

y_pred = net_regr.predict(X_regr[:5])
y_pred

a, b = net_regr.train_split(X_regr)
net_regr = NeuralNetRegressor(
    Net(hidden_size=500,
        input_size=x_train.shape[1],
        output_size=y_train.shape[1]),
    criterion=torch.nn.MSELoss,
    optimizer=torch.optim.Adam,
    max_epochs=5000,
    lr=0.001,
    device='cuda',
    train_split=None,
    verbose=1,
    batch_size=-1,
)

res = net_regr.fit(x_train, y_train)
# save
net_regr.save_params(f_params='step1result')

#model = Net(input_size, hidden_size, output_size, dropout_rate)
## print model summary
#nodata = np.prod(x_train.shape)
#noparas = sum([param.nelement() for param in model.parameters()])
#print("Total number of data elements:"+str(nodata))
#print("Total number of parameters   :"+str(noparas))
#for name, param in model.named_parameters():
#    print(name, "\t", param.nelement(), "\t\t", param.data.shape)
#if noparas>nodata:
#    print("Use too much neurons!!!")
#else:
#    print("Network is OK!")
                             module__n_h=4,
                             optimizer=Adam,
                             iterator_train__pin_memory=True,
                             iterator_train__num_workers=0,
                             iterator_train__collate_fn=collate_pool,
                             iterator_train__shuffle=True,
                             iterator_valid__pin_memory=True,
                             iterator_valid__num_workers=0,
                             iterator_valid__collate_fn=collate_pool,
                             iterator_valid__shuffle=False,
                             device=device,
                             criterion=torch.nn.L1Loss,
                             dataset=MergeDataset,
                             callbacks=[cp, load_best_valid_loss, LR_schedule])
    net.initialize()
    net.fit(stds_train_, targets_train_)
    nets.append(net)

# # Loading models
# It takes a few hours to fit the 5-model ensemble. You can either do it via notebook (above) or via `sbatch submit_ensemble_fitting.sh`. Either way, you load the results here.

# In[5]:

import numpy as np
from sklearn.model_selection import KFold
from torch.optim import Adam
import skorch.callbacks.base
from skorch.callbacks import Checkpoint  # needs skorch >= 0.4
from skorch.callbacks.lr_scheduler import LRScheduler
from skorch import NeuralNetRegressor
from cgcnn.dropoutmodel10 import CrystalGraphConvNet
class PyTorchModel(BaseModel):
    def build_model(
        self,
        network=MVRegressor,
        device: str = "cpu",
        scale_data: bool = False,
        num_layers: int = 10,
        num_units: int = 50,
        dropout: float = 0.5,
        num_epochs: int = 10,
        batch_size: int = 128,
    ):

        self.scale_data = scale_data
        self.num_layers = num_layers
        self.num_units = num_units
        self.dropout = dropout
        self.num_epochs = num_epochs
        self.batch_size = batch_size

        if not all([hasattr(self, "input_dim"), hasattr(self, "output_dim")]):

            raise ValueError("Please load dataset first to obtain proper sizes")

        if device == "cpu":
            self.device = device
        else:
            use_cuda = torch.cuda.is_available()
            self.device = torch.device("cuda" if use_cuda else "cpu")

        self.model = NeuralNetRegressor(
            network,
            device=self.device,
            module__input_dim=self.input_dim,
            module__output_dim=self.output_dim,
            module__n_layers=self.num_layers,
            module__num_units=self.num_units,
            module__p_dropout=self.dropout,
            max_epochs=self.num_epochs,
            criterion=nn.MSELoss,
            batch_size=self.batch_size,
            # Shuffle training data on each epoch
            iterator_train__shuffle=True,
            callbacks=[
                (
                    "lr_scheduler",
                    LRScheduler(
                        policy=CyclicLR, base_lr=0.001, max_lr=0.01, step_every="batch"
                    ),
                ),
            ],
        )

    def fit(self, X, y, **fit_params):

        if self.scale_data:
            X, y = self.scalar(X, y)

        X, y = (
            torch.tensor(X).float().to(device=self.device),
            torch.tensor(y).float().to(device=self.device),
        )
        self.model.fit(X, y, **fit_params)

    def load_model(
        self, input_dim: str, output_dim: str, filename: str, scale_data: bool = False,
    ):

        self.scale_data = scale_data
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.build_model()
        self.model = pickle.load(open(filename, "rb"))

    def predict(self, X):

        if self.scale_data:
            X = self.xscalar.transform(X)
        X = torch.tensor(X).float().to(device=self.device)
        preds = self.model.predict(X)

        if self.scale_data:
            preds = self.yscalar.inverse_transform(preds)

        return preds

    def sweep(
        self,
        params: Dict,
        X,
        y,
        search_algorithm: str = "bayesian",
        num_trials: int = 3,
        scoring_func: str = "r2",
    ):

        from tune_sklearn import TuneGridSearchCV, TuneSearchCV

        X, y = (
            torch.tensor(X).float().to(device=self.device),
            torch.tensor(y).float().to(device=self.device),
        )
        tune_search = TuneSearchCV(
            self.model,
            params,
            search_optimization=search_algorithm,
            n_trials=num_trials,
            early_stopping=True,
            scoring=scoring_func,
        )
        tune_search.fit(X, y)

        return tune_search
Beispiel #17
0
TRAIN_DIR = os.path.join(os.path.abspath("."), "..", "cropped_data")
cp = Checkpoint(dirname='segnet_mse_no_sigmoid_sgd_150ep_b8_lr_0.01_30enc/checkpoints')
train_end_cp = TrainEndCheckpoint(dirname='segnet_mse_no_sigmoid_sgd_150ep_b8_lr_0.01_30enc/checkpoints')
load_state = LoadInitState(checkpoint=cp)
net = NeuralNetRegressor(
    SegNet,
    module__encoding_size=30,
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    max_epochs=150,
    batch_size=8,
    criterion=MSELoss,
    lr=0.01,
    iterator_train__shuffle=True,
    optimizer=torch.optim.SGD,
    optimizer__momentum=.9,
    callbacks=[cp, train_end_cp, load_state]
)

if __name__ == '__main__':
    mean = np.array([0.5020, 0.4690, 0.4199])
    std = np.array([0.2052, 0.2005, 0.1966])
    aug_tran = transforms.Compose([
        transforms.Resize(SIZE, interpolation=3),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])

    dataset = AutoEncoderImageDataset(TRAIN_DIR, transform=aug_tran)

    net.fit(dataset, y=None)
Beispiel #18
0
class AtomsTrainer:
    def __init__(self, config):
        self.config = config
        self.pretrained = False

    def load(self):
        self.load_config()
        self.load_rng_seed()
        self.load_dataset()
        self.load_model()
        self.load_criterion()
        self.load_optimizer()
        self.load_logger()
        self.load_extras()
        self.load_skorch()

    def load_config(self):
        self.timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        self.identifier = self.config["cmd"].get("identifier", False)
        if self.identifier:
            self.identifier = self.timestamp + "-{}".format(self.identifier)
        else:
            self.identifier = self.timestamp

        self.device = torch.device(self.config["optim"].get("device", "cpu"))
        self.debug = self.config["cmd"].get("debug", False)
        run_dir = self.config["cmd"].get("run_dir", "./")
        os.chdir(run_dir)
        if not self.debug:
            self.cp_dir = os.path.join(run_dir, "checkpoints", self.identifier)
            print(f"Results saved to {self.cp_dir}")
            os.makedirs(self.cp_dir, exist_ok=True)

    def load_rng_seed(self):
        seed = self.config["cmd"].get("seed", 0)
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    def get_unique_elements(self, training_images):
        elements = np.array(
            [atom.symbol for atoms in training_images for atom in atoms])
        elements = np.unique(elements)
        return elements

    def load_dataset(self):
        training_images = self.config["dataset"]["raw_data"]
        # TODO: Scalability when dataset to large to fit into memory
        if isinstance(training_images, str):
            training_images = ase.io.read(training_images, ":")
        self.elements = self.config["dataset"].get(
            "elements", self.get_unique_elements(training_images))

        self.forcetraining = self.config["model"].get("get_forces", True)
        self.fp_scheme = self.config["dataset"].get("fp_scheme",
                                                    "gaussian").lower()
        self.fp_params = self.config["dataset"]["fp_params"]
        self.cutoff_params = self.config["dataset"].get(
            "cutoff_params", {"cutoff_func": "Cosine"})

        self.train_dataset = AtomsDataset(
            images=training_images,
            descriptor_setup=(
                self.fp_scheme,
                self.fp_params,
                self.cutoff_params,
                self.elements,
            ),
            forcetraining=self.forcetraining,
            save_fps=self.config["dataset"].get("save_fps", True),
        )

        self.target_scaler = self.train_dataset.target_scaler
        if not self.debug:
            normalizers = {"target": self.target_scaler}
            torch.save(normalizers, os.path.join(self.cp_dir,
                                                 "normalizers.pt"))
        self.input_dim = self.train_dataset.input_dim
        self.val_split = self.config["dataset"].get("val_split", 0)
        print("Loading dataset: {} images".format(len(self.train_dataset)))

    def load_model(self):
        elements = list_symbols_to_indices(self.elements)
        self.model = BPNN(elements=elements,
                          input_dim=self.input_dim,
                          **self.config["model"])
        print("Loading model: {} parameters".format(self.model.num_params))

    def load_extras(self):
        callbacks = []
        load_best_loss = train_end_load_best_loss(self.identifier)
        self.split = CVSplit(cv=self.val_split) if self.val_split != 0 else 0

        metrics = evaluator(
            self.val_split,
            self.config["optim"].get("metric", "mae"),
            self.identifier,
            self.forcetraining,
        )
        callbacks.extend(metrics)

        if not self.debug:
            callbacks.append(load_best_loss)
        scheduler = self.config["optim"].get("scheduler", None)
        if scheduler:
            scheduler = LRScheduler(scheduler,
                                    **self.config["optim"]["scheduler_params"])
            callbacks.append(scheduler)
        if self.config["cmd"].get("logger", False):
            from skorch.callbacks import WandbLogger

            callbacks.append(
                WandbLogger(
                    self.wandb_run,
                    save_model=False,
                    keys_ignored="dur",
                ))
        self.callbacks = callbacks

    def load_criterion(self):
        self.criterion = self.config["optim"].get("loss_fn", CustomLoss)

    def load_optimizer(self):
        self.optimizer = self.config["optim"].get("optimizer",
                                                  torch.optim.Adam)

    def load_logger(self):
        if self.config["cmd"].get("logger", False):
            import wandb

            self.wandb_run = wandb.init(
                name=self.identifier,
                config=self.config,
                id=self.timestamp,
            )

    def load_skorch(self):
        skorch.net.to_tensor = to_tensor

        collate_fn = DataCollater(train=True, forcetraining=self.forcetraining)

        self.net = NeuralNetRegressor(
            module=self.model,
            criterion=self.criterion,
            criterion__force_coefficient=self.config["optim"].get(
                "force_coefficient", 0),
            criterion__loss=self.config["optim"].get("loss", "mse"),
            optimizer=self.optimizer,
            lr=self.config["optim"].get("lr", 1e-1),
            batch_size=self.config["optim"].get("batch_size", 32),
            max_epochs=self.config["optim"].get("epochs", 100),
            iterator_train__collate_fn=collate_fn,
            iterator_train__shuffle=True,
            iterator_valid__collate_fn=collate_fn,
            iterator_valid__shuffle=False,
            device=self.device,
            train_split=self.split,
            callbacks=self.callbacks,
            verbose=self.config["cmd"].get("verbose", True),
        )
        print("Loading skorch trainer")

    def train(self, raw_data=None):
        if raw_data is not None:
            self.config["dataset"]["raw_data"] = raw_data
        if not self.pretrained:
            self.load()

        self.net.fit(self.train_dataset, None)

    def predict(self, images, batch_size=32):
        if len(images) < 1:
            warnings.warn("No images found!", stacklevel=2)
            return images

        a2d = AtomsToData(
            descriptor=self.train_dataset.descriptor,
            r_energy=False,
            r_forces=False,
            save_fps=True,
            fprimes=self.forcetraining,
            cores=1,
        )

        data_list = a2d.convert_all(images, disable_tqdm=True)

        self.net.module.eval()
        collate_fn = DataCollater(train=False,
                                  forcetraining=self.forcetraining)

        predictions = {"energy": [], "forces": []}
        for data in data_list:
            collated = collate_fn([data])
            energy, forces = self.net.module(collated)

            energy = self.target_scaler.denorm(
                energy, pred="energy").detach().tolist()
            forces = self.target_scaler.denorm(forces,
                                               pred="forces").detach().numpy()

            predictions["energy"].extend(energy)
            predictions["forces"].append(forces)

        return predictions

    def load_pretrained(self, checkpoint_path=None):
        print(f"Loading checkpoint from {checkpoint_path}")
        self.load()
        self.net.initialize()
        self.pretrained = True
        try:
            self.net.load_params(
                f_params=os.path.join(checkpoint_path, "params.pt"),
                f_optimizer=os.path.join(checkpoint_path, "optimizer.pt"),
                f_criterion=os.path.join(checkpoint_path, "criterion.pt"),
                f_history=os.path.join(checkpoint_path, "history.json"),
            )
            # TODO(mshuaibi): remove dataset load, use saved normalizers
        except NotImplementedError:
            print("Unable to load checkpoint!")
Beispiel #19
0
class AtomsTrainer:
    def __init__(self, config={}):
        self.config = config
        self.pretrained = False

    def load(self, load_dataset=True):
        self.load_config()
        self.load_rng_seed()
        if load_dataset:
            self.load_dataset()
        self.load_model()
        self.load_criterion()
        self.load_optimizer()
        self.load_logger()
        self.load_extras()
        self.load_skorch()

    def load_config(self):
        dtype = self.config["cmd"].get("dtype", torch.FloatTensor)
        torch.set_default_tensor_type(dtype)
        self.timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        self.identifier = self.config["cmd"].get("identifier", False)
        if self.identifier:
            self.identifier = self.timestamp + "-{}".format(self.identifier)
        else:
            self.identifier = self.timestamp

        self.gpus = self.config["optim"].get("gpus", 0)
        if self.gpus > 0:
            self.output_device = 0
            self.device = f"cuda:{self.output_device}"
        else:
            self.device = "cpu"
            self.output_device = -1
        self.debug = self.config["cmd"].get("debug", False)
        run_dir = self.config["cmd"].get("run_dir", "./")
        os.chdir(run_dir)
        if not self.debug:
            self.cp_dir = os.path.join(run_dir, "checkpoints", self.identifier)
            print(f"Results saved to {self.cp_dir}")
            os.makedirs(self.cp_dir, exist_ok=True)

    def load_rng_seed(self):
        seed = self.config["cmd"].get("seed", 0)
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    def get_unique_elements(self, training_images):
        elements = np.array(
            [atom.symbol for atoms in training_images for atom in atoms]
        )
        elements = np.unique(elements)
        return elements

    def load_dataset(self):
        training_images = self.config["dataset"]["raw_data"]
        # TODO: Scalability when dataset to large to fit into memory
        if isinstance(training_images, str):
            training_images = ase.io.read(training_images, ":")
        del self.config["dataset"]["raw_data"]

        self.elements = self.config["dataset"].get(
            "elements", self.get_unique_elements(training_images)
        )

        self.forcetraining = self.config["model"].get("get_forces", True)
        self.fp_scheme = self.config["dataset"].get("fp_scheme", "gaussian").lower()
        self.fp_params = self.config["dataset"]["fp_params"]
        self.save_fps = self.config["dataset"].get("save_fps", True)
        self.cutoff_params = self.config["dataset"].get(
            "cutoff_params", {"cutoff_func": "Cosine"}
        )
        descriptor_setup = (
            self.fp_scheme,
            self.fp_params,
            self.cutoff_params,
            self.elements,
        )
        self.train_dataset = AtomsDataset(
            images=training_images,
            descriptor_setup=descriptor_setup,
            forcetraining=self.forcetraining,
            save_fps=self.config["dataset"].get("save_fps", True),
            scaling=self.config["dataset"].get(
                "scaling", {"type": "normalize", "range": (0, 1)}
            ),
        )
        self.feature_scaler = self.train_dataset.feature_scaler
        self.target_scaler = self.train_dataset.target_scaler
        self.input_dim = self.train_dataset.input_dim
        self.val_split = self.config["dataset"].get("val_split", 0)
        self.config["dataset"]["descriptor"] = descriptor_setup
        if not self.debug:
            normalizers = {
                "target": self.target_scaler,
                "feature": self.feature_scaler,
            }
            torch.save(normalizers, os.path.join(self.cp_dir, "normalizers.pt"))
            # clean/organize config
            self.config["dataset"]["fp_length"] = self.input_dim
            torch.save(self.config, os.path.join(self.cp_dir, "config.pt"))
        print("Loading dataset: {} images".format(len(self.train_dataset)))

    def load_model(self):
        elements = list_symbols_to_indices(self.elements)
        self.model = BPNN(
            elements=elements, input_dim=self.input_dim, **self.config["model"]
        )
        print("Loading model: {} parameters".format(self.model.num_params))
        self.forcetraining = self.config["model"].get("get_forces", True)
        collate_fn = DataCollater(train=True, forcetraining=self.forcetraining)
        self.parallel_collater = ParallelCollater(self.gpus, collate_fn)
        if self.gpus > 0:
            self.model = DataParallel(
                self.model,
                output_device=self.output_device,
                num_gpus=self.gpus,
            )

    def load_extras(self):
        callbacks = []
        load_best_loss = train_end_load_best_loss(self.identifier)
        self.val_split = self.config["dataset"].get("val_split", 0)
        self.split = CVSplit(cv=self.val_split) if self.val_split != 0 else 0

        metrics = evaluator(
            self.val_split,
            self.config["optim"].get("metric", "mae"),
            self.identifier,
            self.forcetraining,
        )
        callbacks.extend(metrics)

        if not self.debug:
            callbacks.append(load_best_loss)
        scheduler = self.config["optim"].get("scheduler", None)
        if scheduler:
            scheduler = LRScheduler(scheduler["policy"], **scheduler["params"])
            callbacks.append(scheduler)
        if self.config["cmd"].get("logger", False):
            from skorch.callbacks import WandbLogger

            callbacks.append(
                WandbLogger(
                    self.wandb_run,
                    save_model=False,
                    keys_ignored="dur",
                )
            )
        self.callbacks = callbacks

    def load_criterion(self):
        self.criterion = self.config["optim"].get("loss_fn", CustomLoss)

    def load_optimizer(self):
        self.optimizer = {
            "optimizer": self.config["optim"].get("optimizer", torch.optim.Adam)
        }
        optimizer_args = self.config["optim"].get("optimizer_args", False)
        if optimizer_args:
            self.optimizer.update(optimizer_args)

    def load_logger(self):
        if self.config["cmd"].get("logger", False):
            import wandb

            self.wandb_run = wandb.init(
                name=self.identifier,
                config=self.config,
            )

    def load_skorch(self):
        skorch.net.to_tensor = to_tensor

        self.net = NeuralNetRegressor(
            module=self.model,
            criterion=self.criterion,
            criterion__force_coefficient=self.config["optim"].get(
                "force_coefficient", 0
            ),
            criterion__loss=self.config["optim"].get("loss", "mse"),
            lr=self.config["optim"].get("lr", 1e-1),
            batch_size=self.config["optim"].get("batch_size", 32),
            max_epochs=self.config["optim"].get("epochs", 100),
            iterator_train__collate_fn=self.parallel_collater,
            iterator_train__shuffle=True,
            iterator_train__pin_memory=True,
            iterator_valid__collate_fn=self.parallel_collater,
            iterator_valid__shuffle=False,
            iterator_valid__pin_memory=True,
            device=self.device,
            train_split=self.split,
            callbacks=self.callbacks,
            verbose=self.config["cmd"].get("verbose", True),
            **self.optimizer,
        )
        print("Loading skorch trainer")

    def train(self, raw_data=None):
        if raw_data is not None:
            self.config["dataset"]["raw_data"] = raw_data
        if not self.pretrained:
            self.load()

        stime = time.time()
        self.net.fit(self.train_dataset, None)
        elapsed_time = time.time() - stime
        print(f"Training completed in {elapsed_time}s")

    def predict(self, images, disable_tqdm=True):
        if len(images) < 1:
            warnings.warn("No images found!", stacklevel=2)
            return images

        self.descriptor = construct_descriptor(self.config["dataset"]["descriptor"])

        a2d = AtomsToData(
            descriptor=self.descriptor,
            r_energy=False,
            r_forces=False,
            save_fps=self.config["dataset"].get("save_fps", True),
            fprimes=self.forcetraining,
            cores=1,
        )

        data_list = a2d.convert_all(images, disable_tqdm=disable_tqdm)
        self.feature_scaler.norm(data_list, disable_tqdm=disable_tqdm)

        self.net.module.eval()
        collate_fn = DataCollater(train=False, forcetraining=self.forcetraining)

        predictions = {"energy": [], "forces": []}
        for data in data_list:
            collated = collate_fn([data]).to(self.device)
            energy, forces = self.net.module([collated])

            energy = self.target_scaler.denorm(
                energy.detach().cpu(), pred="energy"
            ).tolist()
            forces = self.target_scaler.denorm(
                forces.detach().cpu(), pred="forces"
            ).numpy()

            predictions["energy"].extend(energy)
            predictions["forces"].append(forces)

        return predictions

    def load_pretrained(self, checkpoint_path=None, gpu2cpu=False):
        """
        Args:
            checkpoint_path: str, Path to checkpoint directory
            gpu2cpu: bool, True if checkpoint was trained with GPUs and you
            wish to load on cpu instead.
        """

        self.pretrained = True
        print(f"Loading checkpoint from {checkpoint_path}")
        assert os.path.isdir(
            checkpoint_path
        ), f"Checkpoint: {checkpoint_path} not found!"
        if not self.config:
            # prediction only
            self.config = torch.load(os.path.join(checkpoint_path, "config.pt"))
            self.config["cmd"]["debug"] = True
            self.elements = self.config["dataset"]["descriptor"][-1]
            self.input_dim = self.config["dataset"]["fp_length"]
            if gpu2cpu:
                self.config["optim"]["gpus"] = 0
            self.load(load_dataset=False)
        else:
            # prediction+retraining
            self.load(load_dataset=True)
        self.net.initialize()

        if gpu2cpu:
            params_path = os.path.join(checkpoint_path, "params_cpu.pt")
            if not os.path.exists(params_path):
                params = torch.load(
                    os.path.join(checkpoint_path, "params.pt"),
                    map_location=torch.device("cpu"),
                )
                new_dict = OrderedDict()
                for k, v in params.items():
                    name = k[7:]
                    new_dict[name] = v
                torch.save(new_dict, params_path)
        else:
            params_path = os.path.join(checkpoint_path, "params.pt")

        try:
            self.net.load_params(
                f_params=params_path,
                f_optimizer=os.path.join(checkpoint_path, "optimizer.pt"),
                f_criterion=os.path.join(checkpoint_path, "criterion.pt"),
                f_history=os.path.join(checkpoint_path, "history.json"),
            )
            normalizers = torch.load(os.path.join(checkpoint_path, "normalizers.pt"))
            self.feature_scaler = normalizers["feature"]
            self.target_scaler = normalizers["target"]
        except NotImplementedError:
            print("Unable to load checkpoint!")

    def get_calc(self):
        return AMPtorch(self)