Exemple #1
0
    def test_prepare_folded_data_from_file(self):
        datafile = os.path.join(os.path.dirname(dde.__file__), 'test_data',
                                'datafile.csv')

        X_test, y_test, folded_Xs, folded_ys = prepare_folded_data_from_file(
            datafile,
            5,
            add_extra_atom_attribute=True,
            add_extra_bond_attribute=True,
            differentiate_atom_type=True,
            differentiate_bond_type=True,
            testing_ratio=0.0)

        self.assertEqual(len(folded_Xs), 5)
        self.assertEqual(len(folded_ys), 5)

        self.assertEqual(len(X_test), 0)
        self.assertEqual(len(y_test), 0)
        self.assertEqual(len(folded_Xs[0]), 2)
        self.assertEqual(len(folded_ys[0]), 2)
    def kfcv_train(self,
                   folds,
                   lr_func,
                   save_model_path,
                   pretrained_weights=None,
                   batch_size=1,
                   nb_epoch=150,
                   patience=10,
                   training_ratio=0.9,
                   testing_ratio=0.0):
        # prepare data for training
        if self.get_data_from_file:
            folded_data = prepare_folded_data_from_file(
                self.data_file,
                folds,
                add_extra_atom_attribute=self.add_extra_atom_attribute,
                add_extra_bond_attribute=self.add_extra_bond_attribute,
                differentiate_atom_type=self.differentiate_atom_type,
                differentiate_bond_type=self.differentiate_bond_type,
                padding=self.padding,
                padding_final_size=self.padding_final_size,
                save_tensors_dir=self.save_tensors_dir,
                testing_ratio=testing_ratio)
        else:
            folded_data = prepare_folded_data_from_multiple_datasets(
                self.datasets,
                folds,
                add_extra_atom_attribute=self.add_extra_atom_attribute,
                add_extra_bond_attribute=self.add_extra_bond_attribute,
                differentiate_atom_type=self.differentiate_atom_type,
                differentiate_bond_type=self.differentiate_bond_type,
                padding=self.padding,
                padding_final_size=self.padding_final_size,
                prediction_task=self.prediction_task,
                save_tensors_dir=self.save_tensors_dir)

        X_test, y_test, folded_Xs, folded_ys = folded_data

        losses, inner_val_losses, outer_val_losses, test_losses = [], [], [], []
        train_rmses, inner_val_rmses, outer_val_rmses, test_rmses = [], [], [], []
        train_maes, inner_val_maes, outer_val_maes, test_maes = [], [], [], []
        for fold in range(folds):
            data = prepare_data_one_fold(folded_Xs,
                                         folded_ys,
                                         current_fold=fold,
                                         training_ratio=training_ratio)

            # execute train_model
            X_train, X_inner_val, X_outer_val, y_train, y_inner_val, y_outer_val = data
            if self.normalize:
                y_train, y_inner_val, y_outer_val, y_test = self.normalize_output(
                    y_train, y_inner_val, y_outer_val, y_test)
            train_model_output = train_model(
                self.model,
                X_train,
                y_train,
                X_inner_val,
                y_inner_val,
                X_test,
                y_test,
                X_outer_val,
                y_outer_val,
                nb_epoch=nb_epoch,
                batch_size=batch_size,
                lr_func=lr_func,
                patience=patience,
                load_from_disk=True
                if self.save_tensors_dir is not None else False,
                save_model_path=save_model_path)

            model, loss, inner_val_loss, mean_outer_val_loss, mean_test_loss = train_model_output

            # loss and inner_val_loss each is a list
            # containing loss for each epoch
            losses.append(loss)
            inner_val_losses.append(inner_val_loss)
            outer_val_losses.append(mean_outer_val_loss)
            test_losses.append(mean_test_loss)

            # Calculate RMSEs and MAEs
            train_rmse, train_mae = self.evaluate(X_train, y_train)
            inner_val_rmse, inner_val_mae = self.evaluate(
                X_inner_val, y_inner_val)
            outer_val_rmse, outer_val_mae = self.evaluate(
                X_outer_val, y_outer_val)
            test_rmse, test_mae = self.evaluate(X_test, y_test)
            train_rmses.append(train_rmse)
            train_maes.append(train_mae)
            inner_val_rmses.append(inner_val_rmse)
            inner_val_maes.append(inner_val_mae)
            outer_val_rmses.append(outer_val_rmse)
            outer_val_maes.append(outer_val_mae)
            test_rmses.append(test_rmse)
            test_maes.append(test_mae)

            # save model and write fold report
            fpath = os.path.join(save_model_path, 'fold_{0}'.format(fold))
            self.save_model(loss,
                            inner_val_loss,
                            mean_outer_val_loss,
                            mean_test_loss,
                            fpath,
                            train_rmse=train_rmse,
                            train_mae=train_mae,
                            inner_val_rmse=inner_val_rmse,
                            inner_val_mae=inner_val_mae,
                            outer_val_rmse=outer_val_rmse,
                            outer_val_mae=outer_val_mae,
                            test_rmse=test_rmse,
                            test_mae=test_mae)

            # once finish training one fold, reset the model
            if pretrained_weights is not None:
                self.load_parameters(pretrained_weights)
            else:
                self.reset_model()

        # mean inner_val_loss and outer_val_loss used for selecting parameters,
        # e.g., lr, epoch, attributes, etc
        full_folds_mean_loss = np.mean([l[-1] for l in losses if len(l) > 0])
        full_folds_mean_inner_val_loss = np.mean(
            [l[-1] for l in inner_val_losses if len(l) > 0])
        full_folds_mean_outer_val_loss = np.mean(outer_val_losses)
        full_folds_mean_test_loss = np.mean(test_losses)
        full_folds_mean_train_rmse = np.mean(train_rmses)
        full_folds_mean_train_mae = np.mean(train_maes)
        full_folds_mean_inner_val_rmse = np.mean(inner_val_rmses)
        full_folds_mean_inner_val_mae = np.mean(inner_val_maes)
        full_folds_mean_outer_val_rmse = np.mean(outer_val_rmses)
        full_folds_mean_outer_val_mae = np.mean(outer_val_maes)
        full_folds_mean_test_rmse = np.mean(test_rmses)
        full_folds_mean_test_mae = np.mean(test_maes)

        full_folds_loss_report_path = os.path.join(
            save_model_path, 'full_folds_loss_report.txt')

        write_loss_report(full_folds_mean_loss,
                          full_folds_mean_inner_val_loss,
                          full_folds_mean_outer_val_loss,
                          full_folds_mean_test_loss,
                          full_folds_loss_report_path,
                          train_rmse=full_folds_mean_train_rmse,
                          train_mae=full_folds_mean_train_mae,
                          inner_val_rmse=full_folds_mean_inner_val_rmse,
                          inner_val_mae=full_folds_mean_inner_val_mae,
                          outer_val_rmse=full_folds_mean_outer_val_rmse,
                          outer_val_mae=full_folds_mean_outer_val_mae,
                          test_rmse=full_folds_mean_test_rmse,
                          test_mae=full_folds_mean_test_mae)

        # Delete tensor directory
        if self.save_tensors_dir is not None:
            if not self.keep_tensors:
                shutil.rmtree(self.save_tensors_dir)
    def kfcv_batch_train(self,
                         folds,
                         pretrained_weights=None,
                         batch_size=50,
                         nb_epoch=150,
                         patience=10,
                         training_ratio=0.9,
                         testing_ratio=0.0):
        # prepare data for training
        if self.get_data_from_file:
            folded_data = prepare_folded_data_from_file(
                self.data_file,
                folds,
                add_extra_atom_attribute=self.add_extra_atom_attribute,
                add_extra_bond_attribute=self.add_extra_bond_attribute,
                differentiate_atom_type=self.differentiate_atom_type,
                differentiate_bond_type=self.differentiate_bond_type,
                padding=self.padding,
                padding_final_size=self.padding_final_size,
                save_tensors_dir=self.save_tensors_dir,
                testing_ratio=testing_ratio)
        else:
            folded_data = prepare_folded_data_from_multiple_datasets(
                self.datasets,
                folds,
                add_extra_atom_attribute=self.add_extra_atom_attribute,
                add_extra_bond_attribute=self.add_extra_bond_attribute,
                differentiate_atom_type=self.differentiate_atom_type,
                differentiate_bond_type=self.differentiate_bond_type,
                padding=self.padding,
                padding_final_size=self.padding_final_size,
                prediction_task=self.prediction_task,
                save_tensors_dir=self.save_tensors_dir)

        X_test, y_test, folded_Xs, folded_ys = folded_data

        # Data might be stored as file names
        if len(X_test) > 0:
            if isinstance(X_test[0], str):
                dims = np.load(X_test[0]).shape
                X_test_new = np.zeros((len(X_test), ) + dims)
                for i, fname in enumerate(X_test):
                    X_test_new[i] = np.load(fname)
                X_test = X_test_new

        for fold in range(folds):
            data = prepare_data_one_fold(folded_Xs,
                                         folded_ys,
                                         current_fold=fold,
                                         training_ratio=training_ratio)

            X_train, X_inner_val, X_outer_val, y_train, y_inner_val, y_outer_val = data

            if isinstance(X_train, np.ndarray):
                X_train = np.concatenate((X_train, X_inner_val))
            else:
                X_train.extend(X_inner_val)
            if isinstance(y_train, np.ndarray):
                y_train = np.concatenate((y_train, y_inner_val))
            else:
                y_train.extend(y_inner_val)

            # Data might be stored as file names
            if isinstance(X_train[0], str):
                dims = np.load(X_train[0]).shape
                X_train_new = np.zeros((len(X_train), ) + dims)
                X_outer_val_new = np.zeros((len(X_outer_val), ) + dims)
                for i, fname in enumerate(X_train):
                    X_train_new[i] = np.load(fname)
                for i, fname in enumerate(X_outer_val):
                    X_outer_val_new[i] = np.load(fname)
                X_train = X_train_new
                X_outer_val = X_outer_val_new

            if self.normalize:
                y_train, y_outer_val, y_test = self.normalize_output(
                    y_train, y_outer_val, y_test)

            earlyStopping = EarlyStopping(monitor='val_loss',
                                          patience=patience,
                                          verbose=1,
                                          mode='auto')

            history_callback = self.model.fit(np.asarray(X_train),
                                              np.asarray(y_train),
                                              callbacks=[earlyStopping],
                                              nb_epoch=nb_epoch,
                                              batch_size=batch_size,
                                              validation_split=1.0 -
                                              training_ratio)

            loss_history = history_callback.history
            with open(
                    os.path.join(self.out_dir,
                                 'history.json_fold_{0}'.format(fold)),
                    'w') as f_in:
                json.dump(loss_history, f_in, indent=2)

            # evaluate outer validation loss
            outer_val_loss = self.model.evaluate(np.asarray(X_outer_val),
                                                 np.asarray(y_outer_val),
                                                 batch_size=50)
            logging.info("\nOuter val loss: {0}".format(outer_val_loss))

            if len(X_test) > 0:
                test_loss = self.model.evaluate(np.asarray(X_test),
                                                np.asarray(y_test),
                                                batch_size=50)
                logging.info("\nTest loss: {0}".format(test_loss))

            # once finish training one fold, reset the model
            if pretrained_weights is not None:
                self.load_parameters(pretrained_weights)
            else:
                self.reset_model()

        # Delete tensor directory
        if self.save_tensors_dir is not None:
            if not self.keep_tensors:
                shutil.rmtree(self.save_tensors_dir)