def test_validation(): """ test cross-validation """ # this sys.path.append are used to import knnModel inside /models/KNN sys.path.append(".") sys.path.append("../") from catboostModel import PredictiveModel X, Y = getXY() string_cols = [ "Unnamed: 0", "dataset_type", "Name", "RescuerID", "Description", "PetID" ] categorical_col = [ "Type", "Gender", "Vaccinated", "Dewormed", "Sterilized", "Breed1", "Breed2", "Color1", "Color2", "Color3", "State" ] numerical_col = [ col for col in X.columns if col not in string_cols and col not in categorical_col and col != "AdoptionSpeed" ] mapping_sizes = [2, 2, 3, 3, 3, 307, 307, 7, 7, 7, 15] cat_features = [ i for i in range(len(numerical_col), len(numerical_col) + len(categorical_col)) ] X = pd.concat([X[numerical_col], X[categorical_col]], axis=1) model = PredictiveModel("catboost_by_pytest") assert model.validation(X, Y, cat_features, n_folds=2) > 0 assert model.validation(X, Y, cat_features, method=1, n_folds=2) > 0 assert model.validation(X, Y, cat_features, method=2, n_folds=2) > 0 assert model.validation(X, Y, cat_features, n_folds=1) > 0
def test_run(): """ this test just runs the load-train-predict workflow the code is just a script of 85f7ec7c8c0581c347a5b8034139a9ad3a6c3352../../../kNN.ipynb """ # this sys.path.append are used to import knnModel inside /models/KNN sys.path.append(".") sys.path.append("../") from catboostModel import PredictiveModel ########################################################### #### this can be used as an example usage of the model #### ########################################################### X, Y = getXY() string_cols = [ "Unnamed: 0", "dataset_type", "Name", "RescuerID", "Description", "PetID" ] categorical_col = [ "Type", "Gender", "Vaccinated", "Dewormed", "Sterilized", "Breed1", "Breed2", "Color1", "Color2", "Color3", "State" ] numerical_col = [ col for col in X.columns if col not in string_cols and col not in categorical_col and col != "AdoptionSpeed" ] mapping_sizes = [2, 2, 3, 3, 3, 307, 307, 7, 7, 7, 15] cat_features = [ i for i in range(len(numerical_col), len(numerical_col) + len(categorical_col)) ] X = pd.concat([X[numerical_col], X[categorical_col]], axis=1) train_size = int(len(X) * 0.8) # split in train and validation data train_X, train_Y = X[:train_size], Y[:train_size] validation_X, validation_Y = X[train_size:], Y[train_size:] assert train_X.shape[0] == train_Y.shape[0] assert validation_X.shape[0] == validation_Y.shape[0] model = PredictiveModel("catboost_by_pytest") model.train(train_X, train_Y, cat_features) predictions = model.predict(validation_X) score = model.evaluate(validation_Y) assert score > 0 # score is less then zero means something is wrong predictions = model.predict(validation_X, probability=True) assert len(predictions) > 0 assert 1 - 1e6 < sum(predictions[0]) < 1 + 1e6
def test_grid_search(): """ """ # this sys.path.append are used to import knnModel inside /models/KNN sys.path.append(".") sys.path.append("../") from catboostModel import PredictiveModel X, Y = getXY() string_cols = [ "Unnamed: 0", "dataset_type", "Name", "RescuerID", "Description", "PetID" ] categorical_col = [ "Type", "Gender", "Vaccinated", "Dewormed", "Sterilized", "Breed1", "Breed2", "Color1", "Color2", "Color3", "State" ] numerical_col = [ col for col in X.columns if col not in string_cols and col not in categorical_col and col != "AdoptionSpeed" ] mapping_sizes = [2, 2, 3, 3, 3, 307, 307, 7, 7, 7, 15] cat_features = [ i for i in range(len(numerical_col), len(numerical_col) + len(categorical_col)) ] X = pd.concat([X[numerical_col], X[categorical_col]], axis=1) params = { 'depth': 3, 'iterations': 20, 'learning_rate': 0.5, 'l2_leaf_reg': 3, 'border_count': 3, 'thread_count': 4, } model = PredictiveModel("catboost_by_pytest", params) assert model.validation(X, Y, cat_features, n_folds=2) > 0
def test_meta(): """ test generate_meta, replicating validation """ # this sys.path.append are used to import knnModel inside /models/KNN sys.path.append(".") sys.path.append("../") from catboostModel import PredictiveModel X, Y = getXY() string_cols = [ "Unnamed: 0", "dataset_type", "Name", "RescuerID", "Description", "PetID" ] categorical_col = [ "Type", "Gender", "Vaccinated", "Dewormed", "Sterilized", "Breed1", "Breed2", "Color1", "Color2", "Color3", "State" ] numerical_col = [ col for col in X.columns if col not in string_cols and col not in categorical_col and col != "AdoptionSpeed" ] mapping_sizes = [2, 2, 3, 3, 3, 307, 307, 7, 7, 7, 15] cat_features = [ i for i in range(len(numerical_col), len(numerical_col) + len(categorical_col)) ] X = pd.concat([X[numerical_col], X[categorical_col]], axis=1) model = PredictiveModel("catboost_by_pytest_generate_meta") n_folds = 3 score = model.validation(X, Y, cat_features, n_folds=n_folds) meta_train = model.generate_meta_train(X, Y, cat_features, n_folds=n_folds, short=True) meta_train = model.generate_meta_train(X, Y, cat_features, n_folds=n_folds, short=True, verbose=True) from sklearn.model_selection import KFold splitclass = KFold(n_splits=n_folds) for train_index, test_index in splitclass.split(X): meta_vals = meta_train.loc[test_index] # generated from .generate_meta train_X, train_Y = X.loc[train_index], Y.loc[train_index] validation_X, validation_Y = X.loc[test_index], Y.loc[test_index] assert train_X.shape[0] == train_Y.shape[0] assert validation_X.shape[0] == validation_Y.shape[0] model.train(train_X, train_Y, cat_features, short=True) predictions = model.predict(validation_X, probability=True) meta_vals = meta_vals.reset_index().drop('index', axis=1) for i, p in enumerate(predictions): assert p[0] == meta_vals.loc[i, 'L0'] assert p[1] == meta_vals.loc[i, 'L1'] assert p[2] == meta_vals.loc[i, 'L2'] assert p[3] == meta_vals.loc[i, 'L3'] assert p[4] == meta_vals.loc[i, 'L4'] X_test = getXY(X_test=True) X_test = pd.concat([X_test[numerical_col], X_test[categorical_col]], axis=1) meta_test = model.generate_meta_test(X, Y, cat_features, X_test) assert len(meta_test.columns) == 5 assert len(meta_test) == len(X_test)