def test_with_feature_attachment(): X, y = make_classification_df(n_num_features=5, class_sep=0.7) params = { 'objective': 'binary', 'max_depth': 8 } with get_temp_directory() as temp_feature_path: cols = list(X.columns) for i, c in enumerate(cols): if X.shape[1] == 1: break save_feature(X[[c]], i, directory=temp_feature_path) X.drop(c, axis=1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False) with get_temp_directory() as temp_path: result_wo_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path) with get_temp_directory() as temp_path: result_w_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path, feature_list=[0, 1, 2, 3], feature_directory=temp_feature_path) assert result_w_feature.metrics[-1] > result_wo_feature.metrics[-1]
def test_feature_exists(): df = pd.DataFrame({ 'a': [1, 2, 3, 4, 5] + [None] * 5 }) with get_temp_directory() as tmp: fs.save_feature(df[['a']], 0, directory=tmp) with pytest.raises(RuntimeError): fs.save_feature(df, 0, overwrite=False, directory=tmp)
def test_save_feature(): df = pd.DataFrame() df['a'] = np.arange(100) with get_temp_directory() as tmp: fs.save_feature(df, 0, tmp) assert os.path.exists(os.path.join(tmp, '0.f'))
def test_load_feature(): df = pd.DataFrame() df['a'] = np.arange(100) with get_temp_directory() as tmp: fs.save_feature(df, 0, tmp) df_loaded = fs.load_feature(0, tmp) assert_frame_equal(df, df_loaded)
def test_load_features(): df = pd.DataFrame() df['a'] = np.arange(100).astype(float) df['b'] = np.arange(100).astype(int) df['c'] = np.arange(100).astype(int) with get_temp_directory() as tmp: fs.save_feature(df[['b']], 0, tmp) fs.save_feature(df[['c']], 1, tmp) df_loaded = fs.load_features(df[['a']], [0, 1], tmp) assert_frame_equal(df, df_loaded)
def test_load_feature_ignore_all_columns(): df = pd.DataFrame() df['a'] = np.arange(100).astype(float) df['b'] = np.arange(100).astype(int) df['c'] = np.arange(100).astype(int) with get_temp_directory() as tmp: fs.save_feature(df, 0, tmp) df_loaded = fs.load_feature(0, tmp, ignore_columns=['a', 'b', 'c', 'X']) assert_frame_equal(df_loaded, df.drop(['a', 'b', 'c'], axis=1))
def test_various_dtypes(): df = pd.DataFrame() df['a'] = np.arange(100).astype(float) df['b'] = np.arange(100).astype(int) df['c'] = np.arange(100).astype(np.uint8) df['d'] = np.arange(100).astype(np.uint16) df['e'] = np.arange(100).astype(np.uint32) df['f'] = np.arange(100).astype(np.int8) df['g'] = np.arange(100).astype(np.int16) df['h'] = np.arange(100).astype(np.int32) df['i'] = np.arange(100).astype(np.int64) with get_temp_directory() as tmp: fs.save_feature(df, 0, tmp) df_loaded = fs.load_feature(0, tmp) assert_frame_equal(df, df_loaded)
def test_load_features_no_base(): df = pd.DataFrame() df['a'] = np.arange(100).astype(float) df['b'] = np.arange(100).astype(int) df['c'] = np.arange(100).astype(int) with get_temp_directory() as tmp: fs.save_feature(df[['b']], 0, tmp) fs.save_feature(df[['c']], 1, tmp) fs.save_feature(df[['a']], '2', tmp) df_loaded = fs.load_features(None, [0, 1, '2'], tmp) assert list(df_loaded.columns) == ['b', 'c', 'a']
def test_invalid_feature(): df = pd.DataFrame({ 'a': [1, 2, 3, 4, 5] + [None] * 5, 'b': np.random.randint(0, 10, size=10) }) y = pd.Series([1, 0, 1, 0, 1]) with get_temp_directory() as tmp: with pytest.raises(RuntimeError): fs.save_feature(df[['a']], 0, reference_target_variable=y, directory=tmp) with pytest.raises(RuntimeError): fs.save_feature(df, 0, reference_target_variable=y, directory=tmp) # ok fs.save_feature(df[['b']], 0, reference_target_variable=y, directory=tmp)
def test_load_features_duplicate_col_name(): df = pd.DataFrame() df['a'] = np.arange(100).astype(float) df['b'] = np.arange(100).astype(int) df['c'] = np.arange(100).astype(int) with get_temp_directory() as tmp: fs.save_feature(df[['a', 'b']], 0, tmp) fs.save_feature(df[['b', 'c']], 1, tmp) fs.save_feature(df[['b', 'a']], 'X', tmp) df_loaded = fs.load_features(None, [0, 1, 'X'], tmp, rename_duplicate=True) assert list(df_loaded.columns) == ['a', 'b', 'b_1', 'c', 'b_X', 'a_X'] df_loaded = fs.load_features(None, [0, 1, 'X'], tmp, rename_duplicate=False) assert list(df_loaded.columns) == ['a', 'b', 'b', 'c', 'b', 'a']