def test_prepare_deep_without_embedding_columns(): errors=[] df_randint = pd.DataFrame(np.random.choice(np.arange(100), (100,2))) df_randint.columns = ['col1', 'col2'] preprocessor3 = DeepPreprocessor(continuous_cols=['col1', 'col2']) try: X_randint = preprocessor3.fit_transform(df_randint) except: errors.append('Fundamental Error') out_booleans = [] means, stds = np.mean(X_randint, axis=0), np.std(X_randint, axis=0) for mean, std in zip(means, stds): out_booleans.append(np.isclose(mean, 0.)) out_booleans.append(np.isclose(std, 1.)) if not np.all(out_booleans): errors.append('There is something going on with the scaler') assert not errors, "errors occured:\n{}".format("\n".join(errors))
wide_cols = [ 'age_buckets', 'education', 'relationship', 'workclass', 'occupation', 'native_country', 'gender' ] crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')] cat_embed_cols = [('education', 10), ('relationship', 8), ('workclass', 10), ('occupation', 10), ('native_country', 10)] continuous_cols = ["age", "hours_per_week"] target = 'income_label' target = df[target].values prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = prepare_wide.fit_transform(df) prepare_deep = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols) X_deep = prepare_deep.fit_transform(df) wide = Wide(wide_dim=X_wide.shape[1], output_dim=1) deepdense = DeepDense(hidden_layers=[64, 32], dropout=[0.2, 0.2], deep_column_idx=prepare_deep.deep_column_idx, embed_input=prepare_deep.embeddings_input, continuous_cols=continuous_cols) model = WideDeep(wide=wide, deepdense=deepdense) wide_opt = torch.optim.Adam(model.wide.parameters()) deep_opt = RAdam(model.deepdense.parameters()) wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=3) deep_sch = torch.optim.lr_scheduler.StepLR(deep_opt, step_size=5) optimizers = {'wide': wide_opt, 'deepdense': deep_opt}
] ) def test_label_encoder_with_custom_encoder(input_df, encoding_dict, output_df): tmp_df = input_df.copy() for c in input_df.columns: tmp_df[c] = tmp_df[c].map(encoding_dict[c]) assert tmp_df.equals(output_df) ################################################################################ # Test the DeepPreprocessor: only categorical columns to be represented with # embeddings ############################################################################### cat_embed_cols = [('col1',5), ('col2',5)] preprocessor1 = DeepPreprocessor(cat_embed_cols) X_letters = preprocessor1.fit_transform(df_letters) embed_input_letters = preprocessor1.embeddings_input decoding_dict_letters = {c: {k:v for v,k in preprocessor1.encoding_dict[c].items()} for c in preprocessor1.encoding_dict.keys()} preprocessor2 = DeepPreprocessor(cat_embed_cols) X_numbers = preprocessor2.fit_transform(df_numbers) embed_input_numbers = preprocessor2.embeddings_input decoding_dict_numbers = {c: {k:v for v,k in preprocessor2.encoding_dict[c].items()} for c in preprocessor2.encoding_dict.keys()} errors = [] @pytest.mark.parametrize('input_df, X_deep, embed_input, decoding_dict, error_list', [ (df_letters, X_letters, embed_input_letters, decoding_dict_letters, errors), (df_numbers, X_numbers, embed_input_numbers, decoding_dict_numbers, errors),