def test_inplace_prediction(test_dir, data_frame): label_col = 'label' df = data_frame(n_samples=100, label_col=label_col) data_encoder_cols = [TfIdfEncoder('features')] label_encoder_cols = [CategoricalEncoder(label_col)] data_cols = [BowFeaturizer('features')] output_path = os.path.join(test_dir, "tmp", "out") imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit(train_df=df, num_epochs=1) predicted = imputer.predict(df, inplace=True) assert predicted is df
def test_imputer_real_data_all_featurizers(test_dir, data_frame): """ Tests Imputer with sequential, bag-of-words and categorical variables as inputs this could be run as part of integration test suite. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 5000 num_labels = 3 seq_len = 20 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col + "_lstm", max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col + "_lstm", seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] output_path = os.path.join(test_dir, "tmp", "imputer_experiment_synthetic_data") num_epochs = 10 batch_size = 32 learning_rate = 1e-2 imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit(train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, calibrate=False) len_df_before_predict = len(df_test) pred = imputer.transform(df_test) assert len(pred[label_col]) == len_df_before_predict assert sum(df_test[label_col].values == pred[label_col]) == len(df_test) _ = imputer.predict_proba_top_k(df_test, top_k=2) _, metrics = imputer.transform_and_compute_metrics(df_test) assert metrics[label_col]['avg_f1'] > 0.9 deserialized = Imputer.load(imputer.output_path) _, metrics_deserialized = deserialized.transform_and_compute_metrics( df_test) assert metrics_deserialized[label_col]['avg_f1'] > 0.9 # training on a small data set to get a imputer with low precision not_so_precise_imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit( train_df=df_train[:50], test_df=df_test, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, calibrate=False) df_test = df_test.reset_index() predictions_df = not_so_precise_imputer.predict( df_test, precision_threshold=.5, imputation_suffix="_imputed") assert predictions_df.columns.contains(label_col + "_imputed") assert predictions_df.columns.contains(label_col + "_imputed_proba")