def synthetic_label_shift_simple(N, label_proportions, error_proba, covariates=None) -> pd.DataFrame: """ Generate data with synthetic label shift and a single string feature column :param N: Number of observations :param label_proportions: Dirichlet vector with label proportions :param error_proba: Probability of sampling from a different covariate distribution :param covariates: list of covariate names, random strings by default """ if covariates is None: covariates = [] for i in range(len(label_proportions)): covariates.append(rand_string(6)) out = [] for n in range(N): label = np.random.choice(range(len(label_proportions)), p=label_proportions) if np.random.rand() > error_proba: covariate = covariates[label] else: # choose a different covariate at random. covariate = covariates[np.random.choice( [i for i in range(len(label_proportions)) if i != label])] out.append((covariate, 'label_' + str(label))) return pd.DataFrame(out, columns=['covariate', 'label'])
def _inner_impl( feature_col='features', label_col='labels', n_samples=500, word_length=5, num_words=100, vocab_size=100, num_labels=10): """ Generates text features and categorical labels. :param feature_col: name of feature column. :param label_col: name of label column. :param n_samples: how many rows to generate. :return: pd.DataFrame with columns = [feature_col, label_col] """ vocab = [rand_string(word_length) for i in range(vocab_size)] labels = vocab[:num_labels] words = vocab[num_labels:] def _sentence_with_label(labels=labels, words=words): """ Generates a random token sequence containing a random label :param labels: label set :param words: vocabulary of tokens :return: blank separated token sequence and label """ label = random.choice(labels) tokens = [random.choice(words) for _ in range(num_words)] + [label] sentence = " ".join(np.random.permutation(tokens)) return sentence, label sentences, labels = zip(*[_sentence_with_label(labels, words) for _ in range(n_samples)]) df = pd.DataFrame({feature_col: sentences, label_col: labels}) return df
def test_simple_imputer_real_data_default_args(test_dir, data_frame): """ Tests SimpleImputer with default options """ feature_col = "string_feature" label_col = "label" n_samples = 2000 num_labels = 3 seq_len = 100 vocab_size = int(2**15) # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_simple") df_train_cols_before = df_train.columns.tolist() input_columns = [feature_col] imputer = SimpleImputer(input_columns=input_columns, output_column=label_col, output_path=output_path).fit(train_df=df_train) logfile = os.path.join(imputer.output_path, 'imputer.log') assert os.path.exists(logfile) assert os.path.getsize(logfile) > 0 assert imputer.output_path == output_path assert imputer.imputer.data_featurizers[0].__class__ == BowFeaturizer assert imputer.imputer.data_encoders[0].__class__ == BowEncoder assert set( imputer.imputer.data_encoders[0].input_columns) == set(input_columns) assert set(imputer.imputer.label_encoders[0].input_columns) == set( [label_col]) assert all([ after == before for after, before in zip(df_train.columns, df_train_cols_before) ]) df_no_label_column = df_test.copy() true_labels = df_test[label_col] del (df_no_label_column[label_col]) df_test_cols_before = df_no_label_column.columns.tolist() df_test_imputed = imputer.predict(df_no_label_column, inplace=True) assert all([ after == before for after, before in zip(df_no_label_column.columns, df_test_cols_before) ]) imputed_columns = df_test_cols_before + [ label_col + "_imputed", label_col + "_imputed_proba" ] assert all([ after == before for after, before in zip(df_test_imputed, imputed_columns) ]) f1 = f1_score(true_labels, df_test_imputed[label_col + '_imputed'], average="weighted") assert f1 > .9 new_path = imputer.output_path + "-" + rand_string() os.rename(imputer.output_path, new_path) deserialized = SimpleImputer.load(new_path) df_test = deserialized.predict(df_test, imputation_suffix="_deserialized_imputed") f1 = f1_score(df_test[label_col], df_test[label_col + '_deserialized_imputed'], average="weighted") assert f1 > .9 retrained_simple_imputer = deserialized.fit(df_train, df_train) df_train_imputed = retrained_simple_imputer.predict(df_train.copy(), inplace=True) f1 = f1_score(df_train[label_col], df_train_imputed[label_col + '_imputed'], average="weighted") assert f1 > .9 metrics = retrained_simple_imputer.load_metrics() assert f1 == metrics['weighted_f1']