Exemple #1
0
def synthetic_label_shift_simple(N,
                                 label_proportions,
                                 error_proba,
                                 covariates=None) -> pd.DataFrame:
    """
    Generate data with synthetic label shift and a single string feature column

    :param N: Number of observations
    :param label_proportions: Dirichlet vector with label proportions
    :param error_proba: Probability of sampling from a different covariate distribution
    :param covariates: list of covariate names, random strings by default
    """

    if covariates is None:
        covariates = []
        for i in range(len(label_proportions)):
            covariates.append(rand_string(6))

    out = []
    for n in range(N):
        label = np.random.choice(range(len(label_proportions)),
                                 p=label_proportions)
        if np.random.rand() > error_proba:
            covariate = covariates[label]
        else:
            # choose a different covariate at random.
            covariate = covariates[np.random.choice(
                [i for i in range(len(label_proportions)) if i != label])]
        out.append((covariate, 'label_' + str(label)))

    return pd.DataFrame(out, columns=['covariate', 'label'])
Exemple #2
0
    def _inner_impl(
            feature_col='features',
            label_col='labels',
            n_samples=500,
            word_length=5,
            num_words=100,
            vocab_size=100,
            num_labels=10):

        """
        Generates text features and categorical labels.
        :param feature_col: name of feature column.
        :param label_col: name of label column.
        :param n_samples: how many rows to generate.
        :return: pd.DataFrame with columns = [feature_col, label_col]
        """

        vocab = [rand_string(word_length) for i in range(vocab_size)]
        labels = vocab[:num_labels]
        words = vocab[num_labels:]

        def _sentence_with_label(labels=labels, words=words):
            """
            Generates a random token sequence containing a random label

            :param labels: label set
            :param words: vocabulary of tokens
            :return: blank separated token sequence and label

            """
            label = random.choice(labels)
            tokens = [random.choice(words) for _ in range(num_words)] + [label]
            sentence = " ".join(np.random.permutation(tokens))

            return sentence, label

        sentences, labels = zip(*[_sentence_with_label(labels, words) for _ in range(n_samples)])
        df = pd.DataFrame({feature_col: sentences, label_col: labels})

        return df
def test_simple_imputer_real_data_default_args(test_dir, data_frame):
    """
    Tests SimpleImputer with default options

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 2000
    num_labels = 3
    seq_len = 100
    vocab_size = int(2**15)

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    output_path = os.path.join(test_dir, "tmp", "real_data_experiment_simple")

    df_train_cols_before = df_train.columns.tolist()

    input_columns = [feature_col]

    imputer = SimpleImputer(input_columns=input_columns,
                            output_column=label_col,
                            output_path=output_path).fit(train_df=df_train)

    logfile = os.path.join(imputer.output_path, 'imputer.log')
    assert os.path.exists(logfile)
    assert os.path.getsize(logfile) > 0

    assert imputer.output_path == output_path
    assert imputer.imputer.data_featurizers[0].__class__ == BowFeaturizer
    assert imputer.imputer.data_encoders[0].__class__ == BowEncoder
    assert set(
        imputer.imputer.data_encoders[0].input_columns) == set(input_columns)
    assert set(imputer.imputer.label_encoders[0].input_columns) == set(
        [label_col])

    assert all([
        after == before
        for after, before in zip(df_train.columns, df_train_cols_before)
    ])

    df_no_label_column = df_test.copy()
    true_labels = df_test[label_col]
    del (df_no_label_column[label_col])
    df_test_cols_before = df_no_label_column.columns.tolist()

    df_test_imputed = imputer.predict(df_no_label_column, inplace=True)

    assert all([
        after == before for after, before in zip(df_no_label_column.columns,
                                                 df_test_cols_before)
    ])

    imputed_columns = df_test_cols_before + [
        label_col + "_imputed", label_col + "_imputed_proba"
    ]

    assert all([
        after == before
        for after, before in zip(df_test_imputed, imputed_columns)
    ])

    f1 = f1_score(true_labels,
                  df_test_imputed[label_col + '_imputed'],
                  average="weighted")

    assert f1 > .9

    new_path = imputer.output_path + "-" + rand_string()

    os.rename(imputer.output_path, new_path)

    deserialized = SimpleImputer.load(new_path)
    df_test = deserialized.predict(df_test,
                                   imputation_suffix="_deserialized_imputed")
    f1 = f1_score(df_test[label_col],
                  df_test[label_col + '_deserialized_imputed'],
                  average="weighted")

    assert f1 > .9

    retrained_simple_imputer = deserialized.fit(df_train, df_train)

    df_train_imputed = retrained_simple_imputer.predict(df_train.copy(),
                                                        inplace=True)
    f1 = f1_score(df_train[label_col],
                  df_train_imputed[label_col + '_imputed'],
                  average="weighted")

    assert f1 > .9

    metrics = retrained_simple_imputer.load_metrics()

    assert f1 == metrics['weighted_f1']