Example #1
0
def test_cleanlab_with_fasttext():
    '''Tests FastTextClassifier when used with cleanlab to find a label error.'''
    
    if python_version.is_compatible():
        import cleanlab

        top = 3
        label_counts = list(zip(np.unique(y_train + y_test), cleanlab.util.value_counts(y_train + y_test)))
        # Find which labels occur the most often.
        top_labels = [v for v,c in sorted(label_counts, key=lambda x: x[1])[::-1][:top]]

        # Get indices of data and labels for the top labels
        X_train_idx, y_train_top = [list(w) for w in zip(*[(i, z.split(" ", 1)[0]) for i, z in enumerate(train_data) if z.split(" ", 1)[0] in top_labels])]
        X_test_idx, y_test_top = [list(w) for w in zip(*[(i, z.split(" ", 1)[0]) for i, z in enumerate(test_data) if z.split(" ", 1)[0] in top_labels])]

        # Pre-train
        ftc = FastTextClassifier(
            train_data_fn = DATA_DIR + 'cooking.train.txt', 
            test_data_fn = DATA_DIR + 'cooking.test.txt', 
            kwargs_train_supervised = {
                'epoch': 20,
            },
            del_intermediate_data = True,
        )
        ftc.fit(X_train_idx, y_train_top)
        # Set epochs to 1 for getting cross-validated predicted probabilities
        ftc.clf.epoch = 1

        # Dictionary mapping string labels to non-negative integers 0, 1, 2...
        label2num = dict(zip(np.unique(y_train_top), range(top)))
        # Map labels
        s_train = np.array([label2num[z] for z in y_train_top])
        # Compute confident joint and predicted probability matrix for each example
        cj, psx = cleanlab.latent_estimation.estimate_confident_joint_and_cv_pred_proba(
            X = np.array(X_train_idx), 
            s = s_train, 
            clf = ftc, 
            cv_n_folds=5,
        )
        # Find inidices of errors
        noise_idx = cleanlab.pruning.get_noise_indices(
            s_train, 
            psx, 
            confident_joint=cj, 
        )
        # Extract errors. This works by:
        # (1) masking the training examples we used with the noise indices identified.
        # (2) we find the actual train_data corresponding to those indices.
        errors = np.array(train_data)[np.array(X_train_idx)[noise_idx]]

        # Known error - this should be tagged as substituion, not baking.
        assert('__label__baking what can i use instead of corn syrup ?' in errors)
    assert(True)
    for test_split in [10, 11]:
        train_fn = data_dir + 'train_{}_amazon5core.preprocessed.txt'.format(test_split)
        # Get labels
        noisy_labels = np.empty(file_len(train_fn), dtype=int)
        bs = 1000000
        label_map = {'__label__1':0, '__label__3':1, '__label__5':2}
        for i, (l, t) in enumerate(data_loader(train_fn, batch_size=bs)):
            noisy_labels[bs*i:bs*(i+1)] = [label_map[lab] for lab in l]

        ftc = FastTextClassifier(
            train_data_fn=train_fn, 
            batch_size=100000, 
            labels=[1, 3, 5],
            kwargs_train_supervised = {
                'epoch': epochs,
                'thread': cpu_threads,
                'lr': lr,
                'wordNgrams': ngram,
                'bucket': 200000,
                'dim': dim,
                'loss': 'softmax', #'softmax', # 'hs'
            },
        )
        pyx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
            X=np.arange(len(noisy_labels)),
            labels=noisy_labels,
            clf=ftc,
            cv_n_folds=cv_n_folds,
            seed=seed,
        )
        # Write out
        wfn = pyx_dir + 'amazon_pyx_train_{}_cv__folds_{}__epochs_{}__lr_{}__ngram_{}__dim_{}.npy'.format(
Example #3
0
    y_train, X_train = [
        list(t) for t in zip(*(z.split(" ", 1) for z in train_data))
    ]

    # Load test text data
    with open(DATA_DIR + 'cooking.test.txt', 'r') as f:
        test_data = [z.strip() for z in f.readlines()]
    y_test, X_test = [
        list(t) for t in zip(*(z.split(" ", 1) for z in test_data))
    ]

    # Set-up a FastTextClassifier model. Train it for five epochs.
    ftc = FastTextClassifier(
        train_data_fn=DATA_DIR + 'cooking.train.txt',
        test_data_fn=DATA_DIR + 'cooking.test.txt',
        kwargs_train_supervised={
            'epoch': 5,
        },
        del_intermediate_data=True,
    )
    ftc.fit(X=None)


def test_predict_proba_masking():

    if python_version.is_compatible():
        psx = ftc.predict_proba(X=[500, 1000, 4999])
        assert (psx.shape[0] == 3)
    assert (True)


def test_predict_masking():
Example #4
0
for i, params in enumerate(param_list):
    print(params)
    if i > 0:
        elapsed = dt.now() - start_time
        total_time = elapsed * len(param_list) / float(i)
        remaining = total_time - elapsed
        print('Elapsed:',
              str(elapsed)[:-7], '| Remaining:',
              str(remaining)[:-7])
    ftc = FastTextClassifier(
        train_data_fn=write_dir + 'amazon5core.preprocessed.txt',
        batch_size=100000,
        labels=[1, 3, 5],
        kwargs_train_supervised={
            'epoch': params['epochs'],
            'thread': 12,
            'lr': params['lr'],
            'wordNgrams': params['ngram'],
            'bucket': 200000,
            'dim': params['dim'],
            'loss': 'softmax',  #'softmax', # 'hs'
        },
    )
    pyx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
        X=np.arange(len(labels)),
        labels=labels,
        clf=ftc,
        cv_n_folds=params['cv_n_folds'],
        seed=seed,
    )
    # Write out
    wfn = write_dir + 'amazon_pyx_cv__folds_{}__epochs_{}__lr_{}__ngram_{}__dim_{}.npy'.format(