Python FastTextClassifier Examples

Programming Language: Python

Namespace/Package Name: cleanlab.models.fasttext

Examples at hotexamples.com: 4

Python FastTextClassifier - 4 examples found. These are the top rated real world Python examples of cleanlab.models.fasttext.FastTextClassifier extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FastTextClassifier(4)

fit(3)

score(1)

Example #1

Show file

def test_cleanlab_with_fasttext():
    '''Tests FastTextClassifier when used with cleanlab to find a label error.'''
    
    if python_version.is_compatible():
        import cleanlab

        top = 3
        label_counts = list(zip(np.unique(y_train + y_test), cleanlab.util.value_counts(y_train + y_test)))
        # Find which labels occur the most often.
        top_labels = [v for v,c in sorted(label_counts, key=lambda x: x[1])[::-1][:top]]

        # Get indices of data and labels for the top labels
        X_train_idx, y_train_top = [list(w) for w in zip(*[(i, z.split(" ", 1)[0]) for i, z in enumerate(train_data) if z.split(" ", 1)[0] in top_labels])]
        X_test_idx, y_test_top = [list(w) for w in zip(*[(i, z.split(" ", 1)[0]) for i, z in enumerate(test_data) if z.split(" ", 1)[0] in top_labels])]

        # Pre-train
        ftc = FastTextClassifier(
            train_data_fn = DATA_DIR + 'cooking.train.txt', 
            test_data_fn = DATA_DIR + 'cooking.test.txt', 
            kwargs_train_supervised = {
                'epoch': 20,
            },
            del_intermediate_data = True,
        )
        ftc.fit(X_train_idx, y_train_top)
        # Set epochs to 1 for getting cross-validated predicted probabilities
        ftc.clf.epoch = 1

        # Dictionary mapping string labels to non-negative integers 0, 1, 2...
        label2num = dict(zip(np.unique(y_train_top), range(top)))
        # Map labels
        s_train = np.array([label2num[z] for z in y_train_top])
        # Compute confident joint and predicted probability matrix for each example
        cj, psx = cleanlab.latent_estimation.estimate_confident_joint_and_cv_pred_proba(
            X = np.array(X_train_idx), 
            s = s_train, 
            clf = ftc, 
            cv_n_folds=5,
        )
        # Find inidices of errors
        noise_idx = cleanlab.pruning.get_noise_indices(
            s_train, 
            psx, 
            confident_joint=cj, 
        )
        # Extract errors. This works by:
        # (1) masking the training examples we used with the noise indices identified.
        # (2) we find the actual train_data corresponding to those indices.
        errors = np.array(train_data)[np.array(X_train_idx)[noise_idx]]

        # Known error - this should be tagged as substituion, not baking.
        assert('__label__baking what can i use instead of corn syrup ?' in errors)
    assert(True)

Example #2

Show file

File: compare_cl_vs_vanilla.py Project: zwzhu-d/cleanlab

    for test_split in [10, 11]:
        train_fn = data_dir + 'train_{}_amazon5core.preprocessed.txt'.format(test_split)
        # Get labels
        noisy_labels = np.empty(file_len(train_fn), dtype=int)
        bs = 1000000
        label_map = {'__label__1':0, '__label__3':1, '__label__5':2}
        for i, (l, t) in enumerate(data_loader(train_fn, batch_size=bs)):
            noisy_labels[bs*i:bs*(i+1)] = [label_map[lab] for lab in l]

        ftc = FastTextClassifier(
            train_data_fn=train_fn, 
            batch_size=100000, 
            labels=[1, 3, 5],
            kwargs_train_supervised = {
                'epoch': epochs,
                'thread': cpu_threads,
                'lr': lr,
                'wordNgrams': ngram,
                'bucket': 200000,
                'dim': dim,
                'loss': 'softmax', #'softmax', # 'hs'
            },
        )
        pyx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
            X=np.arange(len(noisy_labels)),
            labels=noisy_labels,
            clf=ftc,
            cv_n_folds=cv_n_folds,
            seed=seed,
        )
        # Write out
        wfn = pyx_dir + 'amazon_pyx_train_{}_cv__folds_{}__epochs_{}__lr_{}__ngram_{}__dim_{}.npy'.format(

Example #3

Show file

    y_train, X_train = [
        list(t) for t in zip(*(z.split(" ", 1) for z in train_data))
    ]

    # Load test text data
    with open(DATA_DIR + 'cooking.test.txt', 'r') as f:
        test_data = [z.strip() for z in f.readlines()]
    y_test, X_test = [
        list(t) for t in zip(*(z.split(" ", 1) for z in test_data))
    ]

    # Set-up a FastTextClassifier model. Train it for five epochs.
    ftc = FastTextClassifier(
        train_data_fn=DATA_DIR + 'cooking.train.txt',
        test_data_fn=DATA_DIR + 'cooking.test.txt',
        kwargs_train_supervised={
            'epoch': 5,
        },
        del_intermediate_data=True,
    )
    ftc.fit(X=None)


def test_predict_proba_masking():

    if python_version.is_compatible():
        psx = ftc.predict_proba(X=[500, 1000, 4999])
        assert (psx.shape[0] == 3)
    assert (True)


def test_predict_masking():

Example #4

Show file

for i, params in enumerate(param_list):
    print(params)
    if i > 0:
        elapsed = dt.now() - start_time
        total_time = elapsed * len(param_list) / float(i)
        remaining = total_time - elapsed
        print('Elapsed:',
              str(elapsed)[:-7], '| Remaining:',
              str(remaining)[:-7])
    ftc = FastTextClassifier(
        train_data_fn=write_dir + 'amazon5core.preprocessed.txt',
        batch_size=100000,
        labels=[1, 3, 5],
        kwargs_train_supervised={
            'epoch': params['epochs'],
            'thread': 12,
            'lr': params['lr'],
            'wordNgrams': params['ngram'],
            'bucket': 200000,
            'dim': params['dim'],
            'loss': 'softmax',  #'softmax', # 'hs'
        },
    )
    pyx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
        X=np.arange(len(labels)),
        labels=labels,
        clf=ftc,
        cv_n_folds=params['cv_n_folds'],
        seed=seed,
    )
    # Write out
    wfn = write_dir + 'amazon_pyx_cv__folds_{}__epochs_{}__lr_{}__ngram_{}__dim_{}.npy'.format(