def test_cleanlab_with_fasttext(): '''Tests FastTextClassifier when used with cleanlab to find a label error.''' if python_version.is_compatible(): import cleanlab top = 3 label_counts = list(zip(np.unique(y_train + y_test), cleanlab.util.value_counts(y_train + y_test))) # Find which labels occur the most often. top_labels = [v for v,c in sorted(label_counts, key=lambda x: x[1])[::-1][:top]] # Get indices of data and labels for the top labels X_train_idx, y_train_top = [list(w) for w in zip(*[(i, z.split(" ", 1)[0]) for i, z in enumerate(train_data) if z.split(" ", 1)[0] in top_labels])] X_test_idx, y_test_top = [list(w) for w in zip(*[(i, z.split(" ", 1)[0]) for i, z in enumerate(test_data) if z.split(" ", 1)[0] in top_labels])] # Pre-train ftc = FastTextClassifier( train_data_fn = DATA_DIR + 'cooking.train.txt', test_data_fn = DATA_DIR + 'cooking.test.txt', kwargs_train_supervised = { 'epoch': 20, }, del_intermediate_data = True, ) ftc.fit(X_train_idx, y_train_top) # Set epochs to 1 for getting cross-validated predicted probabilities ftc.clf.epoch = 1 # Dictionary mapping string labels to non-negative integers 0, 1, 2... label2num = dict(zip(np.unique(y_train_top), range(top))) # Map labels s_train = np.array([label2num[z] for z in y_train_top]) # Compute confident joint and predicted probability matrix for each example cj, psx = cleanlab.latent_estimation.estimate_confident_joint_and_cv_pred_proba( X = np.array(X_train_idx), s = s_train, clf = ftc, cv_n_folds=5, ) # Find inidices of errors noise_idx = cleanlab.pruning.get_noise_indices( s_train, psx, confident_joint=cj, ) # Extract errors. This works by: # (1) masking the training examples we used with the noise indices identified. # (2) we find the actual train_data corresponding to those indices. errors = np.array(train_data)[np.array(X_train_idx)[noise_idx]] # Known error - this should be tagged as substituion, not baking. assert('__label__baking what can i use instead of corn syrup ?' in errors) assert(True)
for test_split in [10, 11]: train_fn = data_dir + 'train_{}_amazon5core.preprocessed.txt'.format(test_split) # Get labels noisy_labels = np.empty(file_len(train_fn), dtype=int) bs = 1000000 label_map = {'__label__1':0, '__label__3':1, '__label__5':2} for i, (l, t) in enumerate(data_loader(train_fn, batch_size=bs)): noisy_labels[bs*i:bs*(i+1)] = [label_map[lab] for lab in l] ftc = FastTextClassifier( train_data_fn=train_fn, batch_size=100000, labels=[1, 3, 5], kwargs_train_supervised = { 'epoch': epochs, 'thread': cpu_threads, 'lr': lr, 'wordNgrams': ngram, 'bucket': 200000, 'dim': dim, 'loss': 'softmax', #'softmax', # 'hs' }, ) pyx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities( X=np.arange(len(noisy_labels)), labels=noisy_labels, clf=ftc, cv_n_folds=cv_n_folds, seed=seed, ) # Write out wfn = pyx_dir + 'amazon_pyx_train_{}_cv__folds_{}__epochs_{}__lr_{}__ngram_{}__dim_{}.npy'.format(
y_train, X_train = [ list(t) for t in zip(*(z.split(" ", 1) for z in train_data)) ] # Load test text data with open(DATA_DIR + 'cooking.test.txt', 'r') as f: test_data = [z.strip() for z in f.readlines()] y_test, X_test = [ list(t) for t in zip(*(z.split(" ", 1) for z in test_data)) ] # Set-up a FastTextClassifier model. Train it for five epochs. ftc = FastTextClassifier( train_data_fn=DATA_DIR + 'cooking.train.txt', test_data_fn=DATA_DIR + 'cooking.test.txt', kwargs_train_supervised={ 'epoch': 5, }, del_intermediate_data=True, ) ftc.fit(X=None) def test_predict_proba_masking(): if python_version.is_compatible(): psx = ftc.predict_proba(X=[500, 1000, 4999]) assert (psx.shape[0] == 3) assert (True) def test_predict_masking():
for i, params in enumerate(param_list): print(params) if i > 0: elapsed = dt.now() - start_time total_time = elapsed * len(param_list) / float(i) remaining = total_time - elapsed print('Elapsed:', str(elapsed)[:-7], '| Remaining:', str(remaining)[:-7]) ftc = FastTextClassifier( train_data_fn=write_dir + 'amazon5core.preprocessed.txt', batch_size=100000, labels=[1, 3, 5], kwargs_train_supervised={ 'epoch': params['epochs'], 'thread': 12, 'lr': params['lr'], 'wordNgrams': params['ngram'], 'bucket': 200000, 'dim': params['dim'], 'loss': 'softmax', #'softmax', # 'hs' }, ) pyx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities( X=np.arange(len(labels)), labels=labels, clf=ftc, cv_n_folds=params['cv_n_folds'], seed=seed, ) # Write out wfn = write_dir + 'amazon_pyx_cv__folds_{}__epochs_{}__lr_{}__ngram_{}__dim_{}.npy'.format(