data[i, :] = generate_pattern_data_as_array( length=LENGTH, avg_pattern_length=pattern_length[label], avg_amplitude=amplitude[label], variance_pattern_length=var_pattern_length[label], variance_amplitude=var_amplitude[label]) labels[i] = label if (random.randint(0, 99) < PERCENT_NOISE): noisy_labels[i] = (label + 1) % 2 else: noisy_labels[i] = label #data, labels = generate_pattern_data_as_dataframe(length=LENGTH, numSamples=NUM_SAMPLES, numClasses=3) #pre-process and identify data data, labels = preprocess_x_y_and_shuffle(data, noisy_labels) res = check_dataset(data, noisy_labels) prec = _precision_on_k(res["pred"][:], noisy_labels, labels, 0.02) print("Precision is ", prec) # return first 100 questionable indices #print("The first 100 questionable pairs (x_i, y_i) are: {}".format(res["indices"][:100])) print("Top 10 questionable series: ") for i in res["indices"][:10]: print("\n-------Index ", i, "---------") print("Mean: ", np.mean(data[i][:])) print("Max: ", np.amax(data[i][:])) print("Max: ", np.amin(data[i][:])) print("Label: ", labels[i])
from utils.visualize import visualize_image from labelfix import check_dataset, preprocess_x_y_and_shuffle import tensorflow as tf # In this example, we aim to find mislabeled instances in the fashion MNIST training data set if __name__ == "__main__": # First, construct required dictionary using the fashion mnist training data (x_train, y_train), (_, _) = tf.keras.datasets.fashion_mnist.load_data() # check the data set x_train, y_train = preprocess_x_y_and_shuffle(x_train, y_train) res = check_dataset(x_train, y_train, hyperparams={ 'activation': 'relu', 'dropout': 0.3, 'learn_rate': 0.001, 'num_hidden': 3, 'output_dim': 10, 'input_dim': 2048 }) # plot four sets of images with the most likely mislabeled pairs (x, y) and save to disk for i in range(40): visualize_image(image_data=x_train, image_labels=y_train, label_names=[ "top/shirt", "trousers", "pullover", "dress", "coat", "sandal", "shirt", "sneaker", "bag", "ankle boot" ],
raw_data = np.genfromtxt(data_file, delimiter=',') labels = np.genfromtxt(label_file, delimiter=',', dtype='int') NUM_SAMPLES = len(raw_data) NUM_CLASSES = max(labels) + 1 data_features = np.genfromtxt(feature_file, delimiter=',') bad_indexes = np.genfromtxt(index_file, delimiter=',', dtype='int') y_true = np.zeros(NUM_SAMPLES) for index in bad_indexes: y_true[index] = 1 f.write(str(len(raw_data)) + " samples in dataset\n") f.write(str(len(labels)) + " labels in dataset\n") f.write(str(NUM_CLASSES) + " distinct labels\n") preprocess_x_y_and_shuffle(raw_data, labels) for i in range(NUM_OF_RUNS): print("--------------Run Number: ", i + 1, "--------------------") res_ts = check_dataset(raw_data, labels) res_numerical = check_dataset(data_features, labels, hyperparams={ "input_dim": data_features.shape[1], "output_dim": max(labels) + 1, "num_hidden": 3, "size_hidden": 100, "dropout": 0.1, "epochs": 400, "learn_rate": 1e-2, "activation": "relu"
import sklearn from labelfix import check_dataset, preprocess_x_y_and_shuffle from utils.view_20_newsgroup import view_twenty # example on how the system works on textual data if __name__ == "__main__": print("working") # load the twenty newsgroup data set twenty_newsgroup = sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=False) print("data loaded") # "data" is required to be a list of strings. Each string is the newsgroup article to be classified. # "target" is an array of ints representing the labels. twenty_newsgroup["data"], twenty_newsgroup[ "target"] = preprocess_x_y_and_shuffle(twenty_newsgroup["data"], twenty_newsgroup["target"]) print("data processed") res = check_dataset(twenty_newsgroup["data"], twenty_newsgroup["target"]) # return first 100 questionable indices print("The first 100 questionable pairs (x_i, y_i) are: {}".format( res["indices"][:100])) # iterate over the findings and display both X (from the original corpus) and the questionable labels y for i in res["indices"]: print("Loading next document .. please be patient\n") view_twenty( sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=False), i) input("... Press Return for next")