def check_dataset_super(current_dataset_fnct, ds_name, mu, df=None): """ Given a DataFrame, dataset and classifier, perform whole pipeline of adding noise and classification. :param current_dataset_fnct: closure, returns a data set. :param ds_name: str, name of the data set for display :param mu: float, percentage of data to which label noise will be added, has to be in [0,1] :param df: DataFrame, results will be appended to this DataFrame, if None, a new one will be created :return: DataFrame, results are appended appended """ # if necessary create new DataFrame if df is None: df = pd.DataFrame() # get the data set current_dataset = current_dataset_fnct(mu) # check the dataset res = check_dataset(X=current_dataset["data"], y=current_dataset["target_noisy"]) # calc goodness / recall recall = {} for alpha in [0.01, 0.02, 0.03]: recall[str(alpha)] = _calc_recall( pred=res["pred"], y_noisy=current_dataset["target_noisy"], y_true=current_dataset["target"], alpha=alpha, mu=mu) # calc accuracy / precision precision = {} for alpha in [0.01, 0.02, 0.03]: precision[str(alpha)] = _precision_on_k( pred=res["pred"], y_noisy=current_dataset["target_noisy"], y_true=current_dataset["target"], alpha=alpha) print("Found recall {} and precision {}".format(recall, precision)) # ================================================================================================================ # noinspection PyTypeChecker return df.append( { "Dataset": ds_name, "Size": current_dataset["data"].shape, "Classes": len(np.unique(current_dataset["target_noisy"], axis=0)), "Runtime": res["runtime"], "rec {}".format(0.01): round(recall["0.01"], 3), "rec {}".format(0.02): round(recall["0.02"], 3), "rec {}".format(0.03): round(recall["0.03"], 3), "prec {}".format(0.01): round(precision["0.01"], 3), "prec {}".format(0.02): round(precision["0.02"], 3), "prec {}".format(0.03): round(precision["0.03"], 3), "target imbalance": round(res["imbalance"], 3), "very best hyperparams": res["best_params"] }, ignore_index=True)
data_features = np.genfromtxt(feature_file, delimiter=',') bad_indexes = np.genfromtxt(index_file, delimiter=',', dtype='int') y_true = np.zeros(NUM_SAMPLES) for index in bad_indexes: y_true[index] = 1 f.write(str(len(raw_data)) + " samples in dataset\n") f.write(str(len(labels)) + " labels in dataset\n") f.write(str(NUM_CLASSES) + " distinct labels\n") preprocess_x_y_and_shuffle(raw_data, labels) for i in range(NUM_OF_RUNS): print("--------------Run Number: ", i + 1, "--------------------") res_ts = check_dataset(raw_data, labels) res_numerical = check_dataset(data_features, labels, hyperparams={ "input_dim": data_features.shape[1], "output_dim": max(labels) + 1, "num_hidden": 3, "size_hidden": 100, "dropout": 0.1, "epochs": 400, "learn_rate": 1e-2, "activation": "relu" }) rem_percent = int(NUM_SAMPLES * 0.01) ts_y = np.array(res_ts["indices"][:rem_percent])
length=LENGTH, avg_pattern_length=pattern_length[label], avg_amplitude=amplitude[label], variance_pattern_length=var_pattern_length[label], variance_amplitude=var_amplitude[label]) labels[i] = label if (random.randint(0, 99) < PERCENT_NOISE): noisy_labels[i] = (label + 1) % 2 else: noisy_labels[i] = label #data, labels = generate_pattern_data_as_dataframe(length=LENGTH, numSamples=NUM_SAMPLES, numClasses=3) #pre-process and identify data data, labels = preprocess_x_y_and_shuffle(data, noisy_labels) res = check_dataset(data, noisy_labels) prec = _precision_on_k(res["pred"][:], noisy_labels, labels, 0.02) print("Precision is ", prec) # return first 100 questionable indices #print("The first 100 questionable pairs (x_i, y_i) are: {}".format(res["indices"][:100])) print("Top 10 questionable series: ") for i in res["indices"][:10]: print("\n-------Index ", i, "---------") print("Mean: ", np.mean(data[i][:])) print("Max: ", np.amax(data[i][:])) print("Max: ", np.amin(data[i][:])) print("Label: ", labels[i])
f.write("--------------Run Number: " + str(iter_num + 1) + "--------------------\n") #train and test on raw features X_train, X_test, y_train, y_test = train_test_split(raw_data, labels, test_size=0.2, shuffle=True) classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) classifier.summary() if ONLY_CLEAN_TRAIN: res_ts = check_dataset(X_train, y_train) else: res_ts = check_dataset(raw_data, labels) y_train = to_categorical(y_train) classifier.fit(X_train, y_train, epochs=15, verbose=0) y_pred = classifier.predict(X_test) y_pred = decode_from_one_hot(y_pred) raw_precision[iter_num] = precision_score(y_test, y_pred, average='macro') raw_accuracy[iter_num] = accuracy_score(y_test, y_pred, normalize=True) raw_recall[iter_num] = recall_score(y_test, y_pred, average='macro') if ONLY_CLEAN_TRAIN: cleaned_data = X_train
from utils.visualize import visualize_image from labelfix import check_dataset, preprocess_x_y_and_shuffle import tensorflow as tf # In this example, we aim to find mislabeled instances in the fashion MNIST training data set if __name__ == "__main__": # First, construct required dictionary using the fashion mnist training data (x_train, y_train), (_, _) = tf.keras.datasets.fashion_mnist.load_data() # check the data set x_train, y_train = preprocess_x_y_and_shuffle(x_train, y_train) res = check_dataset(x_train, y_train, hyperparams={ 'activation': 'relu', 'dropout': 0.3, 'learn_rate': 0.001, 'num_hidden': 3, 'output_dim': 10, 'input_dim': 2048 }) # plot four sets of images with the most likely mislabeled pairs (x, y) and save to disk for i in range(40): visualize_image(image_data=x_train, image_labels=y_train, label_names=[ "top/shirt", "trousers", "pullover", "dress", "coat", "sandal", "shirt", "sneaker", "bag", "ankle boot" ], indices=res["indices"],
'#a1a1a1', '#c1c1c1' ] colors = [ 'blue', 'green', 'cyan', 'gray', 'olive', 'brown', 'gold', 'darkgreen' ] raw_data = np.genfromtxt(data_file, delimiter=',') labels = np.genfromtxt(label_file, delimiter=',', dtype='int') preprocess_x_y_and_shuffle(raw_data, labels) first_run = True for i in range(NUM_OF_RUNS): print("--------------Run Number: ", i + 1, "--------------------") res_ts = check_dataset(raw_data, labels) bad = np.array(res_ts["indices"][:1000]) if first_run: all_bad = bad first_run = False else: all_bad = np.intersect1d(bad, all_bad) print(all_bad) gc.collect() all_bad = all_bad[:10] print("Bad indexes in HuaWei hand data: ", all_bad) np.savetxt("huawei_hand_bad_indexes.csv", all_bad, delimiter=",", fmt="%d") e = tsne(n_components=2,
from labelfix import check_dataset, preprocess_x_y_and_shuffle from utils.view_20_newsgroup import view_twenty # example on how the system works on textual data if __name__ == "__main__": print("working") # load the twenty newsgroup data set twenty_newsgroup = sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=False) print("data loaded") # "data" is required to be a list of strings. Each string is the newsgroup article to be classified. # "target" is an array of ints representing the labels. twenty_newsgroup["data"], twenty_newsgroup[ "target"] = preprocess_x_y_and_shuffle(twenty_newsgroup["data"], twenty_newsgroup["target"]) print("data processed") res = check_dataset(twenty_newsgroup["data"], twenty_newsgroup["target"]) # return first 100 questionable indices print("The first 100 questionable pairs (x_i, y_i) are: {}".format( res["indices"][:100])) # iterate over the findings and display both X (from the original corpus) and the questionable labels y for i in res["indices"]: print("Loading next document .. please be patient\n") view_twenty( sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=False), i) input("... Press Return for next")
import pickle from tensorflow.keras.datasets import cifar100 from utils.visualize import visualize_image from labelfix import check_dataset, preprocess_x_y_and_shuffle # In this example, we aim to find mislabeled instances in the CIFAR-100 training data set if __name__ == "__main__": # First, construct required dictionary using the CIFAR-100 training data (x_train, y_train), (_, _) = cifar100.load_data(label_mode='fine') x_train, y_train = preprocess_x_y_and_shuffle(x_train, y_train) res = check_dataset(x_train, y_train) # load label names with open("../../res/cifar100/cifar100_names", 'rb') as f: dic = pickle.load(f, encoding='bytes') label_names = dic[b"fine_label_names"] # plot four sets of images with the most likely mislabeled pairs (x, y) and save to disk for i in range(40): visualize_image(image_data=x_train, image_labels=y_train, label_names=label_names, indices=res["indices"], batch_to_plot=i, save_to_path="../../out/cifar100") ids = res["indices"][:int(res["indices"].shape[0] * 0.03)] print(ids)