def check_dataset_super(current_dataset_fnct, ds_name, mu, df=None):
    """
    Given a DataFrame, dataset and classifier, perform whole pipeline of adding noise and classification.
    :param current_dataset_fnct:    closure, returns a data set.
    :param ds_name:                 str, name of the data set for display
    :param mu:                      float, percentage of data to which label noise will be added, has to be in [0,1]
    :param df:                      DataFrame, results will be appended to this DataFrame, if None, a new one will be created
    :return:                        DataFrame, results are appended appended
    """
    # if necessary create new DataFrame
    if df is None:
        df = pd.DataFrame()

    # get the data set
    current_dataset = current_dataset_fnct(mu)

    # check the dataset
    res = check_dataset(X=current_dataset["data"],
                        y=current_dataset["target_noisy"])

    # calc goodness / recall
    recall = {}
    for alpha in [0.01, 0.02, 0.03]:
        recall[str(alpha)] = _calc_recall(
            pred=res["pred"],
            y_noisy=current_dataset["target_noisy"],
            y_true=current_dataset["target"],
            alpha=alpha,
            mu=mu)

    # calc accuracy / precision
    precision = {}
    for alpha in [0.01, 0.02, 0.03]:
        precision[str(alpha)] = _precision_on_k(
            pred=res["pred"],
            y_noisy=current_dataset["target_noisy"],
            y_true=current_dataset["target"],
            alpha=alpha)
    print("Found recall {} and precision {}".format(recall, precision))

    # ================================================================================================================
    # noinspection PyTypeChecker
    return df.append(
        {
            "Dataset": ds_name,
            "Size": current_dataset["data"].shape,
            "Classes": len(np.unique(current_dataset["target_noisy"], axis=0)),
            "Runtime": res["runtime"],
            "rec {}".format(0.01): round(recall["0.01"], 3),
            "rec {}".format(0.02): round(recall["0.02"], 3),
            "rec {}".format(0.03): round(recall["0.03"], 3),
            "prec {}".format(0.01): round(precision["0.01"], 3),
            "prec {}".format(0.02): round(precision["0.02"], 3),
            "prec {}".format(0.03): round(precision["0.03"], 3),
            "target imbalance": round(res["imbalance"], 3),
            "very best hyperparams": res["best_params"]
        },
        ignore_index=True)
    data_features = np.genfromtxt(feature_file, delimiter=',')
    bad_indexes = np.genfromtxt(index_file, delimiter=',', dtype='int')

    y_true = np.zeros(NUM_SAMPLES)
    for index in bad_indexes:
        y_true[index] = 1

    f.write(str(len(raw_data)) + " samples in dataset\n")
    f.write(str(len(labels)) + " labels in dataset\n")
    f.write(str(NUM_CLASSES) + " distinct labels\n")

    preprocess_x_y_and_shuffle(raw_data, labels)

    for i in range(NUM_OF_RUNS):
        print("--------------Run Number: ", i + 1, "--------------------")
        res_ts = check_dataset(raw_data, labels)
        res_numerical = check_dataset(data_features,
                                      labels,
                                      hyperparams={
                                          "input_dim": data_features.shape[1],
                                          "output_dim": max(labels) + 1,
                                          "num_hidden": 3,
                                          "size_hidden": 100,
                                          "dropout": 0.1,
                                          "epochs": 400,
                                          "learn_rate": 1e-2,
                                          "activation": "relu"
                                      })

        rem_percent = int(NUM_SAMPLES * 0.01)
        ts_y = np.array(res_ts["indices"][:rem_percent])
Esempio n. 3
0
            length=LENGTH,
            avg_pattern_length=pattern_length[label],
            avg_amplitude=amplitude[label],
            variance_pattern_length=var_pattern_length[label],
            variance_amplitude=var_amplitude[label])
        labels[i] = label
        if (random.randint(0, 99) < PERCENT_NOISE):
            noisy_labels[i] = (label + 1) % 2
        else:
            noisy_labels[i] = label

    #data, labels = generate_pattern_data_as_dataframe(length=LENGTH, numSamples=NUM_SAMPLES, numClasses=3)

    #pre-process and identify data
    data, labels = preprocess_x_y_and_shuffle(data, noisy_labels)

    res = check_dataset(data, noisy_labels)

    prec = _precision_on_k(res["pred"][:], noisy_labels, labels, 0.02)
    print("Precision is ", prec)

    # return first 100 questionable indices
    #print("The first 100 questionable pairs (x_i, y_i) are: {}".format(res["indices"][:100]))
    print("Top 10 questionable series: ")
    for i in res["indices"][:10]:
        print("\n-------Index ", i, "---------")
        print("Mean: ", np.mean(data[i][:]))
        print("Max: ", np.amax(data[i][:]))
        print("Max: ", np.amin(data[i][:]))
        print("Label: ", labels[i])
Esempio n. 4
0
        f.write("--------------Run Number: " + str(iter_num + 1) +
                "--------------------\n")

        #train and test on raw features
        X_train, X_test, y_train, y_test = train_test_split(raw_data,
                                                            labels,
                                                            test_size=0.2,
                                                            shuffle=True)

        classifier.compile(optimizer='adam',
                           loss='categorical_crossentropy',
                           metrics=['accuracy'])
        classifier.summary()

        if ONLY_CLEAN_TRAIN:
            res_ts = check_dataset(X_train, y_train)
        else:
            res_ts = check_dataset(raw_data, labels)

        y_train = to_categorical(y_train)
        classifier.fit(X_train, y_train, epochs=15, verbose=0)
        y_pred = classifier.predict(X_test)
        y_pred = decode_from_one_hot(y_pred)
        raw_precision[iter_num] = precision_score(y_test,
                                                  y_pred,
                                                  average='macro')
        raw_accuracy[iter_num] = accuracy_score(y_test, y_pred, normalize=True)
        raw_recall[iter_num] = recall_score(y_test, y_pred, average='macro')

        if ONLY_CLEAN_TRAIN:
            cleaned_data = X_train
from utils.visualize import visualize_image
from labelfix import check_dataset, preprocess_x_y_and_shuffle
import tensorflow as tf

# In this example, we aim to find mislabeled instances in the fashion MNIST training data set
if __name__ == "__main__":
    # First, construct required dictionary using the fashion mnist training data
    (x_train, y_train), (_, _) = tf.keras.datasets.fashion_mnist.load_data()

    # check the data set
    x_train, y_train = preprocess_x_y_and_shuffle(x_train, y_train)
    res = check_dataset(x_train,
                        y_train,
                        hyperparams={
                            'activation': 'relu',
                            'dropout': 0.3,
                            'learn_rate': 0.001,
                            'num_hidden': 3,
                            'output_dim': 10,
                            'input_dim': 2048
                        })

    # plot four sets of images with the most likely mislabeled pairs (x, y) and save to disk
    for i in range(40):
        visualize_image(image_data=x_train,
                        image_labels=y_train,
                        label_names=[
                            "top/shirt", "trousers", "pullover", "dress",
                            "coat", "sandal", "shirt", "sneaker", "bag",
                            "ankle boot"
                        ],
                        indices=res["indices"],
Esempio n. 6
0
        '#a1a1a1', '#c1c1c1'
    ]

    colors = [
        'blue', 'green', 'cyan', 'gray', 'olive', 'brown', 'gold', 'darkgreen'
    ]

    raw_data = np.genfromtxt(data_file, delimiter=',')
    labels = np.genfromtxt(label_file, delimiter=',', dtype='int')
    preprocess_x_y_and_shuffle(raw_data, labels)

    first_run = True

    for i in range(NUM_OF_RUNS):
        print("--------------Run Number: ", i + 1, "--------------------")
        res_ts = check_dataset(raw_data, labels)

        bad = np.array(res_ts["indices"][:1000])
        if first_run:
            all_bad = bad
            first_run = False
        else:
            all_bad = np.intersect1d(bad, all_bad)
        print(all_bad)
        gc.collect()

    all_bad = all_bad[:10]
    print("Bad indexes in HuaWei hand data: ", all_bad)
    np.savetxt("huawei_hand_bad_indexes.csv", all_bad, delimiter=",", fmt="%d")

    e = tsne(n_components=2,
from labelfix import check_dataset, preprocess_x_y_and_shuffle
from utils.view_20_newsgroup import view_twenty

# example on how the system works on textual data
if __name__ == "__main__":
    print("working")
    # load the twenty newsgroup data set
    twenty_newsgroup = sklearn.datasets.fetch_20newsgroups(subset="all",
                                                           shuffle=False)
    print("data loaded")

    # "data" is required to be a list of strings. Each string is the newsgroup article to be classified.
    # "target" is an array of ints representing the labels.
    twenty_newsgroup["data"], twenty_newsgroup[
        "target"] = preprocess_x_y_and_shuffle(twenty_newsgroup["data"],
                                               twenty_newsgroup["target"])
    print("data processed")
    res = check_dataset(twenty_newsgroup["data"], twenty_newsgroup["target"])

    # return first 100 questionable indices
    print("The first 100 questionable pairs (x_i, y_i) are: {}".format(
        res["indices"][:100]))

    # iterate over the findings and display both X (from the original corpus) and the questionable labels y
    for i in res["indices"]:
        print("Loading next document .. please be patient\n")
        view_twenty(
            sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=False),
            i)
        input("... Press Return for next")
import pickle

from tensorflow.keras.datasets import cifar100

from utils.visualize import visualize_image
from labelfix import check_dataset, preprocess_x_y_and_shuffle

# In this example, we aim to find mislabeled instances in the CIFAR-100 training data set
if __name__ == "__main__":
    # First, construct required dictionary using the CIFAR-100 training data
    (x_train, y_train), (_, _) = cifar100.load_data(label_mode='fine')

    x_train, y_train = preprocess_x_y_and_shuffle(x_train, y_train)
    res = check_dataset(x_train, y_train)

    # load label names
    with open("../../res/cifar100/cifar100_names", 'rb') as f:
        dic = pickle.load(f, encoding='bytes')
    label_names = dic[b"fine_label_names"]

    # plot four sets of images with the most likely mislabeled pairs (x, y) and save to disk
    for i in range(40):
        visualize_image(image_data=x_train,
                        image_labels=y_train,
                        label_names=label_names,
                        indices=res["indices"],
                        batch_to_plot=i,
                        save_to_path="../../out/cifar100")

    ids = res["indices"][:int(res["indices"].shape[0] * 0.03)]
    print(ids)