Esempio n. 1
0
        data[i, :] = generate_pattern_data_as_array(
            length=LENGTH,
            avg_pattern_length=pattern_length[label],
            avg_amplitude=amplitude[label],
            variance_pattern_length=var_pattern_length[label],
            variance_amplitude=var_amplitude[label])
        labels[i] = label
        if (random.randint(0, 99) < PERCENT_NOISE):
            noisy_labels[i] = (label + 1) % 2
        else:
            noisy_labels[i] = label

    #data, labels = generate_pattern_data_as_dataframe(length=LENGTH, numSamples=NUM_SAMPLES, numClasses=3)

    #pre-process and identify data
    data, labels = preprocess_x_y_and_shuffle(data, noisy_labels)

    res = check_dataset(data, noisy_labels)

    prec = _precision_on_k(res["pred"][:], noisy_labels, labels, 0.02)
    print("Precision is ", prec)

    # return first 100 questionable indices
    #print("The first 100 questionable pairs (x_i, y_i) are: {}".format(res["indices"][:100]))
    print("Top 10 questionable series: ")
    for i in res["indices"][:10]:
        print("\n-------Index ", i, "---------")
        print("Mean: ", np.mean(data[i][:]))
        print("Max: ", np.amax(data[i][:]))
        print("Max: ", np.amin(data[i][:]))
        print("Label: ", labels[i])
from utils.visualize import visualize_image
from labelfix import check_dataset, preprocess_x_y_and_shuffle
import tensorflow as tf

# In this example, we aim to find mislabeled instances in the fashion MNIST training data set
if __name__ == "__main__":
    # First, construct required dictionary using the fashion mnist training data
    (x_train, y_train), (_, _) = tf.keras.datasets.fashion_mnist.load_data()

    # check the data set
    x_train, y_train = preprocess_x_y_and_shuffle(x_train, y_train)
    res = check_dataset(x_train,
                        y_train,
                        hyperparams={
                            'activation': 'relu',
                            'dropout': 0.3,
                            'learn_rate': 0.001,
                            'num_hidden': 3,
                            'output_dim': 10,
                            'input_dim': 2048
                        })

    # plot four sets of images with the most likely mislabeled pairs (x, y) and save to disk
    for i in range(40):
        visualize_image(image_data=x_train,
                        image_labels=y_train,
                        label_names=[
                            "top/shirt", "trousers", "pullover", "dress",
                            "coat", "sandal", "shirt", "sneaker", "bag",
                            "ankle boot"
                        ],
    raw_data = np.genfromtxt(data_file, delimiter=',')
    labels = np.genfromtxt(label_file, delimiter=',', dtype='int')
    NUM_SAMPLES = len(raw_data)
    NUM_CLASSES = max(labels) + 1
    data_features = np.genfromtxt(feature_file, delimiter=',')
    bad_indexes = np.genfromtxt(index_file, delimiter=',', dtype='int')

    y_true = np.zeros(NUM_SAMPLES)
    for index in bad_indexes:
        y_true[index] = 1

    f.write(str(len(raw_data)) + " samples in dataset\n")
    f.write(str(len(labels)) + " labels in dataset\n")
    f.write(str(NUM_CLASSES) + " distinct labels\n")

    preprocess_x_y_and_shuffle(raw_data, labels)

    for i in range(NUM_OF_RUNS):
        print("--------------Run Number: ", i + 1, "--------------------")
        res_ts = check_dataset(raw_data, labels)
        res_numerical = check_dataset(data_features,
                                      labels,
                                      hyperparams={
                                          "input_dim": data_features.shape[1],
                                          "output_dim": max(labels) + 1,
                                          "num_hidden": 3,
                                          "size_hidden": 100,
                                          "dropout": 0.1,
                                          "epochs": 400,
                                          "learn_rate": 1e-2,
                                          "activation": "relu"
import sklearn
from labelfix import check_dataset, preprocess_x_y_and_shuffle
from utils.view_20_newsgroup import view_twenty

# example on how the system works on textual data
if __name__ == "__main__":
    print("working")
    # load the twenty newsgroup data set
    twenty_newsgroup = sklearn.datasets.fetch_20newsgroups(subset="all",
                                                           shuffle=False)
    print("data loaded")

    # "data" is required to be a list of strings. Each string is the newsgroup article to be classified.
    # "target" is an array of ints representing the labels.
    twenty_newsgroup["data"], twenty_newsgroup[
        "target"] = preprocess_x_y_and_shuffle(twenty_newsgroup["data"],
                                               twenty_newsgroup["target"])
    print("data processed")
    res = check_dataset(twenty_newsgroup["data"], twenty_newsgroup["target"])

    # return first 100 questionable indices
    print("The first 100 questionable pairs (x_i, y_i) are: {}".format(
        res["indices"][:100]))

    # iterate over the findings and display both X (from the original corpus) and the questionable labels y
    for i in res["indices"]:
        print("Loading next document .. please be patient\n")
        view_twenty(
            sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=False),
            i)
        input("... Press Return for next")