Beispiel #1
0
def merge_prune(train_floders, test_folders):
    train_datasets = maybe_pickle(train_folders, 45000)
    test_datasets = maybe_pickle(test_folders, 1800)

    train_size = 200000
    valid_size = 10000
    test_size = 10000

    valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
        train_datasets, train_size, valid_size)
    _, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

    print('Training:', train_dataset.shape, train_labels.shape)
    print('Validation:', valid_dataset.shape, valid_labels.shape)
    print('Testing:', test_dataset.shape, test_labels.shape)

    train_dataset, train_labels = randomize(train_dataset, train_labels)
    test_dataset, test_labels = randomize(test_dataset, test_labels)
    valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

    pickle_file = 'notMNIST.pickle'
    save = {
        'train_dataset': train_dataset,
        'train_labels': train_labels,
        'valid_dataset': valid_dataset,
        'valid_labels': valid_labels,
        'test_dataset': test_dataset,
        'test_labels': test_labels,
    }
    save_obj(pickle_file, save)
Beispiel #2
0
def merge_prune(train_folders, test_folders):
    train_datasets = maybe_pickle(train_folders, 45000)
    test_datasets = maybe_pickle(test_folders, 1800)

    train_size = 200000
    valid_size = 10000
    test_size = 10000

    valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
        train_datasets, train_size, valid_size)
    _, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

    print('Training:', train_dataset.shape, train_labels.shape)
    print('Validation:', valid_dataset.shape, valid_labels.shape)
    print('Testing:', test_dataset.shape, test_labels.shape)

    train_dataset, train_labels = randomize(train_dataset, train_labels)
    test_dataset, test_labels = randomize(test_dataset, test_labels)
    valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

    pickle_file = 'notMNIST.pickle'
    save = {
        'train_dataset': train_dataset,
        'train_labels': train_labels,
        'valid_dataset': valid_dataset,
        'valid_labels': valid_labels,
        'test_dataset': test_dataset,
        'test_labels': test_labels,
    }
    save_obj(pickle_file, save)
Beispiel #3
0
def load_train():
    datasets = load_pickle('notMNIST_clean.pickle')
    train_dataset = datasets['train_dataset']
    train_labels = datasets['train_labels']
    valid_dataset = datasets['valid_dataset']
    valid_labels = datasets['valid_labels']

    classifier_name = 'classifier.pickle'

    if os.path.exists(classifier_name):
        classifier = load_pickle(classifier_name)
    else:
        classifier = LogisticRegression()
        classifier.fit(train_dataset.reshape(train_dataset.shape[0], -1), train_labels)
        save_obj(classifier_name, classifier)

    # simple valid
    valid_idx_s = 3000
    valid_idx_e = 3014
    x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1)[valid_idx_s: valid_idx_e])
    print(x)
    print(valid_labels[valid_idx_s:valid_idx_e])

    # whole valid
    x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1))
    fail_cnt = 0
    for i, pred in enumerate(x):
        if pred != valid_labels[i]:
            fail_cnt += 1
    print("success rate:" + str((1 - float(fail_cnt) / len(x)) * 100) + "%")
def clean():
    datasets = load_pickle('notMNIST.pickle')
    test_dataset = datasets['test_dataset']
    test_labels = datasets['test_labels']
    print('test_dataset:%d' % len(test_dataset))
    print('test_labels:%d' % len(test_labels))

    except_valid_idx, valid_dataset = imgs_idx_hash_except(datasets['valid_dataset'], test_dataset)
    valid_labels = np.delete(datasets['valid_labels'], except_valid_idx)
    print('valid_dataset:%d' % len(valid_dataset))
    print('valid_labels:%d' % len(valid_labels))

    # except with valid_dataset
    except_train_idx, train_dataset = imgs_idx_hash_except(datasets['train_dataset'], valid_dataset)
    train_labels = np.delete(datasets['train_labels'], except_train_idx)
    # except with test_dataset
    except_train_idx, train_dataset = imgs_idx_hash_except(train_dataset, test_dataset)
    train_labels = np.delete(train_labels, except_train_idx)

    print('train_dataset:%d' % len(train_dataset))
    print('train_labels:%d' % len(train_labels))
    print('valid_dataset:%d' % len(valid_dataset))
    print('valid_labels:%d' % len(valid_labels))
    print('test_dataset:%d' % len(test_dataset))
    print('test_labels:%d' % len(test_labels))

    pickle_file = 'notMNIST_clean.pickle'
    save = {
        'train_dataset': train_dataset,
        'train_labels': train_labels,
        'valid_dataset': valid_dataset,
        'valid_labels': valid_labels,
        'test_dataset': test_dataset,
        'test_labels': test_labels,
    }
    save_obj(pickle_file, save)
Beispiel #5
0
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close()


data_set = load_pickle('text8_text.pickle')
if data_set is None:
    # load data
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', 31344016, url=url)

    # read data
    text = read_data(filename)
    print('Data size %d' % len(text))
    save_obj('text8_text.pickle', text)
else:
    text = data_set

# Create a small validation set.
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

# Utility functions to map characters to vocabulary IDs and back.
vocabulary_size = len(string.ascii_lowercase) + 1  # [a-z] + ' '
# ascii code for character
first_letter = ord(string.ascii_lowercase[0])
Beispiel #6
0
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()
    save_obj('text8_embed.pickle', final_embeddings)

num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points + 1, :])


def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15, 15))  # in inches
    for i, label in enumerate(labels):
        x, y = embeddings[i, :]
        pylab.scatter(x, y)
        pylab.annotate(label,
                       xy=(x, y),
Beispiel #7
0
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close()


data_set = load_pickle('text8_text.pickle')
if data_set is None:
    # load data
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', 31344016, url=url)

    # read data
    text = read_data(filename)
    print('Data size %d' % len(text))
    save_obj('text8_text.pickle', text)
else:
    text = data_set

# Create a small validation set.
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

# Utility functions to map characters to vocabulary IDs and back.
vocabulary_size = len(string.ascii_lowercase) + 1  # [a-z] + ' '
# ascii code for character
first_letter = ord(string.ascii_lowercase[0])
Beispiel #8
0
    # read data
    words = read_data(filename)
    print('Data size %d' % len(words))
    data, count, dictionary, reverse_dictionary = build_dataset(
        words, vocabulary_size)
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10])
    del words  # Hint to reduce memory.
    data_set = {
        'data': data,
        'count': count,
        'dictionary': dictionary,
        'reverse_dictionary': reverse_dictionary,
    }
    save_obj('text8_data.pickle', data_set)
else:
    data = data_set['data']
    count = data_set['count']
    dictionary = data_set['dictionary']
    reverse_dictionary = data_set['reverse_dictionary']

# split data
data_index = 0

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    test_size = 8
    batch, labels = generate_batch(batch_size=test_size,
                                   num_skips=num_skips,
Beispiel #9
0
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()
    save_obj('text8_embed.pickle', final_embeddings)

num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points + 1, :])


def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15, 15))  # in inches
    for i, label in enumerate(labels):
        x, y = embeddings[i, :]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                       ha='right', va='bottom')
Beispiel #10
0
if data_set is None:
    # load data
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', 31344016, url=url)

    # read data
    words = read_data(filename)
    print('Data size %d' % len(words))
    data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10])
    del words  # Hint to reduce memory.
    data_set = {
        'data': data, 'count': count, 'dictionary': dictionary, 'reverse_dictionary': reverse_dictionary,
    }
    save_obj('text8_data.pickle', data_set)
else:
    data = data_set['data']
    count = data_set['count']
    dictionary = data_set['dictionary']
    reverse_dictionary = data_set['reverse_dictionary']

# split data
data_index = 0

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    test_size = 8
    batch, labels = generate_batch(batch_size=test_size, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))