def calc(number): curve = idx2numpy.convert_from_file('../MNIST/curve') images = idx2numpy.convert_from_file("../MNIST/spin_train_images") labels = idx2numpy.convert_from_file("../MNIST/train_labels") double = np.zeros((410, 410), dtype=np.float_) for n in range(10000 * number, 10000 * (number + 1)): for i in range(400): for j in range(400): double[i][j] += images[n][curve[i][0]][ curve[i][1]] * images[n][curve[j][0]][curve[j][1]] if i == 0 and j == 0: print(n) for i in range(400): for n in range(10000 * number, 10000 * (number + 1)): for j in range(10): if labels[n] == j: double[i][400 + j] += images[n][curve[i][0]][curve[i][1]] double[400 + j][i] += images[n][curve[i][0]][curve[i][1]] else: double[i][400 + j] -= images[n][curve[i][0]][curve[i][1]] double[400 + j][i] -= images[n][curve[i][0]][curve[i][1]] for i in range(410): for j in range(410): double[i][j] = double[i][j] / 60000 idx2numpy.convert_to_file("../Data/double" + str(number), double)
def create_some_test_images(some_number=10): import idx2numpy import gzip import numpy as np import PIL.Image import os test_images = "../t10k-images-idx3-ubyte.gz" test_labels = "../t10k-labels-idx1-ubyte.gz" EXPORT_FOLDER = "../test_images" imgs = idx2numpy.convert_from_file(gzip.open(test_images)) lbls = idx2numpy.convert_from_file(gzip.open(test_labels)) indices = np.arange(0, len(lbls)) for i in range(10): ix = np.random.choice(indices[lbls == i], size=some_number) assert len(ix) == some_number for j, image_index in enumerate(ix): pil_image = PIL.Image.fromarray(np.squeeze( imgs[image_index, :, :])) os.makedirs(os.path.join('..', 'static', 'img', 'mnist', str(i)), exist_ok=True) pil_image.save(f'../static/img/mnist/{i}/{j}.png')
def main(): # 训练数据 images = idx2numpy.convert_from_file("train-images-idx3-ubyte") # print(images.shape) X = images.reshape(images.shape[0], -1) X = X.astype('float32') # print(X.shape) # X = (X-np.min(X,0))/(np.max(X,0)+0.0001) X = X / 255.0 # X = np.matrix(X) # 测试数据 Y = idx2numpy.convert_from_file("train-labels-idx1-ubyte") Y = Y.astype('int64') # print(Y.shape) X_test = X[50000:] Y_test = Y[50000:] X = X[:50000] Y = Y[:50000] sizes = [X.shape[1], 200, 100, 10] time_start = time.time() ANN_train(sizes, X, Y) time_end = time.time() print('totally cost', time_end - time_start) correct = ANN_test(sizes, 'ANN.pth', X_test, Y_test) print('correct:', correct)
def load_data(): """ Read in the training data from the data dir. The data comes to us from http://yann.lecun.com/exdb/mnist/ """ train_images = idx2numpy.convert_from_file( 'data/train-images-idx3-ubyte').astype('float64') # We know that from the IDX file documentation the max value we will be getting # is 255, and we wha the values of our input to be [0, 1] so train_images = train_images / 255 # # Lets look at one of the images # import matplotlib.pyplot as plt # plt.imsave("test", train_images[1, :, :]) # Well that was fun # We also reshape the images so they match the first layer of the nn train_images = train_images.reshape(train_images.shape[0], 784, 1) # Lables # Along with the images we need the train_lables_ff = idx2numpy.convert_from_file( 'data/train-labels-idx1-ubyte') # These are also in the wrong format, we need them to be activations of nodes train_lables = numpy.zeros([train_lables_ff.shape[0], 10, 1]) # This cannot be the most efficient solution by it works for i in range(0, train_lables_ff.shape[0]): train_lables[i, train_lables_ff[i]] = 1 return (train_images, train_lables)
def leerTrainingSet(): x_training = idx2numpy.convert_from_file( 'datos/train-images-idx3-ubyte' ) # Las imagenes son 60000 cada una de 28x28 y_training = idx2numpy.convert_from_file('datos/train-labels-idx1-ubyte') x_training = np.reshape(x_training, (60000, 784)) return (x_training, y_training)
def lire_MNIST(): """ Get alpha digit data for given character. Parameters ---------- Returns ------- data : ndarray Matrix containing the data with rows corresponding to samples and columns to pixels. """ train_image_file = 'data/train-images-idx3-ubyte' train_label_file = 'data/train-labels-idx1-ubyte' test_image_file = 'data/t10k-images-idx3-ubyte' test_label_file = 'data/t10k-labels-idx1-ubyte' train_image = idx2numpy.convert_from_file(train_image_file) train_image = to_black_white(np.array([img.flatten() for img in train_image])) test_image = idx2numpy.convert_from_file(test_image_file) test_image = to_black_white(np.array([img.flatten() for img in test_image])) train_label = idx2numpy.convert_from_file(train_label_file) test_label = idx2numpy.convert_from_file(test_label_file) return train_image, train_label, test_image, test_label
def labelDigitsToMultipleFiles(label_filename, instance_filename, out_filename): dirname = os.path.dirname(os.path.dirname(__file__)) instance_filename = dirname + instance_filename label_filename = dirname + label_filename digits = idx2numpy.convert_from_file(instance_filename) labels = idx2numpy.convert_from_file(label_filename) n = len(labels) c = 0 prefix = 0 out_file = open(dirname + "/assets/" + str(prefix) + "_" + out_filename, "w+") for i in range(0, n): if (c == 19999): prefix += 1 c = 0 out_file.close() out_file = open( dirname + "/assets/" + str(prefix) + "_" + out_filename, "w+") out_file.write((str(labels[i]) + ",")) for r in digits[i]: for x in r: out_file.write(str(x) + ",") out_file.write("\n") c += 1 out_file.close()
def load_mnist(): X_train = idx2numpy.convert_from_file('MNIST_data/train-images.idx3-ubyte') train_labels = idx2numpy.convert_from_file( 'MNIST_data/train-labels.idx1-ubyte') X_test = idx2numpy.convert_from_file('MNIST_data/t10k-images.idx3-ubyte') test_labels = idx2numpy.convert_from_file( 'MNIST_data/t10k-labels.idx1-ubyte') train_images = [] # reshape train images so that the training set for i in range(X_train.shape[0]): # is of shape (60000, 1, 28, 28) train_images.append(np.expand_dims(X_train[i], axis=0)) train_images = np.array(train_images) test_images = [] # reshape test images so that the test set for i in range(X_test.shape[0]): # is of shape (10000, 1, 28, 28) test_images.append(np.expand_dims(X_test[i], axis=0)) test_images = np.array(test_images) indices = np.random.permutation( train_images.shape[0]) # permute and split training data in training_idx, validation_idx = indices[:55000], indices[ 55000:] # training and validation sets train_images, validation_images = train_images[ training_idx, :], train_images[validation_idx, :] train_labels, validation_labels = train_labels[training_idx], train_labels[ validation_idx] return { 'train_images': train_images, 'train_labels': train_labels, 'validation_images': validation_images, 'validation_labels': validation_labels, 'test_images': test_images, 'test_labels': test_labels }
def load_data(): """Return the MNIST data as a tuple containing the training data, the validation data, and the test data. The ``training_data`` is returned as a tuple with two entries. The first entry contains the actual training images. This is a numpy ndarray with 50,000 entries. Each entry is, in turn, a numpy ndarray with 784 values, representing the 28 * 28 = 784 pixels in a single MNIST image. The second entry in the ``training_data`` tuple is a numpy ndarray containing 50,000 entries. Those entries are just the digit values (0...9) for the corresponding images contained in the first entry of the tuple. The ``validation_data`` and ``test_data`` are similar, except each contains only 10,000 images. This is a nice data format, but for use in neural networks it's helpful to modify the format of the ``training_data`` a little. That's done in the wrapper function ``load_data_wrapper()``, see below. """ trainImages = idx2numpy.convert_from_file('train-images-idx3-ubyte') trainLabels = idx2numpy.convert_from_file('train-labels-idx1-ubyte') testImages = idx2numpy.convert_from_file('t10k-images-idx3-ubyte') testLabels = idx2numpy.convert_from_file('t10k-labels-idx1-ubyte') training_data = np.array((trainImages[:50000],trainLabels[:50000])) validation_data = np.array((trainImages[50000:],trainLabels[50000:])) test_data = np.array((testImages,testLabels)) return (training_data, validation_data, test_data)
def load_full_Mnist(USE_COLAB = False, path = ''): """ Function that downloads data from the MNIST dataset. (numbers) The following are the inputs and outputs: Outputs: train_x_full: tensor of images sampled randomly from the Mnist training dataset. (60000,28,28) train_y_full: tensor of labels of those sampled training images. (60000,) test_x_full: tensor of images sampled randomly from the Mnist test dataset. (10000,28,28) test_y_full: tensor of labels of those sampled testing images. (10000,) """ train_images_file = 'Data/Mnist/train-images.idx3-ubyte' test_images_file = 'Data/Mnist/t10k-images.idx3-ubyte' train_labels_file = 'Data/Mnist/train-labels.idx1-ubyte' test_labels_file = 'Data/Mnist/t10k-labels.idx1-ubyte' if USE_COLAB: train_images_file = os.path.join(path,'Data/Mnist/train-images.idx3-ubyte') test_images_file = os.path.join(path,'Data/Mnist/t10k-images.idx3-ubyte') train_labels_file = os.path.join(path,'Data/Mnist/train-labels.idx1-ubyte') test_labels_file = os.path.join(path,'Data/Mnist/t10k-labels.idx1-ubyte') train_x_full = torch.tensor(np.array(idx2numpy.convert_from_file(train_images_file))).to(dtype = torch.float32, device = 'cpu') train_y_full = torch.tensor(np.array(idx2numpy.convert_from_file(train_labels_file))).to(dtype = torch.int32, device = 'cpu') test_x_full = torch.tensor(np.array(idx2numpy.convert_from_file(test_images_file))).to(dtype = torch.float32, device = 'cpu') test_y_full = torch.tensor(np.array(idx2numpy.convert_from_file(test_labels_file))).to(dtype = torch.int32, device = 'cpu') return train_x_full, train_y_full, test_x_full, test_y_full
def load_data(filename): if ".h5" in filename[0]: X,Y = read_h5(filename[0]) else: X = idx2numpy.convert_from_file(filename[0]) Y = idx2numpy.convert_from_file(filename[1]) return X,Y
def load_mnist(data_path): ''' Loads the MNIST data from the base path ''' train_img_path = '%s/train-images.idx3-ubyte' % data_path train_lbl_path = '%s/train-labels.idx1-ubyte' % data_path test_img_path = '%s/t10k-images.idx3-ubyte' % data_path test_lbl_path = '%s/t10k-labels.idx1-ubyte' % data_path def encode_one_hot(y, m, k): y_one_hot = np.zeros((m, k)) y_one_hot[range(m), y] = 1 return y_one_hot # get the training data train_img = idx2numpy.convert_from_file(train_img_path) m, row, col = train_img.shape d = row * col X_tr = np.reshape(train_img, (m, d)) / 255. train_lbl = idx2numpy.convert_from_file(train_lbl_path) k = max(train_lbl) + 1 y_tr = encode_one_hot(train_lbl, m, k) # set the data matrix for test test_img = idx2numpy.convert_from_file(test_img_path) m_te = test_img.shape[0] X_te = np.reshape(test_img, (m_te, d)) / 255. # test data matrix test_lbl = idx2numpy.convert_from_file(test_lbl_path) y_te = encode_one_hot(test_lbl, m_te, k) return X_tr, y_tr, X_te, y_te
def get_dataset(): import idx2numpy files = ["train-images-idx3-ubyte","train-labels-idx1-ubyte"] trains = idx2numpy.convert_from_file(files[0]) labels = idx2numpy.convert_from_file(files[1]) train,labels = convert(trains,labels) return train,labels
def run(self): # Get images and labels img = i2n.convert_from_file('t10k-images-idx3-ubyte') lbl = i2n.convert_from_file('t10k-labels-idx1-ubyte') self.hdr.write("#define MNIST_IMAGES {\\\n") for image in range(self.numIm): for row in range(28): for col in range(28): if self.norm: temp = float(img[self.idxSt + image][row][col]) / 255.0 temp -= 0.1307 temp = temp / 0.3015 if temp >= 0: temp = 1 else: temp = 0 self.hdr.write(str(temp) + ", ") else: self.hdr.write( str(img[self.idxSt + image][row][col]) + ", ") self.hdr.write("\\\n") self.hdr.write("}\n\n") self.hdr.write("#define MNIST_LABELS {\\\n") for image in range(self.numIm): self.hdr.write(str(lbl[self.idxSt + image]) + ", ") self.hdr.write("\\\n") self.hdr.write("}\n\n")
def load_data(): """ Loads mnist dataset. """ names = ['trainX', 'trainY', 'testX', 'testY'] data = [] path = easyDL.__file__.split('__init__')[0] + 'preprocessing/datasets/__datasets__/mnist/' if not os.path.isdir(path): print('downloading...') os.mkdir(path) urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'] sleep(0.5) for i, _ in enumerate(urls): download_url(urls[i], path + names[i] + '.gz') with gzip.open(path + names[i] + '.gz', 'rb') as f_in: with open(path + names[i] + '.idx', 'wb') as f_out: shutil.copyfileobj(f_in,f_out) data.append(idx2numpy.convert_from_file(path + names[i] + '.idx')) os.remove(path + names[i] + '.gz') print('\nAll Done.') else: for name in names: data.append(idx2numpy.convert_from_file(path + name + '.idx')) trainX = data[0].reshape(-1, 28, 28, 1).astype(np.int32) testX = data[2].reshape(-1, 28, 28, 1).astype(np.int32) trainY = data[1].astype(np.int32) testY = data[3].astype(np.int32) return (trainX, trainY), (testX, testY)
def mnist_school(classifier, samples_limit=5123): # Raw training, no caffe use. print 'MNIST training started.' mnist_file = '/home/student/Downloads/MNIST/train-images.idx3-ubyte' if os.path.isfile(mnist_file): train_arr = idx2numpy.convert_from_file(mnist_file) else: print 'Error, no file' return print 'Train array loaded, size is ', train_arr.shape label_file = '/home/student/Downloads/MNIST/train-labels.idx1-ubyte' label_arr = idx2numpy.convert_from_file(label_file) print 'Train labels loaded, size is ', label_arr.shape digits = set(label_arr) # Train for each digit for digit_i in digits: # binarize y = 1 * (label_arr == digit_i)[:samples_limit] x_train = np.vstack([i.flatten() for i in train_arr[:samples_limit]]) classifier.fit_from_features(x_train, y, 'MNIST_' + str(digit_i)) print 'MNIST training done.' # Testing on last 1000 samples print 'MNIST testing started.' scores = [] for digit_i in digits: # binarize y = 1 * (label_arr == digit_i)[-1000:] x_train = np.vstack([i.flatten() for i in train_arr[-1000:]]) scores.append(classifier.score(x_train, y)) print 'The mean score for MNIST Task is ', np.mean(scores)
def calcMeanAndStd(pictureFile,labelFile,out_dir): pictures = ~idx2numpy.convert_from_file(pictureFile) #0 means black usually! And 255 means white! labels = idx2numpy.convert_from_file(labelFile) mean_picture = numpy.mean(pictures,0) std_picture = numpy.std(pictures,0) pictures_by_classes = [[] for i in range(10)] pictures_by_classes_array = [None for i in range(10)] for picture,label in zip(pictures,labels): pictures_by_classes[label].append(picture) for i in range(10): pictures_by_classes_array[i] = numpy.array(pictures_by_classes[i]) mean_image = Image.fromarray(numpy.mean(pictures_by_classes_array[i],axis=0).astype('uint8'),'L') std_image = Image.fromarray(numpy.std(pictures_by_classes_array[i],axis=0).astype('uint8'),'L') std_image_inv = Image.fromarray(~numpy.std(pictures_by_classes_array[i],axis=0).astype('uint8'),'L') mean_image.save(out_dir+os.sep + 'mean_train_' + str(i)+'.png','png') std_image.save(out_dir+os.sep + 'std_train_' + str(i)+'.png','png') std_image_inv.save(out_dir+os.sep + 'std_train_inv_' + str(i)+'.png','png') im = Image.fromarray(mean_picture.astype('uint8'),'L') im.save(out_dir+os.sep +'mean_train.png','png') im = Image.fromarray(std_picture.astype('uint8'),'L') im.save(out_dir+os.sep +'std_train.png','png') im = Image.fromarray(~std_picture.astype('uint8'),'L') im.save(out_dir+os.sep +'std_train_inv.png','png')
def emnist_train(model): t_start = time.time() emnist_path = '../gzip' X_train = idx2numpy.convert_from_file(emnist_path + 'emnist-byclass-train-images-idx3-ubyte') y_train = idx2numpy.convert_from_file(emnist_path + 'emnist-byclass-train-labels-idx1-ubyte') X_test = idx2numpy.convert_from_file(emnist_path + 'emnist-byclass-test-images-idx3-ubyte') y_test = idx2numpy.convert_from_file(emnist_path + 'emnist-byclass-test-labels-idx1-ubyte') X_train = np.reshape(X_train, (X_train.shape[0], 28, 28, 1)) X_test = np.reshape(X_test, (X_test.shape[0], 28, 28, 1)) print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, len(emnist_labels)) k = 10 X_train = X_train[:X_train.shape[0] // k] y_train = y_train[:y_train.shape[0] // k] X_test = X_test[:X_test.shape[0] // k] y_test = y_test[:y_test.shape[0] // k] X_train = X_train.astype(np.float32) X_train /= 255.0 X_test = X_test.astype(np.float32) X_test /= 255.0 x_train_cat = keras.utils.to_categorical(y_train, len(emnist_labels)) y_test_cat = keras.utils.to_categorical(y_test, len(emnist_labels)) learning_rate_reduction = keras.callbacks.ReduceLROnPlateau(monitor='val_acc', patience=3, verbose=1, factor=0.5, min_lr=0.00001) keras.backend.get_session().run(tf.global_variables_initializer()) model.fit(X_train, x_train_cat, validation_data=(X_test, y_test_cat), callbacks=[learning_rate_reduction], batch_size=64, epochs=30) print("Training done, dT:", time.time() - t_start)
def way_1(): ''' SOURCE : https://stackoverflow.com/questions/40427435/extract-images-from-idx3-ubyte-file-or-gzip-via-python installation : pip install idx2numpy ''' NUMBER_OF_ITRATION = 1 data = idx2numpy.convert_from_file(TRAIN_DATA) labels = idx2numpy.convert_from_file(TRAIN_LABEL) print(data.shape) print(labels.shape) for i in range(NUMBER_OF_ITRATION): idx = np.random.randint(0, len(data)) digit = data[idx].reshape(data.shape[1:]) plt.imshow(digit) plt.title('Real number : {}'.format(labels[idx])) plt.show() # ... data = idx2numpy.convert_from_file(VALIDATION_DATA) labels = idx2numpy.convert_from_file(VALIDATION_LABEL) print(data.shape) print(labels.shape) for i in range(NUMBER_OF_ITRATION): idx = np.random.randint(0, len(data)) digit = data[idx].reshape(data.shape[1:]) plt.imshow(digit) plt.title('Real number : {}'.format(labels[idx])) plt.show()
def conversion(document, new_file): #Determining the size and magic of the document for error checking with open(document, 'rb') as f: bytes = f.read(8) magic, size = struct.unpack(">II", bytes) print(magic) print(size) #Setting the output of this function as an array ndarr = idx2numpy.convert_from_file(document) print(ndarr) # f_read = open(document, 'rb') print(f_read) ndarr = idx2numpy.convert_from_file(f_read) print(ndarr) write(ndarr, new_file) read_data = read(new_file) print(read_data)
def readIDX(path): import idx2numpy ndarr = idx2numpy.convert_from_file(path) f_read = open(path, 'rb') ndarr = idx2numpy.convert_from_file(f_read) s = f_read.read() #ndarr = idx2numpy.convert_from_string(s) return ndarr
def __init__(self, data_path, label_path): self.data_path = data_path self.label_path = label_path self.dataset = idx2numpy.convert_from_file(self.data_path) self.labels = idx2numpy.convert_from_file(self.label_path) super(IdxFileDataset, self).__init__()
def generate_df(data_path, labels_path): data_raw = idx2numpy.convert_from_file(data_path) labels = idx2numpy.convert_from_file(labels_path) df = pd.DataFrame(data_raw.reshape((data_raw.shape[0], 784))) df['value'] = labels df.loc[df['value'] == 0, df.columns[:784]] = 0 df = df.astype(np.uint8) return df
def loadData(Directory): x_train = idx2numpy.convert_from_file(Directory + 'train-images.idx3-ubyte') y_train = idx2numpy.convert_from_file(Directory + 'train-labels.idx1-ubyte') x_test = idx2numpy.convert_from_file(Directory + 't10k-images.idx3-ubyte') y_test = idx2numpy.convert_from_file(Directory + 't10k-labels.idx1-ubyte') return x_train, y_train, x_test, y_test
def pcaReduction(trainIdxPath, testIdxPath, outTrainPath, outTestIdx, outRatioFile, pcaEigenDir): trainData = idx2numpy.convert_from_file(trainIdxPath) testData = idx2numpy.convert_from_file(testIdxPath) shape = trainData.shape if (len(shape) > 1): trainData = trainData.reshape(shape[0], shape[1] * shape[2]) shape = testData.shape if (len(shape) > 1): testData = testData.reshape(shape[0], shape[1] * shape[2]) pca = PCA() pca.fit(trainData) cumSumRatio = numpy.cumsum(pca.explained_variance_ratio_) indOf09 = numpy.argmax(cumSumRatio >= 0.9) indOf095 = numpy.argmax(cumSumRatio >= 0.95) indOf099 = numpy.argmax(cumSumRatio >= 0.99) print('explained variance ratio: ', pca.explained_variance_ratio_[0:indOf099]) print('cumsum of explained variance ratio: ', cumSumRatio[0:indOf099]) print('indexes of 0.9, 095, 0.99 ', indOf09, indOf095, indOf099) #print('Shape of components:',pca.components_.shape) A = pca.components_[:, 0:indOf095 + 1] for i in range(0, indOf095 + 1): v = A[:, i] v = abs(v) * 255 v = v.reshape(28, 28) im = Image.fromarray(~v.astype('uint8'), 'L') im.save(pcaEigenDir + os.sep + str(i) + '.png', 'png') #print('xxx',A.shape) trainData = numpy.dot(trainData, A) print('train shape', trainData.shape, trainData.dtype) testData = numpy.dot(testData, A) print('test shape', testData.shape, trainData.dtype) f_write = open(outTrainPath, 'wb') idx2numpy.convert_to_file(f_write, trainData) f_write = open(outTestIdx, 'wb') idx2numpy.convert_to_file(f_write, testData) pickle.dump(cumSumRatio[0:indOf099], open(outRatioFile, 'wb'))
def readNewImage(): index = 0 file = './samples/train-images-idx3-ubyte/train-images.idx3-ubyte' label = './samples/train-labels-idx1-ubyte/train-labels.idx1-ubyte' arrFiles = idx2numpy.convert_from_file(file) arrLabels = idx2numpy.convert_from_file(label) for i in range(1): ascii_show(arrFiles[index]) print(arrLabels[index])
def load_mnist_data(imgPath, lblPath): examples = idx.convert_from_file(imgPath) labels = idx.convert_from_file(lblPath) examples = examples.reshape((examples.shape[0], 784)) labels = labels.reshape((labels.size, 1)) data = np.concatenate((examples, labels), axis=1) return data
def __init__(self): print("Loading dataset from files...") self._load( idx2numpy.convert_from_file('../data/MNISTdataset/train-images-idx3-ubyte.idx'), idx2numpy.convert_from_file('../data/MNISTdataset/train-labels-idx1-ubyte.idx'), idx2numpy.convert_from_file('../data/MNISTdataset/t10k-images-idx3-ubyte.idx'), idx2numpy.convert_from_file('../data/MNISTdataset/t10k-labels-idx1-ubyte.idx') ) print("Dataset loaded from files.")
def __init__(self, image_path=None, label_path=None): ### Set Image/Label Path otherwise it defaults to the handwritten digits file if (not image_path and not label_path): image_path = './lib/train-images.idx3-ubyte' label_path = './lib/train-labels.idx1-ubyte' #Import images directly with the idx2numpy library #Using the initialized function to call the image is not recommended as it's the raw image file, prior to preproccessing the image for fliffpress (self.images, self.labels) = (idx2numpy.convert_from_file(image_path), idx2numpy.convert_from_file(label_path))
def Datasets(): MaybeDownload() prefix = "./mnist/" train_im = idx2numpy.convert_from_file(prefix + "train-images-idx3-ubyte") test_im = idx2numpy.convert_from_file(prefix + "t10k-images-idx3-ubyte") train_labels = idx2numpy.convert_from_file(prefix + "train-labels-idx1-ubyte") test_labels = idx2numpy.convert_from_file(prefix + "t10k-labels-idx1-ubyte") return train_im, test_im, train_labels, test_labels
def train_model(model): t_start = time.time() emnist_path = 'neural/emnist/database/digits/' X_train = idx2numpy.convert_from_file( emnist_path + 'emnist-digits-train-images-idx3-ubyte') y_train = idx2numpy.convert_from_file( emnist_path + 'emnist-digits-train-labels-idx1-ubyte') X_test = idx2numpy.convert_from_file( emnist_path + 'emnist-digits-test-images-idx3-ubyte') y_test = idx2numpy.convert_from_file( emnist_path + 'emnist-digits-test-labels-idx1-ubyte') X_train = np.reshape(X_train, (X_train.shape[0], 28, 28, 1)) X_test = np.reshape(X_test, (X_test.shape[0], 28, 28, 1)) # Test: k = 1 X_train = X_train[:X_train.shape[0] // k] y_train = y_train[:y_train.shape[0] // k] X_test = X_test[:X_test.shape[0] // k] y_test = y_test[:y_test.shape[0] // k] # Normalize X_train = X_train.astype(np.float32) X_train /= 255.0 X_test = X_test.astype(np.float32) X_test /= 255.0 y_train_cat = keras.utils.to_categorical(y_train, len(emnist_labels)) y_test_cat = keras.utils.to_categorical(y_test, len(emnist_labels)) model.compile( optimizer='adam', # Optimizer # Минимизируемая функция потерь loss=keras.losses.CategoricalCrossentropy(), # Список метрик для мониторинга metrics=[keras.metrics.CategoricalAccuracy()]) print('# Обучаем модель на тестовых данных') history = model.fit(X_train, y_train_cat, batch_size=64, epochs=3, validation_data=(X_test, y_test_cat)) print("Training done, dT:", time.time() - t_start) print('\nhistory dict:', history.history) # Оценим модель на тестовых данных, используя "evaluate" print('\n# Оцениваем на тестовых данных') results = model.evaluate(X_test, y_test_cat, batch_size=1024) print('test loss, test acc:', results)
def read_answer(self, number="test"): filename1 = self.path + "Resp/" + number + ".samples" filename2 = self.path + "Resp/" + number + ".occs" '''filename3 = self.path + "Resp/" + number + ".lens" lens = np.zeros((2), dtype = "int16") lens[0] = self.hidlen lens[1] = self.vislen''' self.answer = idx2numpy.convert_from_file(filename1) self.answer_occ = idx2numpy.convert_from_file(filename2)
def prepareData(): """ Loads the MNIST test dataset from files located in the MNIST_data folder. Inverts images and reformats them into a single array of pixel features and adds each to a pandas dataframe. Returns the aformentioned pandas dataframe along with a numpy array of the accomanying labels. """ test_images_np = idx2numpy.convert_from_file('MNIST_data/t10k-images-idx3-ubyte') test_labels_np = idx2numpy.convert_from_file('MNIST_data/t10k-labels-idx1-ubyte') img_set = [[255-x for x in entry.flatten()] for entry in test_images_np] mnist_test_df = pd.DataFrame(img_set) return mnist_test_df, test_labels_np
def prepareData(): """ Loads the MNIST training dataset from files located in the MNIST_data folder. Inverts images and reformats them into a single array of pixel features and adds each to a pandas dataframe. Returns the aformentioned pandas dataframe along with a numpy array of the accomanying labels. """ train_images_np = idx2numpy.convert_from_file('MNIST_data/train-images-idx3-ubyte') train_labels_np = idx2numpy.convert_from_file('MNIST_data/train-labels-idx1-ubyte') # Reformat from 2-d array of pixels to a 1-d array and invert so that images are dark letters on lighter background img_set = [[255-x for x in entry.flatten()] for entry in train_images_np] mnist_df = pd.DataFrame(img_set) return mnist_df, train_labels_np
def load_dataset(IDX_IMG_FILE, IDX_LBL_FILE): imgs = idx2numpy.convert_from_file(IDX_IMG_FILE) lbls = idx2numpy.convert_from_file(IDX_LBL_FILE) dataset = [] for i in range(len(imgs)): vector = [] for row in imgs[i]: vector = vector + row.tolist() lbl = [0] * 10 lbl[lbls[i]] = lbls[i] img = [] img.append(vector) img.append(lbl) dataset.append(img) return dataset
def load_dataset(IDX_IMG_FILE, IDX_LBL_FILE): imgs = idx2numpy.convert_from_file(IDX_IMG_FILE) lbls = idx2numpy.convert_from_file(IDX_LBL_FILE) lbls = lbls.tolist() train = [] for i in range(len(imgs)): vector = [] for row in imgs[i]: vector = vector + row.tolist() img = [] img.append(vector) img.append(lbls[i]) train.append(img) print len(train) for i in range(10): print train[i]
def load_mnist(data_path): ''' Loads the MNIST data from the base path ''' train_img_path = '%s/train-images.idx3-ubyte' % data_path # get the training data train_img = idx2numpy.convert_from_file(train_img_path) m, row, col = train_img.shape d = row * col X_tr = np.reshape(train_img, (m, d)) / 255. return X_tr
def __init__(self, filepath, labels=0) : self.data = idx2numpy.convert_from_file(filepath) if labels > 0 : temp = np.zeros((self.data.shape[0], labels)) for i in range(self.data.shape[0]) : temp[i, self.data[i]] = 1 self.data=temp else : self.data = self.data / 256
import numpy as np import idx2numpy import csv """ The idx files for the MNIST dataset can be downloaded at http://yann.lecun.com/exdb/mnist/. This python script can then be used to convert them into two csv files. The first containing all 70,000 images (one row per image), and the second containing all 70,000 labels (single row). """ trainImages = idx2numpy.convert_from_file('/your/path/here/train-images.idx3-ubyte') trainLabels = idx2numpy.convert_from_file('/your/path/here/train-labels.idx1-ubyte') testImages = idx2numpy.convert_from_file('/your/path/here/t10k-images.idx3-ubyte') testLabels = idx2numpy.convert_from_file('/your/path/here/t10k-labels.idx1-ubyte') images = np.concatenate([trainImages.reshape(60000,784), testImages.reshape(10000,784)]) labels = np.concatenate([trainLabels, testLabels]) with open('/your/path/here/mnist_images.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in images: writer.writerow(row) with open('/your/path/here/mnist_labels.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(labels)
# fine-tuning import idx2numpy import numpy as np from nnet import SoftmaxClassifier as scl from nnet import Autoencoder as ae from nnet import DeepAutoencoderClassifier as dac # define the paths train_img_path = '/home/avasbr/datasets/MNIST/train-images.idx3-ubyte' train_lbl_path = '/home/avasbr/datasets/MNIST/train-labels.idx1-ubyte' test_img_path = '/home/avasbr/datasets/MNIST/t10k-images.idx3-ubyte' test_lbl_path = '/home/avasbr/datasets/MNIST/t10k-labels.idx1-ubyte' # convert the raw images into feature vectors train_img = idx2numpy.convert_from_file(train_img_path) m_tr,row,col = train_img.shape d = row*col # dimensions X_tr = np.reshape(train_img[:m_tr],(m_tr,d)).T/255. # train data matrix train_lbl = idx2numpy.convert_from_file(train_lbl_path) k = max(train_lbl)+1 # set the targets for the training-set y_tr = np.zeros((k,m_tr)) for i,idx in enumerate(train_lbl[:m_tr]): y_tr[idx,i] = 1 # set the data matrix for test test_img = idx2numpy.convert_from_file(test_img_path) m_te = test_img.shape[0] X_te = np.reshape(test_img,(m_te,d)).T/255. # test data matrix
#!/usr/bin/env python #-*- coding: utf-8 -*- from __future__ import division, print_function import idx2numpy, gzip, sys import numpy as np DIM_SIZE = 28 print("Reading training data from original MNIST file ...", file=sys.stderr) training_idx_fp = gzip.open('../data/MNIST/train-images-idx3-ubyte.gz') training_arr = idx2numpy.convert_from_file(training_idx_fp) training_idx_fp.close() # convert to binary print("Converting to binary images ...", file=sys.stderr) training_arr = training_arr.astype(bool).astype(int) # flatten each image training_arr = training_arr.reshape(training_arr.shape[0], DIM_SIZE ** 2) print("Save results in csv format for IBP noisyor ...", file=sys.stderr) # write out csv file for ibp header_str = ','.join(['p' + str(_) for _ in range(DIM_SIZE)]) training_ibp_fp = gzip.open('../data/MNIST/train-images-binary-ibp.csv.gz', 'w') np.savetxt(training_ibp_fp, training_arr, fmt='%d', delimiter=',', header=header_str, comments='') training_ibp_fp.close() print("Save results in csv format for tIBP noisyor ...", file=sys.stderr) # write out csv file for ibp training_tibp_fp = gzip.open('../data/MNIST/train-images-binary-tibp.csv.gz', 'w') training_arr_tibp = np.insert(training_arr, 0, DIM_SIZE, axis=1)
from kmeans import K_means from dbscan import DBScan from idx2numpy import convert_from_file import numpy from scipy.spatial.distance import hamming images = numpy.reshape(convert_from_file("train-images.idx3-ubyte"), (60000, 784)).astype("float64") labels = convert_from_file("train-labels.idx1-ubyte") images = numpy.multiply(images, 1 / 255) kmeans = K_means(10, 1) kmeans.fit(numpy.array([images[i] for i in range(5000)])) print(kmeans.score(numpy.array([labels[i] for i in range(5000)]))) #dbscan = DBScan(4.795, 50) #dbscan.fit(numpy.array([images[i] for i in range(1000)])) #print(dbscan.score(numpy.array([labels[i] for i in range(1000)]))) #print(dbscan.clusters())
import numpy as np import idx2numpy import Image import pickle images = idx2numpy.convert_from_file("train-images-idx3-ubyte") data = [] temp = [] for image in images: for i in image: for j in i: temp.append(j) data.append(temp) temp = [] X = np.asarray(data, "float32") # print X.shape # X = (X-np.min(X,0))/(np.max(X,0)+0.0001) X = X / 255.0 # X = np.matrix(X) Y = idx2numpy.convert_from_file("train-labels-idx1-ubyte") X_test = X[50000:] Y_test = Y[50000:] X = X[:50000] Y = Y[:50000] sizes = [X.shape[1], 100] class rbm: def __init__(self, sizes=[], learning_rate=0.01, numepochs=1): print "rbm init ,sizes:", sizes, ", numepochs:", numepochs
def load_dataset(ds): x_path = "mnist/%s-images-idx3-ubyte" % ds y_path = "mnist/%s-labels-idx1-ubyte" % ds x = preprocess_xs(idx2numpy.convert_from_file(x_path)) y = idx2numpy.convert_from_file(y_path) return (x, y)
import idx2numpy import numpy as np from PIL import Image import scipy import scipy.misc import csv imagesidx = idx2numpy.convert_from_file('t10k-images.idx3-ubyte') labelidx = idx2numpy.convert_from_file('t10k-labels.idx1-ubyte') images = [] for img, label in zip(imagesidx, labelidx): raw_img = scipy.misc.imresize(img, (16,16)).flatten() raw_img = [(255 - pixel) for pixel in raw_img] images.append(np.append(raw_img, label)) images = np.array(images) images = images.astype('uint8') print(images.dtype) # np.savetxt("foo.csv", images, delimiter=",") with open("foo.csv", 'w+') as f: csvwriter = csv.writer(f, delimiter=',') for image in images: csvwriter.writerow(image)
with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) from sklearn.ensemble import RandomForestClassifier as RFC __author__ = 'vks' import_labs(["task4p1/", "task3/"]) from random_forest import RandomForest from kNN import Naive_kNN from CV import k_fold px_x = 2 px_y = 2 train_images = idx2numpy.convert_from_file("train-images.idx3-ubyte") train_images_hog = [hog(img, orientations=8, pixels_per_cell=(px_x, px_y), cells_per_block=(1, 1)) for img in train_images] train_labels = idx2numpy.convert_from_file("train-labels.idx1-ubyte") test_images = idx2numpy.convert_from_file("t10k-images.idx3-ubyte") test_images_hog = [hog(img, orientations=8, pixels_per_cell=(px_x, px_y), cells_per_block=(1, 1)) for img in test_images] test_labels = idx2numpy.convert_from_file("t10k-labels.idx1-ubyte") def shift(axis, dist, data): ans = [[[0] * len(data[0][0]) for i in range(len(data[0]))] for j in range(len(data))] for i in range(len(data)): size = len(ans[i]) for j in range(size): for k in range(size):
# This demo applies the sparse autoencoder to the MNIST data to learn # pen-stroke-like features. The self-taught learning (STL) demo explores # the use of these features for classification purposes import idx2numpy import numpy as np import matplotlib.pyplot as plt from nnet import Autoencoder as ae from nnet.common import dataproc as dp # define the paths train_img_path = '/home/avasbr/datasets/MNIST/train-images.idx3-ubyte' # convert the raw images into feature vectors num_img = 10000 train_img = idx2numpy.convert_from_file(train_img_path) dummy,row,col = train_img.shape d = row*col # dimensions X_tr = np.reshape(train_img[:num_img],(num_img,d)).T/255. # train data matrix # Neural network initialization parameters print 'Sparse Autoencoder applied to MNIST data\n' print 'Data:' print '------' print 'Number of samples for training:',num_img,'\n' nnet_params = {'d':d,'n_hid':196,'decay':0.003,'beta':3,'rho':0.1} optim_params = {'method':'L-BFGS-B','n_iter':400}
xp = np img_size = 48 train_size = 1711 test_size = 249 N = train_size N_test = test_size train_path = "./data/numbers-proceed" test_path = "./data/mustread-proceed" x_train = i2n.convert_from_file('./data/new/faxocr-training-48_train_images.idx3') y_train = i2n.convert_from_file('./data/new/faxocr-training-48_train_labels.idx1').astype('int32') x_test = i2n.convert_from_file('./data/new/faxocr-mustread-48_train_images.idx3') y_test = i2n.convert_from_file('./data/new/faxocr-mustread-48_train_labels.idx1').astype('int32') print x_train.shape def reshape(data): shape = data.shape n_d = np.zeros((shape[0],1,shape[1],shape[2]),dtype="float32") size = shape[0] for i in range(size): n_d[i][0] = data[i] return n_d
#!c:/Python34/python.exe # from here: https://www.snip2code.com/Snippet/257756/Python-script-for-converting-the-MNIST-d import numpy as np import idx2numpy import csv """ The idx files for the MNIST dataset can be downloaded at http://yann.lecun.com/exdb/mnist/. This python script can then be used to convert them into two csv files. The first containing all 70,000 images (one row per image), and the second containing all 70,000 labels (single row). """ trainImages = idx2numpy.convert_from_file('data/train-images-idx3-ubyte') trainLabels = idx2numpy.convert_from_file('data/train-labels-idx1-ubyte') testImages = idx2numpy.convert_from_file('data/t10k-images-idx3-ubyte') testLabels = idx2numpy.convert_from_file('data/t10k-labels-idx1-ubyte') #images = np.concatenate([trainImages.reshape(60000,784), testImages.reshape(10000,784)]) #labels = np.concatenate([trainLabels, testLabels]) train_images = trainImages.reshape(60000,784) test_images = testImages.reshape(10000,784) with open('mnist_train_images.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in train_images: writer.writerow(row)
random_forest_classifier.fit(training_image_data_hog, training_label_data) random_forest_classifier_accuracy = random_forest_classifier.score(testing_image_data_hog, testing_label_data) print "\nRandom Forest accuracy with max_depth="+str(max_depth)+" and number of trees = "+str(number_of_trees)+ " is "+ str(random_forest_classifier_accuracy) if max_depth in best_accuracy_forest: if best_accuracy_forest[max_depth] < random_forest_classifier_accuracy: best_accuracy_forest[max_depth] = random_forest_classifier_accuracy best_number_of_tree_forest[max_depth] = number_of_trees else : # print "\nEntered else case in forest" best_accuracy_forest[max_depth] = random_forest_classifier_accuracy best_number_of_tree_forest[max_depth] = number_of_trees if __name__ == '__main__': print "Building Data set" training_image_data = idx2numpy.convert_from_file("train-images.idx3-ubyte") training_image_data_hog = [hog(img, orientations=9, pixels_per_cell=(8,8), cells_per_block=(3, 3)) for img in training_image_data] training_label_data = idx2numpy.convert_from_file("train-labels.idx1-ubyte") testing_image_data = idx2numpy.convert_from_file("t10k-images.idx3-ubyte") testing_image_data_hog = [hog(img, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(3, 3)) for img in testing_image_data] testing_label_data = idx2numpy.convert_from_file("t10k-labels.idx1-ubyte") print "Dataset is complete" depth_array = [5,6,7] number_of_trees_array = [310,350,390] for depth in depth_array: for number_of_trees in number_of_trees_array : find_best_adaboost_classifier(number_of_trees,depth) find_best_random_forest_classifier(number_of_trees,depth)
from idx2numpy import convert_from_file from adaboost import AdaBoostMulticlass from numpy import reshape, multiply train_images = reshape(convert_from_file("train-images.idx3-ubyte").astype('float64'), (60000, 784)) train_labels = convert_from_file("train-labels.idx1-ubyte") test_images = reshape(convert_from_file("t10k-images.idx3-ubyte").astype('float64'), (10000, 784)) test_labels = convert_from_file("t10k-labels.idx1-ubyte") train_images = multiply(train_images, 1 / 255) test_images = multiply(test_images, 1 / 255) ada = AdaBoostMulticlass(1000) ada.fit(train_images, train_labels) score = 0 for i in range(10000): if ada.predict(test_images[i]) == test_labels[i]: score += 1 print(score / 10000)
import idx2numpy train_img_path = r'E:\VirtualDesktop\nnet\minist\train-images.idx3-ubyte' train_index_path = r'E:\VirtualDesktop\nnet\minist\train-labels.idx1-ubyte' t10k_img_path = r'E:\VirtualDesktop\nnet\minist\t10k-images.idx3-ubyte' t10k_index_path = r'E:\VirtualDesktop\nnet\minist\t10k-labels.idx1-ubyte' train_images = idx2numpy.convert_from_file(train_img_path) train_labels = idx2numpy.convert_from_file(train_index_path) print 'End'
#!/usr/bin/env python # import array import os import numpy from PIL import Image import idx2numpy def isint(x): try: int(x) except: return False else: return True if __name__ == "__main__": trainingImages = idx2numpy.convert_from_file("train-images-idx3-ubyte") trainingLabels = idx2numpy.convert_from_file("train-labels-idx1-ubyte") testImages = idx2numpy.convert_from_file("t10k-images-idx3-ubyte") testLabels = idx2numpy.convert_from_file("t10k-labels-idx1-ubyte") labelSet = set() labelSet.update(set(trainingLabels)) labelSet.update(set(testLabels)) # ints = [i for i in labelSet if isint(i)] # labelSet.difference_update(ints) # labelSet = [str(j) for j in (sorted([int(i) for i in ints]) + sorted(labelSet))] labelSet = [str(j) for j in sorted(labelSet)] extension = "png"
import numpy as np import math import idx2numpy as inp from sklearn.neighbors import KNeighborsClassifier trimgs = inp.convert_from_file('train-images.idx3-ubyte') trlbls = inp.convert_from_file('train-labels.idx1-ubyte') tsimgs = inp.convert_from_file('t10k-images.idx3-ubyte') tslbls = inp.convert_from_file('t10k-labels.idx1-ubyte') trimgs = np.resize(trimgs,(60000,784)) tsimgs = np.resize(tsimgs,(10000,784)) neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(trimgs, trlbls) pred=neigh.predict(tsimgs) corr=0 for i in range(10000): if (pred[i]==tslbls[i]): corr+=1 print "accuracy: ",corr/100
def test_correct_file_on_disk(self): file = os.path.join(self.files_dir, 'correct.idx') self.assertSequenceEqual( [0x0A, 0x0B, 0x0C], self._to_list(idx2numpy.convert_from_file(file)))
def get_mnist_trainset(): x_train = idx2numpy.convert_from_file('train-images.idx3-ubyte') x_train = np.array([x.flatten() for x in x_train]) y_train = idx2numpy.convert_from_file('train-labels.idx1-ubyte') return x_train, y_train
import idx2numpy import numpy as np from sklearn.lda import LDA from multiprocessing import Pool import pdb imagestra = idx2numpy.convert_from_file('train-images.idx3-ubyte') labelstra = idx2numpy.convert_from_file('train-labels.idx1-ubyte') labelsmaptra = { x : [] for x in list( set( list(labelstra))) } imagesArray = [] for i in range(len(imagestra)): imagesArray.append( imagestra[i].reshape( (1, (28*28)))[0] ) for lab, img in zip( list(labelstra), imagesArray ): labelsmaptra[lab].append( img ) imagestes = idx2numpy.convert_from_file('t10k-images.idx3-ubyte') labelstes = idx2numpy.convert_from_file('t10k-labels.idx1-ubyte') labelsmaptes = { x : [] for x in list( set( list(labelstes))) } imagesArray = [] for i in range(len(imagestes)): imagesArray.append( imagestes[i].reshape( (1, (28*28)))[0] ) for lab, img, in zip( list(labelstes), imagesArray ): labelsmaptes[lab].append( img ) #numbers = [ (x, y) for x in range(10) for y in range(x,10) if x != y ] def runTestPairs( e ):
import numpy as np import idx2numpy import csv """ The idx files for the MNIST dataset can be downloaded at http://yann.lecun.com/exdb/mnist/. This python script can then be used to convert them into two csv files. The first containing all 70,000 images (one row per image), and the second containing all 70,000 labels (single row). """ trainImages = idx2numpy.convert_from_file('/home/simjay/workspace/NaiveBayes/others/mnist/train-images.idx3-ubyte') trainLabels = idx2numpy.convert_from_file('/home/simjay/workspace/NaiveBayes/others/mnist/train-labels.idx1-ubyte') testImages = idx2numpy.convert_from_file('/home/simjay/workspace/NaiveBayes/others/mnist/t10k-images.idx3-ubyte') testLabels = idx2numpy.convert_from_file('/home/simjay/workspace/NaiveBayes/others/mnist/t10k-labels.idx1-ubyte') trainImages = np.concatenate([trainImages.reshape(60000,784)]) testImages = np.concatenate([testImages.reshape(10000,784)]) trainLabels = np.concatenate([trainLabels]) testLabels = np.concatenate([testLabels]) with open('/home/simjay/workspace/NaiveBayes/mnistCSV/trainImages.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in trainImages: writer.writerow(row) print("done") with open('/home/simjay/workspace/NaiveBayes/mnistCSV/testImages.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)