def preprocess(self, copy=False): """ Creates a preprocessing function according to the settings, the function is used by the user of the dataset :param copy: Boolean specifying if there is a need to copy the input matrix :return: """ path = self.dataset_path if not os.path.isdir(self.dataset_path): path = os.path.dirname(os.path.abspath(self.dataset_path)) if self.normalize_data: self.preprocessors = (functools.partial(preprocessing.normalize, copy=copy), functools.partial(preprocessing.normalize, copy=copy)) if self.scale: self.preprocessors = (preprocessing.StandardScaler(copy=copy).fit( self.trainset[0]).transform, preprocessing.StandardScaler(copy=copy).fit( self.trainset[1]).transform) if self.scale_rows: self.preprocessors = (functools.partial(preprocessing.scale, copy=copy, axis=1), functools.partial(preprocessing.scale, copy=copy, axis=1)) if not self.pca[0] == 0: self.preprocessors = (PCA(self.pca[0], copy=copy, whiten=self.whiten).fit( self.trainset[0].copy()).transform, lambda x: x) if not self.pca[1] == 0: self.preprocessors = (lambda x: x, PCA(self.pca[0], copy=copy, whiten=self.whiten).fit( self.trainset[1].copy()).transform) if self.whiten: OutputLog().write('using whiten') pca_dim1 = PCA(whiten=True) pca_dim2 = PCA(whiten=True) pca_dim1.fit(self.trainset[0]) pca_dim2.fit(self.trainset[1]) self.trainset = (pca_dim1.transform(self.trainset[0]), pca_dim2.transform(self.trainset[1])) self.testset = (pca_dim1.transform(self.testset[0]), pca_dim2.transform(self.testset[1])) self.tuning = (pca_dim1.transform(self.tuning[0]), pca_dim2.transform(self.tuning[1]))
def __init__(self, data_set_parameters): OutputLog().write('Loading dataset: ' + data_set_parameters['name']) self.dataset_path = data_set_parameters['path'] self.trainset = None self.testset = None self.tuning = None self.reduce_val = 0 self.x_y_mapping = {'train': None, 'dev': None, 'test': None} self.x_reduce = {'train': None, 'dev': None, 'test': None} self.data_set_parameters = data_set_parameters self.scale = bool(int(data_set_parameters['scale'])) self.scale_rows = bool(int(data_set_parameters['scale_samples'])) self.whiten = bool(int(data_set_parameters['whiten'])) self.pca = map(int, data_set_parameters['pca'].split()) self.normalize_data = bool(int(data_set_parameters['normalize'])) self.preprocessors = None
def load(self): """ Dataset can be saved as three .npy files each containing a tuple of matrices, or the data can be read manually through the build_dataset method. The dataset is composed of three tuples for training, testing and validation. Each tuple contains two matrices one for the X view and the Y view. The matrices are of size MxD1 and MxD2, where M is the number of samples and D1 and D2 are the dimensionality of views X and Y respectively. :return: """ path = self.dataset_path if not os.path.isdir(self.dataset_path): path = os.path.dirname(os.path.abspath(self.dataset_path)) params = os.path.join(path, 'params.p') try: self.trainset = self.load_cache(path, 'train') self.testset = self.load_cache(path, 'test') self.tuning = self.load_cache(path, 'validate') try: self.x_y_mapping['test'] = numpy.load( os.path.join(path, 'mapping_test.npy'), 'r') self.x_y_mapping['dev'] = numpy.load( os.path.join(path, 'mapping_dev.npy'), 'r') self.x_reduce = cPickle.load( open(os.path.join(path, 'reduce.p'), 'r')) except: OutputLog().write('Failed loading mappings') self.generate_mapping() # Save mapping to disk numpy.save(os.path.join(path, 'mapping_test'), self.x_y_mapping['test']) numpy.save(os.path.join(path, 'mapping_dev'), self.x_y_mapping['dev']) with open(os.path.join(path, 'reduce.p'), 'w') as reduce_file: cPickle.dump(self.x_reduce, reduce_file) with open(params) as params_file: loaded_params = cPickle.load(params_file) OutputLog().write( 'Loaded dataset params: {0}'.format(loaded_params)) except Exception as e: OutputLog().write( 'Failed loading from local cache with exception: {}'.format(e)) self.build_dataset() self.preprocess() OutputLog().write( 'Dataset dimensions = %d, %d' % (self.trainset[0].shape[1], self.trainset[1].shape[1])) OutputLog().write('Training set size = %d' % self.trainset[0].shape[0]) OutputLog().write('Test set size = %d' % self.testset[0].shape[0]) OutputLog().write('Dataset params: {0}'.format( self.data_set_parameters)) if self.tuning is not None: OutputLog().write('Tuning set size = %d' % self.tuning[0].shape[0])
def print_params(cls): OutputLog().write('Params:\n') for (key, value) in cls.__dict__.iteritems(): if not key.startswith('__'): OutputLog().write('{0}: {1}'.format(key, value))
def test_model(model_x, model_y, dataset_x, dataset_y, preprocessors=None, reduce=0): test_x = dataset_x test_y = dataset_y x_total_value = None y_total_value = None if preprocessors is None: preprocessors = (None, None) for index, batch in enumerate( iterate_single_minibatch(test_x, Params.VALIDATION_BATCH_SIZE, False, preprocessor=preprocessors[0])): x_values = model_y(batch)[0] if x_total_value is None: x_total_value = x_values else: x_total_value = numpy.vstack((x_total_value, x_values)) for index, batch in enumerate( iterate_single_minibatch(test_y, Params.VALIDATION_BATCH_SIZE, False, preprocessor=preprocessors[1])): y_values = model_x(batch)[0] if y_total_value is None: y_total_value = y_values else: y_total_value = numpy.vstack((y_total_value, y_values)) for index, (x_tilde, y_tilde) in enumerate(zip(y_total_value, x_total_value)): x_tilde_reshape = x_tilde.reshape((28, 14), order='F') y_tilde_reshape = y_tilde.reshape((28, 14), order='F') x_reshape = test_x[index].reshape((28, 14), order='F') y_reshape = test_y[index].reshape((28, 14), order='F') image_tilde_x = numpy.hstack((x_tilde_reshape, y_reshape)) image_tilde_y = numpy.hstack((x_reshape, y_tilde_reshape)) image_tilde_x = (image_tilde_x + abs(image_tilde_x)) / 2 image_tilde_y = (image_tilde_y + abs(image_tilde_y)) / 2 pyplot.imsave(os.path.join('/home/avive/theses/MNIST_results2/x/', '{0}.jpg'.format(index)), image_tilde_x, cmap='Greys_r') pyplot.imsave(os.path.join('/home/avive/theses/MNIST_results2/y/', '{0}.jpg'.format(index)), image_tilde_y, cmap='Greys_r') header = ['layer', 'loss', 'corr', 'search1', 'search5', 'desc1', 'desc5'] rows = [] search_recall, describe_recall = complete_rank(x_total_value, y_total_value, reduce) loss = euclidean_error(x_total_value, y_total_value) correlation = calculate_correlation(x_total_value, y_total_value) print_row = ["{0} ".format(Params.OUTPUT_LAYER), loss, correlation] print_row.extend(search_recall) print_row.extend(describe_recall) rows.append(print_row) OutputLog().write(tabulate(rows, headers=header))
VALIDATE_ALL = False MEMORY_LIMIT = 8000000. if __name__ == '__main__': data_set_config = sys.argv[1] if len(sys.argv) > 2: top = int(sys.argv[2]) else: top = 0 model_results = {'train': [], 'validate': []} results_folder = os.path.join(os.getcwd(), 'results') OutputLog().set_path(results_folder) OutputLog().set_verbosity('info') data_config = ConfigParser.ConfigParser() data_config.read(data_set_config) data_parameters = ConfigSectionMap("dataset_parameters", data_config) # construct data set data_set = Container().create(data_parameters['name'], data_parameters) data_set.load() Params.print_params() # Export network path = OutputLog().output_path
if preprocessor is not None: yield preprocessor(numpy.copy(buffer[excerpt])) else: yield buffer[excerpt] if __name__ == '__main__': data_set_config = sys.argv[1] model_results = {'train': [], 'validate': []} results_folder = os.path.join(os.getcwd(), 'results') OutputLog().set_path(results_folder) OutputLog().set_verbosity('info') data_config = ConfigParser.ConfigParser() data_config.read(data_set_config) data_parameters = ConfigSectionMap("dataset_parameters", data_config) # construct data set data_set = Container().create(data_parameters['name'], data_parameters) data_set.load() y_var = tensor.matrix() x_var = tensor.matrix() model = tied_dropout_iterative_model