def test_get_line_cache_name(cachedir, cachefile, n_images, n_row, n_bands): cfg = dict(cache_line_dir=cachedir) assert cachefile == cache.get_line_cache_name(cfg, n_images, n_row, n_bands)
def test_get_line_cache_name(self): name = cache.get_line_cache_name(self.config, self.n_images, self.n_row, self.n_bands) self.assertEqual(name, self.test_file)
def test_get_line_cache_name(self): name = cache.get_line_cache_name( self.config, self.n_images, self.n_row, self.n_bands) self.assertEqual(name, self.test_file)
def read_line(line, images, image_IDs, dataset_config, ncol, nband, dtype, read_cache=False, write_cache=False, validate_cache=False): """ Reads in dataset from cache or images if required Args: line (int): line to read in from images images (list): list of image filenames to read from image_IDs (iterable): list image identifying strings dataset_config (dict): dictionary of dataset configuration options ncol (int): number of columns nband (int): number of bands dtype (type): NumPy datatype read_cache (bool, optional): try to read from cache directory (default: False) write_cache (bool, optional): try to to write to cache directory (default: False) validate_cache (bool, optional): validate that cache data come from same images specified in `images` (default: False) Returns: Y (np.ndarray): 3D array of image data (nband, n_image, n_cols) """ start_time = time.time() read_from_disk = True cache_filename = get_line_cache_name( dataset_config, len(images), line, nband) Y_shape = (nband, len(images), ncol) if read_cache: Y = read_cache_file(cache_filename, image_IDs if validate_cache else None) if Y is not None and Y.shape == Y_shape: logger.debug('Read in Y from cache file') read_from_disk = False elif Y is not None and Y.shape != Y_shape: logger.warning( 'Data from cache file does not meet size requested ' '({y} versus {r})'.format(y=Y.shape, r=Y_shape)) if read_from_disk: # Read in Y if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = read_row_BIP(images, line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = read_row_GDAL(images, line) logger.debug('Took {s}s to read in the data'.format( s=round(time.time() - start_time, 2))) if write_cache and read_from_disk: logger.debug('Writing Y data to cache file {f}'.format( f=cache_filename)) write_cache_file(cache_filename, Y, image_IDs) return Y
def read_line(line, images, image_IDs, dataset_config, ncol, nband, dtype, read_cache=False, write_cache=False, validate_cache=False): """ Reads in dataset from cache or images if required Args: line (int): line to read in from images images (list): list of image filenames to read from image_IDs (iterable): list image identifying strings dataset_config (dict): dictionary of dataset configuration options ncol (int): number of columns nband (int): number of bands dtype (type): NumPy datatype read_cache (bool, optional): try to read from cache directory (default: False) write_cache (bool, optional): try to to write to cache directory (default: False) validate_cache (bool, optional): validate that cache data come from same images specified in `images` (default: False) Returns: Y (np.ndarray): 3D array of image data (nband, n_image, n_cols) """ start_time = time.time() read_from_disk = True cache_filename = get_line_cache_name(dataset_config, len(images), line, nband) Y_shape = (nband, len(images), ncol) if read_cache: Y = read_cache_file(cache_filename, image_IDs if validate_cache else None) if Y is not None and Y.shape == Y_shape: logger.debug('Read in Y from cache file') read_from_disk = False elif Y is not None and Y.shape != Y_shape: logger.warning('Data from cache file does not meet size requested ' '({y} versus {r})'.format(y=Y.shape, r=Y_shape)) if read_from_disk: # Read in Y if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = read_row_BIP(images, line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = read_row_GDAL(images, line) logger.debug('Took {s}s to read in the data'.format( s=round(time.time() - start_time, 2))) if write_cache and read_from_disk: logger.debug( 'Writing Y data to cache file {f}'.format(f=cache_filename)) write_cache_file(cache_filename, Y, image_IDs) return Y
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format'] ) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name( dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern( job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file( images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs ) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format']) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name(dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format(l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file(images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def cache(ctx, config, job_number, total_jobs, update_pattern, interlace): cfg = parse_config_file(config) if not os.path.isdir(cfg['dataset']['cache_line_dir']): os.makedirs(cfg['dataset']['cache_line_dir']) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) df['image_IDs'] = get_image_IDs(df['filename']) nrow, ncol, nband, dtype = reader.get_image_attribute(df['filename'][0]) # Determine lines to work on job_lines = distribute_jobs(job_number, total_jobs, nrow, interlaced=interlace) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if cfg['dataset']['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if update_pattern: previous_cache = fnmatch.filter( os.listdir(cfg['dataset']['cache_line_dir']), update_pattern) if not previous_cache: logger.warning('Could not find cache files to update with pattern ' '%s' % update_pattern) else: logger.debug('Found %s previously cached files to update' % len(previous_cache)) for job_line in job_lines: cache_filename = get_line_cache_name(cfg['dataset'], len(df), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) else: update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: update_cache_file(df['filename'], df['image_IDs'], update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if cfg['dataset']['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(df['filename'], job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(df['filename'], job_line) write_cache_file(cache_filename, Y, df['image_IDs']) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))