def test_update_cache_file_add_obs(cachefile, example_cache, example_timeseries): """ Grab a subset of test data and see if we get more data back """ stack_images = example_timeseries['images'] stack_image_IDs = example_timeseries['image_IDs'] # Presort and subset for comparison sort_idx = np.argsort(example_cache['image_IDs']) test_Y = example_cache['Y'][:, sort_idx, :] test_IDs = example_cache['image_IDs'][sort_idx] size_1 = 100 size_2 = 200 sort_idx = np.argsort(stack_image_IDs)[:size_2] stack_images = stack_images[sort_idx] stack_IDs = stack_image_IDs[sort_idx] # Create reduced dataset to add to np.savez_compressed('test.npz', Y=test_Y[:, :size_1, :], image_IDs=test_IDs[:size_1]) # Write update and read back cache.update_cache_file(stack_images, stack_IDs, 'test.npz', 'test_new.npz', 0, io.gdal_reader) updated = np.load('test_new.npz') # Test and clean update np.testing.assert_equal(test_Y[:, :size_2, :], updated['Y']) np.testing.assert_equal(test_IDs[:size_2], updated['image_IDs']) os.remove('test.npz') os.remove('test_new.npz')
def test_update_cache_file_add_obs(self): """ Grab a subset of test data and see if we get more data back """ # Presort and subset for comparison sort_idx = np.argsort(self.test_data['image_IDs']) test_Y = self.test_data['Y'][:, sort_idx, :] test_IDs = self.test_data['image_IDs'][sort_idx] size_1 = 100 size_2 = 200 sort_idx = np.argsort(self.stack_image_IDs)[:size_2] stack_images = self.stack_images[sort_idx] stack_IDs = self.stack_image_IDs[sort_idx] # Create reduced dataset to add to np.savez_compressed('test_write_3.npz', Y=test_Y[:, :size_1, :], image_IDs=test_IDs[:size_1]) # Write update and read back cache.update_cache_file(stack_images, stack_IDs, 'test_write_3.npz', 'test_write_new_3.npz', 0, reader.read_row_GDAL) updated = np.load('test_write_new_3.npz') # Test and clean update np.testing.assert_equal(test_Y[:, :size_2, :], updated['Y']) np.testing.assert_equal(test_IDs[:size_2], updated['image_IDs']) os.remove('test_write_3.npz') os.remove('test_write_new_3.npz')
def test_update_cache_file_delete_obs(cachefile, example_cache): choice = np.random.choice(example_cache['image_IDs'].size, size=100, replace=False) new_Y = example_cache['Y'][:, choice, :] new_image_IDs = example_cache['image_IDs'][choice] # For now, just use image_IDs as `images` since we won't be updating # from images cache.update_cache_file(new_image_IDs, new_image_IDs, cachefile, 'test.npz', 0, io.gdal_reader) test = np.load('test.npz') Y, image_IDs = test['Y'], test['image_IDs'] os.remove('test.npz') np.testing.assert_equal(new_Y, Y) np.testing.assert_equal(new_image_IDs, image_IDs)
def test_update_cache_file_delete_obs(self): choice = np.random.choice(self.test_data['image_IDs'].size, size=100, replace=False) new_Y = self.test_data['Y'][:, choice, :] new_image_IDs = self.test_data['image_IDs'][choice] # For now, just use image_IDs as `images` since we won't be updating # from images cache.update_cache_file(new_image_IDs, new_image_IDs, self.test_file, 'test_write_2.npz', 0, reader.read_row_GDAL) new_cache = np.load('test_write_2.npz') np.testing.assert_equal(new_Y, new_cache['Y']) np.testing.assert_equal(new_image_IDs, new_cache['image_IDs']) os.remove('test_write_2.npz')
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format'] ) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name( dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern( job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file( images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs ) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format']) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name(dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format(l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file(images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def cache(ctx, config, job_number, total_jobs, update_pattern, interlace): cfg = parse_config_file(config) if not os.path.isdir(cfg['dataset']['cache_line_dir']): os.makedirs(cfg['dataset']['cache_line_dir']) df = csvfile_to_dataframe(cfg['dataset']['input_file'], cfg['dataset']['date_format']) df['image_IDs'] = get_image_IDs(df['filename']) nrow, ncol, nband, dtype = reader.get_image_attribute(df['filename'][0]) # Determine lines to work on job_lines = distribute_jobs(job_number, total_jobs, nrow, interlaced=interlace) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if cfg['dataset']['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if update_pattern: previous_cache = fnmatch.filter( os.listdir(cfg['dataset']['cache_line_dir']), update_pattern) if not previous_cache: logger.warning('Could not find cache files to update with pattern ' '%s' % update_pattern) else: logger.debug('Found %s previously cached files to update' % len(previous_cache)) for job_line in job_lines: cache_filename = get_line_cache_name(cfg['dataset'], len(df), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) else: update = os.path.join(cfg['dataset']['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: update_cache_file(df['filename'], df['image_IDs'], update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if cfg['dataset']['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(df['filename'], job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(df['filename'], job_line) write_cache_file(cache_filename, Y, df['image_IDs']) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))