Example #1
0
def test_update_cache_file_add_obs(cachefile, example_cache,
                                   example_timeseries):
    """ Grab a subset of test data and see if we get more data back """
    stack_images = example_timeseries['images']
    stack_image_IDs = example_timeseries['image_IDs']

    # Presort and subset for comparison
    sort_idx = np.argsort(example_cache['image_IDs'])
    test_Y = example_cache['Y'][:, sort_idx, :]
    test_IDs = example_cache['image_IDs'][sort_idx]

    size_1 = 100
    size_2 = 200

    sort_idx = np.argsort(stack_image_IDs)[:size_2]
    stack_images = stack_images[sort_idx]
    stack_IDs = stack_image_IDs[sort_idx]

    # Create reduced dataset to add to
    np.savez_compressed('test.npz',
                        Y=test_Y[:, :size_1, :],
                        image_IDs=test_IDs[:size_1])

    # Write update and read back
    cache.update_cache_file(stack_images, stack_IDs,
                            'test.npz', 'test_new.npz',
                            0, io.gdal_reader)
    updated = np.load('test_new.npz')

    # Test and clean update
    np.testing.assert_equal(test_Y[:, :size_2, :], updated['Y'])
    np.testing.assert_equal(test_IDs[:size_2], updated['image_IDs'])

    os.remove('test.npz')
    os.remove('test_new.npz')
Example #2
0
    def test_update_cache_file_add_obs(self):
        """ Grab a subset of test data and see if we get more data back """
        # Presort and subset for comparison
        sort_idx = np.argsort(self.test_data['image_IDs'])
        test_Y = self.test_data['Y'][:, sort_idx, :]
        test_IDs = self.test_data['image_IDs'][sort_idx]

        size_1 = 100
        size_2 = 200

        sort_idx = np.argsort(self.stack_image_IDs)[:size_2]
        stack_images = self.stack_images[sort_idx]
        stack_IDs = self.stack_image_IDs[sort_idx]

        # Create reduced dataset to add to
        np.savez_compressed('test_write_3.npz',
                            Y=test_Y[:, :size_1, :],
                            image_IDs=test_IDs[:size_1])

        # Write update and read back
        cache.update_cache_file(stack_images, stack_IDs, 'test_write_3.npz',
                                'test_write_new_3.npz', 0,
                                reader.read_row_GDAL)
        updated = np.load('test_write_new_3.npz')

        # Test and clean update
        np.testing.assert_equal(test_Y[:, :size_2, :], updated['Y'])
        np.testing.assert_equal(test_IDs[:size_2], updated['image_IDs'])

        os.remove('test_write_3.npz')
        os.remove('test_write_new_3.npz')
Example #3
0
def test_update_cache_file_add_obs(cachefile, example_cache,
                                   example_timeseries):
    """ Grab a subset of test data and see if we get more data back """
    stack_images = example_timeseries['images']
    stack_image_IDs = example_timeseries['image_IDs']

    # Presort and subset for comparison
    sort_idx = np.argsort(example_cache['image_IDs'])
    test_Y = example_cache['Y'][:, sort_idx, :]
    test_IDs = example_cache['image_IDs'][sort_idx]

    size_1 = 100
    size_2 = 200

    sort_idx = np.argsort(stack_image_IDs)[:size_2]
    stack_images = stack_images[sort_idx]
    stack_IDs = stack_image_IDs[sort_idx]

    # Create reduced dataset to add to
    np.savez_compressed('test.npz',
                        Y=test_Y[:, :size_1, :],
                        image_IDs=test_IDs[:size_1])

    # Write update and read back
    cache.update_cache_file(stack_images, stack_IDs, 'test.npz',
                            'test_new.npz', 0, io.gdal_reader)
    updated = np.load('test_new.npz')

    # Test and clean update
    np.testing.assert_equal(test_Y[:, :size_2, :], updated['Y'])
    np.testing.assert_equal(test_IDs[:size_2], updated['image_IDs'])

    os.remove('test.npz')
    os.remove('test_new.npz')
Example #4
0
    def test_update_cache_file_add_obs(self):
        """ Grab a subset of test data and see if we get more data back """
        # Presort and subset for comparison
        sort_idx = np.argsort(self.test_data['image_IDs'])
        test_Y = self.test_data['Y'][:, sort_idx, :]
        test_IDs = self.test_data['image_IDs'][sort_idx]

        size_1 = 100
        size_2 = 200

        sort_idx = np.argsort(self.stack_image_IDs)[:size_2]
        stack_images = self.stack_images[sort_idx]
        stack_IDs = self.stack_image_IDs[sort_idx]

        # Create reduced dataset to add to
        np.savez_compressed('test_write_3.npz',
                            Y=test_Y[:, :size_1, :],
                            image_IDs=test_IDs[:size_1])

        # Write update and read back
        cache.update_cache_file(stack_images, stack_IDs,
                                'test_write_3.npz', 'test_write_new_3.npz',
                                0, reader.read_row_GDAL)
        updated = np.load('test_write_new_3.npz')

        # Test and clean update
        np.testing.assert_equal(test_Y[:, :size_2, :], updated['Y'])
        np.testing.assert_equal(test_IDs[:size_2], updated['image_IDs'])

        os.remove('test_write_3.npz')
        os.remove('test_write_new_3.npz')
Example #5
0
def test_update_cache_file_delete_obs(cachefile, example_cache):
    choice = np.random.choice(example_cache['image_IDs'].size,
                              size=100,
                              replace=False)
    new_Y = example_cache['Y'][:, choice, :]
    new_image_IDs = example_cache['image_IDs'][choice]

    # For now, just use image_IDs as `images` since we won't be updating
    # from images
    cache.update_cache_file(new_image_IDs, new_image_IDs, cachefile,
                            'test.npz', 0, io.gdal_reader)
    test = np.load('test.npz')
    Y, image_IDs = test['Y'], test['image_IDs']
    os.remove('test.npz')

    np.testing.assert_equal(new_Y, Y)
    np.testing.assert_equal(new_image_IDs, image_IDs)
Example #6
0
    def test_update_cache_file_delete_obs(self):
        choice = np.random.choice(self.test_data['image_IDs'].size,
                                  size=100,
                                  replace=False)
        new_Y = self.test_data['Y'][:, choice, :]
        new_image_IDs = self.test_data['image_IDs'][choice]

        # For now, just use image_IDs as `images` since we won't be updating
        # from images
        cache.update_cache_file(new_image_IDs, new_image_IDs, self.test_file,
                                'test_write_2.npz', 0, reader.read_row_GDAL)

        new_cache = np.load('test_write_2.npz')

        np.testing.assert_equal(new_Y, new_cache['Y'])
        np.testing.assert_equal(new_image_IDs, new_cache['image_IDs'])

        os.remove('test_write_2.npz')
Example #7
0
def test_update_cache_file_delete_obs(cachefile, example_cache):
    choice = np.random.choice(example_cache['image_IDs'].size,
                              size=100, replace=False)
    new_Y = example_cache['Y'][:, choice, :]
    new_image_IDs = example_cache['image_IDs'][choice]

    # For now, just use image_IDs as `images` since we won't be updating
    # from images
    cache.update_cache_file(new_image_IDs, new_image_IDs,
                            cachefile,
                            'test.npz',
                            0, io.gdal_reader)
    test = np.load('test.npz')
    Y, image_IDs = test['Y'], test['image_IDs']
    os.remove('test.npz')

    np.testing.assert_equal(new_Y, Y)
    np.testing.assert_equal(new_image_IDs, image_IDs)
Example #8
0
    def test_update_cache_file_delete_obs(self):
        choice = np.random.choice(self.test_data['image_IDs'].size,
                                  size=100, replace=False)
        new_Y = self.test_data['Y'][:, choice, :]
        new_image_IDs = self.test_data['image_IDs'][choice]

        # For now, just use image_IDs as `images` since we won't be updating
        # from images
        cache.update_cache_file(new_image_IDs, new_image_IDs,
                                self.test_file,
                                'test_write_2.npz',
                                0, reader.read_row_GDAL)

        new_cache = np.load('test_write_2.npz')

        np.testing.assert_equal(new_Y, new_cache['Y'])
        np.testing.assert_equal(new_image_IDs, new_cache['image_IDs'])

        os.remove('test_write_2.npz')
Example #9
0
def main(args):
    # Parse and validate configuration file
    dataset_config, yatsm_config = config_parser.parse_config_file(
        args['config_file'])

    if not os.path.isdir(dataset_config['cache_line_dir']):
        os.makedirs(dataset_config['cache_line_dir'])

    dates, images = utils.csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format']
    )

    image_IDs = utils.get_image_IDs(images)

    nrow, ncol, nband, dtype = reader.get_image_attribute(images[0])

    # Determine lines to work on
    job_lines = utils.calculate_lines(args['job_number'],
                                      args['total_jobs'],
                                      nrow,
                                      interlaced=args['interlace'])
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if dataset_config['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband),
                               'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if args['update_pattern']:
        previous_cache = fnmatch.filter(
            os.listdir(dataset_config['cache_line_dir']),
            args['update_pattern'])

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern'
                           '{p}'.format(p=args['update_pattern']))
        else:
            logger.debug('Found {n} previously cached files to update'.format(
                n=len(previous_cache)))

    for job_line in job_lines:
        cache_filename = cache.get_line_cache_name(
            dataset_config, len(images), job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(
            l=job_line, f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = cache.get_line_cache_pattern(
                job_line, nband, regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            cache.update_cache_file(
                images, image_IDs,
                update, cache_filename,
                job_line, image_reader, image_reader_kwargs
            )
        else:
            if dataset_config['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(images, job_line)
            cache.write_cache_file(cache_filename, Y, image_IDs)

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))
Example #10
0
def main(args):
    # Parse and validate configuration file
    dataset_config, yatsm_config = config_parser.parse_config_file(
        args['config_file'])

    if not os.path.isdir(dataset_config['cache_line_dir']):
        os.makedirs(dataset_config['cache_line_dir'])

    dates, images = utils.csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format'])

    image_IDs = utils.get_image_IDs(images)

    nrow, ncol, nband, dtype = reader.get_image_attribute(images[0])

    # Determine lines to work on
    job_lines = utils.calculate_lines(args['job_number'],
                                      args['total_jobs'],
                                      nrow,
                                      interlaced=args['interlace'])
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if dataset_config['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if args['update_pattern']:
        previous_cache = fnmatch.filter(
            os.listdir(dataset_config['cache_line_dir']),
            args['update_pattern'])

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern'
                           '{p}'.format(p=args['update_pattern']))
        else:
            logger.debug('Found {n} previously cached files to update'.format(
                n=len(previous_cache)))

    for job_line in job_lines:
        cache_filename = cache.get_line_cache_name(dataset_config, len(images),
                                                   job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(l=job_line,
                                                      f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = cache.get_line_cache_pattern(job_line,
                                                   nband,
                                                   regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            cache.update_cache_file(images, image_IDs, update, cache_filename,
                                    job_line, image_reader,
                                    image_reader_kwargs)
        else:
            if dataset_config['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(images, job_line)
            cache.write_cache_file(cache_filename, Y, image_IDs)

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))
Example #11
0
def cache(ctx, config, job_number, total_jobs, update_pattern, interlace):
    cfg = parse_config_file(config)

    if not os.path.isdir(cfg['dataset']['cache_line_dir']):
        os.makedirs(cfg['dataset']['cache_line_dir'])

    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    df['image_IDs'] = get_image_IDs(df['filename'])

    nrow, ncol, nband, dtype = reader.get_image_attribute(df['filename'][0])

    # Determine lines to work on
    job_lines = distribute_jobs(job_number, total_jobs, nrow,
                                interlaced=interlace)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if cfg['dataset']['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband),
                               'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if update_pattern:
        previous_cache = fnmatch.filter(
            os.listdir(cfg['dataset']['cache_line_dir']), update_pattern)

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern '
                           '%s' % update_pattern)
        else:
            logger.debug('Found %s previously cached files to update' %
                         len(previous_cache))

    for job_line in job_lines:
        cache_filename = get_line_cache_name(cfg['dataset'], len(df),
                                             job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(
            l=job_line, f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = get_line_cache_pattern(job_line, nband, regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(cfg['dataset']['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(cfg['dataset']['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            update_cache_file(df['filename'], df['image_IDs'],
                              update, cache_filename,
                              job_line, image_reader, image_reader_kwargs)
        else:
            if cfg['dataset']['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(df['filename'], job_line,
                                        (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(df['filename'], job_line)
            write_cache_file(cache_filename, Y, df['image_IDs'])

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))