Esempio n. 1
0
def train(ctx, config, classifier_config, model, n_fold, seed,
          plot, diagnostics, overwrite):
    """
    Train a classifier from `scikit-learn` on YATSM output and save result to
    file <model>. Dataset configuration is specified by <yatsm_config> and
    classifier and classifier parameters are specified by <classifier_config>.
    """
    if not model.endswith('.pkl'):
        model += '.pkl'
    if os.path.isfile(model) and not overwrite:
        logger.error('<model> exists and --overwrite was not specified')
        raise click.Abort()

    if seed:
        np.random.seed(seed)

    # Parse YATSM config
    dataset_config, yatsm_config = parse_config_file(config)

    if not dataset_config['training_image'] or \
            not os.path.isfile(dataset_config['training_image']):
        logger.error('Training data image {f} does not exist'.format(
            f=dataset_config['training_image']))
        raise click.Abort()

    # Parse classifier config
    algorithm_helper = classifiers.ini_to_algorthm(classifier_config)

    main(dataset_config, yatsm_config, algorithm_helper, model,
         diagnostics, n_fold, plot)
Esempio n. 2
0
def classify(ctx, config, algo, job_number, total_jobs, resume):
    cfg = parse_config_file(config)

    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    nrow = get_image_attribute(df['filename'][0])[0]

    classifier = joblib.load(algo)

    # Split into lines and classify
    job_lines = distribute_jobs(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    start_time = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:
        filename = get_output_name(cfg['dataset'], job_line)
        if not os.path.exists(filename):
            logger.warning('No model result found for line {l} '
                           '(file {f})'.format(l=job_line, f=filename))
            pass

        if resume and try_resume(filename):
            logger.debug('Already processed line {l}'.format(l=job_line))
            continue

        logger.debug('Classifying line {l}'.format(l=job_line))
        classify_line(filename, classifier)

    logger.debug('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time) / 60.0, 2))
    )
Esempio n. 3
0
def main(args):
    """ Classify dataset """
    # Parse config and file data
    dataset_config, yatsm_config = parse_config_file(args['config_file'])

    # Get some attributes about the dataset
    dates, sensors, images = csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format']
    )
    nrow, _, _, _ = get_image_attribute(images[0])

    # Read in the saved classification result
    try:
        _ = open(args['algo'])
    except:
        logger.error('Could not open pickled classifier')
        sys.exit(1)

    classifier = joblib.load(args['algo'])

    # Split into lines and classify
    job_lines = calculate_lines(args['job_number'] - 1, args['total_jobs'],
                                nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    start_time = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:

        filename = get_output_name(dataset_config, job_line)
        if not os.path.exists(filename):
            logger.warning('No model result found for line {l} '
                           '(file {f})'.format(l=job_line, f=filename))
            pass

        if args['resume'] and try_resume(filename):
            logger.debug('Already processed line {l}'.format(l=job_line))
            continue

        logger.debug('Classifying line {l}'.format(l=job_line))
        classify_line(filename, classifier)

    logger.debug('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time) / 60.0, 2))
    )
Esempio n. 4
0
def train(ctx, config, classifier_config, model, n_fold, seed, plot, diagnostics, overwrite):
    """
    Train a classifier from `scikit-learn` on YATSM output and save result to
    file <model>. Dataset configuration is specified by <yatsm_config> and
    classifier and classifier parameters are specified by <classifier_config>.
    """
    # Setup
    if not model.endswith(".pkl"):
        model += ".pkl"
    if os.path.isfile(model) and not overwrite:
        logger.error("<model> exists and --overwrite was not specified")
        raise click.Abort()

    if seed:
        np.random.seed(seed)

    # Parse config & algorithm config
    cfg = parse_config_file(config)
    algo, algo_cfg = classifiers.cfg_to_algorithm(classifier_config)

    training_image = cfg["classification"]["training_image"]
    if not training_image or not os.path.isfile(training_image):
        logger.error("Training data image %s does not exist" % training_image)
        raise click.Abort()

    # Find information from results -- e.g., design info
    attrs = find_result_attributes(cfg)
    cfg["YATSM"].update(attrs)

    # Cache file for training data
    has_cache = False
    training_cache = cfg["classification"]["cache_training"]
    if training_cache:
        # If doesn't exist, retrieve it
        if not os.path.isfile(training_cache):
            logger.info("Could not retrieve cache file for Xy")
            logger.info("    file: %s" % training_cache)
        else:
            logger.info("Restoring X/y from cache file")
            has_cache = True

    training_image = cfg["classification"]["training_image"]
    # Check if we need to regenerate the cache file because training data is
    #   newer than the cache
    regenerate_cache = is_cache_old(training_cache, training_image)
    if regenerate_cache:
        logger.warning("Existing cache file older than training data ROI")
        logger.warning("Regenerating cache file")

    if not has_cache or regenerate_cache:
        logger.debug("Reading in X/y")
        X, y, row, col, labels = get_training_inputs(cfg)
        logger.debug("Done reading in X/y")
    else:
        logger.debug("Reading in X/y from cache file %s" % training_cache)
        with np.load(training_cache) as f:
            X = f["X"]
            y = f["y"]
            row = f["row"]
            col = f["col"]
            labels = f["labels"]
        logger.debug("Read in X/y from cache file %s" % training_cache)

    # If cache didn't exist but is specified, create it for first time
    if not has_cache and training_cache:
        logger.info("Saving X/y to cache file %s" % training_cache)
        try:
            np.savez(training_cache, X=X, y=y, row=row, col=col, labels=labels)
        except:
            logger.error("Could not save X/y to cache file")
            raise

    # Do modeling
    logger.info("Training classifier")
    algo.fit(X, y, **algo_cfg.get("fit", {}))

    # Serialize algorithm to file
    logger.info("Pickling classifier with sklearn.externals.joblib")
    joblib.dump(algo, model, compress=3)

    # Diagnostics
    if diagnostics:
        algo_diagnostics(cfg, X, y, row, col, algo, n_fold, plot)
Esempio n. 5
0
def line(ctx, config, job_number, total_jobs,
         resume, check_cache, do_not_run, verbose_yatsm):
    if verbose_yatsm:
        logger_algo.setLevel(logging.DEBUG)

    # Parse config
    cfg = parse_config_file(config)

    if ('phenology' in cfg and cfg['phenology'].get('enable')) and not pheno:
        click.secho('Could not import yatsm.phenology but phenology metrics '
                    'are requested', fg='red')
        click.secho('Error: %s' % pheno_exception, fg='red')
        raise click.Abort()

    # Make sure output directory exists and is writable
    output_dir = cfg['dataset']['output']
    try:
        os.makedirs(output_dir)
    except OSError as e:
        # File exists
        if e.errno == 17:
            pass
        elif e.errno == 13:
            click.secho('Cannot create output directory %s' % output_dir,
                        fg='red')
            raise click.Abort()

    if not os.access(output_dir, os.W_OK):
        click.secho('Cannot write to output directory %s' % output_dir,
                    fg='red')
        raise click.Abort()

    # Test existence of cache directory
    read_cache, write_cache = test_cache(cfg['dataset'])

    logger.info('Job {i} of {n} - using config file {f}'.format(i=job_number,
                                                                n=total_jobs,
                                                                f=config))
    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    df['image_ID'] = get_image_IDs(df['filename'])

    # Get attributes of one of the images
    nrow, ncol, nband, dtype = get_image_attribute(df['filename'][0])

    # Calculate the lines this job ID works on
    job_lines = distribute_jobs(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Calculate X feature input
    dates = np.asarray(df['date'])
    kws = {'x': dates}
    kws.update(df.to_dict())
    X = patsy.dmatrix(cfg['YATSM']['design_matrix'], kws)
    cfg['YATSM']['design'] = X.design_info.column_name_indexes

    # Form YATSM class arguments
    fit_indices = np.arange(cfg['dataset']['n_bands'])
    if cfg['dataset']['mask_band'] is not None:
        fit_indices = fit_indices[:-1]

    if cfg['YATSM']['reverse']:
        X = np.flipud(X)

    # Create output metadata to save
    md = {
        'YATSM': cfg['YATSM'],
        cfg['YATSM']['algorithm']: cfg[cfg['YATSM']['algorithm']]
    }
    if cfg['phenology']['enable']:
        md.update({'phenology': cfg['phenology']})

    # Begin process
    start_time_all = time.time()
    for line in job_lines:
        out = get_output_name(cfg['dataset'], line)

        if resume:
            try:
                np.load(out)
            except:
                pass
            else:
                logger.debug('Already processed line %s' % line)
                continue

        logger.debug('Running line %s' % line)
        start_time = time.time()

        Y = read_line(line, df['filename'], df['image_ID'], cfg['dataset'],
                      ncol, nband, dtype,
                      read_cache=read_cache, write_cache=write_cache,
                      validate_cache=False)
        if do_not_run:
            continue
        if cfg['YATSM']['reverse']:
            Y = np.fliplr(Y)

        output = []
        for col in np.arange(Y.shape[-1]):
            _Y = Y.take(col, axis=2)
            # Mask
            idx_mask = cfg['dataset']['mask_band'] - 1
            valid = cyprep.get_valid_mask(
                _Y,
                cfg['dataset']['min_values'],
                cfg['dataset']['max_values']).astype(bool)

            valid *= np.in1d(_Y.take(idx_mask, axis=0),
                             cfg['dataset']['mask_values'],
                             invert=True).astype(np.bool)

            _Y = np.delete(_Y, idx_mask, axis=0)[:, valid]
            _X = X[valid, :]
            _dates = dates[valid]

            # Run model
            cls = cfg['YATSM']['algorithm_cls']
            algo_cfg = cfg[cfg['YATSM']['algorithm']]

            yatsm = cls(lm=cfg['YATSM']['prediction_object'],
                        **algo_cfg.get('init', {}))
            yatsm.px = col
            yatsm.py = line

            try:
                yatsm.fit(_X, _Y, _dates, **algo_cfg.get('fit', {}))
            except TSLengthException:
                continue

            if yatsm.record is None or len(yatsm.record) == 0:
                continue

            # Postprocess
            if cfg['YATSM'].get('commission_alpha'):
                yatsm.record = postprocess.commission_test(
                    yatsm, cfg['YATSM']['commission_alpha'])

            for prefix, lm in zip(cfg['YATSM']['refit']['prefix'],
                                  cfg['YATSM']['refit']['prediction_object']):
                yatsm.record = postprocess.refit_record(yatsm, prefix, lm,
                                                        keep_regularized=True)

            if cfg['phenology']['enable']:
                pcfg = cfg['phenology']
                ltm = pheno.LongTermMeanPhenology(**pcfg.get('init', {}))
                yatsm.record = ltm.fit(yatsm, **pcfg.get('fit', {}))

            output.extend(yatsm.record)

        logger.debug('    Saving YATSM output to %s' % out)
        np.savez(out,
                 record=np.array(output),
                 version=__version__,
                 metadata=md)

        run_time = time.time() - start_time
        logger.debug('Line %s took %ss to run' % (line, run_time))

    logger.info('Completed {n} lines in {m} minutes'.format(
                n=len(job_lines),
                m=round((time.time() - start_time_all) / 60.0, 2)))
Esempio n. 6
0
    do_not_run = args['--do-not-run']

    # Setup logger
    if args['--verbose']:
        logger.setLevel(logging.DEBUG)

    if args['--verbose-yatsm']:
        loglevel_YATSM = logging.DEBUG

    if args['--quiet']:
        loglevel_YATSM = logging.WARNING
        logger.setLevel(logging.WARNING)

    # Parse and validate configuration file
    dataset_config, yatsm_config = parse_config_file(config_file)

    # Import phenology stuff only if necessary since it relies on rpy2 / R
    if yatsm_config['calc_pheno'] and not do_not_run:
        import yatsm.phenology as pheno

    # Make output directory
    try:
        os.makedirs(dataset_config['output'])
    except OSError as e:
        # File exists
        if e.errno == 17:
            pass
        elif e.errno == 13:
            print('Error - cannot create output directory {d}'.format(
                d=dataset_config['output']))
Esempio n. 7
0
def pixel(ctx, config, px, py, band, plot, ylim, style, cmap,
          embed, seed, algo_kw):
    # Set seed
    np.random.seed(seed)
    # Convert band to index
    band -= 1

    # Get colormap
    if hasattr(palettable.colorbrewer, cmap):
        mpl_cmap = getattr(palettable.colorbrewer, cmap).mpl_colormap
    elif hasattr(palettable.cubehelix, cmap):
        mpl_cmap = getattr(palettable.cubehelix, cmap).mpl_colormap
    elif hasattr(palettable.wesanderson, cmap):
        mpl_cmap = getattr(palettable.wesanderson, cmap).mpl_colormap
    else:
        raise click.Abort('Cannot find specified colormap in `palettable`')

    # Parse config
    cfg = parse_config_file(config)

    # Apply algorithm overrides
    revalidate = False
    for kw in algo_kw:
        for cfg_key in cfg:
            if kw in cfg[cfg_key]:
                # Parse as YAML for type conversions used in config parser
                value = yaml.load(algo_kw[kw])

                print('Overriding cfg[%s][%s]=%s with %s' %
                      (cfg_key, kw, cfg[cfg_key][kw], value))
                cfg[cfg_key][kw] = value
                revalidate = True

    if revalidate:
        cfg = convert_config(cfg)

    # Locate and fetch attributes from data
    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              date_format=cfg['dataset']['date_format'])
    df['image_ID'] = get_image_IDs(df['filename'])

    # Setup X/Y
    kws = {'x': df['date']}
    kws.update(df.to_dict())
    X = patsy.dmatrix(cfg['YATSM']['design_matrix'], kws)
    design_info = X.design_info

    Y = read_pixel_timeseries(df['filename'], px, py)

    fit_indices = np.arange(cfg['dataset']['n_bands'])
    if cfg['dataset']['mask_band'] is not None:
        fit_indices = fit_indices[:-1]

    # Mask out of range data
    idx_mask = cfg['dataset']['mask_band'] - 1
    valid = cyprep.get_valid_mask(Y,
                                  cfg['dataset']['min_values'],
                                  cfg['dataset']['max_values']).astype(np.bool)
    valid *= np.in1d(Y[idx_mask, :], cfg['dataset']['mask_values'],
                     invert=True).astype(np.bool)

    # Apply mask
    Y = np.delete(Y, idx_mask, axis=0)[:, valid]
    X = X[valid, :]
    dates = np.array([dt.datetime.fromordinal(d) for d in df['date'][valid]])

    # Plot before fitting
    with plt.xkcd() if style == 'xkcd' else mpl.style.context(style):
        for _plot in plot:
            if _plot == 'TS':
                plot_TS(dates, Y[band, :])
            elif _plot == 'DOY':
                plot_DOY(dates, Y[band, :], mpl_cmap)
            elif _plot == 'VAL':
                plot_VAL(dates, Y[band, :], mpl_cmap)

            if ylim:
                plt.ylim(ylim)
            plt.title('Timeseries: px={px} py={py}'.format(px=px, py=py))
            plt.ylabel('Band {b}'.format(b=band + 1))

            if embed and has_embed:
                IPython_embed()

            plt.tight_layout()
            plt.show()

    # Eliminate config parameters not algorithm and fit model
    yatsm = cfg['YATSM']['algorithm_cls'](lm=cfg['YATSM']['prediction_object'],
                                          **cfg[cfg['YATSM']['algorithm']])
    yatsm.px = px
    yatsm.py = py
    yatsm.fit(X, Y, np.asarray(df['date'][valid]))

    # Plot after predictions
    with plt.xkcd() if style == 'xkcd' else mpl.style.context(style):
        for _plot in plot:
            if _plot == 'TS':
                plot_TS(dates, Y[band, :])
            elif _plot == 'DOY':
                plot_DOY(dates, Y[band, :], mpl_cmap)
            elif _plot == 'VAL':
                plot_VAL(dates, Y[band, :], mpl_cmap)

            if ylim:
                plt.ylim(ylim)
            plt.title('Timeseries: px={px} py={py}'.format(px=px, py=py))
            plt.ylabel('Band {b}'.format(b=band + 1))

            plot_results(band, cfg['YATSM'], yatsm, plot_type=_plot)

            if embed and has_embed:
                IPython_embed()

            plt.tight_layout()
            plt.show()
Esempio n. 8
0
    do_not_run = args['--do-not-run']

    # Setup logger
    if args['--verbose']:
        logger.setLevel(logging.DEBUG)

    if args['--verbose-yatsm']:
        loglevel_YATSM = logging.DEBUG

    if args['--quiet']:
        loglevel_YATSM = logging.WARNING
        logger.setLevel(logging.WARNING)

    # Parse and validate configuration file
    dataset_config, yatsm_config = parse_config_file(config_file)

    # Import phenology stuff only if necessary since it relies on rpy2 / R
    if yatsm_config['calc_pheno'] and not do_not_run:
        import yatsm.phenology as pheno

    # Make output directory
    try:
        os.makedirs(dataset_config['output'])
    except OSError as e:
        # File exists
        if e.errno == 17:
            pass
        elif e.errno == 13:
            print('Error - cannot create output directory {d}'.format(
                d=dataset_config['output']))
Esempio n. 9
0
def main(args):
    # Parse and validate configuration file
    dataset_config, yatsm_config = config_parser.parse_config_file(
        args['config_file'])

    if not os.path.isdir(dataset_config['cache_line_dir']):
        os.makedirs(dataset_config['cache_line_dir'])

    dates, images = utils.csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format']
    )

    image_IDs = utils.get_image_IDs(images)

    nrow, ncol, nband, dtype = reader.get_image_attribute(images[0])

    # Determine lines to work on
    job_lines = utils.calculate_lines(args['job_number'],
                                      args['total_jobs'],
                                      nrow,
                                      interlaced=args['interlace'])
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if dataset_config['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband),
                               'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if args['update_pattern']:
        previous_cache = fnmatch.filter(
            os.listdir(dataset_config['cache_line_dir']),
            args['update_pattern'])

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern'
                           '{p}'.format(p=args['update_pattern']))
        else:
            logger.debug('Found {n} previously cached files to update'.format(
                n=len(previous_cache)))

    for job_line in job_lines:
        cache_filename = cache.get_line_cache_name(
            dataset_config, len(images), job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(
            l=job_line, f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = cache.get_line_cache_pattern(
                job_line, nband, regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            cache.update_cache_file(
                images, image_IDs,
                update, cache_filename,
                job_line, image_reader, image_reader_kwargs
            )
        else:
            if dataset_config['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(images, job_line)
            cache.write_cache_file(cache_filename, Y, image_IDs)

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))
Esempio n. 10
0
def main(args):
    # Parse and validate configuration file
    dataset_config, yatsm_config = config_parser.parse_config_file(
        args['config_file'])

    if not os.path.isdir(dataset_config['cache_line_dir']):
        os.makedirs(dataset_config['cache_line_dir'])

    dates, images = utils.csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format'])

    image_IDs = utils.get_image_IDs(images)

    nrow, ncol, nband, dtype = reader.get_image_attribute(images[0])

    # Determine lines to work on
    job_lines = utils.calculate_lines(args['job_number'],
                                      args['total_jobs'],
                                      nrow,
                                      interlaced=args['interlace'])
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if dataset_config['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if args['update_pattern']:
        previous_cache = fnmatch.filter(
            os.listdir(dataset_config['cache_line_dir']),
            args['update_pattern'])

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern'
                           '{p}'.format(p=args['update_pattern']))
        else:
            logger.debug('Found {n} previously cached files to update'.format(
                n=len(previous_cache)))

    for job_line in job_lines:
        cache_filename = cache.get_line_cache_name(dataset_config, len(images),
                                                   job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(l=job_line,
                                                      f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = cache.get_line_cache_pattern(job_line,
                                                   nband,
                                                   regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            cache.update_cache_file(images, image_IDs, update, cache_filename,
                                    job_line, image_reader,
                                    image_reader_kwargs)
        else:
            if dataset_config['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(images, job_line)
            cache.write_cache_file(cache_filename, Y, image_IDs)

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))
Esempio n. 11
0
def cache(ctx, config, job_number, total_jobs, update_pattern, interlace):
    cfg = parse_config_file(config)

    if not os.path.isdir(cfg['dataset']['cache_line_dir']):
        os.makedirs(cfg['dataset']['cache_line_dir'])

    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    df['image_IDs'] = get_image_IDs(df['filename'])

    nrow, ncol, nband, dtype = reader.get_image_attribute(df['filename'][0])

    # Determine lines to work on
    job_lines = distribute_jobs(job_number, total_jobs, nrow,
                                interlaced=interlace)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if cfg['dataset']['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband),
                               'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if update_pattern:
        previous_cache = fnmatch.filter(
            os.listdir(cfg['dataset']['cache_line_dir']), update_pattern)

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern '
                           '%s' % update_pattern)
        else:
            logger.debug('Found %s previously cached files to update' %
                         len(previous_cache))

    for job_line in job_lines:
        cache_filename = get_line_cache_name(cfg['dataset'], len(df),
                                             job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(
            l=job_line, f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = get_line_cache_pattern(job_line, nband, regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(cfg['dataset']['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(cfg['dataset']['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            update_cache_file(df['filename'], df['image_IDs'],
                              update, cache_filename,
                              job_line, image_reader, image_reader_kwargs)
        else:
            if cfg['dataset']['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(df['filename'], job_line,
                                        (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(df['filename'], job_line)
            write_cache_file(cache_filename, Y, df['image_IDs'])

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))