Example #1
0
def classify(ctx, config, algo, job_number, total_jobs, resume):
    cfg = parse_config_file(config)

    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    nrow = get_image_attribute(df['filename'][0])[0]

    classifier = joblib.load(algo)

    # Split into lines and classify
    job_lines = distribute_jobs(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    start_time = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:
        filename = get_output_name(cfg['dataset'], job_line)
        if not os.path.exists(filename):
            logger.warning('No model result found for line {l} '
                           '(file {f})'.format(l=job_line, f=filename))
            pass

        if resume and try_resume(filename):
            logger.debug('Already processed line {l}'.format(l=job_line))
            continue

        logger.debug('Classifying line {l}'.format(l=job_line))
        classify_line(filename, classifier)

    logger.debug('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time) / 60.0, 2))
    )
Example #2
0
def main(args):
    """ Classify dataset """
    # Parse config and file data
    dataset_config, yatsm_config = parse_config_file(args['config_file'])

    # Get some attributes about the dataset
    dates, sensors, images = csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format']
    )
    nrow, _, _, _ = get_image_attribute(images[0])

    # Read in the saved classification result
    try:
        _ = open(args['algo'])
    except:
        logger.error('Could not open pickled classifier')
        sys.exit(1)

    classifier = joblib.load(args['algo'])

    # Split into lines and classify
    job_lines = calculate_lines(args['job_number'] - 1, args['total_jobs'],
                                nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    start_time = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:

        filename = get_output_name(dataset_config, job_line)
        if not os.path.exists(filename):
            logger.warning('No model result found for line {l} '
                           '(file {f})'.format(l=job_line, f=filename))
            pass

        if args['resume'] and try_resume(filename):
            logger.debug('Already processed line {l}'.format(l=job_line))
            continue

        logger.debug('Classifying line {l}'.format(l=job_line))
        classify_line(filename, classifier)

    logger.debug('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time) / 60.0, 2))
    )
Example #3
0
def line(ctx, config, job_number, total_jobs,
         resume, check_cache, do_not_run, verbose_yatsm):
    if verbose_yatsm:
        logger_algo.setLevel(logging.DEBUG)

    # Parse config
    cfg = parse_config_file(config)

    if ('phenology' in cfg and cfg['phenology'].get('enable')) and not pheno:
        click.secho('Could not import yatsm.phenology but phenology metrics '
                    'are requested', fg='red')
        click.secho('Error: %s' % pheno_exception, fg='red')
        raise click.Abort()

    # Make sure output directory exists and is writable
    output_dir = cfg['dataset']['output']
    try:
        os.makedirs(output_dir)
    except OSError as e:
        # File exists
        if e.errno == 17:
            pass
        elif e.errno == 13:
            click.secho('Cannot create output directory %s' % output_dir,
                        fg='red')
            raise click.Abort()

    if not os.access(output_dir, os.W_OK):
        click.secho('Cannot write to output directory %s' % output_dir,
                    fg='red')
        raise click.Abort()

    # Test existence of cache directory
    read_cache, write_cache = test_cache(cfg['dataset'])

    logger.info('Job {i} of {n} - using config file {f}'.format(i=job_number,
                                                                n=total_jobs,
                                                                f=config))
    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    df['image_ID'] = get_image_IDs(df['filename'])

    # Get attributes of one of the images
    nrow, ncol, nband, dtype = get_image_attribute(df['filename'][0])

    # Calculate the lines this job ID works on
    job_lines = distribute_jobs(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Calculate X feature input
    dates = np.asarray(df['date'])
    kws = {'x': dates}
    kws.update(df.to_dict())
    X = patsy.dmatrix(cfg['YATSM']['design_matrix'], kws)
    cfg['YATSM']['design'] = X.design_info.column_name_indexes

    # Form YATSM class arguments
    fit_indices = np.arange(cfg['dataset']['n_bands'])
    if cfg['dataset']['mask_band'] is not None:
        fit_indices = fit_indices[:-1]

    if cfg['YATSM']['reverse']:
        X = np.flipud(X)

    # Create output metadata to save
    md = {
        'YATSM': cfg['YATSM'],
        cfg['YATSM']['algorithm']: cfg[cfg['YATSM']['algorithm']]
    }
    if cfg['phenology']['enable']:
        md.update({'phenology': cfg['phenology']})

    # Begin process
    start_time_all = time.time()
    for line in job_lines:
        out = get_output_name(cfg['dataset'], line)

        if resume:
            try:
                np.load(out)
            except:
                pass
            else:
                logger.debug('Already processed line %s' % line)
                continue

        logger.debug('Running line %s' % line)
        start_time = time.time()

        Y = read_line(line, df['filename'], df['image_ID'], cfg['dataset'],
                      ncol, nband, dtype,
                      read_cache=read_cache, write_cache=write_cache,
                      validate_cache=False)
        if do_not_run:
            continue
        if cfg['YATSM']['reverse']:
            Y = np.fliplr(Y)

        output = []
        for col in np.arange(Y.shape[-1]):
            _Y = Y.take(col, axis=2)
            # Mask
            idx_mask = cfg['dataset']['mask_band'] - 1
            valid = cyprep.get_valid_mask(
                _Y,
                cfg['dataset']['min_values'],
                cfg['dataset']['max_values']).astype(bool)

            valid *= np.in1d(_Y.take(idx_mask, axis=0),
                             cfg['dataset']['mask_values'],
                             invert=True).astype(np.bool)

            _Y = np.delete(_Y, idx_mask, axis=0)[:, valid]
            _X = X[valid, :]
            _dates = dates[valid]

            # Run model
            cls = cfg['YATSM']['algorithm_cls']
            algo_cfg = cfg[cfg['YATSM']['algorithm']]

            yatsm = cls(lm=cfg['YATSM']['prediction_object'],
                        **algo_cfg.get('init', {}))
            yatsm.px = col
            yatsm.py = line

            try:
                yatsm.fit(_X, _Y, _dates, **algo_cfg.get('fit', {}))
            except TSLengthException:
                continue

            if yatsm.record is None or len(yatsm.record) == 0:
                continue

            # Postprocess
            if cfg['YATSM'].get('commission_alpha'):
                yatsm.record = postprocess.commission_test(
                    yatsm, cfg['YATSM']['commission_alpha'])

            for prefix, lm in zip(cfg['YATSM']['refit']['prefix'],
                                  cfg['YATSM']['refit']['prediction_object']):
                yatsm.record = postprocess.refit_record(yatsm, prefix, lm,
                                                        keep_regularized=True)

            if cfg['phenology']['enable']:
                pcfg = cfg['phenology']
                ltm = pheno.LongTermMeanPhenology(**pcfg.get('init', {}))
                yatsm.record = ltm.fit(yatsm, **pcfg.get('fit', {}))

            output.extend(yatsm.record)

        logger.debug('    Saving YATSM output to %s' % out)
        np.savez(out,
                 record=np.array(output),
                 version=__version__,
                 metadata=md)

        run_time = time.time() - start_time
        logger.debug('Line %s took %ss to run' % (line, run_time))

    logger.info('Completed {n} lines in {m} minutes'.format(
                n=len(job_lines),
                m=round((time.time() - start_time_all) / 60.0, 2)))
Example #4
0
def main(dataset_config, yatsm_config,
         check=False, resume=False,
         do_not_run=False,
         read_cache=False, write_cache=False,
         validate_cache=False):
    """ Read in dataset and YATSM for a complete line

    Args:
      dataset_config (dict): dict of dataset configuration options
      yatsm_config (dict): dict of YATSM algorithm options
      check (bool, optional): check to make sure images are readible
      resume (bool, optional): do not overwrite existing results, instead
        continue from first non-existing result file
      do_not_run (bool, optional): Don't run YATSM
      read_cache (bool, optional): try to read from cache directory
        (default: False)
      write_cache (bool, optional): try to to write to cache directory
        (default: False)
      validate_cache (bool, optional): ensure data from cache file come from
        images specified in configuration (default: False)

    """
    # Read in dataset
    dates, sensors, images = csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format']
    )

    image_IDs = get_image_IDs(images)

    # Check for existence of files and remove missing
    if check:
        to_delete = []
        for i, img in enumerate(images):
            if not os.path.isfile(img):
                logger.warning('Could not find file {f} -- removing'.
                               format(f=img))
                to_delete.append(i)

        if len(to_delete) == 0:
            logger.debug('Checked and found all input images')
        else:
            logger.warning('Removing {n} images'.format(n=len(to_delete)))
            dates = np.delete(dates, np.array(to_delete))
            images = np.delete(images, np.array(to_delete))

    # Get attributes of one of the images
    nrow, ncol, nband, dtype = get_image_attribute(images[0])

    # Calculate the lines this job ID works on
    job_lines = calculate_lines(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Calculate X feature input
    X = patsy.dmatrix(yatsm_config['design_matrix'],
                      {'x': dates, 'sensor': sensors})

    # Start running YATSM
    start_time_all = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:
        if resume:
            try:
                z = np.load(get_output_name(dataset_config, job_line))
            except:
                pass
            else:
                del z
                logger.debug('Already processed line {l}'.format(l=job_line))
                continue

        logger.debug('Running line {l}'.format(l=job_line))
        start_time = time.time()

        try:
            run_line(job_line, X, images, image_IDs,
                     dataset_config, yatsm_config,
                     nrow, ncol, nband, dtype,
                     do_not_run=do_not_run,
                     read_cache=read_cache, write_cache=write_cache,
                     validate_cache=validate_cache)
        except Exception as e:
            logger.error('Could not process line {l}'.format(l=job_line))
            logger.error(type(e))
            logger.error(str(e))

        logger.debug('Took {s}s to run'.format(
            s=round(time.time() - start_time, 2)))

    logger.info('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time_all) / 60.0, 2)
    ))
Example #5
0
def main(dataset_config,
         yatsm_config,
         check=False,
         resume=False,
         do_not_run=False,
         read_cache=False,
         write_cache=False,
         validate_cache=False):
    """ Read in dataset and YATSM for a complete line

    Args:
      dataset_config (dict): dict of dataset configuration options
      yatsm_config (dict): dict of YATSM algorithm options
      check (bool, optional): check to make sure images are readible
      resume (bool, optional): do not overwrite existing results, instead
        continue from first non-existing result file
      do_not_run (bool, optional): Don't run YATSM
      read_cache (bool, optional): try to read from cache directory
        (default: False)
      write_cache (bool, optional): try to to write to cache directory
        (default: False)
      validate_cache (bool, optional): ensure data from cache file come from
        images specified in configuration (default: False)

    """
    # Read in dataset
    dates, sensors, images = csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format'])

    image_IDs = get_image_IDs(images)

    # Check for existence of files and remove missing
    if check:
        to_delete = []
        for i, img in enumerate(images):
            if not os.path.isfile(img):
                logger.warning(
                    'Could not find file {f} -- removing'.format(f=img))
                to_delete.append(i)

        if len(to_delete) == 0:
            logger.debug('Checked and found all input images')
        else:
            logger.warning('Removing {n} images'.format(n=len(to_delete)))
            dates = np.delete(dates, np.array(to_delete))
            images = np.delete(images, np.array(to_delete))

    # Get attributes of one of the images
    nrow, ncol, nband, dtype = get_image_attribute(images[0])

    # Calculate the lines this job ID works on
    job_lines = calculate_lines(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Calculate X feature input
    X = patsy.dmatrix(yatsm_config['design_matrix'], {
        'x': dates,
        'sensor': sensors
    })

    # Start running YATSM
    start_time_all = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:
        if resume:
            try:
                z = np.load(get_output_name(dataset_config, job_line))
            except:
                pass
            else:
                del z
                logger.debug('Already processed line {l}'.format(l=job_line))
                continue

        logger.debug('Running line {l}'.format(l=job_line))
        start_time = time.time()

        try:
            run_line(job_line,
                     X,
                     images,
                     image_IDs,
                     dataset_config,
                     yatsm_config,
                     nrow,
                     ncol,
                     nband,
                     dtype,
                     do_not_run=do_not_run,
                     read_cache=read_cache,
                     write_cache=write_cache,
                     validate_cache=validate_cache)
        except Exception as e:
            logger.error('Could not process line {l}'.format(l=job_line))
            logger.error(type(e))
            logger.error(str(e))

        logger.debug(
            'Took {s}s to run'.format(s=round(time.time() - start_time, 2)))

    logger.info('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines), m=round((time.time() - start_time_all) / 60.0, 2)))
Example #6
0
def main(args):
    # Parse and validate configuration file
    dataset_config, yatsm_config = config_parser.parse_config_file(
        args['config_file'])

    if not os.path.isdir(dataset_config['cache_line_dir']):
        os.makedirs(dataset_config['cache_line_dir'])

    dates, images = utils.csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format']
    )

    image_IDs = utils.get_image_IDs(images)

    nrow, ncol, nband, dtype = reader.get_image_attribute(images[0])

    # Determine lines to work on
    job_lines = utils.calculate_lines(args['job_number'],
                                      args['total_jobs'],
                                      nrow,
                                      interlaced=args['interlace'])
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if dataset_config['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband),
                               'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if args['update_pattern']:
        previous_cache = fnmatch.filter(
            os.listdir(dataset_config['cache_line_dir']),
            args['update_pattern'])

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern'
                           '{p}'.format(p=args['update_pattern']))
        else:
            logger.debug('Found {n} previously cached files to update'.format(
                n=len(previous_cache)))

    for job_line in job_lines:
        cache_filename = cache.get_line_cache_name(
            dataset_config, len(images), job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(
            l=job_line, f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = cache.get_line_cache_pattern(
                job_line, nband, regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            cache.update_cache_file(
                images, image_IDs,
                update, cache_filename,
                job_line, image_reader, image_reader_kwargs
            )
        else:
            if dataset_config['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(images, job_line)
            cache.write_cache_file(cache_filename, Y, image_IDs)

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))
Example #7
0
def main(args):
    # Parse and validate configuration file
    dataset_config, yatsm_config = config_parser.parse_config_file(
        args['config_file'])

    if not os.path.isdir(dataset_config['cache_line_dir']):
        os.makedirs(dataset_config['cache_line_dir'])

    dates, images = utils.csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format'])

    image_IDs = utils.get_image_IDs(images)

    nrow, ncol, nband, dtype = reader.get_image_attribute(images[0])

    # Determine lines to work on
    job_lines = utils.calculate_lines(args['job_number'],
                                      args['total_jobs'],
                                      nrow,
                                      interlaced=args['interlace'])
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if dataset_config['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if args['update_pattern']:
        previous_cache = fnmatch.filter(
            os.listdir(dataset_config['cache_line_dir']),
            args['update_pattern'])

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern'
                           '{p}'.format(p=args['update_pattern']))
        else:
            logger.debug('Found {n} previously cached files to update'.format(
                n=len(previous_cache)))

    for job_line in job_lines:
        cache_filename = cache.get_line_cache_name(dataset_config, len(images),
                                                   job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(l=job_line,
                                                      f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = cache.get_line_cache_pattern(job_line,
                                                   nband,
                                                   regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(dataset_config['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            cache.update_cache_file(images, image_IDs, update, cache_filename,
                                    job_line, image_reader,
                                    image_reader_kwargs)
        else:
            if dataset_config['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(images, job_line)
            cache.write_cache_file(cache_filename, Y, image_IDs)

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))
Example #8
0
def cache(ctx, config, job_number, total_jobs, update_pattern, interlace):
    cfg = parse_config_file(config)

    if not os.path.isdir(cfg['dataset']['cache_line_dir']):
        os.makedirs(cfg['dataset']['cache_line_dir'])

    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    df['image_IDs'] = get_image_IDs(df['filename'])

    nrow, ncol, nband, dtype = reader.get_image_attribute(df['filename'][0])

    # Determine lines to work on
    job_lines = distribute_jobs(job_number, total_jobs, nrow,
                                interlaced=interlace)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Determine file reader
    if cfg['dataset']['use_bip_reader']:
        logger.debug('Reading in data from disk using BIP reader')
        image_reader = reader.read_row_BIP
        image_reader_kwargs = {'size': (ncol, nband),
                               'dtype': dtype}
    else:
        logger.debug('Reading in data from disk using GDAL')
        image_reader = reader.read_row_GDAL
        image_reader_kwargs = {}

    # Attempt to update cache files
    previous_cache = None
    if update_pattern:
        previous_cache = fnmatch.filter(
            os.listdir(cfg['dataset']['cache_line_dir']), update_pattern)

        if not previous_cache:
            logger.warning('Could not find cache files to update with pattern '
                           '%s' % update_pattern)
        else:
            logger.debug('Found %s previously cached files to update' %
                         len(previous_cache))

    for job_line in job_lines:
        cache_filename = get_line_cache_name(cfg['dataset'], len(df),
                                             job_line, nband)
        logger.debug('Caching line {l} to {f}'.format(
            l=job_line, f=cache_filename))
        start_time = time.time()

        # Find matching cache file
        update = False
        if previous_cache:
            pattern = get_line_cache_pattern(job_line, nband, regex=False)

            potential = fnmatch.filter(previous_cache, pattern)

            if not potential:
                logger.info('Found zero previous cache files for '
                            'line {l}'.format(l=job_line))
            elif len(potential) > 1:
                logger.info('Found more than one previous cache file for '
                            'line {l}. Keeping first'.format(l=job_line))
                update = os.path.join(cfg['dataset']['cache_line_dir'],
                                      potential[0])
            else:
                update = os.path.join(cfg['dataset']['cache_line_dir'],
                                      potential[0])

            logger.info('Updating from cache file {f}'.format(f=update))

        if update:
            update_cache_file(df['filename'], df['image_IDs'],
                              update, cache_filename,
                              job_line, image_reader, image_reader_kwargs)
        else:
            if cfg['dataset']['use_bip_reader']:
                # Use BIP reader
                logger.debug('Reading in data from disk using BIP reader')
                Y = reader.read_row_BIP(df['filename'], job_line,
                                        (ncol, nband), dtype)
            else:
                # Read in data just using GDAL
                logger.debug('Reading in data from disk using GDAL')
                Y = reader.read_row_GDAL(df['filename'], job_line)
            write_cache_file(cache_filename, Y, df['image_IDs'])

        logger.debug('Took {s}s to cache the data'.format(
            s=round(time.time() - start_time, 2)))