def _fetch_results_saved(self):
        """ Read YATSM results and return """
        self.yatsm_model = MockResult()
        row, col = self.series[0].py, self.series[0].px

        data_cfg = {
            'output': os.path.join(self.location,
                                   self.config['results_folder'].value),
            'output_prefix': (self.config['results_pattern'].value
                              .replace('*', ''))
        }
        result_filename = get_output_name(data_cfg, row)
        logger.info('Attempting to open: {f}'.format(f=result_filename))

        if not os.path.isfile(result_filename):
            qgis_log('Could not find result for row {r} ({fn})'.format(
                r=row, fn=result_filename))
            return

        z = np.load(result_filename)
        if 'record' not in z.files:
            raise KeyError('Cannot find "record" within saved result ({})'
                           .format(result_filename))
        if 'metadata' not in z.files:
            raise KeyError('Cannot find "metadata" within saved result ({})'
                           .format(result_filename))
        metadata = z['metadata'].item()
        if 'design' not in metadata['YATSM']:
            raise KeyError('Cannot find "design" within saved result metadata '
                           '({})'.format(result_filename))
        self._design_info = metadata['YATSM']['design']

        rec = z['record']
        idx = np.where((rec['px'] == col) & (rec['py'] == row))[0]
        self.yatsm_model.record = rec[idx]
Exemple #2
0
def classify(ctx, config, algo, job_number, total_jobs, resume):
    cfg = parse_config_file(config)

    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    nrow = get_image_attribute(df['filename'][0])[0]

    classifier = joblib.load(algo)

    # Split into lines and classify
    job_lines = distribute_jobs(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    start_time = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:
        filename = get_output_name(cfg['dataset'], job_line)
        if not os.path.exists(filename):
            logger.warning('No model result found for line {l} '
                           '(file {f})'.format(l=job_line, f=filename))
            pass

        if resume and try_resume(filename):
            logger.debug('Already processed line {l}'.format(l=job_line))
            continue

        logger.debug('Classifying line {l}'.format(l=job_line))
        classify_line(filename, classifier)

    logger.debug('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time) / 60.0, 2))
    )
Exemple #3
0
def main(args):
    """ Classify dataset """
    # Parse config and file data
    dataset_config, yatsm_config = parse_config_file(args['config_file'])

    # Get some attributes about the dataset
    dates, sensors, images = csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format']
    )
    nrow, _, _, _ = get_image_attribute(images[0])

    # Read in the saved classification result
    try:
        _ = open(args['algo'])
    except:
        logger.error('Could not open pickled classifier')
        sys.exit(1)

    classifier = joblib.load(args['algo'])

    # Split into lines and classify
    job_lines = calculate_lines(args['job_number'] - 1, args['total_jobs'],
                                nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    start_time = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:

        filename = get_output_name(dataset_config, job_line)
        if not os.path.exists(filename):
            logger.warning('No model result found for line {l} '
                           '(file {f})'.format(l=job_line, f=filename))
            pass

        if args['resume'] and try_resume(filename):
            logger.debug('Already processed line {l}'.format(l=job_line))
            continue

        logger.debug('Classifying line {l}'.format(l=job_line))
        classify_line(filename, classifier)

    logger.debug('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time) / 60.0, 2))
    )
Exemple #4
0
    def _fetch_results_saved(self):
        """ Read YATSM results and return """
        self.yatsm_model = MockResult()
        row, col = self.series[0].py, self.series[0].px

        data_cfg = {
            'output':
            os.path.join(self.location, self.config['results_folder'].value),
            'output_prefix':
            (self.config['results_pattern'].value.replace('*', ''))
        }
        result_filename = get_output_name(data_cfg, row)
        logger.info('Attempting to open: {f}'.format(f=result_filename))

        if not os.path.isfile(result_filename):
            qgis_log('Could not find result for row {r} ({fn})'.format(
                r=row, fn=result_filename))
            return

        z = np.load(result_filename)
        if 'record' not in z.files:
            raise KeyError(
                'Cannot find "record" within saved result ({})'.format(
                    result_filename))
        if 'metadata' not in z.files:
            raise KeyError(
                'Cannot find "metadata" within saved result ({})'.format(
                    result_filename))
        metadata = z['metadata'].item()
        if 'design' not in metadata['YATSM']:
            raise KeyError('Cannot find "design" within saved result metadata '
                           '({})'.format(result_filename))
        self._design = metadata['YATSM']['design_matrix']
        self._design_info = metadata['YATSM']['design']

        rec = z['record']
        idx = np.where((rec['px'] == col) & (rec['py'] == row))[0]
        self.yatsm_model.record = rec[idx]
Exemple #5
0
def get_training_inputs(cfg, exit_on_missing=False):
    """ Returns X features and y labels specified in config file

    Args:
        cfg (dict): YATSM configuration dictionary
        exit_on_missing (bool, optional): exit if input feature cannot be found

    Returns:
        X (np.ndarray): matrix of feature inputs for each training data sample
        y (np.ndarray): array of labeled training data samples
        row (np.ndarray): row pixel locations of `y`
        col (np.ndarray): column pixel locations of `y`
        labels (np.ndarraY): label of `y` if found, else None

    """
    # Find and parse training data
    roi = reader.read_image(cfg["classification"]["training_image"])
    logger.debug("Read in training data")
    if len(roi) == 2:
        logger.info("Found labels for ROIs -- including in output")
        labels = roi[1]
    else:
        roi = roi[0]
        labels = None

    # Determine start and end dates of training sample relevance
    try:
        training_start = dt.strptime(
            cfg["classification"]["training_start"], cfg["classification"]["training_date_format"]
        ).toordinal()
        training_end = dt.strptime(
            cfg["classification"]["training_end"], cfg["classification"]["training_date_format"]
        ).toordinal()
    except:
        logger.error("Failed to parse training data start or end dates")
        raise

    # Loop through samples in ROI extracting features
    mask_values = cfg["classification"]["roi_mask_values"]
    mask = ~np.in1d(roi, mask_values).reshape(roi.shape)
    row, col = np.where(mask)
    y = roi[row, col]

    X = []
    out_y = []
    out_row = []
    out_col = []

    _row_previous = None
    for _row, _col, _y in izip(row, col, y):
        # Load result
        if _row != _row_previous:
            output_name = utils.get_output_name(cfg["dataset"], _row)
            try:
                rec = np.load(output_name)["record"]
                _row_previous = _row
            except:
                logger.error("Could not open saved result file %s" % output_name)
                if exit_on_missing:
                    raise
                else:
                    continue

        # Find intersecting time segment
        i = np.where((rec["start"] < training_start) & (rec["end"] > training_end) & (rec["px"] == _col))[0]

        if i.size == 0:
            logger.debug("Could not find model for label %i at x/y %i/%i" % (_y, _col, _row))
            continue
        elif i.size > 1:
            raise TrainingDataException("Found more than one valid model for label %i at x/y %i/%i" % (_y, _col, _row))

        # Extract coefficients with intercept term rescaled
        coef = rec[i]["coef"][0, :]
        coef[0, :] = coef[0, :] + coef[1, :] * (rec[i]["start"] + rec[i]["end"]) / 2.0

        X.append(np.concatenate((coef.reshape(coef.size), rec[i]["rmse"][0])))
        out_y.append(_y)
        out_row.append(_row)
        out_col.append(_col)

    out_row = np.array(out_row)
    out_col = np.array(out_col)

    if labels is not None:
        labels = labels[out_row, out_col]

    return np.array(X), np.array(out_y), out_row, out_col, labels
Exemple #6
0
def line(ctx, config, job_number, total_jobs,
         resume, check_cache, do_not_run, verbose_yatsm):
    if verbose_yatsm:
        logger_algo.setLevel(logging.DEBUG)

    # Parse config
    cfg = parse_config_file(config)

    if ('phenology' in cfg and cfg['phenology'].get('enable')) and not pheno:
        click.secho('Could not import yatsm.phenology but phenology metrics '
                    'are requested', fg='red')
        click.secho('Error: %s' % pheno_exception, fg='red')
        raise click.Abort()

    # Make sure output directory exists and is writable
    output_dir = cfg['dataset']['output']
    try:
        os.makedirs(output_dir)
    except OSError as e:
        # File exists
        if e.errno == 17:
            pass
        elif e.errno == 13:
            click.secho('Cannot create output directory %s' % output_dir,
                        fg='red')
            raise click.Abort()

    if not os.access(output_dir, os.W_OK):
        click.secho('Cannot write to output directory %s' % output_dir,
                    fg='red')
        raise click.Abort()

    # Test existence of cache directory
    read_cache, write_cache = test_cache(cfg['dataset'])

    logger.info('Job {i} of {n} - using config file {f}'.format(i=job_number,
                                                                n=total_jobs,
                                                                f=config))
    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    df['image_ID'] = get_image_IDs(df['filename'])

    # Get attributes of one of the images
    nrow, ncol, nband, dtype = get_image_attribute(df['filename'][0])

    # Calculate the lines this job ID works on
    job_lines = distribute_jobs(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Calculate X feature input
    dates = np.asarray(df['date'])
    kws = {'x': dates}
    kws.update(df.to_dict())
    X = patsy.dmatrix(cfg['YATSM']['design_matrix'], kws)
    cfg['YATSM']['design'] = X.design_info.column_name_indexes

    # Form YATSM class arguments
    fit_indices = np.arange(cfg['dataset']['n_bands'])
    if cfg['dataset']['mask_band'] is not None:
        fit_indices = fit_indices[:-1]

    if cfg['YATSM']['reverse']:
        X = np.flipud(X)

    # Create output metadata to save
    md = {
        'YATSM': cfg['YATSM'],
        cfg['YATSM']['algorithm']: cfg[cfg['YATSM']['algorithm']]
    }
    if cfg['phenology']['enable']:
        md.update({'phenology': cfg['phenology']})

    # Begin process
    start_time_all = time.time()
    for line in job_lines:
        out = get_output_name(cfg['dataset'], line)

        if resume:
            try:
                np.load(out)
            except:
                pass
            else:
                logger.debug('Already processed line %s' % line)
                continue

        logger.debug('Running line %s' % line)
        start_time = time.time()

        Y = read_line(line, df['filename'], df['image_ID'], cfg['dataset'],
                      ncol, nband, dtype,
                      read_cache=read_cache, write_cache=write_cache,
                      validate_cache=False)
        if do_not_run:
            continue
        if cfg['YATSM']['reverse']:
            Y = np.fliplr(Y)

        output = []
        for col in np.arange(Y.shape[-1]):
            _Y = Y.take(col, axis=2)
            # Mask
            idx_mask = cfg['dataset']['mask_band'] - 1
            valid = cyprep.get_valid_mask(
                _Y,
                cfg['dataset']['min_values'],
                cfg['dataset']['max_values']).astype(bool)

            valid *= np.in1d(_Y.take(idx_mask, axis=0),
                             cfg['dataset']['mask_values'],
                             invert=True).astype(np.bool)

            _Y = np.delete(_Y, idx_mask, axis=0)[:, valid]
            _X = X[valid, :]
            _dates = dates[valid]

            # Run model
            cls = cfg['YATSM']['algorithm_cls']
            algo_cfg = cfg[cfg['YATSM']['algorithm']]

            yatsm = cls(lm=cfg['YATSM']['prediction_object'],
                        **algo_cfg.get('init', {}))
            yatsm.px = col
            yatsm.py = line

            try:
                yatsm.fit(_X, _Y, _dates, **algo_cfg.get('fit', {}))
            except TSLengthException:
                continue

            if yatsm.record is None or len(yatsm.record) == 0:
                continue

            # Postprocess
            if cfg['YATSM'].get('commission_alpha'):
                yatsm.record = postprocess.commission_test(
                    yatsm, cfg['YATSM']['commission_alpha'])

            for prefix, lm in zip(cfg['YATSM']['refit']['prefix'],
                                  cfg['YATSM']['refit']['prediction_object']):
                yatsm.record = postprocess.refit_record(yatsm, prefix, lm,
                                                        keep_regularized=True)

            if cfg['phenology']['enable']:
                pcfg = cfg['phenology']
                ltm = pheno.LongTermMeanPhenology(**pcfg.get('init', {}))
                yatsm.record = ltm.fit(yatsm, **pcfg.get('fit', {}))

            output.extend(yatsm.record)

        logger.debug('    Saving YATSM output to %s' % out)
        np.savez(out,
                 record=np.array(output),
                 version=__version__,
                 metadata=md)

        run_time = time.time() - start_time
        logger.debug('Line %s took %ss to run' % (line, run_time))

    logger.info('Completed {n} lines in {m} minutes'.format(
                n=len(job_lines),
                m=round((time.time() - start_time_all) / 60.0, 2)))
Exemple #7
0
def main(dataset_config, yatsm_config,
         check=False, resume=False,
         do_not_run=False,
         read_cache=False, write_cache=False,
         validate_cache=False):
    """ Read in dataset and YATSM for a complete line

    Args:
      dataset_config (dict): dict of dataset configuration options
      yatsm_config (dict): dict of YATSM algorithm options
      check (bool, optional): check to make sure images are readible
      resume (bool, optional): do not overwrite existing results, instead
        continue from first non-existing result file
      do_not_run (bool, optional): Don't run YATSM
      read_cache (bool, optional): try to read from cache directory
        (default: False)
      write_cache (bool, optional): try to to write to cache directory
        (default: False)
      validate_cache (bool, optional): ensure data from cache file come from
        images specified in configuration (default: False)

    """
    # Read in dataset
    dates, sensors, images = csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format']
    )

    image_IDs = get_image_IDs(images)

    # Check for existence of files and remove missing
    if check:
        to_delete = []
        for i, img in enumerate(images):
            if not os.path.isfile(img):
                logger.warning('Could not find file {f} -- removing'.
                               format(f=img))
                to_delete.append(i)

        if len(to_delete) == 0:
            logger.debug('Checked and found all input images')
        else:
            logger.warning('Removing {n} images'.format(n=len(to_delete)))
            dates = np.delete(dates, np.array(to_delete))
            images = np.delete(images, np.array(to_delete))

    # Get attributes of one of the images
    nrow, ncol, nband, dtype = get_image_attribute(images[0])

    # Calculate the lines this job ID works on
    job_lines = calculate_lines(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Calculate X feature input
    X = patsy.dmatrix(yatsm_config['design_matrix'],
                      {'x': dates, 'sensor': sensors})

    # Start running YATSM
    start_time_all = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:
        if resume:
            try:
                z = np.load(get_output_name(dataset_config, job_line))
            except:
                pass
            else:
                del z
                logger.debug('Already processed line {l}'.format(l=job_line))
                continue

        logger.debug('Running line {l}'.format(l=job_line))
        start_time = time.time()

        try:
            run_line(job_line, X, images, image_IDs,
                     dataset_config, yatsm_config,
                     nrow, ncol, nband, dtype,
                     do_not_run=do_not_run,
                     read_cache=read_cache, write_cache=write_cache,
                     validate_cache=validate_cache)
        except Exception as e:
            logger.error('Could not process line {l}'.format(l=job_line))
            logger.error(type(e))
            logger.error(str(e))

        logger.debug('Took {s}s to run'.format(
            s=round(time.time() - start_time, 2)))

    logger.info('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time_all) / 60.0, 2)
    ))
Exemple #8
0
def run_line(line, X, images, image_IDs,
             dataset_config, yatsm_config,
             nrow, ncol, nband, dtype,
             do_not_run=False,
             read_cache=False, write_cache=False,
             validate_cache=False):
    """ Runs YATSM for a line

    Args:
      line (int): line to be run from image
      dates (ndarray): np.array of X feature from ordinal dates
      images (ndarray): np.array of image filenames
      image_IDs (iterable): list image identifying strings
      dataset_config (dict): dict of dataset configuration options
      yatsm_config (dict): dict of YATSM algorithm options
      nrow (int): number of rows
      ncol (int): number of columns
      nband (int): number of bands
      dtype (type): NumPy datatype
      do_not_run (bool, optional): don't run YATSM
      read_cache (bool, optional): try to read from cache directory
        (default: False)
      write_cache (bool, optional): try to to write to cache directory
        (default: False)
      validate_cache (bool, optional): ensure data from cache file come from
        images specified in configuration (default: False)


    """
    # Setup output
    output = []

    Y = read_line(line, images, image_IDs, dataset_config,
                  ncol, nband, dtype,
                  read_cache=read_cache, write_cache=write_cache,
                  validate_cache=validate_cache)

    if do_not_run:
        return

    # About to run YATSM
    logger.debug('    running YATSM')
    # Raise or lower logging level for YATSM
    _level = logger.level
    logger.setLevel(loglevel_YATSM)

    for c in xrange(Y.shape[-1]):
        try:
            result = run_pixel(X, Y[..., c], dataset_config, yatsm_config,
                               px=c, py=line)
        except TSLengthException:
            continue

        output.extend(result)

    # Return logging level
    logger.setLevel(_level)

    # Save output
    outfile = get_output_name(dataset_config, line)
    logger.debug('    saving YATSM output to {f}'.format(f=outfile))

    np.savez(outfile,
             version=__version__,
             consecutive=yatsm_config['consecutive'],
             threshold=yatsm_config['threshold'],
             min_obs=yatsm_config['min_obs'],
             min_rmse=yatsm_config['min_rmse'],
             test_indices=yatsm_config['test_indices'],
             design=yatsm_config['design_matrix'],
             design_matrix=X.design_info.column_name_indexes,
             retrain_time=yatsm_config['retrain_time'],
             screening=yatsm_config['screening'],
             screening_crit=yatsm_config['screening_crit'],
             remove_noise=yatsm_config['remove_noise'],
             dynamic_rmse=yatsm_config['dynamic_rmse'],
             commission_alpha=yatsm_config['commission_alpha'],
             reverse=yatsm_config['reverse'],
             robust=yatsm_config['robust'],
             lassocv=yatsm_config['lassocv'],
             record=np.array(output))
Exemple #9
0
def main(dataset_config,
         yatsm_config,
         check=False,
         resume=False,
         do_not_run=False,
         read_cache=False,
         write_cache=False,
         validate_cache=False):
    """ Read in dataset and YATSM for a complete line

    Args:
      dataset_config (dict): dict of dataset configuration options
      yatsm_config (dict): dict of YATSM algorithm options
      check (bool, optional): check to make sure images are readible
      resume (bool, optional): do not overwrite existing results, instead
        continue from first non-existing result file
      do_not_run (bool, optional): Don't run YATSM
      read_cache (bool, optional): try to read from cache directory
        (default: False)
      write_cache (bool, optional): try to to write to cache directory
        (default: False)
      validate_cache (bool, optional): ensure data from cache file come from
        images specified in configuration (default: False)

    """
    # Read in dataset
    dates, sensors, images = csvfile_to_dataset(
        dataset_config['input_file'],
        date_format=dataset_config['date_format'])

    image_IDs = get_image_IDs(images)

    # Check for existence of files and remove missing
    if check:
        to_delete = []
        for i, img in enumerate(images):
            if not os.path.isfile(img):
                logger.warning(
                    'Could not find file {f} -- removing'.format(f=img))
                to_delete.append(i)

        if len(to_delete) == 0:
            logger.debug('Checked and found all input images')
        else:
            logger.warning('Removing {n} images'.format(n=len(to_delete)))
            dates = np.delete(dates, np.array(to_delete))
            images = np.delete(images, np.array(to_delete))

    # Get attributes of one of the images
    nrow, ncol, nband, dtype = get_image_attribute(images[0])

    # Calculate the lines this job ID works on
    job_lines = calculate_lines(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    # Calculate X feature input
    X = patsy.dmatrix(yatsm_config['design_matrix'], {
        'x': dates,
        'sensor': sensors
    })

    # Start running YATSM
    start_time_all = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:
        if resume:
            try:
                z = np.load(get_output_name(dataset_config, job_line))
            except:
                pass
            else:
                del z
                logger.debug('Already processed line {l}'.format(l=job_line))
                continue

        logger.debug('Running line {l}'.format(l=job_line))
        start_time = time.time()

        try:
            run_line(job_line,
                     X,
                     images,
                     image_IDs,
                     dataset_config,
                     yatsm_config,
                     nrow,
                     ncol,
                     nband,
                     dtype,
                     do_not_run=do_not_run,
                     read_cache=read_cache,
                     write_cache=write_cache,
                     validate_cache=validate_cache)
        except Exception as e:
            logger.error('Could not process line {l}'.format(l=job_line))
            logger.error(type(e))
            logger.error(str(e))

        logger.debug(
            'Took {s}s to run'.format(s=round(time.time() - start_time, 2)))

    logger.info('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines), m=round((time.time() - start_time_all) / 60.0, 2)))
Exemple #10
0
def run_line(line,
             X,
             images,
             image_IDs,
             dataset_config,
             yatsm_config,
             nrow,
             ncol,
             nband,
             dtype,
             do_not_run=False,
             read_cache=False,
             write_cache=False,
             validate_cache=False):
    """ Runs YATSM for a line

    Args:
      line (int): line to be run from image
      dates (ndarray): np.array of X feature from ordinal dates
      images (ndarray): np.array of image filenames
      image_IDs (iterable): list image identifying strings
      dataset_config (dict): dict of dataset configuration options
      yatsm_config (dict): dict of YATSM algorithm options
      nrow (int): number of rows
      ncol (int): number of columns
      nband (int): number of bands
      dtype (type): NumPy datatype
      do_not_run (bool, optional): don't run YATSM
      read_cache (bool, optional): try to read from cache directory
        (default: False)
      write_cache (bool, optional): try to to write to cache directory
        (default: False)
      validate_cache (bool, optional): ensure data from cache file come from
        images specified in configuration (default: False)


    """
    # Setup output
    output = []

    Y = read_line(line,
                  images,
                  image_IDs,
                  dataset_config,
                  ncol,
                  nband,
                  dtype,
                  read_cache=read_cache,
                  write_cache=write_cache,
                  validate_cache=validate_cache)

    if do_not_run:
        return

    # About to run YATSM
    logger.debug('    running YATSM')
    # Raise or lower logging level for YATSM
    _level = logger.level
    logger.setLevel(loglevel_YATSM)

    for c in xrange(Y.shape[-1]):
        try:
            result = run_pixel(X,
                               Y[..., c],
                               dataset_config,
                               yatsm_config,
                               px=c,
                               py=line)
        except TSLengthException:
            continue

        output.extend(result)

    # Return logging level
    logger.setLevel(_level)

    # Save output
    outfile = get_output_name(dataset_config, line)
    logger.debug('    saving YATSM output to {f}'.format(f=outfile))

    np.savez(outfile,
             version=__version__,
             consecutive=yatsm_config['consecutive'],
             threshold=yatsm_config['threshold'],
             min_obs=yatsm_config['min_obs'],
             min_rmse=yatsm_config['min_rmse'],
             test_indices=yatsm_config['test_indices'],
             design=yatsm_config['design_matrix'],
             design_matrix=X.design_info.column_name_indexes,
             retrain_time=yatsm_config['retrain_time'],
             screening=yatsm_config['screening'],
             screening_crit=yatsm_config['screening_crit'],
             remove_noise=yatsm_config['remove_noise'],
             dynamic_rmse=yatsm_config['dynamic_rmse'],
             commission_alpha=yatsm_config['commission_alpha'],
             reverse=yatsm_config['reverse'],
             robust=yatsm_config['robust'],
             lassocv=yatsm_config['lassocv'],
             record=np.array(output))
Exemple #11
0
def get_training_inputs(dataset_config, exit_on_missing=False):
    """ Returns X features and y labels specified in config file

    Args:
      dataset_config (dict): dataset configuration
      exit_on_missing (bool, optional): exit if input feature cannot be found

    Returns:
      X (np.ndarray): matrix of feature inputs for each training data sample
      y (np.ndarray): array of labeled training data samples
      row (np.ndarray): row pixel locations of `y`
      col (np.ndarray): column pixel locations of `y`
      labels (np.ndarraY): label of `y` if found, else None

    """
    # Find and parse training data
    roi = reader.read_image(dataset_config['training_image'])
    logger.debug('Read in training data')
    if len(roi) == 2:
        logger.info('Found labels for ROIs -- including in output')
        labels = roi[1]
    else:
        roi = roi[0]
        labels = None

    # Determine start and end dates of training sample relevance
    try:
        training_start = dt.strptime(
            dataset_config['training_start'],
            dataset_config['training_date_format']).toordinal()
        training_end = dt.strptime(
            dataset_config['training_end'],
            dataset_config['training_date_format']).toordinal()
    except:
        logger.error('Failed to parse training data start or end dates')
        raise

    # Loop through samples in ROI extracting features
    mask = ~np.in1d(roi, dataset_config['roi_mask_values']).reshape(roi.shape)
    row, col = np.where(mask)
    y = roi[row, col]

    X = []
    out_y = []
    out_row = []
    out_col = []

    rec = None
    _row_previous = None
    for _row, _col, _y in izip(row, col, y):
        # Load result
        if _row != _row_previous:
            try:
                rec = np.load(utils.get_output_name(
                    dataset_config, _row))['record']
                _row_previous = _row
            except:
                logger.error('Could not open saved result file {f}'.format(
                    f=utils.get_output_name(dataset_config, _row)))
                if exit_on_missing:
                    raise
                else:
                    continue
        # Find intersecting time segment
        i = np.where((rec['start'] < training_start) &
                     (rec['end'] > training_end) &
                     (rec['px'] == _col))[0]

        if i.size == 0:
            logger.debug(
                'Could not find model for label {l} at x/y {c}/{r}'.format(
                    l=_y, c=_col, r=_row))
            continue
        elif i.size > 1:
            raise TrainingDataException('Found more than one valid model for \
                label {l} at x/y {x}/{y}'.format(l=_y, x=_col, y=_row))

        # Extract coefficients with intercept term rescaled
        coef = rec[i]['coef'][0, :]
        coef[0, :] = (coef[0, :] +
                      coef[1, :] * (rec[i]['start'] + rec[i]['end']) / 2.0)

        X.append(np.concatenate(
            (coef.reshape(coef.size), rec[i]['rmse'][0])))
        out_y.append(_y)
        out_row.append(_row)
        out_col.append(_col)

    if not out_y:
        logger.error('Could not find any matching timeseries segments')
        raise click.Abort()
    logger.info('Found matching time segments for {m} out of {n} labels'.
                format(m=len(out_y), n=y.size))

    out_row = np.array(out_row)
    out_col = np.array(out_col)

    if labels is not None:
        labels = labels[out_row, out_col]

    return (np.array(X), np.array(out_y),
            out_row, out_col, labels)
Exemple #12
0
def get_training_inputs(dataset_config, exit_on_missing=False):
    """ Returns X features and y labels specified in config file

    Args:
      dataset_config (dict): dataset configuration
      exit_on_missing (bool, optional): exit if input feature cannot be found

    Returns:
      X (np.ndarray): matrix of feature inputs for each training data sample
      y (np.ndarray): array of labeled training data samples
      row (np.ndarray): row pixel locations of `y`
      col (np.ndarray): column pixel locations of `y`
      labels (np.ndarraY): label of `y` if found, else None

    """
    # Find and parse training data
    try:
        roi_ds = gdal.Open(dataset_config['training_image'], gdal.GA_ReadOnly)
    except:
        logger.error('Could not read in training image')
        raise
    logger.info('Reading in training data')
    roi = roi_ds.GetRasterBand(1).ReadAsArray()
    if roi_ds.RasterCount == 2:
        logger.info('Found labels for ROIs -- including in output')
        labels = roi_ds.GetRasterBand(2).ReadAsArray()
    else:
        labels = None

    # Determine start and end dates of training sample relevance
    try:
        training_start = dt.strptime(
            dataset_config['training_start'],
            dataset_config['training_date_format']).toordinal()
        training_end = dt.strptime(
            dataset_config['training_end'],
            dataset_config['training_date_format']).toordinal()
    except:
        logger.error('Failed to parse training data start or end dates')
        raise

    # Loop through samples in ROI extracting features
    mask = ~np.in1d(roi, dataset_config['roi_mask_values']).reshape(roi.shape)
    row, col = np.where(mask)
    y = roi[row, col]

    X = []
    out_y = []
    out_row = []
    out_col = []

    rec = None
    _row_previous = None
    for _row, _col, _y in izip(row, col, y):
        # Load result
        if _row != _row_previous:
            try:
                rec = np.load(utils.get_output_name(dataset_config,
                                                    _row))['record']
                _row_previous = _row
            except:
                logger.error('Could not open saved result file {f}'.format(
                    f=utils.get_output_name(dataset_config, _row)))
                if exit_on_missing:
                    raise
                else:
                    continue
        # Find intersecting time segment
        i = np.where((rec['start'] < training_start)
                     & (rec['end'] > training_end) & (rec['px'] == _col))[0]

        if i.size == 0:
            logger.debug(
                'Could not find model for label {l} at x/y {c}/{r}'.format(
                    l=_y, c=_col, r=_row))
            continue
        elif i.size > 1:
            raise TrainingDataException('Found more than one valid model for \
                label {l} at x/y {x}/{y}'.format(l=_y, x=_col, y=_row))

        # Extract coefficients with intercept term rescaled
        coef = rec[i]['coef'][0, :]
        coef[0, :] = (coef[0, :] + coef[1, :] *
                      (rec[i]['start'] + rec[i]['end']) / 2.0)

        X.append(np.concatenate((coef.reshape(coef.size), rec[i]['rmse'][0])))
        out_y.append(_y)
        out_row.append(_row)
        out_col.append(_col)

    if not out_y:
        logger.error('Could not find any matching timeseries segments')
        sys.exit(1)
    logger.info(
        'Found matching time segments for {m} out of {n} labels'.format(
            m=len(out_y), n=y.size))

    out_row = np.array(out_row)
    out_col = np.array(out_col)

    if labels is not None:
        labels = labels[out_row, out_col]

    return (np.array(X), np.array(out_y), out_row, out_col, labels)