コード例 #1
0
def buffered_tile_inds(n_tiles, xsize, ysize, tx, tile_buffer, mask):

    # Find empty tiles
    print 'Finding empty tiles...'
    t1 = time.time()
    df_tiles, df_tiles_rc, _ = get_tiles(n_tiles, xsize, ysize, tx)

    total_tiles = len(df_tiles)
    empty_tiles = find_empty_tiles(df_tiles, mask, tx)
    df_tiles = df_tiles_rc.select(lambda x: x not in empty_tiles)
    print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\
    (len(empty_tiles), total_tiles, (time.time() - t1)/60)

    # Add buffer around each tile
    df_buf = df_tiles.copy()
    df_buf[['ul_r', 'ul_c']] = df_buf[['ul_r', 'ul_c']] - tile_buffer
    df_buf[['lr_r', 'lr_c']] = df_buf[['lr_r', 'lr_c']] + tile_buffer
    df_buf[['ul_r', 'lr_r']] = df_buf[['ul_r', 'lr_r']].clip(0, ysize)
    df_buf[['ul_c', 'lr_c']] = df_buf[['ul_c', 'lr_c']].clip(0, xsize)

    return df_tiles, df_buf
コード例 #2
0
def get_stratified_sample_by_tile(raster_path,
                                  col_name,
                                  data_band,
                                  n_samples,
                                  bins,
                                  min_sample=None,
                                  max_sample=None,
                                  pct_train=1,
                                  nodata=None,
                                  sampling_scheme='equal',
                                  zero_inflation=None,
                                  data_type='continuous',
                                  kernel=False,
                                  n_tiles=(1, 1),
                                  boundary_shp=None,
                                  bin_scale=1,
                                  n_per_tile=None):
    '''
    Return a dataframe of stratified randomly sampled pixels from raster_path
    '''
    print 'Reading the raster_path... %s\n' % datetime.now()
    ds = gdal.Open(raster_path)
    tx = ds.GetGeoTransform()
    band = ds.GetRasterBand(data_band)
    #ar_full = band.ReadAsArray()
    #ar_data = band.ReadAsArray()
    if nodata == None:
        nodata = band.GetNoDataValue()
        if nodata == None:
            sys.exit('Could not obtain nodata value from dataset and' +\
            ' none specified in parameters file. Try re-running with' +\
            'nodata specified.')

    # Split up the raster into tiles and figure out how
    print 'Calculating sample size per tile...'
    t1 = time.time()
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize,
                                                      tx)
    total_tiles = len(df_tiles)
    # If a boundary shapefile is given, calculate the proportional area within
    #   each tile
    if boundary_shp:
        boundary_ds = ogr.Open(boundary_shp)
        boundary_lyr = boundary_ds.GetLayer()
        empty_tiles = stem.find_empty_tiles(df_tiles,
                                            boundary_lyr,
                                            tx,
                                            nodata=0)
        df_tiles.drop(empty_tiles, inplace=True)
        df_tiles_rc.drop(empty_tiles, inplace=True)
        total_tiles = len(df_tiles)
        calc_proportional_area(df_tiles, boundary_shp)  # Calcs area in place
        if n_per_tile:
            pct_area = df_tiles.pct_area
            df_tiles['pct_max_sample'] = pct_area / (pct_area.max() -
                                                     pct_area.min())
            df_tiles_rc['n_samples'] = (n_per_tile *
                                        df_tiles.pct_max_sample).astype(int)
        else:
            df_tiles_rc['n_samples'] = n_samples * df_tiles.pct_area
    else:
        if n_per_tile:
            df_tiles_rc['n_samples'] = n_per_tile
        else:
            df_tiles_rc['n_samples'] = float(n_samples) / total_tiles
    df_tiles['n_samples'] = df_tiles_rc.n_samples
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # For each tile, get random sample for each bin
    train_rows = []
    train_cols = []
    test_rows = []
    test_cols = []
    empty_tiles = []
    classes = ['_%s' % b[1] for b in bins]
    df_tiles = df_tiles.reindex(columns=df_tiles.columns.tolist() + classes,
                                fill_value=0)
    for c, (i, tile_coords) in enumerate(df_tiles_rc.iterrows()):
        t1 = time.time()
        print 'Sampling for %d pixels for tile %s of %s...' % (
            tile_coords.n_samples, c + 1, total_tiles)
        if tile_coords.n_samples == 0:
            print '\tSkipping this tile because all pixels == nodata...\n'
            empty_tiles.append(i)
            continue
        ul_r, lr_r, ul_c, lr_c = tile_coords[['ul_r', 'lr_r', 'ul_c', 'lr_c']]
        tile_ysize = lr_r - ul_r
        tile_xsize = lr_c - ul_c

        ar = band.ReadAsArray(ul_c, ul_r, tile_xsize, tile_ysize)
        if not type(ar) == np.ndarray:
            import pdb
            pdb.set_trace()
        nodata_mask = ar != nodata
        if not nodata_mask.any():
            print '\tSkipping this tile because all pixels == nodata...\n'
            empty_tiles.append(i)
            continue
        ar_rows, ar_cols = np.indices(ar.shape)
        ar_rows = ar_rows + ul_r
        ar_cols = ar_cols + ul_c

        n_per_bin, scaled_pcts = calc_strata_sizes(ar, nodata_mask,
                                                   tile_coords.n_samples, bins,
                                                   sampling_scheme, bin_scale,
                                                   zero_inflation)
        df_tiles.ix[i, classes] = n_per_bin  #record sample size per bin

        for i, (this_min, this_max) in enumerate(bins):
            #t2 = time.time()

            try:
                this_sample_size = n_per_bin[i]
                if min_sample:
                    this_sample_size = int(max(n_per_bin[i], min_sample))
                if max_sample:
                    if max_sample < this_sample_size:
                        this_sample_size = max_sample

            except:
                import pdb
                pdb.set_trace()
            print 'Sampling between %s and %s: %s pixels (%.1f%% of sample for this tile) ' % (
                this_min, this_max, this_sample_size, scaled_pcts[i] * 100)
            mask = (ar > this_min) & (ar <= this_max) & nodata_mask
            these_rows = ar_rows[mask]
            these_cols = ar_cols[mask]
            '''if this_max == 0 and zero_inflation:
                this_sample_size *= zero_inflation'''

            # If there aren't enough pixels to generate samples for this bin
            if these_rows.size < this_sample_size:
                #import pdb; pdb.set_trace()
                print (('Not enough pixels between {0} and {1} to generate {2}' +\
                ' random samples. Returning all {3} pixels for this bin.'))\
                .format(this_min, this_max, this_sample_size, these_rows.size)
                tr_rows = these_rows
                tr_cols = these_cols
            else:
                samples = random.sample(xrange(len(these_rows)),
                                        this_sample_size)
                tr_rows = these_rows[samples]
                tr_cols = these_cols[samples]

            # If pct_train is specified, split the sample indices into train/test sets
            te_rows = []
            te_cols = []
            if pct_train:
                split_ind = len(tr_rows) * pct_train
                te_rows = tr_rows[split_ind:]
                te_cols = tr_cols[split_ind:]
                tr_rows = tr_rows[:split_ind]
                tr_cols = tr_cols[:split_ind]

            train_rows.extend(tr_rows)
            train_cols.extend(tr_cols)
            test_rows.extend(te_rows)
            test_cols.extend(te_cols)
            #print '%.1f seconds\n' % (time.time() - t2)
        print 'Time for this tile: %.1f minutes\n' % ((time.time() - t1) / 60)
    del tr_rows, tr_cols, te_rows, te_cols, ar

    # Read the whole raster in to extract stuff
    ar = band.ReadAsArray()

    # If True, extract with 3x3 kernel. Otherwise, just get the vals (row,col)
    if kernel:
        train_vals = extract_by_kernel(ar, train_rows, train_cols, data_type,
                                       col_name, nodata)
    else:
        train_vals = ar[train_rows, train_cols]

    # Calculate x and y for later extractions
    ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx
    train_x = [int(ul_x + c * x_res) for c in train_cols]
    train_y = [int(ul_y + r * y_res) for r in train_rows]
    df_train = pd.DataFrame({
        'x': train_x,
        'y': train_y,
        'row': train_rows,
        'col': train_cols,
        col_name: train_vals
    })

    # If training and testing samples were split, get test vals
    df_test = None
    if pct_train > 1:
        if kernel:
            test_vals = extract_by_kernel(ar, train_rows, train_cols,
                                          data_type, col_name, nodata)
        else:
            test_vals = ar[test_rows, test_cols]
        test_x = [int(ul_x + c * x_res) for c in test_cols]
        test_y = [int(ul_y + r * y_res) for r in test_rows]
        df_test = pd.DataFrame({
            'x': test_x,
            'y': test_y,
            'row': test_rows,
            'col': test_cols,
            col_name: test_vals
        })
    ds = None

    # In case empty tiles weren't filtered out already
    df_tiles = df_tiles.ix[~df_tiles.index.isin(empty_tiles)]

    return df_train, df_test, df_tiles
コード例 #3
0
def main(model_dir, n_tiles, **kwargs):

    t0 = time.time()

    n_tiles = [int(n) for n in n_tiles.split(',')]
    if not os.path.isdir(model_dir):
        message = 'model directory given does not exist or is not a directory: ', model_dir
        raise IOError(message)

    model = os.path.basename(model_dir)
    dt_dir = os.path.join(model_dir, 'decisiontree_models')
    set_txt = os.path.join(dt_dir, '%s_support_sets.txt' % model)
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')

    pred_param_path = glob(os.path.join(model_dir,
                                        'predict_stem_*params.txt'))[0]
    predict_params, df_var = stem.read_params(pred_param_path)
    train_param_path = glob(os.path.join(model_dir,
                                         'train_stem_*params.txt'))[0]
    train_params, _ = stem.read_params(train_param_path)
    df_var.sort_index(inplace=True)

    nodata = int(predict_params['nodata'].replace('"', ''))
    if len(kwargs) == 0:
        var_ids = df_sets.max_importance.unique()
        var_names = df_var.ix[var_ids].index
        variables = zip(var_ids, var_names)
    else:
        variables = [(variable_id, variable_name)
                     for variable_name, variable_id in kwargs]

    mask_path = os.path.join(model_dir, '%s_vote.bsq' % model)
    if not os.path.exists(mask_path):
        mask_path = mask_path.replace('.bsq', '.tif')
    mask_ds = gdal.Open(mask_path)
    mask_tx = mask_ds.GetGeoTransform()
    xsize = mask_ds.RasterXSize
    ysize = mask_ds.RasterYSize
    prj = mask_ds.GetProjection()
    df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize,
                                                      mask_tx)
    total_tiles = len(df_tiles)
    df_tiles['tile'] = df_tiles.index

    # Find the tiles that have only nodata values
    t1 = time.time()
    print '\nFinding empty tiles...'
    mask = mask_ds.ReadAsArray() == nodata
    empty_tiles = stem.find_empty_tiles(df_tiles, ~mask, mask_tx)
    mask_ds = None
    print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\
    (len(empty_tiles), total_tiles, (time.time() - t1)/60)
    # Select only tiles that are not empty
    df_tiles = df_tiles.select(lambda x: x not in empty_tiles)
    total_tiles = len(df_tiles)

    #some_set = df_sets.iloc[0]
    support_size = [
        int(s)
        for s in train_params['support_size'].replace('"', '').split(',')
    ]
    set_size = [int(abs(s / mask_tx[1])) for s in support_size]

    out_dir = os.path.join(model_dir, 'importance_maps')
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    print variables
    for vi, (v_id, v_name) in enumerate(variables):

        t1 = time.time()
        print 'Making map for %s: %s of %s variables\n' % (v_name, vi + 1,
                                                           len(variables))

        ar = np.full((ysize, xsize), nodata, dtype=np.uint8)

        for i, (t_ind, t_row) in enumerate(df_tiles.iterrows()):
            t2 = time.time()
            print 'Aggregating for %s of %s tiles' % (i + 1, total_tiles)

            # Calculate the size of this tile in case it's at the edge where the
            #   tile size will be slightly different
            this_size = abs(t_row.lr_y - t_row.ul_y), abs(t_row.lr_x -
                                                          t_row.ul_x)
            df_these_sets = stem.get_overlapping_sets(df_sets, t_row,
                                                      this_size, support_size)

            rc = df_tiles_rc.ix[t_ind]
            this_size = rc.lr_r - rc.ul_r, rc.lr_c - rc.ul_c
            n_sets = len(df_these_sets)

            # Load overlapping predictions from disk and read them as arrays
            tile_ul = t_row[['ul_x', 'ul_y']]

            print n_sets, ' Overlapping sets'
            importance_bands = []

            importance_values = []
            for s_ind, s_row in df_these_sets.iterrows():

                # Calculate offset and array/tile indices
                offset = stem.calc_offset(tile_ul, (s_row.ul_x, s_row.ul_y),
                                          mask_tx)
                #if abs(offset[0]) > this_size[0] or abs(offset[1] > this_size[1]):

                tile_inds, a_inds = mosaic.get_offset_array_indices(
                    tile_size, set_size, offset)

                # Get feature with maximum importance and fill tile with that val
                try:
                    with open(s_row.dt_file, 'rb') as f:
                        dt_model = pickle.load(f)
                    importance_value = int(
                        dt_model.feature_importances_[v_id] * 100)
                    importance_values.append(importance_value)
                    #filled = np.full((nrows, ncols), importance_value, dtype=np.uint8)
                    #import_band = stem.fill_tile_band(this_size, filled, tile_inds, nodata)
                    import_band = np.full(this_size, np.nan, dtype=np.float16)
                    import_band[tile_inds[0]:tile_inds[1],
                                tile_inds[2]:tile_inds[3]] = importance_value
                    importance_bands.append(import_band)
                except Exception as e:
                    print e
                    continue  #'''

            print 'Average importance for this tile: %.1f' % np.mean(
                importance_values)
            #Aggregate
            importance_stack = np.dstack(importance_bands)
            importance_tile = np.nanmean(importance_stack, axis=2)
            tile_mask = mask[rc.ul_r:rc.lr_r,
                             rc.ul_c:rc.lr_c] | np.isnan(importance_tile)
            importance_tile[tile_mask] = nodata
            ar[rc.ul_r:rc.lr_r,
               rc.ul_c:rc.lr_c] = np.round(importance_tile).astype(np.uint8)
            print 'Aggregation time for this tile: %.1f minutes\n' % (
                (time.time() - t2) / 60)
            '''temp_dir = os.path.join(out_dir, 'delete')
            if not os.path.isdir(temp_dir):
                os.mkdir(temp_dir)
            t_tx = tile_ul[0], 30, 0, tile_ul[1], 0, -30
            array_to_raster(np.round(importance_tile).astype(np.uint8), t_tx, prj, gdal.GetDriverByName('gtiff'), os.path.join(temp_dir, 'delete_%s.tif' % t_ind), gdal.GDT_Byte, 255, True)'''
        out_path = os.path.join(out_dir,
                                '%s_importance_%s.tif' % (model, v_name))
        try:
            array_to_raster(ar, mask_tx, prj, gdal.GetDriverByName('gtiff'),
                            out_path, gdal.GDT_Byte, nodata)
        except Exception as e:
            print e
            import pdb
            pdb.set_trace()
        print 'Time for this variable: %.1f minutes\n' % (
            (time.time() - t1) / 60)

    print '\nTotal time for %s variables: %.1f hours\n' % (len(variables), (
        (time.time() - t0) / 3600))