Beispiel #1
0
def main():
    p_path = '/vol/v2/stem/imperv/models/imperv_20161012_0958/imperv_20161012_0958_vote.bsq'
    t_path = '/vol/v2/stem/imperv/truth_map/imperv2001_CAORWA.bsq'
    
    nodata_p = 255
    nodata_t = 255
    out_dir = '/vol/v2/stem/imperv/models/imperv_20161012_0958/evaluation_vote/'
    
    ds_p = gdal.Open(p_path)
    ar_p = ds_p.ReadAsArray()
    ds_p = None
    
    ds_t = gdal.Open(t_path)
    ar_t = ds_t.ReadAsArray()
    ds_t = None
    
    sample_txt = '/vol/v2/stem/imperv/samples/imperv_sample1454542_20161007_0843/imperv_sample1454542_20161007_0843_test.txt'
    #sample_txt = '/vol/v2/stem/canopy/samples/canopy_sample1454542_20161017_1919/canopy_sample1454542_20161017_1919_test.txt'
    df = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
    p_samples, t_samples = get_samples(ar_p, ar_t, df, nodata_p, nodata_t, match=False)
    #t_mask = (t_samples > 0) & (p_samples > 0)
    #t_samples = t_samples[t_mask]
    #p_samples = p_samples[t_mask]
    out_png = os.path.join(out_dir, 'imperv_20161012_0958_2dhistogram_average_hex_gray.png')
    #out_png = os.path.join(out_dir, 'canopy_20161018_2254_2dhistogram_bestmatch_hex_gray.png')
    #import pdb; pdb.set_trace()
    
    sns.set_context(context='paper', font_scale=1.4)
    histogram_2d(t_samples, p_samples, out_png, bins=50, hexplot=True, vmax=4000)
    print out_png
    ar_p = None
    ar_t = None
    p_samples = None
    t_samples = None
def main(p_path, t_path, nodata_p, nodata_t, sample_txt, out_png):

    ds_p = gdal.Open(p_path)
    ar_p = ds_p.ReadAsArray()
    ds_p = None

    ds_t = gdal.Open(t_path)
    ar_t = ds_t.ReadAsArray()
    ds_t = None

    df = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
    p_samples, t_samples = get_samples(ar_p,
                                       ar_t,
                                       df,
                                       nodata_p,
                                       nodata_t,
                                       match=False)
    if not os.path.isdir(os.path.dirname(out_png)):
        os.makedirs(os.path.dirname(out_png))

    sns.set_context(context='paper', font_scale=1.4)
    histogram_2d(t_samples,
                 p_samples,
                 out_png,
                 bins=50,
                 hexplot=True,
                 vmax=4000)
    print(out_png)
    ar_p = None
    ar_t = None
    p_samples = None
    t_samples = None
def par_get_match(args):

    t0 = time.time()
    tile_ind, this_in, this_match, in_nodata, match_nodata, count, total_tiles = args
    matched_vals, _ = get_samples(this_in,
                                  this_match,
                                  in_nodata,
                                  match_nodata,
                                  match=True)
    print 'Time for getting array %s of %s: %.1f seconds' % (
        count, total_tiles, time.time() - t0)
    return tile_ind, matched_vals
 def get_samples(self, model_name, zero_inflated=True, num_samples=1000):
     return get_samples(
         self.model_dict[model_name],
         self.guide_dict[model_name],
         self.p_data_train,
         self.t_data,
         self.s_data,
         self.r_data,
         None,
         self.p_types_train,
         self.p_stories_train,
         self.p_subreddits_train,
         zero_inflated,
         num_samples=num_samples,
     )
Beispiel #5
0
def main(sample_txt, ref_raster, pred_raster, p_nodata, t_nodata, target_col, bins, out_txt, match=None, predict_col=None):
    
    p_nodata = int(p_nodata)
    t_nodata = int(t_nodata)
    
    ds_p = gdal.Open(pred_raster)
    ar_p = ds_p.ReadAsArray()
    
    ds_r = gdal.Open(ref_raster)
    ar_r = ds_r.ReadAsArray()
    
    r_xsize = ds_r.RasterXSize
    r_ysize = ds_r.RasterYSize
    p_xsize = ds_p.RasterXSize
    p_ysize = ds_p.RasterYSize
    tx_r = ds_r.GetGeoTransform()
    tx_p = ds_p.GetGeoTransform()
    # If two arrays are different sizes, make prediction array match reference
    if not r_xsize == p_xsize or r_ysize == p_ysize or tx_r != tx_p:
        warnings.warn('Prediction and reference rasters do not share the same extent. Snapping prediction raster to reference....')
        offset = mosaic.calc_offset((tx_r[0], tx_r[3]), tx_p)
        t_inds, p_inds = mosaic.get_offset_array_indices((r_ysize, r_xsize), (p_ysize, p_xsize), offset)
        ar_buf = np.full(ar_r.shape, p_nodata, dtype=ar_p.dtype)
        ar_buf[t_inds[0]:t_inds[1], t_inds[2]:t_inds[3]] = ar_p[p_inds[0]:p_inds[1], p_inds[2]:p_inds[3]]
        ar_p = ar_buf.copy()
        del ar_buf
        
    bins = parse_bins(bins)
    
    sample = pd.read_csv(sample_txt, sep='\t')
    if target_col in sample.columns:
        t_sample = sample[target_col]
    else:
        raise IndexError('target_col "%s" not in sample' % target_col)
    
    if match:
        t_sample, p_sample = get_samples(ar_p, ar_r, p_nodata, t_nodata, sample, match=match)
    elif predict_col:
        p_sample = sample[predict_col]
    else:
        p_sample = ar_p[sample.row, sample.col]
        t_sample = ar_r[sample.row, sample.col]
    
    rmse = area_weighted_rmse(ar_p, ar_r, p_sample, t_sample, bins, p_nodata, out_txt=out_txt)
    
    return rmse
def main(search_dir, models, t_path, inventory_txt, t_nodata=255):

    df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
    columns = df_inv.columns
    if 'vote_rmse' not in columns:
        df_inv['vote_rmse'] = None
    if 'mean_rmse' not in columns:
        df_inv['mean_rmse'] = None
    df_inv = df_inv.ix[models]

    ds = gdal.Open(t_path)
    ar_t = ds.ReadAsArray()
    nodata_mask = ar_t == t_nodata
    ds = None

    for model in models:
        print '\nCalculating RMSE for ', model
        model_dir = os.path.join(search_dir, model)
        if not os.path.exists(model_dir):
            print 'Model dir does not exist: %s. Skipping...\n' % model_dir
            continue

        confusion_params = os.path.join(model_dir,
                                        'confusion_matrix_params.txt')
        if not os.path.exists(confusion_params):
            print 'Could not find confusion params: ', confusion_params
            predict_params = os.path.join(model_dir, 'predict_stem_params.txt')
            inputs, _ = stem.read_params(predict_params)
            p_nodata = int(inputs['nodata'].replace('"', ''))
            this_srch_str = os.path.join(model_dir, 'train_stem*_params.txt')
            train_params = glob.glob(this_srch_str)
            if len(train_params) == 0:
                print 'Can not find test data for ', model, '\n'
                continue
            train_params = train_params[0]
            inputs, _ = stem.read_params(train_params)
            test_txt = inputs['sample_txt'].replace('predictors',
                                                    'test').replace('"', '')
            train_txt = inputs['sample_txt'].replace('"', '')
        else:
            inputs = read_params(confusion_params)
            for k, v in inputs.iteritems():
                inputs[k] = v.replace('"', '')
            test_txt = inputs['sample_txt']
            p_nodata = int(inputs['p_nodata'])
            train_txt = inputs['sample_txt'].replace('_test',
                                                     '').replace('"', '')
        #df = pd.read_csv(test_txt, sep='\t', index_col='obs_id')
        train_sample = pd.read_csv(train_txt, sep='\t', index_col='obs_id')

        # Set any pixels used for training to -1 so they can be avoided for testing
        n_rows, n_cols = ar_t.shape
        n_pixels = ar_t.size
        pixel_ids = np.arange(n_pixels,
                              dtype=np.uint32).reshape(n_rows, n_cols)
        pixel_ids[
            train_sample.row,
            train_sample.col] = n_pixels  #will always be 1 more than last id
        pixel_ids[nodata_mask] = n_pixels

        n_samples = int(
            int(
                os.path.basename(train_txt).split('_')[1].replace(
                    'sample', '')) * 0.2)
        test_ids = np.array(random.sample(pixel_ids[pixel_ids != n_pixels],
                                          n_samples),
                            dtype=np.uint32)
        test_rows = test_ids / n_cols
        test_cols = test_ids % n_cols
        #test_cols = random.sample(ar_col[ar_col != -1], n_samples)
        df = pd.DataFrame({'row': test_rows, 'col': test_cols})

        for agg_method in ['vote', 'mean']:
            p_path = os.path.join(model_dir, '%s_%s.bsq' % (model, agg_method))
            ds = gdal.Open(p_path)
            ar_p = ds.ReadAsArray()
            t_samples, p_samples = get_samples(ar_p,
                                               ar_t,
                                               p_nodata,
                                               255,
                                               samples=df,
                                               match='best')

            rmse = np.round(calc_rmse(t_samples, p_samples), 1)
            print agg_method, ': ', rmse
            df_inv.ix[model, '%s_rmse' % agg_method] = rmse
        out_txt = os.path.join(
            model_dir, '%s_random_test_sample%s.txt' % (model, n_samples))
        df.to_csv(out_txt, sep='\t', index=False)

    out_txt = inventory_txt.replace('.txt', '_randomRMSE.txt')
    df_inv.to_csv(out_txt, sep='\t')