def main(set_txt, train_params, plot_dims, out_png):

    sns.set_context(context='paper', font_scale=.3)
    sns.set_style('white', rc={'axes.linewidth': .5})

    inputs, df_var = stem.read_params(train_params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    predict_cols = sorted(df_var.index)
    df = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    df = df[df.oob_rate < 5]

    fig, axes = plt.subplots(*plot_dims)
    n_plots = axes.size
    set_ids = random.sample(df.index, n_plots)

    sample = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')

    for si, ax in zip(set_ids, axes.ravel()):
        support_set = df.ix[si]

        oob_ind_txt = support_set.dt_file.replace('.pkl', '_oob_inds.txt')
        with open(oob_ind_txt) as txt:
            oob_inds = [int(l) for l in txt]
        with open(support_set.dt_file, 'rb') as f:
            dt_model = pickle.load(f)

        oob_sample = sample.ix[oob_inds]
        oob_predictions = dt_model.predict(oob_sample[predict_cols])
        oob_reference = oob_sample[target_col]

        rmse = sf.rmse(oob_reference, oob_predictions)
        ac, ac_s, ac_u, ssd, spod = sf.agree_coef(oob_reference,
                                                  oob_predictions)
        ax.plot(oob_reference,
                oob_predictions,
                'o',
                alpha=0.05,
                markeredgecolor='none',
                markersize=2.5)
        #ax.xticks([0, 50, 100])
        #ax.yticks([0, 50, 100])
        title = 'Set ID: %s, RMSE: %.1f, ac: %.3f' % (si, rmse, ac)
        ax.set_title(title)
        sns.despine()

    fig.subplots_adjust(hspace=0.1)
    plt.savefig(out_png, dpi=300)
Exemple #2
0
def main(params, inventory_txt=None, constant_vars=None):
    
    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])    
    df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float

    try:
        n_tiles = [int(i) for i in n_tiles.split(',')]
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    
    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars  = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)
    
    # If constants were given, make a dict and make sure they match the training
    #  constants
    if 'constant_vars' in inputs:
        constant_vars = parse_constant_vars(constant_vars)
        pred_constants = sorted(constant_vars.keys())
        train_constants = [i.replace(' ', '') for i in train_inputs['constant_vars'].strip('"').split(',')]
        train_constants = sorted(train_constants)
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if 'constant_vars' in inputs: 
        unmatched_vars += [v for v in pred_constants if v not in train_constants]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str
        raise NameError(msg)
    unmatched_vars = [v for v in train_vars if v not in pred_vars]
    if 'constant_vars' in inputs:
        unmatched_vars += [v for v in train_constants if v not in pred_constants]
        pred_vars += pred_constants # Add here because it would screw with stuff upstream
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir) #Copy the params for reference
    
    if 'confusion_params' in inputs: 
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path
    
    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)
    
    mosaic_ds = gdal.Open(mosaic_path)
    mosaic_tx = mosaic_ds.GetGeoTransform()
    xsize = mosaic_ds.RasterXSize
    ysize = mosaic_ds.RasterYSize
    prj = mosaic_ds.GetProjection()
    driver = mosaic_ds.GetDriver()
    m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    
    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)
    
    set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    total_sets = len(df_sets)
    
    t0 = time.time()
    if 'n_jobs' in inputs:

        # Predict in parallel
        n_jobs = int(n_jobs)
        args = []
        t1 = time.time()
        print 'Predicting in parallel with %s jobs...' % n_jobs
        print 'Building args and making rasters of TSA arrays...'
        for c, (set_id, row) in enumerate(df_sets.iterrows()):
            
            # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic_ds
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx,
                            xsize, ysize, nodata=nodata)
            tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
            tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5]
            dtype_code = mosaic_ds.GetRasterBand(1).DataType
            mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, tsa_raster, stem.get_gdal_dtype(dtype_code), silent=True)
            
            # Build list of args to pass to the Pool
            tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
            ds = gdal.Open(tsa_raster)
            tsa_tx = ds.GetGeoTransform()
            ds = None
            tsa_off = stem.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tsa_tx[0], tsa_tx[3]), tsa_tx)
            args.append([c, total_sets, set_id, df_var, tsa_raster, tsa_off, coords, 
                         mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, 
                         constant_vars, predict_dir])
        print '%.1f minutes\n' % ((time.time() - t1)/60)
        p = Pool(n_jobs)
        p.map(stem.par_predict, args, 1)
            
    
    else:
        # Loop through each set and generate predictions
        for c, (set_id, row) in enumerate(df_sets.ix[1043:].iterrows()):
            t1 = time.time()
            with open(row.dt_file, 'rb') as f: 
                dt_model = pickle.load(f)
            print '\nPredicting for set %s of %s' % (c + 1, total_sets)
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            ar_predict = stem.predict_set(set_id, df_var, mosaic_ds, coords, 
                                     mosaic_tx, xsize, ysize, dt_model, nodata,
                                     np.int16, constant_vars)        
            tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res
            out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id)
            mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata)
            print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60)
    
        #mosaic_ds = None                  
    print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#'''
    
    #Aggregate predictions by tile and stitch them back together
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    ar_vote, pct_importance, df_sets = stem.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0)
    #df_sets.to_csv(set_txt, sep='\t')'''
    mosaic_ds = None
    
    # Save the importance values
    importance = pd.DataFrame({'variable': pred_vars,
                               'pct_importance': pct_importance,
                               'index': range(len(pred_vars))
                               })
    importance.set_index('index', inplace=True)
    importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')#'''
    
    '''ds = gdal.Open(os.path.join(model_dir, '%s_vote.bsq' % file_stamp))
    ar_vote = ds.ReadAsArray()
    ds = None
    ds = gdal.Open(os.path.join(model_dir, '%s_mean.bsq' % file_stamp))
    ar_mean = ds.ReadAsArray()
    ds = None#'''
    
    if 'confusion_params' in locals():
        import confusion_matrix as confusion
        
        vote_dir = os.path.join(model_dir, 'evaluation_vote')
        mean_dir = os.path.join(model_dir, 'evaluation_mean')
        
        print '\nComputing confusion matrix for vote...'
        out_txt = os.path.join(vote_dir, 'confusion.txt')
        print confusion_params
        df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
        try:
            out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
            df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
        except Exception as e:
            print e
        
        '''print '\nGetting confusion matrix for mean...'
        out_txt = os.path.join(mean_dir, 'confusion.txt')
        df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
        try:
            out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
            df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
        except Exception as e:
            print e#'''
        
        vote_acc = df_v.ix['producer', 'user']
        vote_kap = df_v.ix['producer', 'kappa']
        #mean_acc = df_m.ix['user','producer']
        #mean_kap = df_m.ix['user', 'kappa']

        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'
            
        print ''
        print 'Vote accuracy .............. ', vote_acc
        print 'Vote kappa ................. ', vote_kap
        #print 'Mean accuracy .............. ', mean_acc
        #print 'Mean kappa ................. ', mean_kap
        
    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
    
    print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0)/60)
Exemple #3
0
def main():
    srch_dir = '/vol/v2/stem/canopy/models'
    stamps = fnmatch.filter(os.listdir(srch_dir), 'canopy*')

    info = []
    for stamp in stamps:
        print stamp
        this_dir = os.path.join(srch_dir, stamp)
        this_srch_str = os.path.join(this_dir, 'train_stem*_params.txt')
        matched = glob.glob(this_srch_str)
        if len(matched) == 0:
            print 'No param file for ', stamp
            info.append(
                [stamp, '', 0, 0, 0, 0, False, 0, 0, False, 0, 0, '', 0, ''])
            continue

        this_param_text = matched[0]
        if 'regressor' in this_param_text:
            model_type = 'Regressor'
        else:
            model_type = 'Classifier'
        inputs, df_var = stem.read_params(this_param_text)
        for var in inputs:
            exec("{0} = str({1})").format(var, inputs[var])

        vote_mask = False
        vote_accuracy = None
        vote_kappa = None
        vote_dir = os.path.join(this_dir, 'evaluation_vote')
        if os.path.exists(vote_dir):
            for root, dis, files in os.walk(vote_dir):
                for f in files:
                    if f.endswith('txt'):
                        vote_txt = os.path.join(root, f)
                        df_vote = pd.read_csv(vote_txt,
                                              sep='\t',
                                              index_col='bin')
                        try:
                            vote_accuracy = int(df_vote.ix['producer', 'user'])
                            vote_kappa = round(df_vote.ix['kappa', 'kappa'], 2)
                        except:
                            vote_accuracy = int(df_vote.ix['user', 'producer'])
                            vote_kappa = round(df_vote.ix['user', 'kappa'], 2)
                        if 'mask' in vote_txt: vote_mask = True

        mean_mask = False
        mean_accuracy = None
        mean_kappa = None
        mean_dir = os.path.join(this_dir, 'evaluation_mean')
        if os.path.exists(mean_dir):
            for root, dis, files in os.walk(mean_dir):
                for f in files:
                    if f.endswith('txt'):
                        mean_txt = os.path.join(root, f)
                        df_mean = pd.read_csv(mean_txt,
                                              sep='\t',
                                              index_col='bin')
                        try:
                            mean_accuracy = int(df_mean.ix['producer', 'user'])
                            mean_kappa = round(df_mean.ix['kappa', 'kappa'], 2)
                        except:
                            mean_accuracy = int(df_mean.ix['user', 'producer'])
                            mean_kappa = round(df_mean.ix['user', 'kappa'], 2)
                        if 'mask' in mean_txt: mean_mask = True

        dt_dir = os.path.join(this_dir, 'decisiontree_models')
        try:
            n_sets = len(os.listdir(dt_dir)) - 1
        except:
            n_sets = None

        n_samples = int(sample_txt.split('_')[1].replace('sample', ''))

        if not 'max_features' in inputs: max_features = None

        avg_count = None
        cnt_path = os.path.join(this_dir, '%s_count.bsq' % stamp)
        if os.path.exists(cnt_path):
            ds = gdal.Open(cnt_path)
            ar = ds.ReadAsArray()
            cnt_nodata = ds.GetRasterBand(1).GetNoDataValue()
            ds = None
            if len(ar[ar == cnt_nodata]) == 0:
                cnt_min = ar.min()
                cnt_max = ar.max()
                if cnt_min <= 0:
                    cnt_nodata = cnt_min
                else:
                    cnt_nodata = cnt_max
            avg_count = int(round(np.mean(ar[ar != cnt_nodata]), 0))

        avg_oob = None
        oob_path = os.path.join(this_dir, '%s_oob.bsq' % stamp)
        if os.path.exists(oob_path):
            ds = gdal.Open(oob_path)
            ar = ds.ReadAsArray()
            ds = None
            oob_min = ar.min()
            oob_max = ar.max()
            if oob_min <= 0:
                oob_nodata = oob_min
            else:
                oob_nodata = oob_max
            avg_oob = round(np.mean(ar[ar != oob_nodata]), 1)

        info.append([
            stamp, model_type, avg_oob, avg_count, vote_accuracy, vote_kappa,
            vote_mask, mean_accuracy, mean_kappa, mean_mask, n_sets, n_samples,
            '[%s]' % support_size, sets_per_cell, max_features
        ])

    df = pd.DataFrame(info,
                      columns=[
                          'stamp', 'model_type', 'avg_oob', 'avg_count',
                          'vote_accuracy', 'vote_kappa', 'vote_mask',
                          'mean_accuracy', 'mean_kappa', 'mean_mask', 'n_sets',
                          'n_samples', 'support_size', 'sets_per_cell',
                          'max_features'
                      ])

    out_txt = os.path.join(srch_dir, 'model_info.txt')
    df.to_csv(out_txt, sep='\t', index=False)

    print 'Text written to ', out_txt
Exemple #4
0
def main(params,
         pct_train=None,
         min_oob=0,
         gsrd_shp=None,
         resolution=30,
         make_oob_map=False,
         snap_coord=None,
         oob_map_metric='oob_rate',
         n_jobs=1,
         oob_drop=None):
    t0 = time.time()

    inputs = stem.read_params(params)

    # Convert params to named variables and check for required vars
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    try:
        if 'max_features' not in locals(): max_features = None
        if 'min_oob' in inputs: min_oob = int(min_oob)
        num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell,
                                        min_obs, max_features, pct_train)
        cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars
        str_check = sample_txt, target_col, mosaic_path, out_dir, model_type
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)
    print(var_info)
    df_var = pd.read_csv(var_info, sep='\t', index_col='var_name')

    # Read in training samples and check that df_train has exactly the same
    #   columns as variables specified in df_vars
    df_train = pd.read_csv(sample_txt, sep='\t')
    n_samples = len(df_train)
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n\t'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str
        import pdb
        pdb.set_trace()
        raise NameError(msg)
    if target_col not in df_train.columns:
        raise NameError('target_col "%s" not in sample_txt: %s' %
                        (target_col, sample_txt))
    if 'max_target_val' in inputs:
        max_target_val = int(max_target_val)
    else:
        max_target_val = df_train[target_col].max()

    # Make a timestamped output directory if outdir not specified
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    if not 'out_dirname' in locals(): out_dirname = target_col
    stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists
    shutil.copy2(params, out_dir)  #Copy the params for reference '''

    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    # If there are variables that should remain constant across the modeling
    #   region, get the names
    if 'constant_vars' in locals():
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        predict_cols += constant_vars

    # Get samples and support set bounds
    if 'gsrd_shp' not in locals(): gsrd_shp = None
    if snap_coord:
        snap_coord = [int(c) for c in snap_coord.split(',')]
    out_txt = os.path.join(out_dir, stamp + '.txt')
    df_sets = stem.get_gsrd(mosaic_path,
                            cell_size,
                            support_size,
                            sets_per_cell,
                            df_train,
                            min_obs,
                            target_col,
                            predict_cols,
                            out_txt,
                            gsrd_shp,
                            pct_train,
                            snap_coord=snap_coord)
    n_sets = len(df_sets)

    # Create SQL DB and add train sample table
    '''print 'Dumping train_txt to database...'
    t1 = time.time()#'''
    db_path = os.path.join(out_dir, stamp + '.db')
    '''engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
    #df_train.to_sql('train_sample', engine, chunksize=10000)
    print '%.1f minutes\n' % ((time.time() - t1)/60)#'''

    # Split x and y train
    t1 = time.time()
    print "'{0}'".format(model_type.lower())
    if model_type.lower().strip(
    ) == 'classifier':  #remove .trim() peter clary  it was after lower
        print 'Training STEM with classifier algorithm...'
        model_func = stem.fit_tree_classifier
    elif model_type.lower().strip() == 'zeroinflated':
        print 'Training STEM with zeroinflated regression algorithm...'
        model_func = stem.fit_tree_zeroinflated
    else:
        print 'Training STEM with regressor algorithm...'
        model_func = stem.fit_tree_regressor
    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    importance_cols = ['importance_%s' % c for c in predict_cols]
    for c in importance_cols:
        df_sets[c] = 0

    # Train estimators
    dropped_sets = pd.DataFrame(columns=df_sets.columns)
    dt_dir = os.path.join(out_dir, 'decisiontree_models')
    if not os.path.exists(dt_dir):
        os.mkdir(dt_dir)
    dt_path_template = os.path.join(dt_dir, stamp + '_decisiontree_%s.pkl')

    #oob_rates = [0]
    n_jobs = int(n_jobs)

    sets = _par_train_stem(n_jobs, n_sets, df_train, predict_cols, target_col,
                           min_obs, df_sets, model_func, model_type,
                           max_features, dt_path_template, db_path,
                           max_target_val)
    support_sets, samples = zip(*sets)
    df_sets = pd.DataFrame(list(support_sets))\
                .dropna(subset=['dt_file'])\
                .rename_axis('set_id')
    #print('the cols in the df at this point are: ', df_sets.columns)
    df_sets.to_csv(os.path.join(out_dir, 'support_sets.txt'), sep='\t')

    # Consider moving this back to train function by switching to DBMS with multithread support
    '''print '\n\nMaking relationship table for samples and sets...'
    t1 = time.time()
    set_samples = pd.concat(list(samples), ignore_index=True)
    set_samples.to_sql('set_samples', engine, chunksize=100000)
    print '%.1f minutes\n' % ((time.time() - t1)/60)'''

    # Calculate OOB rates and drop sets with too low OOB
    print 'Calculating OOB rates and dropping sets with high OOB error...'
    t1 = time.time()
    try:
        df_sets, low_oob, oob_metric = stem.get_oob_rates(
            df_sets,
            df_train,
            db_path,
            target_col,
            predict_cols,
            min_oob,
            model_type,
            drop_expression=oob_drop)
    except Exception as e:
        import pdb
        pdb.set_trace()
    if oob_drop and len(low_oob) > 0:
        df_sets.drop(low_oob.index, inplace=True)
        low_oob_shp = os.path.join(out_dir, 'low_oob_sets.shp')
        low_oob.drop('dt_model', axis=1, inplace=True)
        stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp)
    set_shp = os.path.join(out_dir, 'support_sets.shp')
    try:
        stem.coords_to_shp(df_sets.drop('dt_model', axis=1), gsrd_shp, set_shp)
    except Exception as e:
        import pdb
        pdb.set_trace()
        print e.message
    print 'Min OOB rate after dropping: ', df_sets[oob_metric].min()
    print 'Estimated average OOB score: ', int(df_sets[oob_metric].mean())
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Write df_sets and each decison tree to disk
    print 'Saving support set info...'
    #set_txt = os.path.join(dt_dir, stamp + '_support_sets.txt')
    df_sets['set_id'] = df_sets.index
    df_sets = df_sets.drop('dt_model',
                           axis=1)  #.to_csv(set_txt, sep='\t', index=False)
    #df_sets.drop('dt_model', axis=1).to_sql('support_sets', engine)
    t1 = time.time()
    print '%.1f minutes\n' % ((time.time() - t1) / 60)  #"""
    '''stamp = os.path.basename(out_dir)
    db_path = os.path.join(out_dir, stamp + '.db')
    engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
    with engine.connect() as con, con.begin():
        df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')
    predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_brightness','delta_greenness','delta_nbr','delta_wetness', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#'''

    print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
Exemple #5
0
def main(params, inventory_txt=None):

    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    df_var.data_band = [int(b)
                        for b in df_var.data_band]  #sometimes read as float

    try:
        n_tiles = [int(i) for i in n_tiles.split(',')]
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Check that all the variables given were used in training and vice versa
    try:
        _, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars = sorted(df_var.index)
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str
        raise NameError(msg)
    unmatched_vars = [v for v in train_vars if v not in pred_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)

    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    shutil.copy2(params, out_dir)  #Copy the params for reference

    if 'confusion_params' in inputs:
        #shutil.copy2(confusion_params, out_dir)
        conf_bn = os.path.basename(confusion_params)
        confusion_params = os.path.join(out_dir, conf_bn)

    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)

    mosaic_ds = gdal.Open(mosaic_path)
    mosaic_tx = mosaic_ds.GetGeoTransform()
    xsize = mosaic_ds.RasterXSize
    ysize = mosaic_ds.RasterYSize
    prj = mosaic_ds.GetProjection()
    driver = mosaic_ds.GetDriver()
    m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx

    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)

    set_txt = glob.glob(
        os.path.join('/vol/v2/stem/imperv/imperv_bdt',
                     'decisiontree_models/*support_sets.txt'))[0]
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    total_sets = len(df_sets)
    '''# Loop through each set and generate predictions
    t0 = time.time()
    for c, (set_id, row) in enumerate(df_sets.iterrows()):
        t1 = time.time()
        with open(row.dt_file, 'rb') as f: 
            dt_model = pickle.load(f)
        print '\nPredicting for set %s of %s' % (c + 1, total_sets)
        ar_coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
        ar_predict = stem.predict_set_in_pieces(set_id, df_var, mosaic_ds, ar_coords, 
                                 mosaic_tx, xsize, ysize, dt_model, nodata)        
        tx = ar_coords.ul_x, x_res, x_rot, ar_coords.ul_y, y_rot, y_res
        out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id)
        array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata)
        print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60)

    #mosaic_ds = None                  
    print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#'''

    #Aggregate predictions by tile and stitch them back together
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    ar_mean, ar_vote, pct_importance, df_sets = stem.aggregate_predictions(
        ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir,
        df_sets, out_dir, file_stamp, prj, driver, 0)
    #df_sets.to_csv(set_txt, sep='\t')'''
    mosaic_ds = None
    ds = gdal.Open('/vol/v2/stem/canopy/canopy_bdt/canopy_bdt_vote.bsq')
    ar_vote = ds.ReadAsArray()
    ds = None

    if 'confusion_params' in locals():
        import confusion_matrix as confusion

        vote_dir = os.path.join(model_dir, 'evaluation_vote')
        mean_dir = os.path.join(model_dir, 'evaluation_mean')

        print '\nGetting confusion matrix for vote...'
        out_txt = os.path.join(vote_dir, 'confusion.txt')

        df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
        try:
            out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
            df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
        except Exception as e:
            print e

        print '\nGetting confusion matrix for mean...'
        out_txt = os.path.join(mean_dir, 'confusion.txt')
        df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
        try:
            out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
            df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
        except Exception as e:
            print e

        vote_acc = df_v.ix['user', 'producer']
        vote_kap = df_v.ix['user', 'kappa']
        mean_acc = df_m.ix['user', 'producer']
        mean_kap = df_m.ix['user', 'kappa']

        if 'inventory_txt':
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = [
                'vote_accuracy', 'vote_kappa', 'vote_mask', 'mean_accuracy',
                'mean_kappa', 'vote_mask'
            ]
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap, False, \
            mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'

        print ''
        print 'Vote accuracy .............. ', vote_acc
        print 'Vote kappa ................. ', vote_kap
        print 'Mean accuracy .............. ', mean_acc
        print 'Mean kappa ................. ', mean_kap

    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
Exemple #6
0
def main(params,
         inventory_txt=None,
         constant_vars=None,
         mosaic_shp=None,
         resolution=30,
         n_jobs=0,
         n_jobs_agg=0,
         mosaic_nodata=0,
         snap_coord=None,
         overwrite_tiles=False,
         tile_id_field='name'):
    inputs = stem.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    df_var = pd.read_csv(var_info, sep='\t', index_col='var_name')
    df_var.data_band = [int(b)
                        for b in df_var.data_band]  #sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = pd.read_csv(train_inputs['var_info'].replace('"', ''),
                             sep='\t',
                             index_col='var_name')
    train_vars = sorted(train_vars.index)
    pred_vars = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)

    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)

    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir)  #Copy the params for reference

    if 'confusion_params' in inputs:
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path

    if overwrite_tiles.lower() == 'false':
        overwrite_tiles = False

    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)

    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    db_path = os.path.join(model_dir, os.path.basename(model_dir) + '.db')
    if os.path.exists(db_path):
        engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
        with engine.connect() as con, con.begin():
            df_sets = pd.read_sql_table('support_sets',
                                        con,
                                        index_col='set_id')  #'''
    else:
        set_txt = stem.find_file(model_dir, '*support_sets.txt')
        if not os.path.isfile(set_txt):
            raise IOError('No database or support set txt file found')
        df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')

    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        # if subset specified, clip the mosaic and set mosaic path to clipped shp
        if 'subset_shp' in inputs:
            out_shp_bn = os.path.basename(mosaic_path).replace(
                '.shp', '_clipped.shp')
            out_shp = os.path.join(out_dir, out_shp_bn)
            cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(
                clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path)
            subprocess.call(cmd, shell=True)  #'''
            mosaic_path = out_shp
        mosaic_dataset = ogr.Open(mosaic_path, 1)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        # If subset specified, just get sets that overlap the subset
        if 'subset_shp' in inputs:
            mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon)
            i = 0
            for feature in mosaic_ds:
                g = feature.GetGeometryRef()
                # Check that the feature is valid. Clipping can produce a feautre
                #  w/ an area of 0
                if g.GetArea() > 1:
                    mosaic_geom.AddGeometry(g)
                else:
                    fid = feature.GetFID()
                    feature.Destroy()
                    mosaic_ds.DeleteFeature(fid)
            #import pdb; pdb.set_trace()
            df_sets = stem.get_overlapping_sets(df_sets,
                                                mosaic_geom.UnionCascaded())
        xsize = int((max_x - min_x) / resolution)
        ysize = int((max_y - min_y) / resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        if 'snap_coord' in train_inputs:
            snap_coord = train_inputs['snap_coord'].replace('"', '')
            snap_coord = [float(c) for c in snap_coord.split(',')]  #'''
        mosaic_tx, extent = stem.tx_from_shp(mosaic_path,
                                             x_res,
                                             y_res,
                                             snap_coord=snap_coord)
        tiles = stem.attributes_to_df(
            mosaic_path)  # Change to accept arbittary geometry

    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    #driver = gdal.GetDriverByName('gtiff')

    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 90 x 40 ...\n'
        n_tiles = 90, 40
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx)

    total_sets = len(df_sets)
    t0 = time.time()
    last_dts = pd.Series()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    n_jobs = int(n_jobs)
    tile_dir = os.path.join(out_dir, '_temp_tiles')
    #tile_dir = '/home/server/pi/homes/shooper/delete_test'
    if not os.path.isdir(tile_dir):
        os.mkdir(tile_dir)
    tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif')
    n_tiles = len(tiles)

    if not overwrite_tiles:
        files = os.listdir(tile_dir)
        tile_files = pd.DataFrame(columns=agg_stats,
                                  index=tiles[tile_id_field])
        for stat in agg_stats:
            pattern = re.compile('tile_\d+_%s.tif' % stat)
            stat_match = [f.split('_')[1] for f in files if pattern.match(f)]
            try:
                tile_files[stat] = pd.Series(np.ones(len(stat_match)),
                                             index=stat_match)
            except:
                pass  #import pdb; pdb.set_trace()
        index_field = tiles.index.name
        tiles[index_field] = tiles.index
        tiles = tiles.set_index(tile_id_field, drop=False)
        tiles.set_index(index_field, inplace=True)  #'''
    tiles['ul_x'] = [
        stem.get_ul_coord(xmin, xmax, x_res)
        for i, (xmin, xmax) in tiles[['xmin', 'xmax']].iterrows()
    ]
    tiles['ul_y'] = [
        stem.get_ul_coord(ymin, ymax, y_res)
        for i, (ymin, ymax) in tiles[['ymin', 'ymax']].iterrows()
    ]
    tiles['lr_x'] = [
        xmax if ulx == xmin else xmin
        for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin', 'xmin']].iterrows()
    ]
    tiles['lr_y'] = [
        ymax if uly == ymin else ymin
        for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin', 'ymin']].iterrows()
    ]

    support_nrows = int(support_size[0] / abs(y_res))
    support_ncols = int(support_size[1] / abs(x_res))
    t1 = time.time()

    # Patch for unknown Landcover screwup
    args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets,
             df_var, (support_nrows, support_ncols), agg_stats,
             tile_path_template, prj, nodata, snap_coord)
            for i, (t_ind,
                    tile_info) in enumerate(tiles.loc[tiles['name'].isin([
                        '1931', '2810', '0705', '0954', '2814', '1986', '2552',
                        '2019', '2355', '3354', '2278', '2559'
                    ])].iterrows())]

    args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets,
             df_var, (support_nrows, support_ncols), agg_stats,
             tile_path_template, prj, nodata, snap_coord)
            for i, (t_ind, tile_info) in enumerate(tiles.loc[
                tiles['name'].isin(['0705'])].iterrows())]

    # Patch for the GEE subset 2 outside-of-buffer 'slice'
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin(['0639','0718','0797','0876','0955','1034'])].iterrows())]

    # Original line
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tile_files.isnull().any(axis=1).values].iterrows())]

    limits = []

    for arg in args:
        print tile_info[tile_id_field]
        limits.append(stem.par_predict_tile(arg))  #'''

    ###

    return
    print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % (
        (time.time() - t1) / 3600)

    try:
        limits = pd.concat(limits)
    except:
        # They're all None
        pass

    t1 = time.time()
    mosaic_ul = mosaic_tx[0], mosaic_tx[3]
    driver = gdal.GetDriverByName('gtiff')
    for stat in agg_stats:
        #dtype = mosaic.get_min_numpy_dtype(limits[stat])
        dtype = np.int16
        if stat == 'stdv':
            this_nodata = -9999
            ar = np.full((ysize, xsize), this_nodata, dtype=np.int16)  #dtype)
        else:
            this_nodata = nodata
            ar = np.full((ysize, xsize), this_nodata, dtype=dtype)

        for tile_id, tile_coords in tiles.iterrows():
            tile_file = os.path.join(
                tile_dir,
                'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat))
            try:
                ds = gdal.Open(tile_file)
            except:
                print 'Tile not found'
                continue
            tile_tx = ds.GetGeoTransform()
            tile_ul = tile_tx[0], tile_tx[3]
            row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx)
            # Make sure the tile doesn't exceed the size of ar
            tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off
            tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off
            ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows)
            try:
                ar[row_off:row_off + tile_rows,
                   col_off:col_off + tile_cols] = ar_tile
            except Exception as e:
                pass  #import pdb; pdb.set_trace()

        out_path = os.path.join(out_dir, '%s_%s.tif' % (file_stamp, stat))
        #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat))
        gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype)
        mosaic.array_to_raster(ar,
                               mosaic_tx,
                               prj,
                               driver,
                               out_path,
                               gdal_dtype,
                               nodata=this_nodata)

    # Clean up the tiles
    #shutil.rmtree(tile_dir)
    print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1) / 60)

    # Get feature importances and max importance per set
    t1 = time.time()
    print 'Getting importance values...'
    importance_cols = sorted([c for c in df_sets.columns if 'importance' in c])
    df_sets['max_importance'] = nodata
    if len(importance_cols) == 0:
        # Loop through and get importance
        importance_per_var = []
        for s, row in df_sets.iterrows():
            with open(row.dt_file, 'rb') as f:
                dt_model = pickle.load(f)
            max_importance, this_importance = stem.get_max_importance(dt_model)
            df_sets.ix[s, 'max_importance'] = max_importance
            importance_per_var.append(this_importance)
        importance = np.array(importance_per_var).mean(axis=0)
    else:
        df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values,
                                              axis=1)
        importance = df_sets[importance_cols].mean(axis=0).values
    pct_importance = importance / importance.sum()
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Save the importance values
    importance = pd.DataFrame({
        'variable': pred_vars,
        'pct_importance': pct_importance,
        'index': range(len(pred_vars))
    })
    importance.set_index('index', inplace=True)
    importance['rank'] = [
        int(r) for r in importance.pct_importance.rank(method='first',
                                                       ascending=False)
    ]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')  #'''

    print '\nTotal prediction runtime: %.1f hours\n' % (
        (time.time() - t0) / 3600)
def main(params, min_oob=0, err_threshold=10):
    t0 = time.time()

    #read_params(params)
    inputs, df_var = stem.read_params(params)

    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    try:
        if 'err_threshold' in inputs: err_threshold = int(err_threshold)
        str_check = sample_txt, target_col, mosaic_path, tsa_txt, out_dir
        n_sets = int(n_sets)
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)
    '''now = datetime.now()
    date_str = str(now.date()).replace('-','')
    time_str = str(now.time()).replace(':','')[:4]
    stamp = '{0}_{1}_{2}'.format(target_col, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(out_dir) # With a timestamp in dir, no need to check if it already exists'''
    #out_dir = '/vol/v2/stem/imperv/imperv_bdt'
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    stamp = os.path.basename(out_dir)
    shutil.copy2(params, out_dir)  #Copy the params for reference

    df_train = pd.read_csv(sample_txt, sep='\t')
    # Check that df_train has exactly the same columns as variables specified in df_vars
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
        raise NameError(msg)
    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    # Make dataframe of set coords
    min_x, min_y, max_x, max_y, x_res, y_res, tx = stem.get_raster_bounds(
        mosaic_path)
    if x_res < 0:
        ul_x = max_x
        lr_x = min_x
    else:
        ul_x = min_x
        lr_x = max_x
    if y_res < 0:
        ul_y = max_y
        lr_y = min_y
    else:
        ul_y = min_y
        lr_y = max_y
    ar_sets = np.tile([ul_x, ul_y, lr_x, lr_y], n_sets).reshape(n_sets, 4)
    df_sets = pd.DataFrame(ar_sets, columns=['ul_x', 'ul_y', 'lr_x', 'lr_y'])

    # Train a tree for each support set
    print 'Training models...'
    t1 = time.time()
    #set_txt = os.path.join(out_dir, 'decisiontree_models/%s_support_sets.txt' % stamp)
    #df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    df_sets['dt_model'] = ''
    df_sets['oob_rate'] = 0
    df_sets[['dt_model', 'oob_rate']] = [
        stem.fit_bdt_tree_regressor(x_train, y_train) for s in df_sets.index
    ]
    del df_train
    print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean())
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Write df_sets and each decison tree to disk
    print 'Saving models...'
    t1 = time.time()
    df_sets, set_txt = stem.write_model(out_dir, df_sets)
    print '%.1f minutes\n' % ((time.time() - t1) / 60)  #'''
    '''out_dir = '/vol/v2/stem/canopy/models/canopy_20161016_0910'
    stamp = os.path.basename(out_dir)
    set_txt = '/vol/v2/stem/{0}/models/{1}/decisiontree_models/{1}_support_sets.txt'.format(target_col, stamp)
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    oob_txt = os.path.join(out_dir, '%s_oob.txt' % stamp)
    df_oob = pd.read_csv(oob_txt, sep='\t')
    ds = gdal.Open(os.path.join(out_dir, '%s_oob.bsq' % stamp))
    ar_oob = ds.ReadAsArray()
    ds = None
    ds = gdal.Open(os.path.join(out_dir, '%s_count.bsq' % stamp))
    ar_cnt = ds.ReadAsArray()
    ds = None 
    
    predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_bright','delta_green','delta_nbr','delta_wet', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#'''

    # Record params in inventory text file
    if 'inventory_txt' in locals():
        t1 = time.time()
        print 'Getting model info...\n'
        df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
        if 'regressor' in params:
            model_type = 'Regressor'
        else:
            model_type = 'Classifier'
        n_sets = len(df_sets)
        n_samples = int(sample_txt.split('_')[1].replace('sample', ''))
        info = [
            model_type, None, None, None, None, None, None, None, None, n_sets,
            n_samples,
            str(support_size), sets_per_cell, max_features
        ]
        df_inv.ix[stamp] = info
        info_dir = os.path.dirname(inventory_txt)
        existing_models = fnmatch.filter(os.listdir(os.path.dirname(info_dir)),
                                         '%s*' % target_col)
        if len(existing_models) > 0:
            df_inv = df_inv[df_inv.index.isin(existing_models)]

        # Check if oob_map params were specified. If not, set to defaults
        if 'err_threshold' not in locals():
            print 'err_threshold not specified. Using default: 10 ...\n'
            err_threshold = 10
        else:
            err_threshold = int(err_threshold)
        if 'n_tiles' not in locals():
            print 'n_tiles not specified. Using default: 25 x 15 ...\n'
            n_tiles = 25, 15
        else:
            n_tiles = int(n_tiles[0]), int(n_tiles[1])

        #t1 = time.time()
        print 'Calculating OOB score and making OOB score map...'
        ds = gdal.Open(mosaic_path)
        ar = ds.ReadAsArray()
        mask = ar != 0
        del ar
        xsize = ds.RasterXSize
        ysize = ds.RasterYSize
        tx = ds.GetGeoTransform()
        prj = ds.GetProjection()
        driver = ds.GetDriver()
        ds = None

        #import get_oob_map as oob
        ar_oob, ar_cnt, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles,
                                               tx, support_size, df_oob,
                                               df_sets, target_col,
                                               predict_cols, err_threshold,
                                               out_dir, stamp, prj, driver)
        df_sets.to_csv(set_txt, sep='\t')  #'''

        #if 'inventory_txt' in locals() :
        avg_oob = round(np.mean(ar_oob[mask]), 1)
        avg_cnt = int(round(np.mean(ar_cnt[mask]), 0))
        df_inv.ix[stamp, 'avg_oob'] = avg_oob
        df_inv.ix[stamp, 'avg_count'] = avg_cnt
        if len(df_inv) > 1:
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print 'WARNING: Model info not written to inventory_txt...\n'

        print '\nAverage OOB score: .................... %.1f' % avg_oob
        print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt

        print 'Time to make OOB score map: %.1f hours\n' % (
            (time.time() - t1) / 3600)

        #except Exception as e:
        #    print 'Problem getting oob map: ', e

    print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(params, pct_train=None, aggregate_presence=False):
    t0 = time.time()

    # Read params. Make variables from each line of the 1-line variables
    inputs, df_vars = stem.read_params(params)
    for var in inputs:
        exec("{0} = str({1})").format(var, inputs[var])
    try:
        if 'years' in inputs:
            years = np.array([int(yr) for yr in years.split(',')])
        else:
            year_start = int(year_start)
            year_end = int(year_end)
            years = np.arange(year_start, year_end + 1)
        '''tsa_mosaic = inputs['tsa_mosaic']
        search_dir = inputs['search_dir']
        search_str = inputs['search_str']
        obs_txt = inputs['obs_txt']
        index_col = inputs['index_col']
        year_col = inputs['year_col']
        target_col = inputs['target_col']
        out_txt = inputs['out_txt']'''
        add_file_tag = int(add_file_tag)
        #count_type = inputs['count_type']

    except KeyError as e:
        missing_var = str(e).split("'")[1]
        if missing_var in ['year_start', 'year_end', 'years']:
            msg = ('No list of years or year_start/year_end specified in' +\
            ' param file:\n%s\n. Re-run script with either of these' +\
            ' parameters given.') % params
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    out_dir, original_bn = os.path.split(out_txt)
    # Add informative tags to output dir and basename
    if add_file_tag:
        res = years[1] - years[0]
        #out_dir = os.path.basename(out_dir)
        now = datetime.datetime.now()
        date_str = str(now.date()).replace('-', '')
        time_str = str(now.time()).replace(':', '')[:4]
        out_dirname = '{0}_res{1}yr_{2}_{3}'.format(target_col, res, date_str,
                                                    time_str)
        out_dir = os.path.join(out_dir, out_dirname)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        out_bn = '{0}_{1}'.format(
            os.path.basename(obs_txt).replace('.txt', ''), original_bn)
        out_txt = os.path.join(out_dir, out_bn)

    if params != os.path.exists(os.path.join(out_dir,
                                             os.path.basename(params))):
        print 'Copying params to output dir: %s\n' % out_dir
        shutil.copy2(params, out_dir)

    print 'Getting predictors... '
    t1 = time.time()
    df_obs = pd.read_csv(obs_txt, sep='\t', index_col=index_col)
    original_columns = df_obs.columns
    df = get_predictors(years, search_dir, search_str, df_obs, index_col,
                        year_col, df_vars)
    print '%.1f seconds\n' % (time.time() - t1)

    # Select count type and date range
    if 'count_type' in inputs:
        count_type = [t.strip() for t in count_type.split(',')]
        df = df[df.COUNT_TYPE.isin(count_type)]
        #df.drop(['COUNT_TYPE'], axis=1, inplace=True)
        if 'P21' in count_type:
            df = df[df.EFFORT_DISTANCE_KM < .1]
    if 'day_minmax' in inputs:
        day_min, day_max = [int(d) for d in day_minmax.split(',')]
        df = df[(df.DAY >= day_min) & (df.DAY <= day_max)]
    if 'time_minmax' in inputs:
        time_min, time_max = [int(t) for t in time_minmax.split(',')]
        df = df[(df.TIME >= time_min) & (df.TIME <= time_max)]
    if 'max_effort_time' in inputs:
        max_effort_time = int(max_effort_time)
        df = df[df.EFFORT_HRS < max_effort_time]
    if 'max_effort_dist' in inputs:
        max_effort_dist = int(max_effort_dist)
        df = df[df.EFFORT_DISTANCE_KM < max_effort_time]

    #df = df[(df.YEAR >= min(years)) & (df.YEAR <= max(years))]
    #df[target_col] *= 100 # To be able to keep stuff as 8 bit ints

    # Calc row and col from x, y
    ds = gdal.Open(tsa_mosaic)
    tx = ds.GetGeoTransform()
    ds = None
    ul_xy = tx[0], tx[3]
    df['row'], df['col'] = zip(*[
        stem.calc_offset(ul_xy, xy, tx) for i, xy in df[['x', 'y']].iterrows()
    ])

    if 'kernel_dist' in inputs:
        t1 = time.time()
        print 'Calculating kernel density...'
        kernel_dist = int(kernel_dist)
        for yr in df.YEAR.unique():
            yr_mask = df.YEAR == yr
            df_w = gaussain_weights(df[yr_mask], target_col, kernel_dist)
            df.ix[yr_mask, target_col] = df_w.weighted
        '''
        df_w = gaussain_weights(df, target_col, kernel_dist)
        df[target_col] = df_w.weighted
        #df = df.drop_duplicates(subset=[target_col, 'row', 'col'])'''
        print '%.1f seconds\n' % (time.time() - t1)  #"""

    if aggregate_presence:
        t1 = time.time()
        print 'Aggregating presence records...'
        df.ix[df[target_col] > 0, target_col] = 1
        for yr in df.YEAR.unique():
            yr_mask = df.YEAR == yr
            df_yr = df[yr_mask]
            # Get unique locations for this year
            unique = df_yr[['row', 'col']].drop_duplicates().values
            for row, col in unique:
                this_loc = df_yr[(df_yr.row == row) & (df_yr.col == col)]
                #If there are ones and 0s, drop the 0s
                if this_loc[target_col].min(
                ) == 0 and this_loc[target_col].max() == 1:
                    df.drop(this_loc[this_loc[target_col] == 0].index,
                            inplace=True)
        print '%.1f seconds\n' % (time.time() - t1)

    if pct_train:
        print 'Splitting training and test sets...'
        pct_train = float(pct_train)
        #n_test = int(len(df) * (1 - pct_train))
        unique = df[['row', 'col']].drop_duplicates().values
        n_test = int(len(unique) * (1 - pct_train))
        random_idx = random.sample(xrange(len(unique)), n_test)
        random_row, random_col = zip(*unique[random_idx])
        df_test = df[df.row.isin(random_row) & df.col.isin(random_col)]
        test_idx = df_test.index
        test_txt = out_txt.replace('.txt', '_test.txt')
        df = df[~df.index.isin(test_idx)]
        df_test.to_csv(test_txt, sep='\t')

    df.to_csv(out_txt, sep='\t')
    obs_out_txt = out_txt.replace('_' + original_bn[:-4], '')
    df[original_columns].to_csv(obs_out_txt, sep='\t')

    print '\nLength of output df:', len(df)
    print 'Text file written to: ', out_txt
    print '\nTotal time: %.1f minutes' % ((time.time() - t0) / 60)
Exemple #9
0
def main(predict_params,
         start_year,
         end_year,
         out_dir,
         txt_out_dir,
         n_jobs_pred=0,
         n_jobs_agg=0,
         confusion=False,
         subset_shp=None):

    param_dict, df_var_orig = read_params(predict_params)
    for k, v in param_dict.iteritems():
        param_dict[k] = v.replace('"', '')

    param_basename = os.path.basename(predict_params)
    out_txt = os.path.join(txt_out_dir, param_basename)
    if not os.path.isdir(txt_out_dir):
        os.mkdir(txt_out_dir)

    sub_tag = os.path.basename(subset_shp.replace('.shp', ''))
    for year in range(int(start_year), int(end_year) + 1):
        print 'Making params for year %s...' % year

        #################################################################################
        # jdb added 6/2/2017 - make a fresh copy of the original df_var otherwise line 32 won't work right
        df_var = df_var_orig.copy()
        #################################################################################

        # Write the variable param table first
        band = year - 1983  # 1983 becuse gdal bands start at 1
        df_var.ix[df_var.data_band != 1, 'data_band'] = band
        df_var.data_band = df_var.data_band.astype(int)

        this_txt = out_txt.replace('.txt', '_%s_%s.txt' % (sub_tag, year))
        df_var.to_csv(this_txt, sep='\t')

        # Adjust a couple of variable values
        file_stamp = os.path.basename(param_dict['model_dir'].split('_')
                                      [0]) + '_' + sub_tag + '_' + str(year)
        param_dict['file_stamp'] = file_stamp
        param_dict['out_dir'] = os.path.join(out_dir, str(year))
        #if 'confusion_params' in param_dict and not confusion:
        #del param_dict['confusion_params']

        with open(this_txt, 'a') as txt:
            txt.write('\n')
            txt.write('model_dir; %s\n' % param_dict['model_dir'])
            txt.write('train_params; %s\n' % param_dict['train_params'])
            txt.write('mosaic_path; %s\n' % param_dict['mosaic_path'])
            txt.write('support_size; %s\n' % param_dict['support_size'])
            txt.write('n_tiles; %s\n' % '3,3')
            txt.write('nodata; %s\n' % param_dict['nodata'])
            txt.write('out_dir; %s\n' % param_dict['out_dir'])
            txt.write('agg_stats; vote, mean\n')
            txt.write('\nOptional Parameters\n')
            txt.write('file_stamp; %s\n' % param_dict['file_stamp'])
            if int(n_jobs_pred) != 0:
                n_jobs_pred = int(n_jobs_pred)
                txt.write('n_jobs_pred; %s\n' % n_jobs_pred)
            if int(n_jobs_agg) != 0:
                n_jobs_agg = int(n_jobs_agg)
                txt.write('n_jobs_agg; %s\n' % n_jobs_agg)
            if subset_shp:
                txt.write('subset_shp; %s' % subset_shp)

        print 'Params written to %s\n' % this_txt
Exemple #10
0
def main(predict_params,
         start_year,
         end_year,
         out_dir,
         txt_out_dir,
         n_jobs=0,
         agg_stats='mean, vote, median, stdv',
         confusion=False,
         subset_shp=None,
         n_tiles=None):

    param_dict, df_var_orig = read_params(predict_params)
    for k, v in param_dict.iteritems():
        param_dict[k] = v.replace('"', '')

    param_basename = os.path.basename(predict_params)
    out_txt = os.path.join(txt_out_dir, param_basename)
    if not os.path.isdir(txt_out_dir):
        os.mkdir(txt_out_dir)

    for year in range(int(start_year), int(end_year) + 1):
        print 'Making params for year %s...' % year

        df_var = df_var_orig.copy()

        # Write the variable param table first
        band = year - 1983  # 1983 becuse gdal bands start at 1
        df_var.ix[df_var.data_band != 1, 'data_band'] = band
        df_var.data_band = df_var.data_band.astype(int)

        this_txt = out_txt.replace('.txt', '_%s.txt' % year)
        df_var.to_csv(this_txt, sep='\t')

        # Adjust a couple of variable values
        file_stamp = os.path.basename(
            param_dict['model_dir'].split('_')[0]) + '_' + str(year)
        param_dict['file_stamp'] = file_stamp
        param_dict['out_dir'] = os.path.abspath(
            os.path.join(out_dir, str(year)))
        param_dict['agg_stats'] = agg_stats
        #if 'confusion_params' in param_dict and not confusion:
        #del param_dict['confusion_params']

        with open(this_txt, 'a') as txt:
            txt.write('\n')
            txt.write('model_dir; %s\n' % param_dict['model_dir'])
            txt.write('train_params; %s\n' % param_dict['train_params'])
            txt.write('mosaic_path; %s\n' % param_dict['mosaic_path'])
            txt.write('support_size; %s\n' % param_dict['support_size'])
            txt.write('nodata; %s\n' % param_dict['nodata'])
            txt.write('out_dir; %s\n' % param_dict['out_dir'])
            txt.write('agg_stats; %s\n' % param_dict['agg_stats'])
            txt.write('\nOptional Parameters\n')
            txt.write('file_stamp; %s\n' % param_dict['file_stamp'])
            if int(n_jobs) != 0:
                n_jobs = int(n_jobs)
                txt.write('n_jobs; %s\n' % n_jobs)
            if subset_shp:
                txt.write('subset_shp; %s\n' % subset_shp)
            if n_tiles:
                txt.write('n_tiles; %s\n' % n_tiles)

        print 'Params written to %s\n' % this_txt
Exemple #11
0
def main(params,
         pct_train=None,
         min_oob=0,
         gsrd_shp=None,
         resolution=30,
         make_oob_map=False,
         snap_coord=None,
         oob_map_metric='oob_rate'):
    t0 = time.time()

    inputs, df_var = stem.read_params(params)

    # Convert params to named variables and check for required vars
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    try:
        if 'max_features' not in locals(): max_features = None
        if 'min_oob' in inputs: min_oob = int(min_oob)
        num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell,
                                        min_obs, max_features, pct_train)
        cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars
        str_check = sample_txt, target_col, mosaic_path, out_dir, model_type
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Read in training samples and check that df_train has exactly the same
    #   columns as variables specified in df_vars
    df_train = pd.read_csv(sample_txt, sep='\t')
    n_samples = len(df_train)
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n\t'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str
        import pdb
        pdb.set_trace()
        raise NameError(msg)
    if target_col not in df_train.columns:
        raise NameError('target_col "%s" not in sample_txt: %s' %
                        (target_col, sample_txt))

    # Make a timestamped output directory if outdir not specified
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    if not 'out_dirname' in locals(): out_dirname = target_col
    stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists
    shutil.copy2(params, out_dir)  #Copy the params for reference '''

    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    # If there are variables that should remain constant across the modeling
    #   region, get the names
    if 'constant_vars' in locals():
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        predict_cols += constant_vars

    # Get samples and support set bounds
    if 'gsrd_shp' not in locals(): gsrd_shp = None
    if snap_coord:
        snap_coord = [int(c) for c in snap_coord.split(',')]
    out_txt = os.path.join(out_dir, stamp + '.txt')
    df_sets = stem.get_gsrd(mosaic_path,
                            cell_size,
                            support_size,
                            sets_per_cell,
                            df_train,
                            min_obs,
                            target_col,
                            predict_cols,
                            out_txt,
                            gsrd_shp,
                            pct_train,
                            snap_coord=snap_coord)
    n_sets = len(df_sets)

    # Create SQL DB and add train sample table
    print 'Dumping train_txt to database...'
    t1 = time.time()
    db_path = os.path.join(out_dir, stamp + '.db')
    engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
    df_train.to_sql('train_sample', engine, chunksize=10000)
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Train a tree for each support set
    t1 = time.time()
    if model_type.lower() == 'classifier':
        print 'Training STEM with classifier algorithm...'
        model_func = stem.fit_tree_classifier
    else:
        print 'Training STEM with regressor algorithm...'
        model_func = stem.fit_tree_regressor
    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    importance_cols = ['importance_%s' % c for c in predict_cols]
    for c in importance_cols:
        df_sets[c] = 0

    # Train estimators
    dropped_sets = pd.DataFrame(columns=df_sets.columns)
    dt_dir = os.path.join(out_dir, 'decisiontree_models')
    if not os.path.exists(dt_dir):
        os.mkdir(dt_dir)
    dt_path_template = os.path.join(dt_dir, stamp + '_decisiontree_%s.pkl')

    # establish DB connection and create empty relationship table for sample inds
    cmd = (
        'CREATE TABLE set_samples (set_id INTEGER, sample_id INTEGER, in_bag INTEGER);'
    )
    with sqlite3.connect(db_path) as connection:
        connection.executescript(cmd)
        connection.commit()
    insert_cmd = 'INSERT INTO set_samples (set_id, sample_id, in_bag) VALUES (?,?,?);'

    oob_rates = [0]
    for i, (set_id, ss) in enumerate(df_sets.iterrows()):
        format_tuple = i + 1, n_sets, float(i) / n_sets * 100, (
            time.time() - t1) / 60, np.mean(oob_rates)
        sys.stdout.write(
            '\rTraining %s/%s DTs (%.1f%%) || %.1f minutes || Avg OOB: %d' %
            format_tuple)
        sys.stdout.flush()

        # Get all samples within support set
        sample_inds = df_train.index[
            (df_train['x'] > ss[['ul_x', 'lr_x']].min())
            & (df_train['x'] < ss[['ul_x', 'lr_x']].max()) &
            (df_train['y'] > ss[['ul_y', 'lr_y']].min()) &
            (df_train['y'] < ss[['ul_y', 'lr_y']].max())]

        n_samples = int(len(sample_inds) * .63)
        if n_samples < min_obs:
            df_sets.drop(set_id, inplace=True)
            continue

        this_x = x_train.ix[sample_inds]
        this_y = y_train.ix[sample_inds]
        support_set = df_sets.ix[set_id]
        dt_path = dt_path_template % set_id
        dt_model, train_inds, oob_inds, importance, oob_metrics = stem.train_estimator(
            support_set, n_samples, this_x, this_y, model_func, model_type,
            max_features, dt_path)
        oob_rates.append(oob_metrics['oob_rate'])
        df_sets.ix[set_id, importance_cols] = importance
        df_sets.ix[set_id, 'dt_model'] = dt_model
        df_sets.ix[set_id, 'dt_file'] = dt_path
        df_sets.ix[set_id, 'n_samples'] = n_samples
        for metric in oob_metrics:
            df_sets.ix[set_id, metric] = oob_metrics[metric]

        # Save oob and train inds
        n_train = len(train_inds)
        n_oob = len(oob_inds)
        train_records = zip(np.full(n_train, set_id, dtype=int), train_inds,
                            np.ones(n_train, dtype=int))
        oob_records = zip(np.full(n_oob, set_id, dtype=int), oob_inds,
                          np.zeros(n_oob, dtype=int))

        #try:
        with sqlite3.connect(db_path) as connection:
            connection.executemany(insert_cmd, train_records + oob_records)
            connection.commit()

    print '\n%.1f minutes\n' % ((time.time() - t1) / 60)

    # Calculate OOB rates and drop sets with too low OOB
    print 'Calculating OOB rates...'
    t1 = time.time()
    df_sets, low_oob = stem.get_oob_rates(df_sets, df_train, db_path,
                                          target_col, predict_cols, min_oob)
    if len(low_oob) > 0:
        #df_sets.drop(low_oob.index, inplace=True)
        low_oob_shp = os.path.join(out_dir, 'low_oob_sets.shp')
        low_oob.drop('dt_model', axis=1, inplace=True)
        stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp)
    set_shp = os.path.join(out_dir, 'support_sets.shp')
    try:
        stem.coords_to_shp(df_sets, gsrd_shp, set_shp)
    except Exception as e:
        print e.message
    print '%s sets dropped because OOB rate < %s' % (len(low_oob), min_oob)
    print 'Min OOB rate after dropping: ', df_sets.oob_rate.min()
    print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean())
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Write df_sets and each decison tree to disk
    print 'Saving support set info...'
    #set_txt = os.path.join(dt_dir, stamp + '_support_sets.txt')
    df_sets['set_id'] = df_sets.index
    #df_sets = df_sets.drop('dt_model', axis=1)#.to_csv(set_txt, sep='\t', index=False)
    df_sets.drop('dt_model', axis=1).to_sql('support_sets', engine)
    t1 = time.time()
    print '%.1f minutes\n' % ((time.time() - t1) / 60)  #"""
    '''stamp = os.path.basename(out_dir)
    db_path = os.path.join(out_dir, stamp + '.db')
    engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
    with engine.connect() as con, con.begin():
        df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')
    predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_brightness','delta_greenness','delta_nbr','delta_wetness', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#'''
    if make_oob_map or oob_map_metric in inputs:
        # Check if oob_map params were specified. If not, set to defaults
        if 'n_tiles' not in inputs:
            n_tiles = 40, 90
            print 'n_tiles not specified. Using default: %s x %s ...\n' % (
                n_tiles)

        else:
            n_tiles = int(n_tiles[0]), int(n_tiles[1])

        print 'Calculating OOB score and making OOB score map...'
        try:
            ds = gdal.Open(mosaic_path)
            ar = ds.ReadAsArray()
            mask = ar != 0
            del ar
            xsize = ds.RasterXSize
            ysize = ds.RasterYSize
            tx = ds.GetGeoTransform()
            prj = ds.GetProjection()
            driver = ds.GetDriver()
            ds = None
        except:
            mosaic_ds = ogr.Open(mosaic_path)
            if 'resolution' not in inputs:
                warnings.warn(
                    'Resolution not specified. Assuming default of 30...\n')
            mask = mosaic_ds.GetLayer()
            min_x, max_x, min_y, max_y = mask.GetExtent()
            ul_x = min_x - ((min_x - snap_coord[0]) % resolution)
            ul_y = max_y - ((max_y - snap_coord[1]) % resolution)
            xsize = int((max_x - ul_x) / resolution)
            ysize = int((ul_y - min_y) / resolution)
            prj = mask.GetSpatialRef().ExportToWkt()
            driver = gdal.GetDriverByName('gtiff')
            x_res = resolution
            y_res = -resolution
            tx = ul_x, x_res, 0, ul_y, 0, y_res

        avg_dict, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles, tx,
                                         support_size, db_path, df_sets,
                                         df_train, target_col, predict_cols,
                                         out_dir, stamp, prj, driver,
                                         oob_map_metric)
        df_sets.to_csv(set_txt, sep='\t')  #'''

        avg_oob = round(avg_dict[oob_map_metric], 1)
        avg_cnt = int(round(avg_dict['count'], 0))

        print '\nAverage OOB score: .................... %.1f' % avg_oob
        print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt

        print 'Time to make OOB score map: %.1f hours\n' % (
            (time.time() - t1) / 3600)

    # Record params in inventory text file
    if 'inventory_txt' in inputs:
        t1 = time.time()
        print 'Getting model info...\n'
        df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
        n_sets = len(df_sets)
        '''if 'sample' in sample_txt:
            n_samples = int(sample_txt.split('_')[1].replace('sample',''))
        inv_columns = df_inv.columns
        if 'n_sets' in inv_columns: df_inv.ix[stamp, 'n_sets'] = n_sets
        if 'n_samples' in inv_columns: df_inv.ix[stamp, 'n_samples'] = n_samples
        if 'support_size' in inv_columns: df_inv.ix[stamp, 'support_size'] = str(support_size)
        if 'sets_per_cell' in inv_columns: df_inv.ix[stamp, 'sets_per_cell'] = sets_per_cell
        if 'max_features' in inv_columns: df_inv.ix[stamp, 'max_features'] = max_features
        info_dir = os.path.dirname(inventory_txt)
        existing_models = fnmatch.filter(os.listdir(info_dir), '%s*' % target_col)
        if len(existing_models) > 0:
            df_inv = df_inv[df_inv.index.isin(existing_models)]#'''

        if 'avg_oob' in inv_columns and make_oob_map:
            df_inv.ix[stamp, 'avg_oob'] = avg_oob
        if 'avg_count' in inv_columns and make_oob_map:
            df_inv.ix[stamp, 'avg_count'] = avg_cnt
        if len(df_inv) > 1:
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print 'WARNING: Model info not written to inventory_txt...\n'  #'''

    print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
Exemple #12
0
def main(model_dir, n_tiles, **kwargs):

    t0 = time.time()

    n_tiles = [int(n) for n in n_tiles.split(',')]
    if not os.path.isdir(model_dir):
        message = 'model directory given does not exist or is not a directory: ', model_dir
        raise IOError(message)

    model = os.path.basename(model_dir)
    dt_dir = os.path.join(model_dir, 'decisiontree_models')
    set_txt = os.path.join(dt_dir, '%s_support_sets.txt' % model)
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')

    pred_param_path = glob(os.path.join(model_dir,
                                        'predict_stem_*params.txt'))[0]
    predict_params, df_var = stem.read_params(pred_param_path)
    train_param_path = glob(os.path.join(model_dir,
                                         'train_stem_*params.txt'))[0]
    train_params, _ = stem.read_params(train_param_path)
    df_var.sort_index(inplace=True)

    nodata = int(predict_params['nodata'].replace('"', ''))
    if len(kwargs) == 0:
        var_ids = df_sets.max_importance.unique()
        var_names = df_var.ix[var_ids].index
        variables = zip(var_ids, var_names)
    else:
        variables = [(variable_id, variable_name)
                     for variable_name, variable_id in kwargs]

    mask_path = os.path.join(model_dir, '%s_vote.bsq' % model)
    if not os.path.exists(mask_path):
        mask_path = mask_path.replace('.bsq', '.tif')
    mask_ds = gdal.Open(mask_path)
    mask_tx = mask_ds.GetGeoTransform()
    xsize = mask_ds.RasterXSize
    ysize = mask_ds.RasterYSize
    prj = mask_ds.GetProjection()
    df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize,
                                                      mask_tx)
    total_tiles = len(df_tiles)
    df_tiles['tile'] = df_tiles.index

    # Find the tiles that have only nodata values
    t1 = time.time()
    print '\nFinding empty tiles...'
    mask = mask_ds.ReadAsArray() == nodata
    empty_tiles = stem.find_empty_tiles(df_tiles, ~mask, mask_tx)
    mask_ds = None
    print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\
    (len(empty_tiles), total_tiles, (time.time() - t1)/60)
    # Select only tiles that are not empty
    df_tiles = df_tiles.select(lambda x: x not in empty_tiles)
    total_tiles = len(df_tiles)

    #some_set = df_sets.iloc[0]
    support_size = [
        int(s)
        for s in train_params['support_size'].replace('"', '').split(',')
    ]
    set_size = [int(abs(s / mask_tx[1])) for s in support_size]

    out_dir = os.path.join(model_dir, 'importance_maps')
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    print variables
    for vi, (v_id, v_name) in enumerate(variables):

        t1 = time.time()
        print 'Making map for %s: %s of %s variables\n' % (v_name, vi + 1,
                                                           len(variables))

        ar = np.full((ysize, xsize), nodata, dtype=np.uint8)

        for i, (t_ind, t_row) in enumerate(df_tiles.iterrows()):
            t2 = time.time()
            print 'Aggregating for %s of %s tiles' % (i + 1, total_tiles)

            # Calculate the size of this tile in case it's at the edge where the
            #   tile size will be slightly different
            this_size = abs(t_row.lr_y - t_row.ul_y), abs(t_row.lr_x -
                                                          t_row.ul_x)
            df_these_sets = stem.get_overlapping_sets(df_sets, t_row,
                                                      this_size, support_size)

            rc = df_tiles_rc.ix[t_ind]
            this_size = rc.lr_r - rc.ul_r, rc.lr_c - rc.ul_c
            n_sets = len(df_these_sets)

            # Load overlapping predictions from disk and read them as arrays
            tile_ul = t_row[['ul_x', 'ul_y']]

            print n_sets, ' Overlapping sets'
            importance_bands = []

            importance_values = []
            for s_ind, s_row in df_these_sets.iterrows():

                # Calculate offset and array/tile indices
                offset = stem.calc_offset(tile_ul, (s_row.ul_x, s_row.ul_y),
                                          mask_tx)
                #if abs(offset[0]) > this_size[0] or abs(offset[1] > this_size[1]):

                tile_inds, a_inds = mosaic.get_offset_array_indices(
                    tile_size, set_size, offset)

                # Get feature with maximum importance and fill tile with that val
                try:
                    with open(s_row.dt_file, 'rb') as f:
                        dt_model = pickle.load(f)
                    importance_value = int(
                        dt_model.feature_importances_[v_id] * 100)
                    importance_values.append(importance_value)
                    #filled = np.full((nrows, ncols), importance_value, dtype=np.uint8)
                    #import_band = stem.fill_tile_band(this_size, filled, tile_inds, nodata)
                    import_band = np.full(this_size, np.nan, dtype=np.float16)
                    import_band[tile_inds[0]:tile_inds[1],
                                tile_inds[2]:tile_inds[3]] = importance_value
                    importance_bands.append(import_band)
                except Exception as e:
                    print e
                    continue  #'''

            print 'Average importance for this tile: %.1f' % np.mean(
                importance_values)
            #Aggregate
            importance_stack = np.dstack(importance_bands)
            importance_tile = np.nanmean(importance_stack, axis=2)
            tile_mask = mask[rc.ul_r:rc.lr_r,
                             rc.ul_c:rc.lr_c] | np.isnan(importance_tile)
            importance_tile[tile_mask] = nodata
            ar[rc.ul_r:rc.lr_r,
               rc.ul_c:rc.lr_c] = np.round(importance_tile).astype(np.uint8)
            print 'Aggregation time for this tile: %.1f minutes\n' % (
                (time.time() - t2) / 60)
            '''temp_dir = os.path.join(out_dir, 'delete')
            if not os.path.isdir(temp_dir):
                os.mkdir(temp_dir)
            t_tx = tile_ul[0], 30, 0, tile_ul[1], 0, -30
            array_to_raster(np.round(importance_tile).astype(np.uint8), t_tx, prj, gdal.GetDriverByName('gtiff'), os.path.join(temp_dir, 'delete_%s.tif' % t_ind), gdal.GDT_Byte, 255, True)'''
        out_path = os.path.join(out_dir,
                                '%s_importance_%s.tif' % (model, v_name))
        try:
            array_to_raster(ar, mask_tx, prj, gdal.GetDriverByName('gtiff'),
                            out_path, gdal.GDT_Byte, nodata)
        except Exception as e:
            print e
            import pdb
            pdb.set_trace()
        print 'Time for this variable: %.1f minutes\n' % (
            (time.time() - t1) / 60)

    print '\nTotal time for %s variables: %.1f hours\n' % (len(variables), (
        (time.time() - t0) / 3600))
def main(params, pct_train=None, min_oob=0, err_threshold=10):
    t0 = time.time()

    #read_params(params)
    inputs, df_var = stem.read_params(params)

    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    try:
        if 'max_features' not in locals(): max_features = None
        if 'err_threshold' in inputs: err_threshold = float(err_threshold)
        if 'min_oob' in inputs: min_oob = int(min_oob)
        num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell,
                                        min_obs, max_features, pct_train)
        cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars
        str_check = sample_txt, target_col, mosaic_path, tsa_txt, out_dir
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    if not 'out_dirname' in locals(): out_dirname = target_col
    stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    #import pdb; pdb.set_trace()
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists'''
    #stamp = os.path.dirname(out_dir)
    shutil.copy2(params, out_dir)  #Copy the params for reference

    df_train = pd.read_csv(sample_txt, sep='\t')
    n_samples = len(df_train)
    # Check that df_train has exactly the same columns as variables specified in df_vars
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
        raise NameError(msg)
    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    #import pdb; pdb.set_trace()
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    if 'constant_vars' in locals():
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        predict_cols += constant_vars

    # Get samples and support set bounds
    if 'gsrd_shp' not in locals(): gsrd_shp = None
    out_txt = os.path.join(out_dir, stamp + '.txt')
    df_train, df_sets, df_oob = stem.get_gsrd(mosaic_path, cell_size,
                                              support_size, sets_per_cell,
                                              df_train, min_obs, target_col,
                                              predict_cols, out_txt, gsrd_shp,
                                              pct_train)

    # Train a tree for each support set
    print 'Training models...'
    t1 = time.time()
    x_train = df_train.reindex(columns=predict_cols + ['set_id'])
    y_train = df_train[[target_col, 'set_id']]
    df_sets['dt_model'] = [stem.fit_tree_regressor(x_train.ix[x_train.set_id==s, predict_cols],\
    y_train.ix[y_train.set_id==s, target_col], max_features) for s in df_sets.index]
    del df_train
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Calculate OOB rates and drop sets with too low OOB
    print 'Calculating OOB rates...'
    t1 = time.time()
    df_sets, low_oob = stem.get_oob_rates(df_sets, df_oob, err_threshold,
                                          target_col, predict_cols, min_oob)
    if len(low_oob) > 0:
        df_sets.drop(low_oob.index, inplace=True)
        low_oob_shp = os.path.join(out_dir, 'gsrd_low_oob.shp')
        low_oob.drop('dt_model', axis=1, inplace=True)
        stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp)
    print '%s sets dropped because OOB rate < %s' % (len(low_oob), min_oob)
    print 'Min OOB rate after dropping: ', df_sets.oob_rate.min()
    print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean())
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Write df_sets and each decison tree to disk
    print 'Saving models...'
    t1 = time.time()
    df_sets, set_txt = stem.write_model(out_dir, df_sets)
    print '%.1f minutes\n' % ((time.time() - t1) / 60)  #'''

    #stamp = os.path.basename(out_dir)
    #set_txt = '/vol/v2/stem/{0}/models/{1}/decisiontree_models/{1}_support_sets.txt'.format(target_col, stamp)

    #predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_bright','delta_green','delta_nbr','delta_wet', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#'''

    # Record params in inventory text file
    if 'inventory_txt' in locals():
        t1 = time.time()
        '''print 'Getting model info...\n'
        df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
        if 'regressor' in params: 
            model_type = 'Regressor'
        else: 
            model_type = 'Classifier'
        n_sets = len(df_sets)
        if 'sample' in sample_txt:
            n_samples = int(sample_txt.split('_')[1].replace('sample',''))
        info = [model_type, None, None, None, None, None, None, None, None, n_sets, n_samples, str(support_size), sets_per_cell, max_features]
        df_inv.ix[stamp] = info
        info_dir = os.path.dirname(inventory_txt)
        existing_models = fnmatch.filter(os.listdir(os.path.dirname(info_dir)), '%s*' % target_col)
        if len(existing_models) > 0:
            df_inv = df_inv[df_inv.index.isin(existing_models)]'''

        # Check if oob_map params were specified. If not, set to defaults
        if 'err_threshold' not in locals():
            print 'err_threshold not specified. Using default: 10 ...\n'
            err_threshold = 10
        else:
            err_threshold = int(err_threshold)
        if 'n_tiles' not in locals():
            print 'n_tiles not specified. Using default: 25 x 15 ...\n'
            n_tiles = 25, 15
        else:
            n_tiles = int(n_tiles[0]), int(n_tiles[1])

        #t1 = time.time()
        print 'Calculating OOB score and making OOB score map...'
        ds = gdal.Open(mosaic_path)
        ar = ds.ReadAsArray()
        mask = ar != 0
        del ar
        xsize = ds.RasterXSize
        ysize = ds.RasterYSize
        tx = ds.GetGeoTransform()
        prj = ds.GetProjection()
        driver = ds.GetDriver()
        ds = None

        #import get_oob_map as oob
        ar_oob, ar_cnt, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles,
                                               tx, support_size, df_oob,
                                               df_sets, target_col,
                                               predict_cols, err_threshold,
                                               out_dir, stamp, prj, driver)
        df_sets.to_csv(set_txt, sep='\t')  #'''

        #if 'inventory_txt' in locals() :
        avg_oob = round(np.mean(ar_oob[mask]), 1)
        avg_cnt = int(round(np.mean(ar_cnt[mask]), 0))
        '''df_inv.ix[stamp, 'avg_oob'] = avg_oob
        #df_inv.ix[stamp, 'avg_count'] = avg_cnt
        if len(df_inv) > 1:
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print 'WARNING: Model info not written to inventory_txt...\n' '''

        print '\nAverage OOB score: .................... %.1f' % avg_oob
        print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt

        print 'Time to make OOB score map: %.1f hours\n' % (
            (time.time() - t1) / 3600)

        #except Exception as e:
        #    print 'Problem getting oob map: ', e

    print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(search_dir, models, t_path, inventory_txt, t_nodata=255):

    df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
    columns = df_inv.columns
    if 'vote_rmse' not in columns:
        df_inv['vote_rmse'] = None
    if 'mean_rmse' not in columns:
        df_inv['mean_rmse'] = None
    df_inv = df_inv.ix[models]

    ds = gdal.Open(t_path)
    ar_t = ds.ReadAsArray()
    nodata_mask = ar_t == t_nodata
    ds = None

    for model in models:
        print '\nCalculating RMSE for ', model
        model_dir = os.path.join(search_dir, model)
        if not os.path.exists(model_dir):
            print 'Model dir does not exist: %s. Skipping...\n' % model_dir
            continue

        confusion_params = os.path.join(model_dir,
                                        'confusion_matrix_params.txt')
        if not os.path.exists(confusion_params):
            print 'Could not find confusion params: ', confusion_params
            predict_params = os.path.join(model_dir, 'predict_stem_params.txt')
            inputs, _ = stem.read_params(predict_params)
            p_nodata = int(inputs['nodata'].replace('"', ''))
            this_srch_str = os.path.join(model_dir, 'train_stem*_params.txt')
            train_params = glob.glob(this_srch_str)
            if len(train_params) == 0:
                print 'Can not find test data for ', model, '\n'
                continue
            train_params = train_params[0]
            inputs, _ = stem.read_params(train_params)
            test_txt = inputs['sample_txt'].replace('predictors',
                                                    'test').replace('"', '')
            train_txt = inputs['sample_txt'].replace('"', '')
        else:
            inputs = read_params(confusion_params)
            for k, v in inputs.iteritems():
                inputs[k] = v.replace('"', '')
            test_txt = inputs['sample_txt']
            p_nodata = int(inputs['p_nodata'])
            train_txt = inputs['sample_txt'].replace('_test',
                                                     '').replace('"', '')
        #df = pd.read_csv(test_txt, sep='\t', index_col='obs_id')
        train_sample = pd.read_csv(train_txt, sep='\t', index_col='obs_id')

        # Set any pixels used for training to -1 so they can be avoided for testing
        n_rows, n_cols = ar_t.shape
        n_pixels = ar_t.size
        pixel_ids = np.arange(n_pixels,
                              dtype=np.uint32).reshape(n_rows, n_cols)
        pixel_ids[
            train_sample.row,
            train_sample.col] = n_pixels  #will always be 1 more than last id
        pixel_ids[nodata_mask] = n_pixels

        n_samples = int(
            int(
                os.path.basename(train_txt).split('_')[1].replace(
                    'sample', '')) * 0.2)
        test_ids = np.array(random.sample(pixel_ids[pixel_ids != n_pixels],
                                          n_samples),
                            dtype=np.uint32)
        test_rows = test_ids / n_cols
        test_cols = test_ids % n_cols
        #test_cols = random.sample(ar_col[ar_col != -1], n_samples)
        df = pd.DataFrame({'row': test_rows, 'col': test_cols})

        for agg_method in ['vote', 'mean']:
            p_path = os.path.join(model_dir, '%s_%s.bsq' % (model, agg_method))
            ds = gdal.Open(p_path)
            ar_p = ds.ReadAsArray()
            t_samples, p_samples = get_samples(ar_p,
                                               ar_t,
                                               p_nodata,
                                               255,
                                               samples=df,
                                               match='best')

            rmse = np.round(calc_rmse(t_samples, p_samples), 1)
            print agg_method, ': ', rmse
            df_inv.ix[model, '%s_rmse' % agg_method] = rmse
        out_txt = os.path.join(
            model_dir, '%s_random_test_sample%s.txt' % (model, n_samples))
        df.to_csv(out_txt, sep='\t', index=False)

    out_txt = inventory_txt.replace('.txt', '_randomRMSE.txt')
    df_inv.to_csv(out_txt, sep='\t')
Exemple #15
0
def main(txt, n_sample, out_txt, bins, train_params, by_psu=True, extract_predictors=True):
    
    n_sample = int(n_sample) 
    bins = parse_bins(bins)
    
    df = pd.read_csv(txt, sep='\t', dtype={'tile_id': object})
    sample = pd.DataFrame(columns=df.columns)
    n_bins = len(bins)
    psu_ids = df.tile_id.unique()
    
    train_params = stem.read_params(train_params)
    for var in train_params:
        exec ("{0} = str({1})").format(var, train_params[var])
    tiles = attributes_to_df(MOSAIC_SHP)
    
    if extract_predictors:
        var_info = pd.read_csv(var_info, sep='\t', index_col='var_name')
        for i, tile in enumerate(psu_ids):
            print("extracting %s of %s" % (i, len(psu_ids)))
            sample_mask = df.tile_id == tile
            this_sample = df.loc[sample_mask]
            tile_ul = tiles.loc[tiles['name'] == tile, ['xmin', 'ymax']].values[0]
            #point_dict = get_point_dict(df, psu_ids)
            mosaic_tx, extent = stem.tx_from_shp(MOSAIC_SHP, 30, -30)
            
            row_off, col_off = stem.calc_offset([mosaic_tx[0], mosaic_tx[3]], tile_ul, mosaic_tx)
            this_sample['local_row'] = this_sample.row - row_off
            this_sample['local_col'] = this_sample.col - col_off
    
            for var_name, var_row in var_info.iterrows():
                #tiles = pd.DataFrame({'tile_id': psu_ids, 'tile_str': psu_ids})
                file_path = stem.find_file(var_row.basepath, var_row.search_str, tile)
                ds = gdal.Open(file_path)
                ar = ds.GetRasterBand(var_row.data_band).ReadAsArray()
                try:
                    if len(this_sample) == ar.size:
                        df.loc[sample_mask, var_name] = ar.ravel()
                    else:
                        df.loc[sample_mask, var_name] = ar[this_sample.local_row, this_sample.local_col]
                except Exception as e:
                    print(e)
                    import pdb; pdb.set_trace()
                ds = None
        df.to_csv(txt.replace('.txt', '_predictors.txt'))
    #df[var_name], _ = extract.extract_var('', var_name, var_row.by_tile, var_row.data_band, var_row.data_type, tiles, df, point_dict, var_row.basepath, var_row.search_str, var_row.path_filter, mosaic_tx, 0, 0, silent=True)
                
    if by_psu: 
        
        n_per_psu = n_sample/len(psu_ids)
        n_per_bin = n_per_psu/n_bins
        
        for i, pid in enumerate(psu_ids):
            psu_pixels = df.loc[df.tile_id == pid]
            print("Sampling for %s of %s PSUs" % (i + 1, len(psu_ids)))
            for l, u in bins:
                this_bin = psu_pixels.loc[(l < psu_pixels.value) & (psu_pixels.value <= u)]
                if len(this_bin) > 0:
                    bin_sample_size = min(n_per_bin, len(this_bin))
                    sample = pd.concat([sample, this_bin.sample(bin_sample_size)])
                    print("Sampled %s for bin %s-%s" % (n_per_bin, l, u))
                else:
                    print("No pixels between %s and %s found" % (l, u))
            print("")
    
    else:
        n_per_bin = n_sample/n_bins
        for l, u in bins:
            sample = pd.concat([sample, df.sample(n_per_bin)])
    
    sample.to_csv(out_txt, index=False)
    
    print 'Sample written to ', out_txt
def main(params,
         snap_coord=None,
         resolution=30,
         n_sizes=5,
         max_features=None,
         n_jobs=1):
    t0 = time.time()

    inputs, df_var = stem.read_params(params)

    # Convert params to named variables and check for required vars
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    try:
        sets_per_cell = int(sets_per_cell)
        cell_size = [int(s) for s in cell_size.split(',')]
        min_size = int(min_size)
        max_size = int(max_size)
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Read in training samples and check that df_train has exactly the same
    #   columns as variables specified in df_vars
    df_train = pd.read_csv(sample_txt, sep='\t')
    n_samples = len(df_train)
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n\t'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str
        import pdb
        pdb.set_trace()
        raise NameError(msg)
    if target_col not in df_train.columns:
        raise NameError('target_col "%s" not in sample_txt: %s' %
                        (target_col, sample_txt))
    if 'max_target_val' in inputs:
        max_target_val = int(max_target_val)
    else:
        max_target_val = df_train[target_col].max()
    if 'n_jobs' in inputs:
        n_jobs = int(n_jobs)

    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    if snap_coord:
        snap_coord = [int(c) for c in snap_coord.split(',')]

    t1 = time.time()
    if model_type.lower() == 'classifier':
        model_func = stem.fit_tree_classifier
    else:
        model_func = stem.fit_tree_regressor

    # Make grid
    x_res = resolution
    y_res = -resolution
    tx, extent = stem.tx_from_shp(mosaic_path,
                                  x_res,
                                  y_res,
                                  snap_coord=snap_coord)
    min_x, max_x, min_y, max_y = [int(i) for i in extent]
    cells = stem.generate_gsrd_grid(cell_size, min_x, min_y, max_x, max_y,
                                    x_res, y_res)
    grid = pd.DataFrame(cells, columns=['ul_x', 'ul_y', 'lr_x', 'lr_y'])
    grid.to_csv(out_txt.replace('.txt', '_grid.txt'))
    #import pdb; pdb.set_trace()
    grid = intersecting_cells(grid, mosaic_path)
    stem.coords_to_shp(grid, '/vol/v2/stem/extent_shp/CAORWA.shp',
                       out_txt.replace('.txt', '_grid.shp'))

    if 'set_sizes' in inputs:
        set_sizes = np.sort([int(s) for s in set_sizes.split(',')])
    else:
        if 'n_sizes' in inputs:
            n_sizes = int(n_sizes)
        set_sizes = np.arange(min_size, max_size + 1,
                              (max_size - min_size) / n_sizes)

    # Sample grid
    dfs = []
    for i, cell in grid.iterrows():
        ul_x, ul_y, lr_x, lr_y = cell
        min_x, max_x = min(ul_x, lr_x), max(ul_x, lr_x)
        min_y, max_y = min(ul_y, lr_y), max(ul_y, lr_y)

        # Calculate support set centers
        x_centers = [
            int(stem.snap_coordinate(x, snap_coord[0], x_res))
            for x in random.sample(xrange(min_x, max_x + 1), sets_per_cell)
        ]
        y_centers = [
            int(stem.snap_coordinate(y, snap_coord[1], y_res))
            for y in random.sample(xrange(min_y, max_y + 1), sets_per_cell)
        ]

        for size in set_sizes:
            df = stem.sample_gsrd_cell(sets_per_cell,
                                       cell,
                                       size,
                                       size,
                                       x_res,
                                       y_res,
                                       tx,
                                       snap_coord,
                                       center_coords=(zip(
                                           x_centers, y_centers)))
            df['set_size'] = size
            df['cell_id'] = i
            dfs.append(df)

    support_sets = pd.concat(dfs, ignore_index=True)
    n_sets = len(support_sets)
    #import pdb; pdb.set_trace()
    print 'Testing set sizes with %s jobs...\n' % n_jobs
    oob_metrics = _par_train_estimator(n_jobs, n_sets, df_train, predict_cols,
                                       target_col, support_sets, model_func,
                                       model_type, max_features,
                                       max_target_val)
    '''args = [[i, n_sets, start_time, df_train, predict_cols, target_col, support_set, model_func, model_type, max_features, max_target_val] for i, (si, support_set) in enumerate(support_sets.ix[:100].iterrows())]
    oob_metrics = []
    for arg in args:
        oob_metrics.append(par_train_estimator(arg))'''

    oob_metrics = pd.DataFrame(oob_metrics)
    oob_metrics.set_index('set_id', inplace=True)
    support_sets = pd.merge(support_sets,
                            oob_metrics,
                            left_index=True,
                            right_index=True)
    #import pdb; pdb.set_trace()
    support_sets.to_csv(out_txt)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'):
    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])    
    df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    
    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars  = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)
    
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir) #Copy the params for reference
    
    if 'confusion_params' in inputs: 
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path
    
    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)
    
    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)
    
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    db_path = os.path.join(model_dir, file_stamp + '.db')
    try:
        engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
        with engine.connect() as con, con.begin():
            df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')#'''
    except:
        set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
        if not os.path.isfile(set_txt):
            raise IOError('No database or support set txt file found')
        df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    
    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        # if subset specified, clip the mosaic and set mosaic path to clipped shp
        if 'subset_shp' in inputs:
            out_shp_bn = os.path.basename(mosaic_path).replace('.shp', '_clipped.shp')
            out_shp = os.path.join(out_dir, out_shp_bn)
            cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path)
            subprocess.call(cmd, shell=True)#'''
            mosaic_path = out_shp
        mosaic_dataset = ogr.Open(mosaic_path)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        # If subset specified, just get sets that overlap the subset
        if 'subset_shp' in inputs:
            mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon)
            for feature in mosaic_ds:
                mosaic_geom.AddGeometry(feature.GetGeometryRef())
            df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom)
        xsize = int((max_x - min_x)/resolution)
        ysize = int((max_y - min_y)/resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        if 'snap_coord' in train_inputs:
            snap_coord = train_inputs['snap_coord'].replace('"','')
            snap_coord = [float(c) for c in snap_coord.split(',')]#'''
        mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord)
        tiles = stem.attributes_to_df(mosaic_path) # Change to accept arbittary geometry
        
    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    #driver = gdal.GetDriverByName('gtiff')
        
    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 90, 40
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx)
    
    total_sets = len(df_sets)
    t0 = time.time()
    last_dts = pd.Series()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    n_jobs = int(n_jobs)
    tile_dir = os.path.join(model_dir, 'temp_tiles')
    #tile_dir = '/home/server/pi/homes/shooper/delete_test'
    if not os.path.isdir(tile_dir):
        os.mkdir(tile_dir)
    tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif')
    n_tiles = len(tiles)
    
    if not overwrite_tiles:
        files = os.listdir(tile_dir)
        tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field])
        for stat in agg_stats:
            stat_match = [f.split('_')[1] for f in fnmatch.filter(files, 'tile*%s.tif' % stat)]
            tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match)
        index_field = tiles.index.name
        tiles[index_field] = tiles.index
        tiles = tiles.set_index(tile_id_field, drop=False)[tile_files.isnull().any(axis=1)]
        tiles.set_index(index_field, inplace=True)
    
    tiles['ul_x'] = [stem.get_ul_coord(xmin, xmax, x_res) 
                    for i, (xmin, xmax) in tiles[['xmin','xmax']].iterrows()]
    tiles['ul_y'] = [stem.get_ul_coord(ymin, ymax, y_res) 
                    for i, (ymin, ymax) in tiles[['ymin','ymax']].iterrows()]
    tiles['lr_x'] = [xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax)
                    in tiles[['ul_x', 'xmin','xmin']].iterrows()]
    tiles['lr_y'] = [ymax if uly == ymin else ymin for i, (uly, ymin, ymax) 
                    in tiles[['ul_y', 'ymin','ymin']].iterrows()]
    
    support_nrows = int(support_size[0]/abs(y_res))
    support_ncols = int(support_size[1]/abs(x_res))
    t1 = time.time()
    args = [(tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles[tiles['name'].isin(['1771', '3224', '0333', '0558'])].iterrows())]    
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.iterrows())]
    
    if n_jobs > 1:
        print 'Predicting with %s jobs...\n' % n_jobs
        pool = Pool(n_jobs)
        pool.map(stem.predict_tile, args, 1)
        pool.close()
        pool.join()
    else:
        for arg in args:
            print 'Predicting with 1 job ...\n'
            stem.predict_tile(*arg)#'''
    print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ((time.time() - t1)/3600)
    
    t1 = time.time()
    mosaic_ul = mosaic_tx[0], mosaic_tx[3]
    driver = gdal.GetDriverByName('gtiff')
    for stat in agg_stats:
        if stat == 'stdv':
            this_nodata = -9999
            ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) 
        else:
            this_nodata = nodata
            ar = np.full((ysize, xsize), this_nodata, dtype=np.uint8)
        
        for tile_id, tile_coords in tiles.iterrows():
            tile_file = os.path.join(tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat))
            ds = gdal.Open(tile_file)
            tile_tx = ds.GetGeoTransform()
            tile_ul = tile_tx[0], tile_tx[3]
            row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx)
            # Make sure the tile doesn't exceed the size of ar
            tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off
            tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off
            ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows)
            try:
                ar[row_off : row_off + tile_rows, col_off : col_off + tile_cols] = ar_tile
            except Exception as e:
                import pdb; pdb.set_trace()
        
        out_path = os.path.join(model_dir, '%s_%s.tif' % (file_stamp, stat))
        #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat))
        gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype)
        mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata)
    
    # Clean up the tiles
    shutil.rmtree(tile_dir)
    print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1)/60)
    
    # Get feature importances and max importance per set
    t1 = time.time()
    print 'Getting importance values...'
    importance_cols = sorted([c for c in df_sets.columns if 'importance' in c])
    df_sets['max_importance'] = nodata
    if len(importance_cols) == 0:
        # Loop through and get importance
        importance_per_var = []
        for s, row in df_sets.iterrows():
            with open(row.dt_file, 'rb') as f: 
                dt_model = pickle.load(f)
            max_importance, this_importance = stem.get_max_importance(dt_model)
            df_sets.ix[s, 'max_importance'] = max_importance
            importance_per_var.append(this_importance)
        importance = np.array(importance_per_var).mean(axis=0)
    else:
        df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1)
        importance = df_sets[importance_cols].mean(axis=0).values
    pct_importance = importance / importance.sum()
    print '%.1f minutes\n' % ((time.time() - t1)/60)
    
    # Save the importance values
    importance = pd.DataFrame({'variable': pred_vars,
                               'pct_importance': pct_importance,
                               'index': range(len(pred_vars))
                               })
    importance.set_index('index', inplace=True)
    importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')#'''
    
    if 'confusion_params' in locals():
        import confusion_matrix as confusion

        ''' 
         Read the mean or vote back in '''
        if 'vote' in agg_stats:
            vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp)
            ar_vote = gdal.Open(vote_path)
            print '\nComputing confusion matrix for vote...'
            vote_dir = os.path.join(model_dir, 'evaluation_vote')
            out_txt = os.path.join(vote_dir, 'confusion.txt')
            df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
            vote_acc = df_v.ix['producer', 'user']
            vote_kap = df_v.ix['producer', 'kappa']
            '''try:
                out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
                df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
            except Exception as e:
                print e'''

                
        if 'mean' in agg_stats:
            mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp)
            ar_mean = gdal.Open(mean_path)
            print '\nGetting confusion matrix for mean...'
            mean_dir = os.path.join(model_dir, 'evaluation_mean')
            out_txt = os.path.join(mean_dir, 'confusion.txt')
            df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
            mean_acc = df_m.ix['user','producer']
            mean_kap = df_m.ix['user', 'kappa']
            '''try:
                out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
                df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
            except Exception as e:
                print e#'''


        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'
            
        print ''
        if 'vote' in agg_stats:
            print 'Vote accuracy .............. ', vote_acc
            print 'Vote kappa ................. ', vote_kap
        if 'mean' in agg_stats:
            print 'Mean accuracy .............. ', mean_acc
            print 'Mean kappa ................. ', mean_kap
        
    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
    
    print '\nTotal prediction runtime: %.1f hours\n' % ((time.time() - t0)/3600)