Esempio n. 1
0
def get_zone_inds(ar_size, zone_size, tx, feat):
    '''
    Return the array offset indices for pixels overlapping a feature from a 
    vector dataset. Array indices are returned as (upper_row, lower_row, left_col,_right col)
    to be used to index an array as [upper_row : lower_row, left_col : right_col]
    '''
    geom = feat.GetGeometryRef()
    x1, x2, y1, y2 = geom.GetEnvelope()
    
    # Get the feature ul x and y, and calculate the pixel offset
    ar_ulx, x_res, x_rot, ar_uly, y_rot, y_res = tx
    x_sign = x_res/abs(x_res)
    y_sign = y_res/abs(y_res)
    f_ulx = min([x0/x_sign for x0 in [x1, x2]])/x_sign
    f_uly = min([y0/y_sign for y0 in [y1, y2]])/y_sign
    offset = stem.calc_offset((ar_ulx, ar_uly), (f_ulx, f_uly), tx)

    # Get the inds for the overlapping portions of each array
    a_inds, m_inds = mosaic.get_offset_array_indices(ar_size, zone_size, offset)
    
    return a_inds, m_inds
Esempio n. 2
0
def main(params, inventory_txt=None, constant_vars=None):
    
    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])    
    df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float

    try:
        n_tiles = [int(i) for i in n_tiles.split(',')]
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    
    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars  = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)
    
    # If constants were given, make a dict and make sure they match the training
    #  constants
    if 'constant_vars' in inputs:
        constant_vars = parse_constant_vars(constant_vars)
        pred_constants = sorted(constant_vars.keys())
        train_constants = [i.replace(' ', '') for i in train_inputs['constant_vars'].strip('"').split(',')]
        train_constants = sorted(train_constants)
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if 'constant_vars' in inputs: 
        unmatched_vars += [v for v in pred_constants if v not in train_constants]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str
        raise NameError(msg)
    unmatched_vars = [v for v in train_vars if v not in pred_vars]
    if 'constant_vars' in inputs:
        unmatched_vars += [v for v in train_constants if v not in pred_constants]
        pred_vars += pred_constants # Add here because it would screw with stuff upstream
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir) #Copy the params for reference
    
    if 'confusion_params' in inputs: 
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path
    
    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)
    
    mosaic_ds = gdal.Open(mosaic_path)
    mosaic_tx = mosaic_ds.GetGeoTransform()
    xsize = mosaic_ds.RasterXSize
    ysize = mosaic_ds.RasterYSize
    prj = mosaic_ds.GetProjection()
    driver = mosaic_ds.GetDriver()
    m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    
    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)
    
    set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    total_sets = len(df_sets)
    
    t0 = time.time()
    if 'n_jobs' in inputs:

        # Predict in parallel
        n_jobs = int(n_jobs)
        args = []
        t1 = time.time()
        print 'Predicting in parallel with %s jobs...' % n_jobs
        print 'Building args and making rasters of TSA arrays...'
        for c, (set_id, row) in enumerate(df_sets.iterrows()):
            
            # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic_ds
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx,
                            xsize, ysize, nodata=nodata)
            tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
            tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5]
            dtype_code = mosaic_ds.GetRasterBand(1).DataType
            mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, tsa_raster, stem.get_gdal_dtype(dtype_code), silent=True)
            
            # Build list of args to pass to the Pool
            tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
            ds = gdal.Open(tsa_raster)
            tsa_tx = ds.GetGeoTransform()
            ds = None
            tsa_off = stem.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tsa_tx[0], tsa_tx[3]), tsa_tx)
            args.append([c, total_sets, set_id, df_var, tsa_raster, tsa_off, coords, 
                         mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, 
                         constant_vars, predict_dir])
        print '%.1f minutes\n' % ((time.time() - t1)/60)
        p = Pool(n_jobs)
        p.map(stem.par_predict, args, 1)
            
    
    else:
        # Loop through each set and generate predictions
        for c, (set_id, row) in enumerate(df_sets.ix[1043:].iterrows()):
            t1 = time.time()
            with open(row.dt_file, 'rb') as f: 
                dt_model = pickle.load(f)
            print '\nPredicting for set %s of %s' % (c + 1, total_sets)
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            ar_predict = stem.predict_set(set_id, df_var, mosaic_ds, coords, 
                                     mosaic_tx, xsize, ysize, dt_model, nodata,
                                     np.int16, constant_vars)        
            tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res
            out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id)
            mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata)
            print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60)
    
        #mosaic_ds = None                  
    print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#'''
    
    #Aggregate predictions by tile and stitch them back together
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    ar_vote, pct_importance, df_sets = stem.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0)
    #df_sets.to_csv(set_txt, sep='\t')'''
    mosaic_ds = None
    
    # Save the importance values
    importance = pd.DataFrame({'variable': pred_vars,
                               'pct_importance': pct_importance,
                               'index': range(len(pred_vars))
                               })
    importance.set_index('index', inplace=True)
    importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')#'''
    
    '''ds = gdal.Open(os.path.join(model_dir, '%s_vote.bsq' % file_stamp))
    ar_vote = ds.ReadAsArray()
    ds = None
    ds = gdal.Open(os.path.join(model_dir, '%s_mean.bsq' % file_stamp))
    ar_mean = ds.ReadAsArray()
    ds = None#'''
    
    if 'confusion_params' in locals():
        import confusion_matrix as confusion
        
        vote_dir = os.path.join(model_dir, 'evaluation_vote')
        mean_dir = os.path.join(model_dir, 'evaluation_mean')
        
        print '\nComputing confusion matrix for vote...'
        out_txt = os.path.join(vote_dir, 'confusion.txt')
        print confusion_params
        df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
        try:
            out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
            df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
        except Exception as e:
            print e
        
        '''print '\nGetting confusion matrix for mean...'
        out_txt = os.path.join(mean_dir, 'confusion.txt')
        df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
        try:
            out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
            df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
        except Exception as e:
            print e#'''
        
        vote_acc = df_v.ix['producer', 'user']
        vote_kap = df_v.ix['producer', 'kappa']
        #mean_acc = df_m.ix['user','producer']
        #mean_kap = df_m.ix['user', 'kappa']

        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'
            
        print ''
        print 'Vote accuracy .............. ', vote_acc
        print 'Vote kappa ................. ', vote_kap
        #print 'Mean accuracy .............. ', mean_acc
        #print 'Mean kappa ................. ', mean_kap
        
    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
    
    print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0)/60)
Esempio n. 3
0
def main(params,
         inventory_txt=None,
         constant_vars=None,
         mosaic_shp=None,
         resolution=30,
         n_jobs=0,
         n_jobs_agg=0,
         mosaic_nodata=0,
         snap_coord=None,
         overwrite_tiles=False,
         tile_id_field='name'):
    inputs = stem.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    df_var = pd.read_csv(var_info, sep='\t', index_col='var_name')
    df_var.data_band = [int(b)
                        for b in df_var.data_band]  #sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = pd.read_csv(train_inputs['var_info'].replace('"', ''),
                             sep='\t',
                             index_col='var_name')
    train_vars = sorted(train_vars.index)
    pred_vars = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)

    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)

    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir)  #Copy the params for reference

    if 'confusion_params' in inputs:
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path

    if overwrite_tiles.lower() == 'false':
        overwrite_tiles = False

    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)

    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    db_path = os.path.join(model_dir, os.path.basename(model_dir) + '.db')
    if os.path.exists(db_path):
        engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
        with engine.connect() as con, con.begin():
            df_sets = pd.read_sql_table('support_sets',
                                        con,
                                        index_col='set_id')  #'''
    else:
        set_txt = stem.find_file(model_dir, '*support_sets.txt')
        if not os.path.isfile(set_txt):
            raise IOError('No database or support set txt file found')
        df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')

    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        # if subset specified, clip the mosaic and set mosaic path to clipped shp
        if 'subset_shp' in inputs:
            out_shp_bn = os.path.basename(mosaic_path).replace(
                '.shp', '_clipped.shp')
            out_shp = os.path.join(out_dir, out_shp_bn)
            cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(
                clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path)
            subprocess.call(cmd, shell=True)  #'''
            mosaic_path = out_shp
        mosaic_dataset = ogr.Open(mosaic_path, 1)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        # If subset specified, just get sets that overlap the subset
        if 'subset_shp' in inputs:
            mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon)
            i = 0
            for feature in mosaic_ds:
                g = feature.GetGeometryRef()
                # Check that the feature is valid. Clipping can produce a feautre
                #  w/ an area of 0
                if g.GetArea() > 1:
                    mosaic_geom.AddGeometry(g)
                else:
                    fid = feature.GetFID()
                    feature.Destroy()
                    mosaic_ds.DeleteFeature(fid)
            #import pdb; pdb.set_trace()
            df_sets = stem.get_overlapping_sets(df_sets,
                                                mosaic_geom.UnionCascaded())
        xsize = int((max_x - min_x) / resolution)
        ysize = int((max_y - min_y) / resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        if 'snap_coord' in train_inputs:
            snap_coord = train_inputs['snap_coord'].replace('"', '')
            snap_coord = [float(c) for c in snap_coord.split(',')]  #'''
        mosaic_tx, extent = stem.tx_from_shp(mosaic_path,
                                             x_res,
                                             y_res,
                                             snap_coord=snap_coord)
        tiles = stem.attributes_to_df(
            mosaic_path)  # Change to accept arbittary geometry

    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    #driver = gdal.GetDriverByName('gtiff')

    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 90 x 40 ...\n'
        n_tiles = 90, 40
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx)

    total_sets = len(df_sets)
    t0 = time.time()
    last_dts = pd.Series()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    n_jobs = int(n_jobs)
    tile_dir = os.path.join(out_dir, '_temp_tiles')
    #tile_dir = '/home/server/pi/homes/shooper/delete_test'
    if not os.path.isdir(tile_dir):
        os.mkdir(tile_dir)
    tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif')
    n_tiles = len(tiles)

    if not overwrite_tiles:
        files = os.listdir(tile_dir)
        tile_files = pd.DataFrame(columns=agg_stats,
                                  index=tiles[tile_id_field])
        for stat in agg_stats:
            pattern = re.compile('tile_\d+_%s.tif' % stat)
            stat_match = [f.split('_')[1] for f in files if pattern.match(f)]
            try:
                tile_files[stat] = pd.Series(np.ones(len(stat_match)),
                                             index=stat_match)
            except:
                pass  #import pdb; pdb.set_trace()
        index_field = tiles.index.name
        tiles[index_field] = tiles.index
        tiles = tiles.set_index(tile_id_field, drop=False)
        tiles.set_index(index_field, inplace=True)  #'''
    tiles['ul_x'] = [
        stem.get_ul_coord(xmin, xmax, x_res)
        for i, (xmin, xmax) in tiles[['xmin', 'xmax']].iterrows()
    ]
    tiles['ul_y'] = [
        stem.get_ul_coord(ymin, ymax, y_res)
        for i, (ymin, ymax) in tiles[['ymin', 'ymax']].iterrows()
    ]
    tiles['lr_x'] = [
        xmax if ulx == xmin else xmin
        for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin', 'xmin']].iterrows()
    ]
    tiles['lr_y'] = [
        ymax if uly == ymin else ymin
        for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin', 'ymin']].iterrows()
    ]

    support_nrows = int(support_size[0] / abs(y_res))
    support_ncols = int(support_size[1] / abs(x_res))
    t1 = time.time()

    # Patch for unknown Landcover screwup
    args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets,
             df_var, (support_nrows, support_ncols), agg_stats,
             tile_path_template, prj, nodata, snap_coord)
            for i, (t_ind,
                    tile_info) in enumerate(tiles.loc[tiles['name'].isin([
                        '1931', '2810', '0705', '0954', '2814', '1986', '2552',
                        '2019', '2355', '3354', '2278', '2559'
                    ])].iterrows())]

    args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets,
             df_var, (support_nrows, support_ncols), agg_stats,
             tile_path_template, prj, nodata, snap_coord)
            for i, (t_ind, tile_info) in enumerate(tiles.loc[
                tiles['name'].isin(['0705'])].iterrows())]

    # Patch for the GEE subset 2 outside-of-buffer 'slice'
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin(['0639','0718','0797','0876','0955','1034'])].iterrows())]

    # Original line
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tile_files.isnull().any(axis=1).values].iterrows())]

    limits = []

    for arg in args:
        print tile_info[tile_id_field]
        limits.append(stem.par_predict_tile(arg))  #'''

    ###

    return
    print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % (
        (time.time() - t1) / 3600)

    try:
        limits = pd.concat(limits)
    except:
        # They're all None
        pass

    t1 = time.time()
    mosaic_ul = mosaic_tx[0], mosaic_tx[3]
    driver = gdal.GetDriverByName('gtiff')
    for stat in agg_stats:
        #dtype = mosaic.get_min_numpy_dtype(limits[stat])
        dtype = np.int16
        if stat == 'stdv':
            this_nodata = -9999
            ar = np.full((ysize, xsize), this_nodata, dtype=np.int16)  #dtype)
        else:
            this_nodata = nodata
            ar = np.full((ysize, xsize), this_nodata, dtype=dtype)

        for tile_id, tile_coords in tiles.iterrows():
            tile_file = os.path.join(
                tile_dir,
                'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat))
            try:
                ds = gdal.Open(tile_file)
            except:
                print 'Tile not found'
                continue
            tile_tx = ds.GetGeoTransform()
            tile_ul = tile_tx[0], tile_tx[3]
            row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx)
            # Make sure the tile doesn't exceed the size of ar
            tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off
            tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off
            ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows)
            try:
                ar[row_off:row_off + tile_rows,
                   col_off:col_off + tile_cols] = ar_tile
            except Exception as e:
                pass  #import pdb; pdb.set_trace()

        out_path = os.path.join(out_dir, '%s_%s.tif' % (file_stamp, stat))
        #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat))
        gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype)
        mosaic.array_to_raster(ar,
                               mosaic_tx,
                               prj,
                               driver,
                               out_path,
                               gdal_dtype,
                               nodata=this_nodata)

    # Clean up the tiles
    #shutil.rmtree(tile_dir)
    print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1) / 60)

    # Get feature importances and max importance per set
    t1 = time.time()
    print 'Getting importance values...'
    importance_cols = sorted([c for c in df_sets.columns if 'importance' in c])
    df_sets['max_importance'] = nodata
    if len(importance_cols) == 0:
        # Loop through and get importance
        importance_per_var = []
        for s, row in df_sets.iterrows():
            with open(row.dt_file, 'rb') as f:
                dt_model = pickle.load(f)
            max_importance, this_importance = stem.get_max_importance(dt_model)
            df_sets.ix[s, 'max_importance'] = max_importance
            importance_per_var.append(this_importance)
        importance = np.array(importance_per_var).mean(axis=0)
    else:
        df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values,
                                              axis=1)
        importance = df_sets[importance_cols].mean(axis=0).values
    pct_importance = importance / importance.sum()
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Save the importance values
    importance = pd.DataFrame({
        'variable': pred_vars,
        'pct_importance': pct_importance,
        'index': range(len(pred_vars))
    })
    importance.set_index('index', inplace=True)
    importance['rank'] = [
        int(r) for r in importance.pct_importance.rank(method='first',
                                                       ascending=False)
    ]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')  #'''

    print '\nTotal prediction runtime: %.1f hours\n' % (
        (time.time() - t0) / 3600)
Esempio n. 4
0
def main(txt, n_sample, out_txt, bins, train_params, by_psu=True, extract_predictors=True):
    
    n_sample = int(n_sample) 
    bins = parse_bins(bins)
    
    df = pd.read_csv(txt, sep='\t', dtype={'tile_id': object})
    sample = pd.DataFrame(columns=df.columns)
    n_bins = len(bins)
    psu_ids = df.tile_id.unique()
    
    train_params = stem.read_params(train_params)
    for var in train_params:
        exec ("{0} = str({1})").format(var, train_params[var])
    tiles = attributes_to_df(MOSAIC_SHP)
    
    if extract_predictors:
        var_info = pd.read_csv(var_info, sep='\t', index_col='var_name')
        for i, tile in enumerate(psu_ids):
            print("extracting %s of %s" % (i, len(psu_ids)))
            sample_mask = df.tile_id == tile
            this_sample = df.loc[sample_mask]
            tile_ul = tiles.loc[tiles['name'] == tile, ['xmin', 'ymax']].values[0]
            #point_dict = get_point_dict(df, psu_ids)
            mosaic_tx, extent = stem.tx_from_shp(MOSAIC_SHP, 30, -30)
            
            row_off, col_off = stem.calc_offset([mosaic_tx[0], mosaic_tx[3]], tile_ul, mosaic_tx)
            this_sample['local_row'] = this_sample.row - row_off
            this_sample['local_col'] = this_sample.col - col_off
    
            for var_name, var_row in var_info.iterrows():
                #tiles = pd.DataFrame({'tile_id': psu_ids, 'tile_str': psu_ids})
                file_path = stem.find_file(var_row.basepath, var_row.search_str, tile)
                ds = gdal.Open(file_path)
                ar = ds.GetRasterBand(var_row.data_band).ReadAsArray()
                try:
                    if len(this_sample) == ar.size:
                        df.loc[sample_mask, var_name] = ar.ravel()
                    else:
                        df.loc[sample_mask, var_name] = ar[this_sample.local_row, this_sample.local_col]
                except Exception as e:
                    print(e)
                    import pdb; pdb.set_trace()
                ds = None
        df.to_csv(txt.replace('.txt', '_predictors.txt'))
    #df[var_name], _ = extract.extract_var('', var_name, var_row.by_tile, var_row.data_band, var_row.data_type, tiles, df, point_dict, var_row.basepath, var_row.search_str, var_row.path_filter, mosaic_tx, 0, 0, silent=True)
                
    if by_psu: 
        
        n_per_psu = n_sample/len(psu_ids)
        n_per_bin = n_per_psu/n_bins
        
        for i, pid in enumerate(psu_ids):
            psu_pixels = df.loc[df.tile_id == pid]
            print("Sampling for %s of %s PSUs" % (i + 1, len(psu_ids)))
            for l, u in bins:
                this_bin = psu_pixels.loc[(l < psu_pixels.value) & (psu_pixels.value <= u)]
                if len(this_bin) > 0:
                    bin_sample_size = min(n_per_bin, len(this_bin))
                    sample = pd.concat([sample, this_bin.sample(bin_sample_size)])
                    print("Sampled %s for bin %s-%s" % (n_per_bin, l, u))
                else:
                    print("No pixels between %s and %s found" % (l, u))
            print("")
    
    else:
        n_per_bin = n_sample/n_bins
        for l, u in bins:
            sample = pd.concat([sample, df.sample(n_per_bin)])
    
    sample.to_csv(out_txt, index=False)
    
    print 'Sample written to ', out_txt
def main(raster, nodata, psu_shp, out_dir):

    nodata = int(nodata)
    psus = attributes_to_df(psu_shp)
    ds = gdal.Open(raster)
    ar = ds.GetVirtualMemArray()  #ReadAsArray()
    tx = ds.GetGeoTransform()
    prj = ds.GetProjection()
    driver = gdal.GetDriverByName('gtiff')

    # Just extract the test sample first
    test_sample_dfs = []
    print '\nGetting test samples for PSUs...'
    for i, psu in psus.iterrows():
        # Calc offsets
        row_off, col_off = calc_offset((tx[0], tx[3]), psu[['ul_x', 'ul_y']],
                                       tx)
        n_rows = abs(int((psu.ymax - psu.ymin) / tx[5]))
        n_cols = abs(int((psu.xmax - psu.xmin) / tx[1]))

        # Get values
        test_data = ar[row_off:row_off + n_rows,
                       col_off:col_off + n_cols].ravel()
        mask = test_data != nodata

        # Get row/col and x/y vals
        test_data = test_data[mask]
        row_inds, col_inds = np.indices((n_rows, n_cols), dtype=np.uint32)
        row_inds = row_inds.ravel()[mask]
        col_inds = col_inds.ravel()[mask]
        these_row_inds = row_inds + row_off
        these_col_inds = col_inds + col_off
        y_coords = row_inds * tx[5] + psu.ul_y
        x_coords = col_inds * tx[1] + psu.ul_x

        df = pd.DataFrame({
            'row': these_row_inds,
            'col': these_col_inds,
            'y': y_coords,
            'x': x_coords,
            'value': test_data,
            'tile_id': psu['name']
        })
        #import pdb; pdb.set_trace()
        test_sample_dfs.append(df)

    test_sample = pd.concat(test_sample_dfs, ignore_index=True)
    basename = os.path.basename(raster)
    out_txt = os.path.join(out_dir, basename.replace(basename[-4:],
                                                     '_test.txt'))
    test_sample.to_csv(out_txt, sep='\t', index=False)

    # Read the raster as a write-able array and set all test samples to nodata
    print '\nAssigning nodata val to PSUs in training raster...\n'
    ar = ds.ReadAsArray()
    ar[test_sample.row, test_sample.col] = nodata
    out_raster = out_txt.replace('_test.txt', '_train.tif')
    array_to_raster(ar, tx, prj, driver, out_raster, nodata=nodata)

    desc = 'Training raster and test sample (text file with the same name but "_test" at the end) for making and evaluating STEM CONUS maps. Primary sampling units (PSUs) reserved for testing are assigned nodata.'
    desc += '\n\tInput raster: %s' % os.path.abspath(raster)
    desc += '\n\tNodata value: %s' % nodata
    desc += '\n\tPSU shapefile: %s' % os.path.abspath(psu_shp)
    desc += '\n\tOutput directory: %s\n' % os.path.abspath(out_dir)
    createMetadata(sys.argv, out_raster, description=desc)  #'''

    ds = None
Esempio n. 6
0
def main(model_dir, n_tiles, **kwargs):

    t0 = time.time()

    n_tiles = [int(n) for n in n_tiles.split(',')]
    if not os.path.isdir(model_dir):
        message = 'model directory given does not exist or is not a directory: ', model_dir
        raise IOError(message)

    model = os.path.basename(model_dir)
    dt_dir = os.path.join(model_dir, 'decisiontree_models')
    set_txt = os.path.join(dt_dir, '%s_support_sets.txt' % model)
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')

    pred_param_path = glob(os.path.join(model_dir,
                                        'predict_stem_*params.txt'))[0]
    predict_params, df_var = stem.read_params(pred_param_path)
    train_param_path = glob(os.path.join(model_dir,
                                         'train_stem_*params.txt'))[0]
    train_params, _ = stem.read_params(train_param_path)
    df_var.sort_index(inplace=True)

    nodata = int(predict_params['nodata'].replace('"', ''))
    if len(kwargs) == 0:
        var_ids = df_sets.max_importance.unique()
        var_names = df_var.ix[var_ids].index
        variables = zip(var_ids, var_names)
    else:
        variables = [(variable_id, variable_name)
                     for variable_name, variable_id in kwargs]

    mask_path = os.path.join(model_dir, '%s_vote.bsq' % model)
    if not os.path.exists(mask_path):
        mask_path = mask_path.replace('.bsq', '.tif')
    mask_ds = gdal.Open(mask_path)
    mask_tx = mask_ds.GetGeoTransform()
    xsize = mask_ds.RasterXSize
    ysize = mask_ds.RasterYSize
    prj = mask_ds.GetProjection()
    df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize,
                                                      mask_tx)
    total_tiles = len(df_tiles)
    df_tiles['tile'] = df_tiles.index

    # Find the tiles that have only nodata values
    t1 = time.time()
    print '\nFinding empty tiles...'
    mask = mask_ds.ReadAsArray() == nodata
    empty_tiles = stem.find_empty_tiles(df_tiles, ~mask, mask_tx)
    mask_ds = None
    print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\
    (len(empty_tiles), total_tiles, (time.time() - t1)/60)
    # Select only tiles that are not empty
    df_tiles = df_tiles.select(lambda x: x not in empty_tiles)
    total_tiles = len(df_tiles)

    #some_set = df_sets.iloc[0]
    support_size = [
        int(s)
        for s in train_params['support_size'].replace('"', '').split(',')
    ]
    set_size = [int(abs(s / mask_tx[1])) for s in support_size]

    out_dir = os.path.join(model_dir, 'importance_maps')
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    print variables
    for vi, (v_id, v_name) in enumerate(variables):

        t1 = time.time()
        print 'Making map for %s: %s of %s variables\n' % (v_name, vi + 1,
                                                           len(variables))

        ar = np.full((ysize, xsize), nodata, dtype=np.uint8)

        for i, (t_ind, t_row) in enumerate(df_tiles.iterrows()):
            t2 = time.time()
            print 'Aggregating for %s of %s tiles' % (i + 1, total_tiles)

            # Calculate the size of this tile in case it's at the edge where the
            #   tile size will be slightly different
            this_size = abs(t_row.lr_y - t_row.ul_y), abs(t_row.lr_x -
                                                          t_row.ul_x)
            df_these_sets = stem.get_overlapping_sets(df_sets, t_row,
                                                      this_size, support_size)

            rc = df_tiles_rc.ix[t_ind]
            this_size = rc.lr_r - rc.ul_r, rc.lr_c - rc.ul_c
            n_sets = len(df_these_sets)

            # Load overlapping predictions from disk and read them as arrays
            tile_ul = t_row[['ul_x', 'ul_y']]

            print n_sets, ' Overlapping sets'
            importance_bands = []

            importance_values = []
            for s_ind, s_row in df_these_sets.iterrows():

                # Calculate offset and array/tile indices
                offset = stem.calc_offset(tile_ul, (s_row.ul_x, s_row.ul_y),
                                          mask_tx)
                #if abs(offset[0]) > this_size[0] or abs(offset[1] > this_size[1]):

                tile_inds, a_inds = mosaic.get_offset_array_indices(
                    tile_size, set_size, offset)

                # Get feature with maximum importance and fill tile with that val
                try:
                    with open(s_row.dt_file, 'rb') as f:
                        dt_model = pickle.load(f)
                    importance_value = int(
                        dt_model.feature_importances_[v_id] * 100)
                    importance_values.append(importance_value)
                    #filled = np.full((nrows, ncols), importance_value, dtype=np.uint8)
                    #import_band = stem.fill_tile_band(this_size, filled, tile_inds, nodata)
                    import_band = np.full(this_size, np.nan, dtype=np.float16)
                    import_band[tile_inds[0]:tile_inds[1],
                                tile_inds[2]:tile_inds[3]] = importance_value
                    importance_bands.append(import_band)
                except Exception as e:
                    print e
                    continue  #'''

            print 'Average importance for this tile: %.1f' % np.mean(
                importance_values)
            #Aggregate
            importance_stack = np.dstack(importance_bands)
            importance_tile = np.nanmean(importance_stack, axis=2)
            tile_mask = mask[rc.ul_r:rc.lr_r,
                             rc.ul_c:rc.lr_c] | np.isnan(importance_tile)
            importance_tile[tile_mask] = nodata
            ar[rc.ul_r:rc.lr_r,
               rc.ul_c:rc.lr_c] = np.round(importance_tile).astype(np.uint8)
            print 'Aggregation time for this tile: %.1f minutes\n' % (
                (time.time() - t2) / 60)
            '''temp_dir = os.path.join(out_dir, 'delete')
            if not os.path.isdir(temp_dir):
                os.mkdir(temp_dir)
            t_tx = tile_ul[0], 30, 0, tile_ul[1], 0, -30
            array_to_raster(np.round(importance_tile).astype(np.uint8), t_tx, prj, gdal.GetDriverByName('gtiff'), os.path.join(temp_dir, 'delete_%s.tif' % t_ind), gdal.GDT_Byte, 255, True)'''
        out_path = os.path.join(out_dir,
                                '%s_importance_%s.tif' % (model, v_name))
        try:
            array_to_raster(ar, mask_tx, prj, gdal.GetDriverByName('gtiff'),
                            out_path, gdal.GDT_Byte, nodata)
        except Exception as e:
            print e
            import pdb
            pdb.set_trace()
        print 'Time for this variable: %.1f minutes\n' % (
            (time.time() - t1) / 60)

    print '\nTotal time for %s variables: %.1f hours\n' % (len(variables), (
        (time.time() - t0) / 3600))
Esempio n. 7
0
def main(params, pct_train=None, aggregate_presence=False):
    t0 = time.time()

    # Read params. Make variables from each line of the 1-line variables
    inputs, df_vars = stem.read_params(params)
    for var in inputs:
        exec("{0} = str({1})").format(var, inputs[var])
    try:
        if 'years' in inputs:
            years = np.array([int(yr) for yr in years.split(',')])
        else:
            year_start = int(year_start)
            year_end = int(year_end)
            years = np.arange(year_start, year_end + 1)
        '''tsa_mosaic = inputs['tsa_mosaic']
        search_dir = inputs['search_dir']
        search_str = inputs['search_str']
        obs_txt = inputs['obs_txt']
        index_col = inputs['index_col']
        year_col = inputs['year_col']
        target_col = inputs['target_col']
        out_txt = inputs['out_txt']'''
        add_file_tag = int(add_file_tag)
        #count_type = inputs['count_type']

    except KeyError as e:
        missing_var = str(e).split("'")[1]
        if missing_var in ['year_start', 'year_end', 'years']:
            msg = ('No list of years or year_start/year_end specified in' +\
            ' param file:\n%s\n. Re-run script with either of these' +\
            ' parameters given.') % params
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    out_dir, original_bn = os.path.split(out_txt)
    # Add informative tags to output dir and basename
    if add_file_tag:
        res = years[1] - years[0]
        #out_dir = os.path.basename(out_dir)
        now = datetime.datetime.now()
        date_str = str(now.date()).replace('-', '')
        time_str = str(now.time()).replace(':', '')[:4]
        out_dirname = '{0}_res{1}yr_{2}_{3}'.format(target_col, res, date_str,
                                                    time_str)
        out_dir = os.path.join(out_dir, out_dirname)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        out_bn = '{0}_{1}'.format(
            os.path.basename(obs_txt).replace('.txt', ''), original_bn)
        out_txt = os.path.join(out_dir, out_bn)

    if params != os.path.exists(os.path.join(out_dir,
                                             os.path.basename(params))):
        print 'Copying params to output dir: %s\n' % out_dir
        shutil.copy2(params, out_dir)

    print 'Getting predictors... '
    t1 = time.time()
    df_obs = pd.read_csv(obs_txt, sep='\t', index_col=index_col)
    original_columns = df_obs.columns
    df = get_predictors(years, search_dir, search_str, df_obs, index_col,
                        year_col, df_vars)
    print '%.1f seconds\n' % (time.time() - t1)

    # Select count type and date range
    if 'count_type' in inputs:
        count_type = [t.strip() for t in count_type.split(',')]
        df = df[df.COUNT_TYPE.isin(count_type)]
        #df.drop(['COUNT_TYPE'], axis=1, inplace=True)
        if 'P21' in count_type:
            df = df[df.EFFORT_DISTANCE_KM < .1]
    if 'day_minmax' in inputs:
        day_min, day_max = [int(d) for d in day_minmax.split(',')]
        df = df[(df.DAY >= day_min) & (df.DAY <= day_max)]
    if 'time_minmax' in inputs:
        time_min, time_max = [int(t) for t in time_minmax.split(',')]
        df = df[(df.TIME >= time_min) & (df.TIME <= time_max)]
    if 'max_effort_time' in inputs:
        max_effort_time = int(max_effort_time)
        df = df[df.EFFORT_HRS < max_effort_time]
    if 'max_effort_dist' in inputs:
        max_effort_dist = int(max_effort_dist)
        df = df[df.EFFORT_DISTANCE_KM < max_effort_time]

    #df = df[(df.YEAR >= min(years)) & (df.YEAR <= max(years))]
    #df[target_col] *= 100 # To be able to keep stuff as 8 bit ints

    # Calc row and col from x, y
    ds = gdal.Open(tsa_mosaic)
    tx = ds.GetGeoTransform()
    ds = None
    ul_xy = tx[0], tx[3]
    df['row'], df['col'] = zip(*[
        stem.calc_offset(ul_xy, xy, tx) for i, xy in df[['x', 'y']].iterrows()
    ])

    if 'kernel_dist' in inputs:
        t1 = time.time()
        print 'Calculating kernel density...'
        kernel_dist = int(kernel_dist)
        for yr in df.YEAR.unique():
            yr_mask = df.YEAR == yr
            df_w = gaussain_weights(df[yr_mask], target_col, kernel_dist)
            df.ix[yr_mask, target_col] = df_w.weighted
        '''
        df_w = gaussain_weights(df, target_col, kernel_dist)
        df[target_col] = df_w.weighted
        #df = df.drop_duplicates(subset=[target_col, 'row', 'col'])'''
        print '%.1f seconds\n' % (time.time() - t1)  #"""

    if aggregate_presence:
        t1 = time.time()
        print 'Aggregating presence records...'
        df.ix[df[target_col] > 0, target_col] = 1
        for yr in df.YEAR.unique():
            yr_mask = df.YEAR == yr
            df_yr = df[yr_mask]
            # Get unique locations for this year
            unique = df_yr[['row', 'col']].drop_duplicates().values
            for row, col in unique:
                this_loc = df_yr[(df_yr.row == row) & (df_yr.col == col)]
                #If there are ones and 0s, drop the 0s
                if this_loc[target_col].min(
                ) == 0 and this_loc[target_col].max() == 1:
                    df.drop(this_loc[this_loc[target_col] == 0].index,
                            inplace=True)
        print '%.1f seconds\n' % (time.time() - t1)

    if pct_train:
        print 'Splitting training and test sets...'
        pct_train = float(pct_train)
        #n_test = int(len(df) * (1 - pct_train))
        unique = df[['row', 'col']].drop_duplicates().values
        n_test = int(len(unique) * (1 - pct_train))
        random_idx = random.sample(xrange(len(unique)), n_test)
        random_row, random_col = zip(*unique[random_idx])
        df_test = df[df.row.isin(random_row) & df.col.isin(random_col)]
        test_idx = df_test.index
        test_txt = out_txt.replace('.txt', '_test.txt')
        df = df[~df.index.isin(test_idx)]
        df_test.to_csv(test_txt, sep='\t')

    df.to_csv(out_txt, sep='\t')
    obs_out_txt = out_txt.replace('_' + original_bn[:-4], '')
    df[original_columns].to_csv(obs_out_txt, sep='\t')

    print '\nLength of output df:', len(df)
    print 'Text file written to: ', out_txt
    print '\nTotal time: %.1f minutes' % ((time.time() - t0) / 60)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'):
    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])    
    df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    
    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars  = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)
    
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir) #Copy the params for reference
    
    if 'confusion_params' in inputs: 
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path
    
    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)
    
    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)
    
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    db_path = os.path.join(model_dir, file_stamp + '.db')
    try:
        engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
        with engine.connect() as con, con.begin():
            df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')#'''
    except:
        set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
        if not os.path.isfile(set_txt):
            raise IOError('No database or support set txt file found')
        df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    
    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        # if subset specified, clip the mosaic and set mosaic path to clipped shp
        if 'subset_shp' in inputs:
            out_shp_bn = os.path.basename(mosaic_path).replace('.shp', '_clipped.shp')
            out_shp = os.path.join(out_dir, out_shp_bn)
            cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path)
            subprocess.call(cmd, shell=True)#'''
            mosaic_path = out_shp
        mosaic_dataset = ogr.Open(mosaic_path)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        # If subset specified, just get sets that overlap the subset
        if 'subset_shp' in inputs:
            mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon)
            for feature in mosaic_ds:
                mosaic_geom.AddGeometry(feature.GetGeometryRef())
            df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom)
        xsize = int((max_x - min_x)/resolution)
        ysize = int((max_y - min_y)/resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        if 'snap_coord' in train_inputs:
            snap_coord = train_inputs['snap_coord'].replace('"','')
            snap_coord = [float(c) for c in snap_coord.split(',')]#'''
        mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord)
        tiles = stem.attributes_to_df(mosaic_path) # Change to accept arbittary geometry
        
    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    #driver = gdal.GetDriverByName('gtiff')
        
    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 90, 40
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx)
    
    total_sets = len(df_sets)
    t0 = time.time()
    last_dts = pd.Series()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    n_jobs = int(n_jobs)
    tile_dir = os.path.join(model_dir, 'temp_tiles')
    #tile_dir = '/home/server/pi/homes/shooper/delete_test'
    if not os.path.isdir(tile_dir):
        os.mkdir(tile_dir)
    tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif')
    n_tiles = len(tiles)
    
    if not overwrite_tiles:
        files = os.listdir(tile_dir)
        tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field])
        for stat in agg_stats:
            stat_match = [f.split('_')[1] for f in fnmatch.filter(files, 'tile*%s.tif' % stat)]
            tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match)
        index_field = tiles.index.name
        tiles[index_field] = tiles.index
        tiles = tiles.set_index(tile_id_field, drop=False)[tile_files.isnull().any(axis=1)]
        tiles.set_index(index_field, inplace=True)
    
    tiles['ul_x'] = [stem.get_ul_coord(xmin, xmax, x_res) 
                    for i, (xmin, xmax) in tiles[['xmin','xmax']].iterrows()]
    tiles['ul_y'] = [stem.get_ul_coord(ymin, ymax, y_res) 
                    for i, (ymin, ymax) in tiles[['ymin','ymax']].iterrows()]
    tiles['lr_x'] = [xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax)
                    in tiles[['ul_x', 'xmin','xmin']].iterrows()]
    tiles['lr_y'] = [ymax if uly == ymin else ymin for i, (uly, ymin, ymax) 
                    in tiles[['ul_y', 'ymin','ymin']].iterrows()]
    
    support_nrows = int(support_size[0]/abs(y_res))
    support_ncols = int(support_size[1]/abs(x_res))
    t1 = time.time()
    args = [(tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles[tiles['name'].isin(['1771', '3224', '0333', '0558'])].iterrows())]    
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.iterrows())]
    
    if n_jobs > 1:
        print 'Predicting with %s jobs...\n' % n_jobs
        pool = Pool(n_jobs)
        pool.map(stem.predict_tile, args, 1)
        pool.close()
        pool.join()
    else:
        for arg in args:
            print 'Predicting with 1 job ...\n'
            stem.predict_tile(*arg)#'''
    print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ((time.time() - t1)/3600)
    
    t1 = time.time()
    mosaic_ul = mosaic_tx[0], mosaic_tx[3]
    driver = gdal.GetDriverByName('gtiff')
    for stat in agg_stats:
        if stat == 'stdv':
            this_nodata = -9999
            ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) 
        else:
            this_nodata = nodata
            ar = np.full((ysize, xsize), this_nodata, dtype=np.uint8)
        
        for tile_id, tile_coords in tiles.iterrows():
            tile_file = os.path.join(tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat))
            ds = gdal.Open(tile_file)
            tile_tx = ds.GetGeoTransform()
            tile_ul = tile_tx[0], tile_tx[3]
            row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx)
            # Make sure the tile doesn't exceed the size of ar
            tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off
            tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off
            ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows)
            try:
                ar[row_off : row_off + tile_rows, col_off : col_off + tile_cols] = ar_tile
            except Exception as e:
                import pdb; pdb.set_trace()
        
        out_path = os.path.join(model_dir, '%s_%s.tif' % (file_stamp, stat))
        #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat))
        gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype)
        mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata)
    
    # Clean up the tiles
    shutil.rmtree(tile_dir)
    print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1)/60)
    
    # Get feature importances and max importance per set
    t1 = time.time()
    print 'Getting importance values...'
    importance_cols = sorted([c for c in df_sets.columns if 'importance' in c])
    df_sets['max_importance'] = nodata
    if len(importance_cols) == 0:
        # Loop through and get importance
        importance_per_var = []
        for s, row in df_sets.iterrows():
            with open(row.dt_file, 'rb') as f: 
                dt_model = pickle.load(f)
            max_importance, this_importance = stem.get_max_importance(dt_model)
            df_sets.ix[s, 'max_importance'] = max_importance
            importance_per_var.append(this_importance)
        importance = np.array(importance_per_var).mean(axis=0)
    else:
        df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1)
        importance = df_sets[importance_cols].mean(axis=0).values
    pct_importance = importance / importance.sum()
    print '%.1f minutes\n' % ((time.time() - t1)/60)
    
    # Save the importance values
    importance = pd.DataFrame({'variable': pred_vars,
                               'pct_importance': pct_importance,
                               'index': range(len(pred_vars))
                               })
    importance.set_index('index', inplace=True)
    importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')#'''
    
    if 'confusion_params' in locals():
        import confusion_matrix as confusion

        ''' 
         Read the mean or vote back in '''
        if 'vote' in agg_stats:
            vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp)
            ar_vote = gdal.Open(vote_path)
            print '\nComputing confusion matrix for vote...'
            vote_dir = os.path.join(model_dir, 'evaluation_vote')
            out_txt = os.path.join(vote_dir, 'confusion.txt')
            df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
            vote_acc = df_v.ix['producer', 'user']
            vote_kap = df_v.ix['producer', 'kappa']
            '''try:
                out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
                df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
            except Exception as e:
                print e'''

                
        if 'mean' in agg_stats:
            mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp)
            ar_mean = gdal.Open(mean_path)
            print '\nGetting confusion matrix for mean...'
            mean_dir = os.path.join(model_dir, 'evaluation_mean')
            out_txt = os.path.join(mean_dir, 'confusion.txt')
            df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
            mean_acc = df_m.ix['user','producer']
            mean_kap = df_m.ix['user', 'kappa']
            '''try:
                out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
                df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
            except Exception as e:
                print e#'''


        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'
            
        print ''
        if 'vote' in agg_stats:
            print 'Vote accuracy .............. ', vote_acc
            print 'Vote kappa ................. ', vote_kap
        if 'mean' in agg_stats:
            print 'Mean accuracy .............. ', mean_acc
            print 'Mean kappa ................. ', mean_kap
        
    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
    
    print '\nTotal prediction runtime: %.1f hours\n' % ((time.time() - t0)/3600)