def main(top_left, bottom_left, bottom_right, top_right, config_id): # ------# # SETUP # with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) # connect to db and read config table engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(config_id), engine) raster = config["satellite_grid"][0] nightlights_date = config.get("nightlights_date")[0] base_raster = "../tmp/local_raster.tif" if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # ----------------------------------- # # WorldPop Raster too fine, aggregate # aggregate(raster, base_raster, 1) # ------------------- # # CLIP RASTER TO SCOPE # geoms = [{'type': 'Polygon', 'coordinates': [[top_left, bottom_left, bottom_right, top_right]]}] with rasterio.open(base_raster) as src: out_image, out_transform = mask(src, geoms, crop=True) out_meta = src.meta.copy() # save the resulting raster out_meta.update({"driver": "GTiff", "height": out_image.shape[1], "width": out_image.shape[2], "transform": out_transform }) with rasterio.open(base_raster, "w", **out_meta) as dest: dest.write(out_image) # load the new clipped raster to the img_lib GRID = RasterGrid(base_raster) with rasterio.open(base_raster) as src: list_j, list_i = np.where(src.read()[0] != src.nodata) print("INFO: downloading images in scope ...") coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # dopwnlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date) print('INFO: images downloaded.') print('INFO: scoring ...') # extarct the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline='scoring') # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, config_id, 'scoring'), index=False) g_features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format("Google", config_id, 'scoring')) s_features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format("Sentinel", config_id, 'scoring')) data = pd.merge(g_features, s_features, on=['i', 'j', 'index']) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(config_id), index=False) print('INFO: features extracted.') else: data = pd.DataFrame({'gpsLongitude': coords_x, 'gpsLatitude': coords_y, 'j': list_j, 'i': list_i}) # --------------- # # add nightlights # # --------------- # from geojson import Polygon from nightlights import Nightlights area = Polygon([[top_left, bottom_left, bottom_right, top_right]]) NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date) data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(data) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) # density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) # data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x)) # osm_features.append('density_{}'.format(value)) # ---------------------- # # LOAD MODEL AND PREDICT # print("INFO: load model and predict ...") try: X = data.drop(['index', 'i', 'j', 'gpsLongitude', 'gpsLatitude'], axis=1) except ValueError: X = data.drop(['i', 'j', 'gpsLongitude', 'gpsLatitude'], axis=1) # load model and predict try: RmSense = joblib.load('../Models/RmSense_model_config_id_{}.pkl'.format(config_id)) kNN = joblib.load('../Models/kNN_model_config_id_{}.pkl'.format(config_id)) except FileNotFoundError: print('ERROR: model not found') yhat = (RmSense.predict(X) + kNN.predict(data[['i','j']])) / 2. results = pd.DataFrame({'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': yhat}) outfile = "../Data/Results/scalerout_{}.tif".format(config_id) tifgenerator(outfile=outfile, raster_path=base_raster, df=results)
def run(id): # ----------------- # # SETUP ############# # ----------------- # print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] indicator = config["indicator"][0] raster = config["satellite_grid"][0] aggregate_factor = config["base_raster_aggregation"][0] scope = config["scope"][0] nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end") s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end") if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # ----------------------------------- # # WorldPop Raster too fine, aggregate # from utils import aggregate if aggregate_factor > 1: print('INFO: aggregating raster {}'.format(raster)) base_raster = "../tmp/local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster # -------- # # DATAPREP # # -------- # data = pd.read_csv(dataset) data_cols = data.columns.values # grid GRID = RasterGrid(base_raster) list_i, list_j = GRID.get_gridcoordinates(data) # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y data["i"], data["j"] = list_i, list_j # Get Polygon Geojson of the boundaries minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude") area = points_to_polygon(minlon, minlat, maxlon, maxlat) print("Number of clusters: {} ".format(len(data))) list_i, list_j, pipeline = data["i"], data["j"], 'evaluation' # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # downlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000) print('INFO: images downloaded.') if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)): print('INFO: already scored.') features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)) else: print('INFO: scoring ...') # extract the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline) # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False) features = features.drop('index', 1) data = data.merge(features, on=["i", "j"]) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) print('INFO: features extracted.') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end) data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope) S2.download() data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series) # --------------- # # save features # # --------------- # # features to be use in the linear model features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j']))) # Standardize Features (0 mean and 1 std) #data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std() print("Normalizing : max") data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max() data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) # --------------- # # model indicator # # --------------- # # shuffle dataset data = data.sample(frac=1, random_state=1783).reset_index(drop=True) # shuffle data # if set in the config, take log of indicator if config['log'][0]: data[indicator] = np.log(data[indicator]) from modeller import Modeller X, y = data[features_list + ["gpsLatitude", "gpsLongitude"]], data[indicator] modeller = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], scoring='r2', cv_loops=20) kNN_pipeline = modeller.make_model_pipeline('kNN') kNN_scores = modeller.compute_scores(kNN_pipeline, y) kNN_R2_mean = kNN_scores.mean() kNN_R2_std = kNN_scores.std() print("kNN_R2_mean: ", kNN_R2_mean, "kNN_R2_std: ", kNN_R2_std) Ridge_pipeline = modeller.make_model_pipeline('Ridge') Ridge_scores = modeller.compute_scores(Ridge_pipeline, y) Ridge_R2_mean = Ridge_scores.mean() Ridge_R2_std = Ridge_scores.std() print("Ridge_R2_mean: ", Ridge_R2_mean, "Ridge_R2_std: ", Ridge_R2_std) Ensemble_pipeline = modeller.make_ensemble_pipeline([kNN_pipeline, Ridge_pipeline]) Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y) Ensemble_R2_mean = Ensemble_scores.mean() Ensemble_R2_std = Ensemble_scores.std() print("Ensemble_R2_mean: ", Ensemble_R2_mean, "Ensemble_R2_std: ", Ensemble_R2_std) # ------------------ # # write scores to DB # # ------------------ # query = """ insert into results_new (run_date, config_id, r2, r2_sd, r2_knn, r2_sd_knn, r2_features, r2_sd_features, mape_rmsense) values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format( config['id'][0], Ensemble_R2_mean, Ensemble_R2_std, kNN_R2_mean, kNN_R2_std, Ridge_R2_mean, Ridge_R2_std, 0) engine.execute(query) # ------------------------- # # write predictions to file # # ------------------------- # print('INFO: writing predictions to disk ...') from sklearn.model_selection import cross_val_predict results = pd.DataFrame({ 'yhat': cross_val_predict(Ensemble_pipeline, X.values, y), 'y': data[indicator].values, 'lat': data['gpsLatitude'], 'lon': data['gpsLongitude']}) results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False) # save model for production Ensemble_pipeline.fit(X.values, y) # Best n_neighbors (kNN) print('INFO: number of neighbours chosen: ', Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_) # Best alpha (Ridge) print('INFO: regularization param chosen: ', Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_) from sklearn.externals import joblib joblib.dump(Ensemble_pipeline, '../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model saved.')
def main(id, aggregate_factor, min_pop, minlat, maxlat, minlon, maxlon, shapefile): # ----------------- # # SETUP ############# # ----------------- # print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] raster = config["satellite_grid"][0] scope = config["scope"][0] nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end") s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end") if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # ----------------------------------- # # WorldPop Raster too fine, aggregate # if aggregate_factor is None: aggregate_factor = config["base_raster_aggregation"][0] if aggregate_factor > 1: print('INFO: aggregating raster with factor {}'.format(aggregate_factor)) base_raster = "../tmp/local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster # ---------------- # # AREA OF INTEREST # # ---------------- # dataset_df = pd.read_csv(dataset) data_cols = dataset_df.columns.values # create geometry if (minlat is None) and (maxlat is None) and (minlon is None) and (maxlon is None): minlat, maxlat, minlon, maxlon = df_boundaries(dataset_df, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude") area = points_to_polygon(minlon, minlat, maxlon, maxlat) # crop raster with rasterio.open(base_raster) as src: out_image, out_transform = mask(src, [area], crop=True) out_meta = src.meta.copy() # save the resulting raster out_meta.update({"driver": "GTiff", "height": out_image.shape[1], "width": out_image.shape[2], "transform": out_transform }) final_raster = "../tmp/final_raster.tif" print('INFO: Remiving tiles with population under {}'.format(min_pop)) # only score areas where there are at agg factor living with rasterio.open(final_raster, "w", **out_meta) as dest: out_image[out_image < min_pop] = dest.nodata dest.write(out_image) list_j, list_i = np.where(out_image[0] != dest.nodata) # instantiate GRID GRID = RasterGrid(final_raster) coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) data = pd.DataFrame({"i": list_i, "j": list_j}) data["gpsLatitude"] = coords_y data["gpsLongitude"] = coords_x print("Number of clusters: {} ".format(len(data))) list_i, list_j, pipeline = data["i"], data["j"], 'scoring' # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # downlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000) print('INFO: images downloaded.') print('INFO: scoring ...') # extract the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline) # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False) features = features.drop('index', 1) data = data.merge(features, on=["i", "j"]) data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(id, pipeline), index=False) print('INFO: features extracted.') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end) data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope) S2.download() data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series) # --------------- # # save features # # --------------- # features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j']))) # Standardize Features (0 mean and 1 std) # TODO: use mean and max from training print("INFO: Normalizing by the max") data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max() data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(id, pipeline), index=False) # Open model ensemble_pipeline = joblib.load('../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model loaded.') X = data[features_list + ["gpsLatitude", "gpsLongitude"]] ensemble_predictions = ensemble_pipeline.predict(X.values) # if take log of indicator if config['log'][0]: ensemble_predictions = np.exp(ensemble_predictions) results = pd.DataFrame({'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': ensemble_predictions}) outfile = "../Data/Results/scalerout_{}.tif".format(id) tifgenerator(outfile=outfile, raster_path=final_raster, df=results) outfile = "../Data/Results/scalerout_{}_kNN.tif".format(id) results['yhat_kNN'] = ensemble_pipeline.regr_[0].predict(X.values) tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_kNN') outfile = "../Data/Results/scalerout_{}_Ridge.tif".format(id) results['yhat_Ridge'] = ensemble_pipeline.regr_[1].predict(X.values) tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_Ridge') if shapefile is not None: input_rst = "../Data/Results/scalerout_{}.tif".format(id) weight_rst = "../tmp/final_raster.tif" output_shp = "../Data/Results/scalerout_{}_aggregated.shp".format(id) from utils import weighted_sum_by_polygon weighted_sum_by_polygon(shapefile, input_rst, weight_rst, output_shp)
def run(id): with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""".format( private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config where id = {}".format(id), engine) dataset = config["dataset_filename"][0] indicator = config["indicator"][0] raster = config["satellite_grid"][0] indicator_log = config['indicator_log'][0] ## load data GRID = RasterGrid(raster) list_i, list_j = GRID.get_gridcoordinates(dataset) # hh_data = pd.read_csv(dataset) data = hh_data data["i"] = list_i data["j"] = list_j cluster_N = 'countbyEA' try: data = data.groupby(["i", "j"]).apply( lambda x: np.average(x[indicator], weights=x[cluster_N])).to_frame( name=indicator).reset_index() except: data = data.groupby(["i", "j"]).mean() X = pd.DataFrame({"i": data["i"], "j": data["j"]}) y = data[indicator].values # Log-normal distribution if indicator_log == True: y = np.log(y) # TRAIN MODEL outer_cv = KFold(5, shuffle=True, random_state=75788) inner_cv = KFold(5, shuffle=True, random_state=1673) print(str(np.datetime64('now')), " INFO: training model ...") from sklearn.neighbors import KNeighborsRegressor k = np.arange(20) + 1 parameters = {'n_neighbors': k} model = KNeighborsRegressor(weights='distance') clf = GridSearchCV(estimator=model, param_grid=parameters, cv=inner_cv, scoring=r2_pearson) score = cross_val_score(clf, X, y, scoring=r2_pearson, cv=outer_cv) score_r2 = cross_val_score(clf, X, y, scoring=r2, cv=outer_cv) score_MAPE = cross_val_score(clf, X, y, scoring=MAPE, cv=outer_cv) print('INFO: Pearson score: ', score.mean()) clf.fit(X, y) print('INFO: best parameter: ', clf.fit(X, y).best_params_) ## Create list of i,j src = rasterio.open(raster) list_j, list_i = np.where(src.read()[0] != src.nodata) src.close() ## Score images X = pd.DataFrame({"i": list_i, "j": list_j}) y_hat = clf.predict(X) outfile = "../Data/Outputs/config_id_{}_KNN.tif".format(id) ds = gdal.Open(raster) band = ds.GetRasterBand(1) arr = band.ReadAsArray() [cols, rows] = arr.shape arr_out = np.zeros(arr.shape) - 99 arr_out[list_j, list_i] = y_hat driver = gdal.GetDriverByName("GTiff") outdata = driver.Create(outfile, rows, cols, 1, gdal.GDT_Float32) outdata.SetGeoTransform( ds.GetGeoTransform()) # sets same geotransform as input outdata.SetProjection(ds.GetProjection()) # sets same projection as input outdata.GetRasterBand(1).SetNoDataValue(-99) outdata.GetRasterBand(1).WriteArray(arr_out) outdata.FlushCache() # saves to disk!! outdata = None band = None ds = None
def run(id): # ----------------- # # SETUP ############# # ----------------- # print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] indicator = config["indicator"][0] raster = config["satellite_grid"][0] aggregate_factor = config["aggregation"][0] # ----------------------------------- # # WorldPop Raster too fine, aggregate # from utils import aggregate if aggregate_factor > 1: print('INFO: aggregating raster ...') base_raster = "../tmp/local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster nightlights_date_start = config["nightlights_date"][0].get("start") nightlights_date_end = config["nightlights_date"][0].get("end") if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # -------- # # DATAPREP # # -------- # data = pd.read_csv(dataset) data_cols = data.columns.values # grid GRID = RasterGrid(base_raster) list_i, list_j = GRID.get_gridcoordinates(data) # to use the centroid from the tile instead # coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) #data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y data["i"], data["j"] = list_i, list_j # Get Polygon Geojson of the boundaries minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude") area = points_to_polygon(minlon, minlat, maxlon, maxlat) # --------------------------- # # GROUP CLUSTERS IN SAME TILE # # --------------------------- # # TODO: looks like shit cluster_N = 'n' print("Number of clusters: {} ".format(len(data))) def wavg(g, df, weight_series): w = df.ix[g.index][weight_series] return (g * w).sum() / w.sum() fnc = functools.partial(wavg, df=data, weight_series=cluster_N) try: data = data.groupby(["i", "j"]).agg({indicator: fnc, 'gpsLatitude': fnc, 'gpsLongitude': fnc}).reset_index() except KeyError: print("No weights, taking the average per i and j") data = data[['i', 'j', 'n', 'gpsLatitude', 'gpsLongitude', indicator]].groupby(["i", "j"]).mean().reset_index() print("Number of unique tiles: {} ".format(len(data))) list_i, list_j, pipeline = data["i"], data["j"], 'evaluation' # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # downlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000) print('INFO: images downloaded.') if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)): print('INFO: already scored.') features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)) else: print('INFO: scoring ...') # extract the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline) # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False) features = features.drop('index', 1) data = data.merge(features, on=["i", "j"]) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) print('INFO: features extracted.') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end) data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(data) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) #density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) #data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x)) #osm_features.append('density_{}'.format(value)) # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # # TODO: Use efficiently maxNDBImaxNDVImaxNDWI_sum_todf print('INFO: getting NDBI, NDVI, NDWI ...') start_date = "2017-01-01" # TODO: Add to config, be careful no image before 2015 end_date = "2018-01-01" for i in date_range(start_date, end_date, 3): print('INFO: getting max NDVI between dates: {}'.format(i)) gee_ndvi_max_raster = gee_sentinel_raster(i[0], i[1], area, ind="NDVI") data["max_NDVI_{}_{}".format(i[0], i[1])] = data.apply(gee_raster_mean, args=(gee_ndvi_max_raster, "gpsLatitude", "gpsLongitude", "NDVI"), axis=1) print('INFO: getting max NDBI') gee_ndbi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDBI") data["max_NDBI"] = data.apply(gee_raster_mean, args=(gee_ndbi_max_raster, "gpsLatitude", "gpsLongitude", "NDBI"), axis=1) print('INFO: getting max NDWI') gee_ndwi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDWI") data["max_NDWI"] = data.apply(gee_raster_mean, args=(gee_ndwi_max_raster, "gpsLatitude", "gpsLongitude", "NDWI"), axis=1) # --------------- # # save features # # --------------- # features_list = list(set(data.columns) - set(data_cols) - set(['i', 'j'])) # Standardize Features (0 mean and 1 std) data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std() data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) # --------------- # # model indicator # # --------------- # data = data.sample(frac=1, random_state=1783).reset_index(drop=True) # shuffle data data_features = data[features_list] # if take log of indicator if config['log'][0]: data[indicator] = np.log(data[indicator]) from modeller import Modeller md = Modeller(['kNN', 'Kriging', 'RmSense', 'Ensamble'], data_features) cv_loops = 20 md.compute(data[['i', 'j']], data[indicator].values, cv_loops) # save model for production md.save_models(id) print(str(np.datetime64('now')), 'INFO: model saved.') # ------------------ # # write scores to DB # # ------------------ # r2, r2_var = np.mean(md.scores['Ensamble']), np.var(md.scores['Ensamble']) r2_knn, r2_var_knn = np.mean(md.scores['kNN']), np.var(md.scores['kNN']) r2_rmsense, r2_var_rmsense = np.mean(md.scores['RmSense']), np.var(md.scores['RmSense']) y_duplicated = np.repeat(data[indicator], cv_loops) mape_rmsense = np.mean(np.abs([item for sublist in md.results['RmSense'] for item in sublist] - y_duplicated) / y_duplicated) if mape_rmsense == float("inf") or mape_rmsense == float("-inf"): mape_rmsense = 0 query = """ insert into results_new (run_date, config_id, r2, r2_var, r2_knn, r2_var_knn, r2_features, r2_var_features, mape_rmsense) values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format( config['id'][0], r2, r2_var, r2_knn, r2_var_knn, r2_rmsense, r2_var_rmsense, mape_rmsense) engine.execute(query) # ------------------------- # # write predictions to file # # ------------------------- # print('INFO: writing predictions to disk ...') results = pd.DataFrame({ #'yhat': [item for sublist in md.results['kNN'] for item in sublist], 'y': data[indicator].values, 'lat': data['gpsLatitude'], 'lon': data['gpsLongitude']}) results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False)
def downscale(config, request): country = request.form['country'] algorithm = request.form['algorithm'] # country raster -------------------------------------- # use the country 2 raster app to generate new ones: https://countrytoraster.herokuapp.com/ raster = '{}_0.01_4326_1.tif'.format(country) local_raster = 'temp/' + raster print('-> getting raster ', raster) # download from AWS S3 import boto3 bucket_name = config['rasters_bucket'] s3 = boto3.resource('s3') s3.Bucket(bucket_name).download_file(raster, local_raster) print('-> raster loaded.') # load dataset from input ------------------------------- print('-> loading dataset from input form...') data = pd.read_csv(request.files['file']) # load relative raster print('-> loading raster ', local_raster) GRID = RasterGrid(local_raster) try: data['i'], data['j'] = GRID.get_gridcoordinates(data) except IndexError: print('ERROR: raster and data are not from the same country!') raise # ------------------------------------ # Grouping clusters that belong to the same tile. cluster_N = 'countbyEA' print("Number of clusters: {} ".format(len(data))) def wavg(g, df, weight_series): w = df.ix[g.index][weight_series] return (g * w).sum() / w.sum() import functools fnc = functools.partial(wavg, df=data, weight_series=cluster_N) try: data = data.groupby(["i", "j"]).agg({ 'Indicator': fnc, 'gpsLatitude': fnc, 'gpsLongitude': fnc }).reset_index() except: print("No weights, taking the average per i and j") data = data[['gpsLatitude', 'gpsLongitude', 'Indicator']].groupby(["i", "j"]).mean().reset_index() print("Number of unique tiles: {} ".format(len(data))) # train model ------------------------------------ X = pd.DataFrame({"i": data["i"], "j": data["j"]}) y = data.Indicator.values from model import IndicatorScaler model = IndicatorScaler(algorithm, X, y) # all country predictions ------------ print('-> loading all grid points in the country') import rasterio src = rasterio.open(local_raster) list_j, list_i = np.where(src.read()[0] > 0) src.close() # also add the gps coordinates to the data for later use coords_i, coords_j = GRID.get_gpscoordinates(list_i, list_j) res = pd.DataFrame({ "i": list_i, "j": list_j, "gpsLongitude": coords_i, "gpsLatitude": coords_j }) # ------------------------------------ # filter on built areas ------------- # use WorlPop layer to filter on inhabited locations. pop_raster = '{}_worldpop.tif'.format(country) local_pop_raster = 'temp/' + pop_raster print('-> getting population from WorldPop ({})'.format(local_pop_raster)) if not os.path.exists(local_pop_raster): s3.Bucket(bucket_name).download_file(pop_raster, local_pop_raster) from img_utils import getRastervalue res = getRastervalue(res, local_pop_raster) # ------------------------------------ # predictions for all data left ------- print('-> running predictions...') res['yhat'] = model.model.predict(res[['i', 'j']]) # ------------------------------------ # saves to disk --------------------- # no idea how this works from exporter import tifgenerator outfile = "temp/scalerout_{}_{}.tif".format(country, algorithm) tifgenerator(outfile=outfile, raster_path=local_raster, df=res) # ------------------------------------- print('-> return file to client.') return send_file('../' + outfile, mimetype='image/tiff', as_attachment=True, attachment_filename=country + "_" + algorithm + ".tif")
def run(id): with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""".format( private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config where id = {}".format(id), engine) dataset = config["dataset_filename"][0] indicator = config["indicator"][0] raster = config["satellite_grid"][0] step = config["satellite_step"][0] provider = config["satellite_source"][0] start_date = config["sentinel_start"][0] end_date = config["sentinel_end"][0] land_use_raster = config["land_use_raster"][0] network_model = config['network_model'][0] custom_weights = config['custom_weights'][0] indicator_log = config['indicator_log'][0] model_pca = config['model_pca'][0] output = config['output'][0] model_grid_parameters = config['model_grid_parameters'][0] ## 1. Rasterize Country Shapefile country_shp = "../Data/Geofiles/Shapefiles/ADM0/sen_admbnda_adm0_1m_gov_ocha_04082017/sen_admbnda_adm0_1m_gov_ocha_04082017.shp" cell_size = 0.05 no_data = -99 output = "../Data/Geofiles/Rasters/Senegal_raster_nodata_lowres.tif" #gdal_rasterize -a_nodata -99 -burn 1 -tr 0.05 0.05 -l sen_admbnda_adm0_1m_gov_ocha_04082017 "/Users/pasquierjb/Google Drive/WFP_Shared/Projects/HRM/Data/Shapefiles/ADM0/sen_admbnda_adm0_1m_gov_ocha_04082017/sen_admbnda_adm0_1m_gov_ocha_04082017.shp" /Users/pasquierjb/Desktop/test6.tif ## 2. Create list of i,j #raster="../Data/Geofiles/Rasters/Senegal_raster_nodata.tif" src = rasterio.open(raster) list_j, list_i = np.where(src.read()[0] != src.nodata) src.close() ## 3. Download images GRID = RasterGrid(raster) data = pd.DataFrame({"i": list_i, "j": list_j}) for sat in provider.split(","): data = download_score_merge(id, data, GRID, list_i, list_j, raster, step, sat, start_date, end_date, network_model, custom_weights, pipeline="prediction") data.to_csv( "../Data/Features/features_all_id_{}_prediction.csv".format(id), index=False) X = data[list( set(data.columns) - set(['index', 'index_x', 'index_y', 'i', 'j']))] #X = data.drop(['index', 'index_x', 'index_y', 'i', 'j'], axis=1) clf = joblib.load('../Models/ridge_model_config_id_{}.pkl'.format(id)) y_hat = clf.predict(X) outfile = "../Data/Outputs/{}.tif".format(id) ds = gdal.Open(raster) band = ds.GetRasterBand(1) arr = band.ReadAsArray() [cols, rows] = arr.shape arr_out = np.zeros(arr.shape) - 99 arr_out[list_j, list_i] = y_hat driver = gdal.GetDriverByName("GTiff") outdata = driver.Create(outfile, rows, cols, 1, gdal.GDT_Float32) outdata.SetGeoTransform( ds.GetGeoTransform()) ##sets same geotransform as input outdata.SetProjection(ds.GetProjection()) ##sets same projection as input outdata.GetRasterBand(1).SetNoDataValue(-99) outdata.GetRasterBand(1).WriteArray(arr_out) outdata.FlushCache() ##saves to disk!! outdata = None band = None ds = None
feature_columns = list(set(features.columns.values) - set(non_feature_columns)) feature_matrix = features[feature_columns] feature_matrix = feature_matrix.reindex_axis(sorted(feature_matrix.columns), axis=1) # ---------------------------------------------------------------------- # apply PCA from sklearn.decomposition import PCA pc = PCA(n_components=2) x = pc.fit_transform(features[feature_columns]) dfx = pd.DataFrame(x, columns=['x', 'y']) # ---------------------------------------------------------------------- # get raster and retrieve relevant raster coordinates GRId = RasterGrid( raster='../Data/Satellite/F182013.v4c_web.stable_lights.avg_vis.tif', image_dir='../Data/Satellite/Google') dfx['i'], dfx['j'] = features['i'], features['j'] # for what clusters? dfx['lon'], dfx['lat'] = GRId.get_gpscoordinates(dfx['i'], dfx['j'], step=0) dfx['lonlat'] = dfx[['lon', 'lat']].round(4).astype(str).apply(lambda x: ','.join(x), axis=1) # ---------------------------------------------------------------------- # add indicators score hh_data = pd.read_csv( "../Data/datasets/VAM_ENSA_Nigeria_national_2017_indiv_reduced.csv")[[ 'FCS', 'i', 'j' ]]
preprocess_input = keras.applications.resnet50.preprocess_input dataset = "../Data/datasets/WFP_ENSAN_Senegal_2013_individual.csv" indicator = "FCS" raster = "../Data/Geofiles/Rasters/Senegal_0005_4326_1.tif" step = 0 provider = "Google" start_date = None end_date = None data = pd.read_csv(dataset) data = data.loc[data[indicator] > 0] data = data.sample(frac=1, random_state=1783).reset_index(drop=True) # shuffle data GRID = RasterGrid(raster) list_i, list_j = GRID.get_gridcoordinates(data) data["i"] = list_i data["j"] = list_j print("Number of survey records: {} ".format(len(data))) # Aggregate survey points at the grid level data = data[['i', 'j', 'gpsLatitude', 'gpsLongitude', indicator]].groupby(["i", "j"]).mean() print("Number of unique tiles: {} ".format(len(data))) print(data.head())