def individual_r2(id, indicator, cv): with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] if indicator is None: indicator = config["indicator"][0] data = pd.read_csv(dataset) data_cols = data.columns.values print(data_cols) data = pd.read_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id)) data["noise"] = np.random.normal(0, 1, len(data)) data = data.sample(frac=1, random_state=1783).reset_index(drop=True) # shuffle data features_list = list(set(data.columns) - set(data_cols) - set(['i', 'j', 'gpsLatitude', 'gpsLongitude', 'cluster', 'n', indicator, "log_".format(indicator)])) print(features_list) nn_features_google = [i for i in features_list if i.endswith('_Google')] nn_features_sentinel = [i for i in features_list if i.endswith('_Sentinel')] nn_features = nn_features_google + nn_features_sentinel print(nn_features) no_nn_features = list(set(features_list) - set(nn_features)) print(no_nn_features) # if take log of indicator if config['log'][0]: data[indicator] = np.log(data[indicator]) X = data print("indicator: ", indicator) y = data[indicator] cv_loops = cv from modeller import Modeller Modeller_all = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops) kNN_pipeline = Modeller_all.make_model_pipeline('kNN') kNN_scores = Modeller_all.compute_scores(kNN_pipeline, y) kNN_R2_mean = kNN_scores.mean() kNN_R2_std = kNN_scores.std() print("kNN_R2_mean: ", round(kNN_R2_mean, 2), "kNN_R2_std: ", round(kNN_R2_std, 2)) Ridge_pipeline = Modeller_all.make_model_pipeline('Ridge') Ridge_scores = Modeller_all.compute_scores(Ridge_pipeline, y) Ridge_R2_mean = Ridge_scores.mean() Ridge_R2_std = Ridge_scores.std() print("Ridge_R2_mean: ", round(Ridge_R2_mean, 2), "Ridge_R2_std: ", round(Ridge_R2_std, 2)) Ensemble_pipeline = Modeller_all.make_ensemble_pipeline([kNN_pipeline, Ridge_pipeline]) Ensemble_scores = Modeller_all.compute_scores(Ensemble_pipeline, y) Ensemble_R2_mean = Ensemble_scores.mean() Ensemble_R2_std = Ensemble_scores.std() print("Ensemble_R2_mean: ", round(Ensemble_R2_mean, 2), "Ensemble_R2_std: ", round(Ensemble_R2_std, 2)) Modeller_google = Modeller(X, rs_features=nn_features_google, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops) Ridge_pipeline = Modeller_google.make_model_pipeline('Ridge') Ridge_scores_google = Modeller_google.compute_scores(Ridge_pipeline, y) Ridge_R2_mean_google = Ridge_scores_google.mean() Ridge_R2_std_google = Ridge_scores_google.std() print("Ridge_R2_google_mean: ", round(Ridge_R2_mean_google, 2), "Ridge_R2_google_std: ", round(Ridge_R2_std_google, 2)) Modeller_sentinel = Modeller(X, rs_features=nn_features_sentinel, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops) Ridge_pipeline = Modeller_sentinel.make_model_pipeline('Ridge') Ridge_scores_sentinel = Modeller_sentinel.compute_scores(Ridge_pipeline, y) Ridge_R2_mean_sentinel = Ridge_scores_sentinel.mean() Ridge_R2_std_sentinel = Ridge_scores_sentinel.std() print("Ridge_R2_sentinel_mean: ", round(Ridge_R2_mean_sentinel, 2), "Ridge_R2_sentinel_std: ", round(Ridge_R2_std_sentinel, 2)) Modeller_nn = Modeller(X, rs_features=nn_features, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops) Ridge_pipeline = Modeller_nn.make_model_pipeline('Ridge') Ridge_scores_nn = Modeller_nn.compute_scores(Ridge_pipeline, y) Ridge_R2_mean_nn = Ridge_scores_nn.mean() Ridge_R2_std_nn = Ridge_scores_nn.std() print("Ridge_R2_nn_mean: ", round(Ridge_R2_mean_nn, 2), "Ridge_R2_nn_std: ", round(Ridge_R2_std_nn, 2)) Modeller_no_nn = Modeller(X, rs_features=no_nn_features, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops) Ridge_pipeline = Modeller_no_nn.make_model_pipeline('Ridge') Ridge_scores_no_nn = Modeller_no_nn.compute_scores(Ridge_pipeline, y) Ridge_R2_mean_no_nn = Ridge_scores_no_nn.mean() Ridge_R2_std_no_nn = Ridge_scores_no_nn.std() print("Ridge_R2_no_nn_mean: ", round(Ridge_R2_mean_no_nn, 2), "Ridge_R2_no_nn_std: ", round(Ridge_R2_std_no_nn, 2)) for feature in features_list: Modeller_feature = Modeller(X, rs_features=feature, cv_loops=cv_loops) Ridge_pipeline = Modeller_feature.make_model_pipeline('Ridge') Ridge_scores_feature = Modeller_feature.compute_scores(Ridge_pipeline, y) Ridge_R2_mean_feature = Ridge_scores_feature.mean() Ridge_R2_std_feature = Ridge_scores_feature.std() all_but_feature = list(set(features_list) - set([feature])) Modeller_all_but_feature = Modeller(X, rs_features=all_but_feature, cv_loops=cv_loops) Ridge_pipeline2 = Modeller_all_but_feature.make_model_pipeline('Ridge') Ridge_scores_all_but_feature = Modeller_all_but_feature.compute_scores(Ridge_pipeline2, y) Ridge_R2_mean_all_but_feature = Ridge_scores_all_but_feature.mean() print(feature, " R2_mean: ", round(Ridge_R2_mean_feature, 2), " R2_mean_added_value: ", round(Ridge_R2_mean - Ridge_R2_mean_all_but_feature, 2), "R2_std: ", round(Ridge_R2_std_feature, 2))
def run(id): # ----------------- # # SETUP ############# # ----------------- # print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] indicator = config["indicator"][0] raster = config["satellite_grid"][0] aggregate_factor = config["base_raster_aggregation"][0] scope = config["scope"][0] nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end") s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end") if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # ----------------------------------- # # WorldPop Raster too fine, aggregate # from utils import aggregate if aggregate_factor > 1: print('INFO: aggregating raster {}'.format(raster)) base_raster = "../tmp/local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster # -------- # # DATAPREP # # -------- # data = pd.read_csv(dataset) data_cols = data.columns.values # grid GRID = RasterGrid(base_raster) list_i, list_j = GRID.get_gridcoordinates(data) # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y data["i"], data["j"] = list_i, list_j # Get Polygon Geojson of the boundaries minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude") area = points_to_polygon(minlon, minlat, maxlon, maxlat) print("Number of clusters: {} ".format(len(data))) list_i, list_j, pipeline = data["i"], data["j"], 'evaluation' # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # downlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000) print('INFO: images downloaded.') if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)): print('INFO: already scored.') features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)) else: print('INFO: scoring ...') # extract the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline) # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False) features = features.drop('index', 1) data = data.merge(features, on=["i", "j"]) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) print('INFO: features extracted.') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end) data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope) S2.download() data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series) # --------------- # # save features # # --------------- # # features to be use in the linear model features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j']))) # Standardize Features (0 mean and 1 std) #data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std() print("Normalizing : max") data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max() data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) # --------------- # # model indicator # # --------------- # # shuffle dataset data = data.sample(frac=1, random_state=1783).reset_index(drop=True) # shuffle data # if set in the config, take log of indicator if config['log'][0]: data[indicator] = np.log(data[indicator]) from modeller import Modeller X, y = data[features_list + ["gpsLatitude", "gpsLongitude"]], data[indicator] modeller = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], scoring='r2', cv_loops=20) kNN_pipeline = modeller.make_model_pipeline('kNN') kNN_scores = modeller.compute_scores(kNN_pipeline, y) kNN_R2_mean = kNN_scores.mean() kNN_R2_std = kNN_scores.std() print("kNN_R2_mean: ", kNN_R2_mean, "kNN_R2_std: ", kNN_R2_std) Ridge_pipeline = modeller.make_model_pipeline('Ridge') Ridge_scores = modeller.compute_scores(Ridge_pipeline, y) Ridge_R2_mean = Ridge_scores.mean() Ridge_R2_std = Ridge_scores.std() print("Ridge_R2_mean: ", Ridge_R2_mean, "Ridge_R2_std: ", Ridge_R2_std) Ensemble_pipeline = modeller.make_ensemble_pipeline([kNN_pipeline, Ridge_pipeline]) Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y) Ensemble_R2_mean = Ensemble_scores.mean() Ensemble_R2_std = Ensemble_scores.std() print("Ensemble_R2_mean: ", Ensemble_R2_mean, "Ensemble_R2_std: ", Ensemble_R2_std) # ------------------ # # write scores to DB # # ------------------ # query = """ insert into results_new (run_date, config_id, r2, r2_sd, r2_knn, r2_sd_knn, r2_features, r2_sd_features, mape_rmsense) values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format( config['id'][0], Ensemble_R2_mean, Ensemble_R2_std, kNN_R2_mean, kNN_R2_std, Ridge_R2_mean, Ridge_R2_std, 0) engine.execute(query) # ------------------------- # # write predictions to file # # ------------------------- # print('INFO: writing predictions to disk ...') from sklearn.model_selection import cross_val_predict results = pd.DataFrame({ 'yhat': cross_val_predict(Ensemble_pipeline, X.values, y), 'y': data[indicator].values, 'lat': data['gpsLatitude'], 'lon': data['gpsLongitude']}) results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False) # save model for production Ensemble_pipeline.fit(X.values, y) # Best n_neighbors (kNN) print('INFO: number of neighbours chosen: ', Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_) # Best alpha (Ridge) print('INFO: regularization param chosen: ', Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_) from sklearn.externals import joblib joblib.dump(Ensemble_pipeline, '../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model saved.')
def run(file, id=None): # ----------------- # # SETUP ############# # ----------------- # if id is None: config = get_config_file(file) else: config, engine = get_config_db(id) for d in [ '../Data/Features', '../Data/Geofiles/OSM', '../Data/Geofiles/nightlights', '../Data/Results' ]: if not os.path.exists(d): os.makedirs(d) # --------------------- # # Setting up playground # # --------------------- # assert ( os.path.exists(config['dataset_filename']) ), "ups, dataset specified not found: " + config['dataset_filename'] data = pd.read_csv(config['dataset_filename']) print(str(np.datetime64('now')), 'INFO: original dataset lenght: ', data.shape[0]) data['gpsLongitude'] = np.round(data['gpsLongitude'], 5) data['gpsLatitude'] = np.round(data['gpsLatitude'], 5) # avoid duplicates data = data[['gpsLongitude', 'gpsLatitude', config['indicator']]].groupby(['gpsLongitude', 'gpsLatitude']).mean() # base layer assert (os.path.exists(config['base_raster']) ), "ups, raster specified not found: " + config['base_raster'] GRID = BaseLayer(config['base_raster'], data.index.get_level_values('gpsLongitude'), data.index.get_level_values('gpsLatitude')) # TODO: we should enforce the most accurate i and j when training, i.e. aggregate = 1? # Get Polygon Geojson of the boundaries # TODO: maybe go into BaseLayer class? minlat, maxlat, minlon, maxlon = boundaries(GRID.lat, GRID.lon, buffer=0.05) area = points_to_polygon(minlon, minlat, maxlon, maxlat) print(str(np.datetime64('now')), "INFO: Number of clusters: {} ".format(len(data))) pipeline = 'evaluation' # ------------------------------- # # get features from Google images # # ------------------------------- # if config['satellite_config']['satellite_images'] in ['Y', 'G']: features_path = "../Data/Features/features_Google_id_{}_{}.csv".format( id, pipeline) data_path = "../Data/Satellite/" from google_images import GoogleImages if os.path.exists(features_path): print('INFO: already scored.') features = pd.read_csv(features_path.format(id, pipeline), index_col=['gpsLongitude', 'gpsLatitude'], float_precision='round_trip') else: gimages = GoogleImages(data_path) # download the images from the relevant API gimages.download(GRID.lon, GRID.lat, step=config['satellite_config']['satellite_step']) # extract the features features = pd.DataFrame(gimages.featurize( GRID.lon, GRID.lat, step=config['satellite_config']['satellite_step']), index=data.index) features.columns = [ str(col) + '_Google' for col in features.columns ] features.to_csv(features_path) data = data.join(features) print('INFO: features extracted from Google satellite images') # --------------------------------- # # get features from Sentinel images # # --------------------------------- # if config['satellite_config']['satellite_images'] in ['Y', 'S']: features_path = "../Data/Features/features_Sentinel_id_{}_{}.csv".format( id, pipeline) data_path = "../Data/Satellite/" start_date = config["satellite_config"]["start_date"] end_date = config["satellite_config"]["end_date"] from sentinel_images import SentinelImages if os.path.exists(features_path): print('INFO: already scored.') features = pd.read_csv(features_path.format(id, pipeline), index_col=['gpsLongitude', 'gpsLatitude'], float_precision='round_trip') else: simages = SentinelImages(data_path) # download the images from the relevant API simages.download(GRID.lon, GRID.lat, start_date, end_date) print('INFO: scoring ...') # extract the features print('INFO: extractor instantiated.') features = pd.DataFrame(simages.featurize(GRID.lon, GRID.lat, start_date, end_date), index=data.index) features.columns = [ str(col) + '_Sentinel' for col in features.columns ] features.to_csv(features_path) data = data.join(features) print('INFO: features extracted from Sentinel images') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights nlights = Nightlights('../Data/Geofiles/') nlights.download(area, config['nightlights_date']['start'], config['nightlights_date']['end']) features = pd.DataFrame(nlights.featurize(GRID.lon, GRID.lat), columns=['nightlights'], index=data.index) # quantize nightlights features['nightlights'] = pd.qcut(features['nightlights'], 5, labels=False, duplicates='drop') data = data.join(features) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) dist = OSM.distance_to_nearest(GRID.lat, GRID.lon, osm_gdf["value"]) data['distance_{}'.format(value)] = [ np.log(0.0001 + x) for x in dist ] # ---------------- # # NDBI, NDVI, NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', config['NDs_date']['start'], config['NDs_date']['end'], config['scope']) S2.download() data['max_NDVI'], data['max_NDBI'], data['max_NDWI'] = S2.rms_values( GRID.lon, GRID.lat) # --------------- # # add ACLED # # --------------- # from acled import ACLED acled = ACLED("../Data/Geofiles/ACLED/") acled.download(config['iso3'], config['nightlights_date']['start'], config['nightlights_date']['end']) d = {} for property in ["fatalities", "n_events", "violence_civ"]: for k in [10000, 100000]: d[property + "_" + str(k)] = acled.featurize(GRID.lon, GRID.lat, property=property, function='density', buffer=k) d["weighted_sum_fatalities_by_dist"] = acled.featurize( GRID.lon, GRID.lat, property="fatalities", function='weighted_kNN') d["distance_to_acled_event"] = acled.featurize(GRID.lon, GRID.lat, function='distance') # quantize ACLED for c in d.keys(): d[c] = np.nan_to_num(pd.qcut(d[c], 5, labels=False, duplicates='drop')) features = pd.DataFrame(d, index=data.index) data = data.join(features) # --------------- # # save features # # --------------- # # drop columns with only 1 value print( 'INFO: {} columns. Dropping features with unique values (if any) ...'. format(len(data.columns))) data = data[[col for col in data if not data[col].nunique() == 1]] print('INFO: {} columns.'.format(len(data.columns))) # features to be use in the linear model features_list = list( sorted(set(data.columns) - set(['i', 'j', config['indicator']]))) #Save non-scaled features data.to_csv( "../Data/Features/features_all_id_{}_evaluation_nonscaled.csv".format( config['id'])) # Scale Features print("Normalizing : max") data[features_list] = (data[features_list] - data[features_list].mean() ) / (data[features_list].max() + 0.001) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format( config['id'])) # --------------- # # model indicator # # --------------- # # shuffle dataset data = data.sample(frac=1, random_state=1783) # shuffle data scores_dict = {} # placeholder to save the scores from modeller import Modeller X, y = data[features_list].reset_index(), data[config['indicator']] modeller = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], scoring='r2', cv_loops=20) kNN_pipeline = modeller.make_model_pipeline('kNN') kNN_scores = modeller.compute_scores(kNN_pipeline, y) scores_dict['kNN_R2_mean'] = round(kNN_scores.mean(), 2) scores_dict['kNN_R2_std'] = round(kNN_scores.std(), 2) print("kNN_R2_mean: ", scores_dict['kNN_R2_mean'], "kNN_R2_std: ", scores_dict['kNN_R2_std']) Ridge_pipeline = modeller.make_model_pipeline('Ridge') Ridge_scores = modeller.compute_scores(Ridge_pipeline, y) scores_dict['ridge_R2_mean'] = round(Ridge_scores.mean(), 2) scores_dict['ridge_R2_std'] = round(Ridge_scores.std(), 2) print("Ridge_R2_mean: ", scores_dict['ridge_R2_mean'], "Ridge_R2_std: ", scores_dict['ridge_R2_std']) Ensemble_pipeline = modeller.make_ensemble_pipeline( [kNN_pipeline, Ridge_pipeline]) Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y) scores_dict['ensemble_R2_mean'] = round(Ensemble_scores.mean(), 2) scores_dict['ensemble_R2_std'] = round(Ensemble_scores.std(), 2) print("Ensemble_R2_mean: ", scores_dict['ensemble_R2_mean'], "Ensemble_R2_std: ", scores_dict['ensemble_R2_std']) # save results if id is None: write_scores_to_file(scores_dict, config['id']) else: write_scores_to_db(scores_dict, config['id'], engine) # ------------------------- # # write predictions to file # # ------------------------- # print('INFO: writing predictions to disk ...') from sklearn.model_selection import cross_val_predict results = pd.DataFrame( { 'yhat': cross_val_predict(Ensemble_pipeline, X.values, y), 'y': data[config['indicator']].values }, index=data.index) results.to_csv('../Data/Results/config_{}.csv'.format(config['id'])) # save model for production Ensemble_pipeline.fit(X.values, y) # Best n_neighbors (kNN) print('INFO: number of neighbours chosen: ', Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_) # Best alpha (Ridge) print('INFO: regularization param chosen: ', Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_) from sklearn.externals import joblib joblib.dump(Ensemble_pipeline, '../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model saved.')