def __init__(self, input_dim, output_dim): ''' :param input_dim: input dimension of layer :param output_dim: output dimension of layer ''' BaseLayer.__init__(self, input_dim, output_dim, activation_function=Leaky_ReLU)
def __init__(self, opt): self.cuda = True if torch.cuda.is_available() else False self.Tensor = torch.cuda.FloatTensor if self.cuda else torch.FloatTensor self.opt = opt self.generator = Generator(opt) self.generator_predict = Generator(opt) self.discriminator = Discriminator(opt) self.decay = 0.5**(opt.batch_size / (10 * 1000)) * opt.adjust_decay_param first_decay = 0 if opt.is_restore_model: models = BaseLayer.restore_model(opt.model_path) if models is not None: self.generator, self.generator_predict, self.discriminator = models first_decay = self.decay BaseLayer.print_model_parameters(self.generator, 'generator') BaseLayer.print_model_parameters(self.discriminator, 'discriminator') self.generator.train() self.generator_predict.eval() Generator.apply_decay_parameters(self.generator, self.generator_predict, decay=first_decay) self.discriminator.train() self.generator_loss = GeneratorLoss() self.generator_loss_path_reg = GeneratorLossPathReg(opt=opt) self.discriminator_loss = DiscriminatorLoss() self.discriminator_loss_r1 = DiscriminatorLossR1( reg_interval=opt.d_reg_interval) self.dataloader = get_dataloader(opt.data_path, opt.resolution, opt.batch_size) self.fid = FrechetInceptionDistance(self.generator_predict, self.dataloader, opt) learning_rate, beta1, beta2 = self.get_adam_params_adjust_interval( opt.g_reg_interval, opt) self.optimizer_g = torch.optim.Adam(self.generator.parameters(), lr=learning_rate, betas=(beta1, beta2)) learning_rate, beta1, beta2 = self.get_adam_params_adjust_interval( opt.d_reg_interval, opt) self.optimizer_d = torch.optim.Adam(self.discriminator.parameters(), lr=learning_rate, betas=(beta1, beta2)) if not os.path.isdir(self.opt.cache_path): os.makedirs(self.opt.cache_path, exist_ok=True)
def main(opt): _, generatro_predict, _ = BaseLayer.restore_model(opt.model_path) generatro_predict.truncation_psi = opt.truncation_psi if not os.path.isdir(opt.output_path): os.makedirs(opt.output_path, exist_ok=True) for index, seed in enumerate(opt.seeds): rnd = np.random.RandomState(seed) z = rnd.randn(1, opt.latent_dim) z = Tensor(np.tile(z, (opt.batch_size, 1))) images, _ = generatro_predict(z) images = images.to('cpu').detach().numpy() pil_img = convert_to_pil_image(images[0]) now = datetime.now().strftime("%Y%m%d_%H%M%S") filename = os.path.join(opt.output_path, '{}_{}.png'.format(now, seed)) pil_img.save(filename)
def run(file, id=None): # ----------------- # # SETUP ############# # ----------------- # if id is None: config = get_config_file(file) else: config, engine = get_config_db(id) for d in [ '../Data/Features', '../Data/Geofiles/OSM', '../Data/Geofiles/nightlights', '../Data/Results' ]: if not os.path.exists(d): os.makedirs(d) # --------------------- # # Setting up playground # # --------------------- # assert ( os.path.exists(config['dataset_filename']) ), "ups, dataset specified not found: " + config['dataset_filename'] data = pd.read_csv(config['dataset_filename']) print(str(np.datetime64('now')), 'INFO: original dataset lenght: ', data.shape[0]) data['gpsLongitude'] = np.round(data['gpsLongitude'], 5) data['gpsLatitude'] = np.round(data['gpsLatitude'], 5) # avoid duplicates data = data[['gpsLongitude', 'gpsLatitude', config['indicator']]].groupby(['gpsLongitude', 'gpsLatitude']).mean() # base layer assert (os.path.exists(config['base_raster']) ), "ups, raster specified not found: " + config['base_raster'] GRID = BaseLayer(config['base_raster'], data.index.get_level_values('gpsLongitude'), data.index.get_level_values('gpsLatitude')) # TODO: we should enforce the most accurate i and j when training, i.e. aggregate = 1? # Get Polygon Geojson of the boundaries # TODO: maybe go into BaseLayer class? minlat, maxlat, minlon, maxlon = boundaries(GRID.lat, GRID.lon, buffer=0.05) area = points_to_polygon(minlon, minlat, maxlon, maxlat) print(str(np.datetime64('now')), "INFO: Number of clusters: {} ".format(len(data))) pipeline = 'evaluation' # ------------------------------- # # get features from Google images # # ------------------------------- # if config['satellite_config']['satellite_images'] in ['Y', 'G']: features_path = "../Data/Features/features_Google_id_{}_{}.csv".format( id, pipeline) data_path = "../Data/Satellite/" from google_images import GoogleImages if os.path.exists(features_path): print('INFO: already scored.') features = pd.read_csv(features_path.format(id, pipeline), index_col=['gpsLongitude', 'gpsLatitude'], float_precision='round_trip') else: gimages = GoogleImages(data_path) # download the images from the relevant API gimages.download(GRID.lon, GRID.lat, step=config['satellite_config']['satellite_step']) # extract the features features = pd.DataFrame(gimages.featurize( GRID.lon, GRID.lat, step=config['satellite_config']['satellite_step']), index=data.index) features.columns = [ str(col) + '_Google' for col in features.columns ] features.to_csv(features_path) data = data.join(features) print('INFO: features extracted from Google satellite images') # --------------------------------- # # get features from Sentinel images # # --------------------------------- # if config['satellite_config']['satellite_images'] in ['Y', 'S']: features_path = "../Data/Features/features_Sentinel_id_{}_{}.csv".format( id, pipeline) data_path = "../Data/Satellite/" start_date = config["satellite_config"]["start_date"] end_date = config["satellite_config"]["end_date"] from sentinel_images import SentinelImages if os.path.exists(features_path): print('INFO: already scored.') features = pd.read_csv(features_path.format(id, pipeline), index_col=['gpsLongitude', 'gpsLatitude'], float_precision='round_trip') else: simages = SentinelImages(data_path) # download the images from the relevant API simages.download(GRID.lon, GRID.lat, start_date, end_date) print('INFO: scoring ...') # extract the features print('INFO: extractor instantiated.') features = pd.DataFrame(simages.featurize(GRID.lon, GRID.lat, start_date, end_date), index=data.index) features.columns = [ str(col) + '_Sentinel' for col in features.columns ] features.to_csv(features_path) data = data.join(features) print('INFO: features extracted from Sentinel images') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights nlights = Nightlights('../Data/Geofiles/') nlights.download(area, config['nightlights_date']['start'], config['nightlights_date']['end']) features = pd.DataFrame(nlights.featurize(GRID.lon, GRID.lat), columns=['nightlights'], index=data.index) # quantize nightlights features['nightlights'] = pd.qcut(features['nightlights'], 5, labels=False, duplicates='drop') data = data.join(features) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) dist = OSM.distance_to_nearest(GRID.lat, GRID.lon, osm_gdf["value"]) data['distance_{}'.format(value)] = [ np.log(0.0001 + x) for x in dist ] # ---------------- # # NDBI, NDVI, NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', config['NDs_date']['start'], config['NDs_date']['end'], config['scope']) S2.download() data['max_NDVI'], data['max_NDBI'], data['max_NDWI'] = S2.rms_values( GRID.lon, GRID.lat) # --------------- # # add ACLED # # --------------- # from acled import ACLED acled = ACLED("../Data/Geofiles/ACLED/") acled.download(config['iso3'], config['nightlights_date']['start'], config['nightlights_date']['end']) d = {} for property in ["fatalities", "n_events", "violence_civ"]: for k in [10000, 100000]: d[property + "_" + str(k)] = acled.featurize(GRID.lon, GRID.lat, property=property, function='density', buffer=k) d["weighted_sum_fatalities_by_dist"] = acled.featurize( GRID.lon, GRID.lat, property="fatalities", function='weighted_kNN') d["distance_to_acled_event"] = acled.featurize(GRID.lon, GRID.lat, function='distance') # quantize ACLED for c in d.keys(): d[c] = np.nan_to_num(pd.qcut(d[c], 5, labels=False, duplicates='drop')) features = pd.DataFrame(d, index=data.index) data = data.join(features) # --------------- # # save features # # --------------- # # drop columns with only 1 value print( 'INFO: {} columns. Dropping features with unique values (if any) ...'. format(len(data.columns))) data = data[[col for col in data if not data[col].nunique() == 1]] print('INFO: {} columns.'.format(len(data.columns))) # features to be use in the linear model features_list = list( sorted(set(data.columns) - set(['i', 'j', config['indicator']]))) #Save non-scaled features data.to_csv( "../Data/Features/features_all_id_{}_evaluation_nonscaled.csv".format( config['id'])) # Scale Features print("Normalizing : max") data[features_list] = (data[features_list] - data[features_list].mean() ) / (data[features_list].max() + 0.001) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format( config['id'])) # --------------- # # model indicator # # --------------- # # shuffle dataset data = data.sample(frac=1, random_state=1783) # shuffle data scores_dict = {} # placeholder to save the scores from modeller import Modeller X, y = data[features_list].reset_index(), data[config['indicator']] modeller = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], scoring='r2', cv_loops=20) kNN_pipeline = modeller.make_model_pipeline('kNN') kNN_scores = modeller.compute_scores(kNN_pipeline, y) scores_dict['kNN_R2_mean'] = round(kNN_scores.mean(), 2) scores_dict['kNN_R2_std'] = round(kNN_scores.std(), 2) print("kNN_R2_mean: ", scores_dict['kNN_R2_mean'], "kNN_R2_std: ", scores_dict['kNN_R2_std']) Ridge_pipeline = modeller.make_model_pipeline('Ridge') Ridge_scores = modeller.compute_scores(Ridge_pipeline, y) scores_dict['ridge_R2_mean'] = round(Ridge_scores.mean(), 2) scores_dict['ridge_R2_std'] = round(Ridge_scores.std(), 2) print("Ridge_R2_mean: ", scores_dict['ridge_R2_mean'], "Ridge_R2_std: ", scores_dict['ridge_R2_std']) Ensemble_pipeline = modeller.make_ensemble_pipeline( [kNN_pipeline, Ridge_pipeline]) Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y) scores_dict['ensemble_R2_mean'] = round(Ensemble_scores.mean(), 2) scores_dict['ensemble_R2_std'] = round(Ensemble_scores.std(), 2) print("Ensemble_R2_mean: ", scores_dict['ensemble_R2_mean'], "Ensemble_R2_std: ", scores_dict['ensemble_R2_std']) # save results if id is None: write_scores_to_file(scores_dict, config['id']) else: write_scores_to_db(scores_dict, config['id'], engine) # ------------------------- # # write predictions to file # # ------------------------- # print('INFO: writing predictions to disk ...') from sklearn.model_selection import cross_val_predict results = pd.DataFrame( { 'yhat': cross_val_predict(Ensemble_pipeline, X.values, y), 'y': data[config['indicator']].values }, index=data.index) results.to_csv('../Data/Results/config_{}.csv'.format(config['id'])) # save model for production Ensemble_pipeline.fit(X.values, y) # Best n_neighbors (kNN) print('INFO: number of neighbours chosen: ', Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_) # Best alpha (Ridge) print('INFO: regularization param chosen: ', Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_) from sklearn.externals import joblib joblib.dump(Ensemble_pipeline, '../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model saved.')
dataloader = get_dataloader(opt.data_path, opt.resolution, opt.batch_size) # dataset = BoxDataset( # file_path=os.path.join(opt.data_path, '*.png'), # transform=transforms.Compose( # [ # transforms.Resize(32), # transforms.ToTensor(), # TranformDynamicRange([0, 255], [-1, 1]) # ] # ), # ) # # dataloader = torch.utils.data.DataLoader( # dataset=dataset, # batch_size=8, # shuffle=True, # ) generator = Generator(opt) # if opt.is_restore_model: # generator.restore() models = BaseLayer.restore_model(opt.model_path) if models is not None: generator, generator_predict, discriminator = models fid = FrechetInceptionDistance(generator, dataloader=dataloader, opt=opt) fid_score = fid.get_score() print(fid_score)
def train_generator(self, current_loop_num): BaseLayer.set_model_parameter_requires_grad_all(self.generator, True) BaseLayer.set_model_parameter_requires_grad_all( self.discriminator, False) # train generator # TensorboardLogger.print_parameter(generator) for index in range(0, self.opt.generator_train_num): train_z = self.Tensor( np.random.normal(loc=0, scale=1, size=(self.opt.batch_size, self.opt.latent_dim))) fake_imgs, fake_dlatents_out = self.generator(train_z) fake_validity = self.discriminator(fake_imgs) prob_fake = F.sigmoid(fake_validity).mean() TensorboardLogger.write_scalar('prob_fake/generator', prob_fake) # print('{} prob_fake(generator): {}'.format(index, prob_fake)) g_loss = self.generator_loss(fake_validity) self.optimizer_g.zero_grad() g_loss.backward() self.optimizer_g.step() run_g_reg = current_loop_num % self.opt.g_reg_interval == 0 if run_g_reg: # generatorの正則化処理 g_reg_maxcount = 4 if 4 < self.opt.generator_train_num else self.opt.generator_train_num for _ in range(0, g_reg_maxcount): z = self.Tensor( np.random.normal(loc=0, scale=1, size=(self.opt.batch_size, self.opt.latent_dim))) pl_fake_imgs, pl_fake_dlatents_out = self.generator(z) g_reg, pl_lenght = self.generator_loss_path_reg( pl_fake_imgs, pl_fake_dlatents_out) self.optimizer_g.zero_grad() g_reg.backward() self.optimizer_g.step() TensorboardLogger.write_scalar('loss/g_reg', g_reg) TensorboardLogger.write_scalar('loss/path_length', pl_lenght) TensorboardLogger.write_scalar( 'loss/pl_mean_var', self.generator_loss_path_reg.pl_mean_var.mean()) # 推論用のgeneratorに指数移動平均を行った重みを適用する Generator.apply_decay_parameters(self.generator, self.generator_predict, decay=self.decay) fake_imgs_predict, fake_dlatents_out_predict = self.generator_predict( train_z) fake_predict_validity = self.discriminator(fake_imgs_predict) prob_fake_predict = F.sigmoid(fake_predict_validity).mean() TensorboardLogger.write_scalar('prob_fake_predict/generator', prob_fake_predict) # print('prob_fake_predict(generator): {}'.format(prob_fake_predict)) Generator.apply_decay_parameters(self.generator_predict, self.generator, decay=self.opt.reverse_decay) if current_loop_num % self.opt.save_metrics_interval == 0: TensorboardLogger.write_scalar('score/g_score', fake_validity.mean()) TensorboardLogger.write_scalar('loss/g_loss', g_loss) TensorboardLogger.write_histogram('generator/fake_imgs', fake_imgs) TensorboardLogger.write_histogram('generator/fake_dlatents_out', fake_dlatents_out) TensorboardLogger.write_histogram('generator/fake_imgs_predict', fake_imgs_predict) TensorboardLogger.write_histogram( 'generator/fake_dlatents_out_predict', fake_dlatents_out_predict) if current_loop_num % self.opt.save_images_tensorboard_interval == 0: # for index in range(fake_imgs.shape[0]): # img = adjust_dynamic_range(fake_imgs[index].to('cpu').detach().numpy(), drange_in=[-1, 1], drange_out=[0, 255]) # TensorboardLogger.write_image('images/fake/{}'.format(index), img) for index in range(fake_imgs_predict.shape[0]): img = adjust_dynamic_range( fake_imgs_predict[index].to('cpu').detach().numpy(), drange_in=[-1, 1], drange_out=[0, 255]) TensorboardLogger.write_image( 'images/fake_predict/{}'.format(index), img) if current_loop_num % self.opt.save_images_interval == 0: # 生成した画像を保存する if not os.path.isdir(self.opt.results): os.makedirs(self.opt.results, exist_ok=True) # fake_imgs_val, fake_dlatents_out_val = generator(val_z) # save_image_grid( # # fake_imgs_val.to('cpu').detach().numpy(), # fake_imgs.to('cpu').detach().numpy(), # os.path.join(self.opt.results, '{}_fake.png'.format(TensorboardLogger.global_step)), # batch_size=self.opt.batch_size, # drange=[-1, 1]) # fake_imgs_predict_val, fake_dlatents_out_predict_val = generator_predict(val_z) save_image_grid(fake_imgs_predict.to('cpu').detach().numpy(), os.path.join( self.opt.results, '{}_fake_predict.png'.format( TensorboardLogger.global_step)), batch_size=self.opt.batch_size, drange=[-1, 1]) return g_loss
def save_model(self): BaseLayer.save_model( self.opt.model_path, (self.generator, self.generator_predict, self.discriminator))
def train_discriminator(self, current_loop_num): BaseLayer.set_model_parameter_requires_grad_all(self.generator, False) BaseLayer.set_model_parameter_requires_grad_all( self.discriminator, True) # train discriminator for index in range(0, self.opt.discriminator_train_num): data_iterator = self.dataloader.__iter__() imgs = data_iterator.next() # imgs = TranformDynamicRange.fade_lod(x=imgs, lod=0.0) # imgs = TranformDynamicRange.upscale_lod(x=imgs, lod=0.0) real_imgs = Variable(imgs.type(self.Tensor), requires_grad=False) z = self.Tensor( np.random.normal(loc=0, scale=1, size=(self.opt.batch_size, self.opt.latent_dim))) fake_imgs, fake_dlatents_out = self.generator(z) real_validity = self.discriminator(real_imgs) prob_real = F.sigmoid(real_validity).mean() TensorboardLogger.write_scalar('prob_real/discriminator', prob_real) # print('{} prob_real(discriminator): {}'.format(index, prob_real)) fake_validity = self.discriminator(fake_imgs) prob_fake = F.sigmoid(fake_validity).mean() TensorboardLogger.write_scalar('prob_fake/discriminator', prob_fake) # print('{} prob_fake(discriminator): {}'.format(index, prob_fake)) d_loss = self.discriminator_loss(fake_validity, real_validity) self.optimizer_d.zero_grad() d_loss.backward() self.optimizer_d.step() run_d_reg = current_loop_num % self.opt.d_reg_interval == 0 if run_d_reg: d_reg_maxcount = 4 if 4 < self.opt.discriminator_train_num else self.opt.discriminator_train_num for index in range(0, d_reg_maxcount): # discriminatorの正則化処理 # z = self.Tensor(np.random.normal(loc=0, scale=1, size=(self.opt.batch_size, self.opt.latent_dim))) # fake_imgs, fake_dlatents_out = self.generator(z) # fake_validity = self.discriminator(fake_imgs) real_imgs.requires_grad = True real_validity = self.discriminator(real_imgs) d_reg = self.discriminator_loss_r1(real_validity, real_imgs) self.optimizer_d.zero_grad() d_reg.backward() self.optimizer_d.step() TensorboardLogger.writer.add_scalar( '{}/reg/d_reg'.format(TensorboardLogger.now), d_reg, TensorboardLogger.global_step) if current_loop_num % self.opt.save_metrics_interval == 0: TensorboardLogger.write_scalar('score/d_score', real_validity.mean()) TensorboardLogger.write_scalar('loss/d_loss', d_loss) TensorboardLogger.write_histogram('real_imgs', real_imgs) return d_loss
def main(id, aggregate_factor, min_pop, bbox, shapefile): """ makes predictions is areas where we have no survey. Args: id (int): the config id aggregate_factor (int): aggregate pixels to lower resolution by x much min_pop: minimium population in pixel to score bbox: bounding box <minlat> <minlon> <maxlat> <maxlon>, if omitted will use boundaries from dataset shapefile: aggregate within shapefile's geometires Example: id, aggregate_factor, min_pop = 3075, 15, 500 """ # read the configs for id print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""".format( private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query( "select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] raster = config["base_raster"][0] scope = config["scope"][0] nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), \ config["nightlights_date"][0].get("end") s2_date_start, s2_date_end = config["NDs_date"][0].get( "start"), config["NDs_date"][0].get("end") ISO = config["iso3"][0] if config['satellite_config'][0].get('satellite_images') == 'Y': print('INFO: satellite images from Google and Sentinel-2') step = config['satellite_config'][0].get("satellite_step") elif config['satellite_config'][0].get('satellite_images') == 'G': print('INFO: only Google satellite images.') step = config['satellite_config'][0].get("satellite_step") elif config['satellite_config'][0].get('satellite_images') == 'N': print('INFO: no satellite images') # ----------------------------------- # # WorldPop Raster too granular (lots of images), aggregate # if aggregate_factor > 1: print( 'INFO: aggregating raster with factor {}'.format(aggregate_factor)) base_raster = "../local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster # ---------------- # # AREA OF INTEREST # # ---------------- # # dataset_df = pd.read_csv(dataset) # data_cols = dataset_df.columns.values if sum(bbox) != 0: # dummy bbox print("INFO: using AOI from bbox") print(sum(bbox)) # define AOI with manually defined bbox minlat, minlon, maxlat, maxlon = bbox[0], bbox[1], bbox[2], bbox[3] area = points_to_polygon(minlat=minlat, minlon=minlon, maxlat=maxlat, maxlon=maxlon) else: print("INFO: using AOI from dataset.") # use dataset's extent dataset_df = pd.read_csv(dataset) minlat, maxlat, minlon, maxlon = boundaries(dataset_df['gpsLatitude'], dataset_df['gpsLongitude']) area = points_to_polygon(minlat=minlat, minlon=minlon, maxlat=maxlat, maxlon=maxlon) del dataset_df # crop raster with rasterio.open(base_raster) as src: out_image, out_transform = mask(src, [area], crop=True) out_meta = src.meta.copy() # save the resulting raster out_meta.update({ "driver": "GTiff", "height": out_image.shape[1], "width": out_image.shape[2], "transform": out_transform }) final_raster = "../final_raster.tif" print('INFO: Removing tiles with population under {}'.format( min_pop)) # only score areas where there are at agg factor living with rasterio.open(final_raster, "w", **out_meta) as dest: out_image[out_image < min_pop] = dest.nodata dest.write(out_image) list_j, list_i = np.where(out_image[0] != dest.nodata) # instantiate GRID GRID = BaseLayer(final_raster) coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) ix = pd.MultiIndex.from_arrays([list_i, list_j, coords_y, coords_x], names=('i', 'j', "gpsLatitude", "gpsLongitude")) print("Number of clusters: {} ".format(len(ix))) pipeline = 'scoring' # ------------------------------------------------ # # download images from Google and Extract Features # # ------------------------------------------------ # if config['satellite_config'][0].get('satellite_images') in ['Y', 'G']: features_path = "../Data/Features/features_Google_id_{}_{}.csv".format( id, pipeline) data_path = "../Data/Satellite/" gimages = GoogleImages(data_path) # download the images from the relevant API gimages.download(coords_x, coords_y, step=step) # extract the features features = pd.DataFrame(gimages.featurize(coords_x, coords_y, step=step), index=ix) features.columns = [str(col) + '_Google' for col in features.columns] features.to_csv(features_path) print('INFO: features extracted.') data = features.copy() # ------------------------------------------------------------- # # download Sentinel images and Extract Features # # ------------------------------------------------------------- # if config['satellite_config'][0].get('satellite_images') == 'Y': features_path = "../Data/Features/features_Sentinel_id_{}_{}.csv".format( id, pipeline) data_path = "../Data/Satellite/" start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] from sentinel_images import SentinelImages simages = SentinelImages(data_path) # download the images from the relevant API simages.download(coords_x, coords_y, start_date, end_date) print('INFO: scoring ...') # extract the features print('INFO: extractor instantiated.') features = pd.DataFrame(simages.featurize(coords_x, coords_y, start_date, end_date), index=ix) features.columns = [str(col) + '_Sentinel' for col in features.columns] features.to_csv(features_path) if data is not None: data = data.join(features) else: data = features.copy() print('INFO: features extracted') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights nlights = Nightlights('../Data/Geofiles/') nlights.download(area, nightlights_date_start, nightlights_date_end) features = pd.DataFrame(nlights.featurize(coords_x, coords_y), columns=['nightlights'], index=ix) # quantize nightlights features['nightlights'] = pd.qcut(features['nightlights'], 5, labels=False, duplicates='drop') data = data.join(features) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) dist = OSM.distance_to_nearest(coords_y, coords_x, osm_gdf["value"]) data['distance_{}'.format(value)] = [ np.log(0.0001 + x) for x in dist ] # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope) S2.download() data['max_NDVI'], data['max_NDBI'], data['max_NDWI'] = S2.rms_values( coords_x, coords_y) # --------------- # # add ACLED # # --------------- # from acled import ACLED acled = ACLED("../Data/Geofiles/ACLED/") acled.download(ISO, nightlights_date_start, nightlights_date_end) d = {} for property in ["fatalities", "n_events", "violence_civ"]: for k in [10000, 100000]: d[property + "_" + str(k)] = acled.featurize(coords_x, coords_y, property=property, function='density', buffer=k) d["weighted_sum_fatalities_by_dist"] = acled.featurize( coords_x, coords_y, property="fatalities", function='weighted_kNN') d["distance_to_acled_event"] = acled.featurize(coords_x, coords_y, function='distance') # quantize ACLED for c in d.keys(): d[c] = np.nan_to_num(pd.qcut(d[c], 5, labels=False, duplicates='drop')) features = pd.DataFrame(d, index=data.index) data = data.join(features) # --------------- # # save features # # --------------- # print('INFO: {} columns.'.format(len(data.columns))) # features to be use in the linear model features_list = list(sorted(data.columns)) print(features_list) data.to_csv("../Data/Features/features_all_id_{}_{}_nonscaled.csv".format( id, pipeline)) # Scale Features print("Normalizing : max") data[features_list] = (data[features_list] - data[features_list].mean() ) / (data[features_list].max() + 0.001) data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format( id, pipeline)) # ------- # # predict # # ------- # ensemble_pipeline = joblib.load( '../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model loaded.') X = data.reset_index(level=[2, 3]) ensemble_predictions = ensemble_pipeline.predict(X.values) results = pd.DataFrame({ 'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': ensemble_predictions }) results.to_csv('../Data/Results/config_{}.csv'.format(id)) outfile = "../Data/Results/scalerout_{}.tif".format(id) tifgenerator(outfile=outfile, raster_path=final_raster, df=results) outfile = "../Data/Results/scalerout_{}_kNN.tif".format(id) results['yhat_kNN'] = ensemble_pipeline.regr_[0].predict(X.values) tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_kNN') outfile = "../Data/Results/scalerout_{}_Ridge.tif".format(id) results['yhat_Ridge'] = ensemble_pipeline.regr_[1].predict(X.values) tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_Ridge') if shapefile is not None: input_rst = "../Data/Results/scalerout_{}.tif".format(id) weight_rst = "../tmp/final_raster.tif" output_shp = "../Data/Results/scalerout_{}_aggregated.shp".format(id) from utils import weighted_sum_by_polygon weighted_sum_by_polygon(shapefile, input_rst, weight_rst, output_shp)