Ejemplo n.º 1
0
 def __init__(self, input_dim, output_dim):
     '''
     :param input_dim: input dimension of layer
     :param output_dim: output dimension of layer
     '''
     BaseLayer.__init__(self,
                        input_dim,
                        output_dim,
                        activation_function=Leaky_ReLU)
Ejemplo n.º 2
0
    def __init__(self, opt):
        self.cuda = True if torch.cuda.is_available() else False
        self.Tensor = torch.cuda.FloatTensor if self.cuda else torch.FloatTensor

        self.opt = opt
        self.generator = Generator(opt)
        self.generator_predict = Generator(opt)
        self.discriminator = Discriminator(opt)

        self.decay = 0.5**(opt.batch_size /
                           (10 * 1000)) * opt.adjust_decay_param
        first_decay = 0
        if opt.is_restore_model:
            models = BaseLayer.restore_model(opt.model_path)
            if models is not None:
                self.generator, self.generator_predict, self.discriminator = models
                first_decay = self.decay

        BaseLayer.print_model_parameters(self.generator, 'generator')
        BaseLayer.print_model_parameters(self.discriminator, 'discriminator')

        self.generator.train()
        self.generator_predict.eval()

        Generator.apply_decay_parameters(self.generator,
                                         self.generator_predict,
                                         decay=first_decay)
        self.discriminator.train()

        self.generator_loss = GeneratorLoss()
        self.generator_loss_path_reg = GeneratorLossPathReg(opt=opt)

        self.discriminator_loss = DiscriminatorLoss()
        self.discriminator_loss_r1 = DiscriminatorLossR1(
            reg_interval=opt.d_reg_interval)

        self.dataloader = get_dataloader(opt.data_path, opt.resolution,
                                         opt.batch_size)
        self.fid = FrechetInceptionDistance(self.generator_predict,
                                            self.dataloader, opt)

        learning_rate, beta1, beta2 = self.get_adam_params_adjust_interval(
            opt.g_reg_interval, opt)
        self.optimizer_g = torch.optim.Adam(self.generator.parameters(),
                                            lr=learning_rate,
                                            betas=(beta1, beta2))

        learning_rate, beta1, beta2 = self.get_adam_params_adjust_interval(
            opt.d_reg_interval, opt)
        self.optimizer_d = torch.optim.Adam(self.discriminator.parameters(),
                                            lr=learning_rate,
                                            betas=(beta1, beta2))

        if not os.path.isdir(self.opt.cache_path):
            os.makedirs(self.opt.cache_path, exist_ok=True)
Ejemplo n.º 3
0
def main(opt):
    _, generatro_predict, _ = BaseLayer.restore_model(opt.model_path)
    generatro_predict.truncation_psi = opt.truncation_psi

    if not os.path.isdir(opt.output_path):
        os.makedirs(opt.output_path, exist_ok=True)

    for index, seed in enumerate(opt.seeds):
        rnd = np.random.RandomState(seed)
        z = rnd.randn(1, opt.latent_dim)
        z = Tensor(np.tile(z, (opt.batch_size, 1)))
        images, _ = generatro_predict(z)
        images = images.to('cpu').detach().numpy()
        pil_img = convert_to_pil_image(images[0])
        now = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = os.path.join(opt.output_path, '{}_{}.png'.format(now, seed))
        pil_img.save(filename)
Ejemplo n.º 4
0
def run(file, id=None):
    # ----------------- #
    # SETUP #############
    # ----------------- #
    if id is None:
        config = get_config_file(file)
    else:
        config, engine = get_config_db(id)

    for d in [
            '../Data/Features', '../Data/Geofiles/OSM',
            '../Data/Geofiles/nightlights', '../Data/Results'
    ]:
        if not os.path.exists(d):
            os.makedirs(d)

    # --------------------- #
    # Setting up playground #
    # --------------------- #
    assert (
        os.path.exists(config['dataset_filename'])
    ), "ups, dataset specified not found: " + config['dataset_filename']
    data = pd.read_csv(config['dataset_filename'])
    print(str(np.datetime64('now')), 'INFO: original dataset lenght: ',
          data.shape[0])
    data['gpsLongitude'] = np.round(data['gpsLongitude'], 5)
    data['gpsLatitude'] = np.round(data['gpsLatitude'], 5)

    # avoid duplicates
    data = data[['gpsLongitude', 'gpsLatitude',
                 config['indicator']]].groupby(['gpsLongitude',
                                                'gpsLatitude']).mean()

    # base layer
    assert (os.path.exists(config['base_raster'])
            ), "ups, raster specified not found: " + config['base_raster']
    GRID = BaseLayer(config['base_raster'],
                     data.index.get_level_values('gpsLongitude'),
                     data.index.get_level_values('gpsLatitude'))
    # TODO: we should enforce the most accurate i and j when training, i.e. aggregate = 1?

    # Get Polygon Geojson of the boundaries
    # TODO: maybe go into BaseLayer class?
    minlat, maxlat, minlon, maxlon = boundaries(GRID.lat,
                                                GRID.lon,
                                                buffer=0.05)
    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    print(str(np.datetime64('now')),
          "INFO: Number of clusters: {} ".format(len(data)))

    pipeline = 'evaluation'

    # ------------------------------- #
    # get features from Google images #
    # ------------------------------- #
    if config['satellite_config']['satellite_images'] in ['Y', 'G']:
        features_path = "../Data/Features/features_Google_id_{}_{}.csv".format(
            id, pipeline)
        data_path = "../Data/Satellite/"
        from google_images import GoogleImages

        if os.path.exists(features_path):
            print('INFO: already scored.')
            features = pd.read_csv(features_path.format(id, pipeline),
                                   index_col=['gpsLongitude', 'gpsLatitude'],
                                   float_precision='round_trip')
        else:
            gimages = GoogleImages(data_path)
            # download the images from the relevant API
            gimages.download(GRID.lon,
                             GRID.lat,
                             step=config['satellite_config']['satellite_step'])
            # extract the features
            features = pd.DataFrame(gimages.featurize(
                GRID.lon,
                GRID.lat,
                step=config['satellite_config']['satellite_step']),
                                    index=data.index)

            features.columns = [
                str(col) + '_Google' for col in features.columns
            ]
            features.to_csv(features_path)

        data = data.join(features)
        print('INFO: features extracted from Google satellite images')

    # --------------------------------- #
    # get features from Sentinel images #
    # --------------------------------- #
    if config['satellite_config']['satellite_images'] in ['Y', 'S']:
        features_path = "../Data/Features/features_Sentinel_id_{}_{}.csv".format(
            id, pipeline)
        data_path = "../Data/Satellite/"
        start_date = config["satellite_config"]["start_date"]
        end_date = config["satellite_config"]["end_date"]

        from sentinel_images import SentinelImages

        if os.path.exists(features_path):
            print('INFO: already scored.')
            features = pd.read_csv(features_path.format(id, pipeline),
                                   index_col=['gpsLongitude', 'gpsLatitude'],
                                   float_precision='round_trip')
        else:
            simages = SentinelImages(data_path)
            # download the images from the relevant API
            simages.download(GRID.lon, GRID.lat, start_date, end_date)
            print('INFO: scoring ...')
            # extract the features
            print('INFO: extractor instantiated.')
            features = pd.DataFrame(simages.featurize(GRID.lon, GRID.lat,
                                                      start_date, end_date),
                                    index=data.index)

            features.columns = [
                str(col) + '_Sentinel' for col in features.columns
            ]
            features.to_csv(features_path)

        data = data.join(features)
        print('INFO: features extracted from Sentinel images')

    # --------------- #
    # add nightlights #
    # --------------- #
    from nightlights import Nightlights

    nlights = Nightlights('../Data/Geofiles/')
    nlights.download(area, config['nightlights_date']['start'],
                     config['nightlights_date']['end'])
    features = pd.DataFrame(nlights.featurize(GRID.lon, GRID.lat),
                            columns=['nightlights'],
                            index=data.index)
    # quantize nightlights
    features['nightlights'] = pd.qcut(features['nightlights'],
                                      5,
                                      labels=False,
                                      duplicates='drop')
    data = data.join(features)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            dist = OSM.distance_to_nearest(GRID.lat, GRID.lon,
                                           osm_gdf["value"])
            data['distance_{}'.format(value)] = [
                np.log(0.0001 + x) for x in dist
            ]

    # ---------------- #
    # NDBI, NDVI, NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')
    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', config['NDs_date']['start'],
                   config['NDs_date']['end'], config['scope'])
    S2.download()
    data['max_NDVI'], data['max_NDBI'], data['max_NDWI'] = S2.rms_values(
        GRID.lon, GRID.lat)

    # --------------- #
    # add ACLED #
    # --------------- #
    from acled import ACLED

    acled = ACLED("../Data/Geofiles/ACLED/")
    acled.download(config['iso3'], config['nightlights_date']['start'],
                   config['nightlights_date']['end'])
    d = {}
    for property in ["fatalities", "n_events", "violence_civ"]:
        for k in [10000, 100000]:
            d[property + "_" + str(k)] = acled.featurize(GRID.lon,
                                                         GRID.lat,
                                                         property=property,
                                                         function='density',
                                                         buffer=k)

    d["weighted_sum_fatalities_by_dist"] = acled.featurize(
        GRID.lon, GRID.lat, property="fatalities", function='weighted_kNN')
    d["distance_to_acled_event"] = acled.featurize(GRID.lon,
                                                   GRID.lat,
                                                   function='distance')
    # quantize ACLED
    for c in d.keys():
        d[c] = np.nan_to_num(pd.qcut(d[c], 5, labels=False, duplicates='drop'))

    features = pd.DataFrame(d, index=data.index)
    data = data.join(features)

    # --------------- #
    # save features   #
    # --------------- #
    # drop columns with only 1 value
    print(
        'INFO: {} columns. Dropping features with unique values (if any) ...'.
        format(len(data.columns)))
    data = data[[col for col in data if not data[col].nunique() == 1]]
    print('INFO: {} columns.'.format(len(data.columns)))
    # features to be use in the linear model
    features_list = list(
        sorted(set(data.columns) - set(['i', 'j', config['indicator']])))

    #Save non-scaled features
    data.to_csv(
        "../Data/Features/features_all_id_{}_evaluation_nonscaled.csv".format(
            config['id']))

    # Scale Features
    print("Normalizing : max")
    data[features_list] = (data[features_list] - data[features_list].mean()
                           ) / (data[features_list].max() + 0.001)
    data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(
        config['id']))

    # --------------- #
    # model indicator #
    # --------------- #
    # shuffle dataset
    data = data.sample(frac=1, random_state=1783)  # shuffle data
    scores_dict = {}  # placeholder to save the scores
    from modeller import Modeller
    X, y = data[features_list].reset_index(), data[config['indicator']]
    modeller = Modeller(X,
                        rs_features=features_list,
                        spatial_features=["gpsLatitude", "gpsLongitude"],
                        scoring='r2',
                        cv_loops=20)

    kNN_pipeline = modeller.make_model_pipeline('kNN')
    kNN_scores = modeller.compute_scores(kNN_pipeline, y)
    scores_dict['kNN_R2_mean'] = round(kNN_scores.mean(), 2)
    scores_dict['kNN_R2_std'] = round(kNN_scores.std(), 2)
    print("kNN_R2_mean: ", scores_dict['kNN_R2_mean'], "kNN_R2_std: ",
          scores_dict['kNN_R2_std'])

    Ridge_pipeline = modeller.make_model_pipeline('Ridge')
    Ridge_scores = modeller.compute_scores(Ridge_pipeline, y)
    scores_dict['ridge_R2_mean'] = round(Ridge_scores.mean(), 2)
    scores_dict['ridge_R2_std'] = round(Ridge_scores.std(), 2)
    print("Ridge_R2_mean: ", scores_dict['ridge_R2_mean'], "Ridge_R2_std: ",
          scores_dict['ridge_R2_std'])

    Ensemble_pipeline = modeller.make_ensemble_pipeline(
        [kNN_pipeline, Ridge_pipeline])
    Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y)
    scores_dict['ensemble_R2_mean'] = round(Ensemble_scores.mean(), 2)
    scores_dict['ensemble_R2_std'] = round(Ensemble_scores.std(), 2)
    print("Ensemble_R2_mean: ", scores_dict['ensemble_R2_mean'],
          "Ensemble_R2_std: ", scores_dict['ensemble_R2_std'])

    # save results
    if id is None:
        write_scores_to_file(scores_dict, config['id'])
    else:
        write_scores_to_db(scores_dict, config['id'], engine)

    # ------------------------- #
    # write predictions to file #
    # ------------------------- #
    print('INFO: writing predictions to disk ...')

    from sklearn.model_selection import cross_val_predict
    results = pd.DataFrame(
        {
            'yhat': cross_val_predict(Ensemble_pipeline, X.values, y),
            'y': data[config['indicator']].values
        },
        index=data.index)
    results.to_csv('../Data/Results/config_{}.csv'.format(config['id']))

    # save model for production
    Ensemble_pipeline.fit(X.values, y)

    # Best n_neighbors (kNN)
    print('INFO: number of neighbours chosen: ',
          Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_)
    # Best alpha (Ridge)
    print('INFO: regularization param chosen: ',
          Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_)

    from sklearn.externals import joblib
    joblib.dump(Ensemble_pipeline,
                '../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model saved.')
    dataloader = get_dataloader(opt.data_path, opt.resolution, opt.batch_size)

    # dataset = BoxDataset(
    #     file_path=os.path.join(opt.data_path, '*.png'),
    #     transform=transforms.Compose(
    #         [
    #             transforms.Resize(32),
    #             transforms.ToTensor(),
    #             TranformDynamicRange([0, 255], [-1, 1])
    #         ]
    #     ),
    # )
    #
    # dataloader = torch.utils.data.DataLoader(
    #     dataset=dataset,
    #     batch_size=8,
    #     shuffle=True,
    # )

    generator = Generator(opt)
    # if opt.is_restore_model:
    #     generator.restore()

    models = BaseLayer.restore_model(opt.model_path)
    if models is not None:
        generator, generator_predict, discriminator = models

    fid = FrechetInceptionDistance(generator, dataloader=dataloader, opt=opt)
    fid_score = fid.get_score()
    print(fid_score)
Ejemplo n.º 6
0
    def train_generator(self, current_loop_num):
        BaseLayer.set_model_parameter_requires_grad_all(self.generator, True)
        BaseLayer.set_model_parameter_requires_grad_all(
            self.discriminator, False)

        # train generator
        # TensorboardLogger.print_parameter(generator)
        for index in range(0, self.opt.generator_train_num):
            train_z = self.Tensor(
                np.random.normal(loc=0,
                                 scale=1,
                                 size=(self.opt.batch_size,
                                       self.opt.latent_dim)))
            fake_imgs, fake_dlatents_out = self.generator(train_z)
            fake_validity = self.discriminator(fake_imgs)

            prob_fake = F.sigmoid(fake_validity).mean()
            TensorboardLogger.write_scalar('prob_fake/generator', prob_fake)
            # print('{} prob_fake(generator): {}'.format(index, prob_fake))

            g_loss = self.generator_loss(fake_validity)
            self.optimizer_g.zero_grad()
            g_loss.backward()
            self.optimizer_g.step()

        run_g_reg = current_loop_num % self.opt.g_reg_interval == 0
        if run_g_reg:
            # generatorの正則化処理
            g_reg_maxcount = 4 if 4 < self.opt.generator_train_num else self.opt.generator_train_num
            for _ in range(0, g_reg_maxcount):
                z = self.Tensor(
                    np.random.normal(loc=0,
                                     scale=1,
                                     size=(self.opt.batch_size,
                                           self.opt.latent_dim)))
                pl_fake_imgs, pl_fake_dlatents_out = self.generator(z)
                g_reg, pl_lenght = self.generator_loss_path_reg(
                    pl_fake_imgs, pl_fake_dlatents_out)
                self.optimizer_g.zero_grad()
                g_reg.backward()
                self.optimizer_g.step()

            TensorboardLogger.write_scalar('loss/g_reg', g_reg)
            TensorboardLogger.write_scalar('loss/path_length', pl_lenght)
            TensorboardLogger.write_scalar(
                'loss/pl_mean_var',
                self.generator_loss_path_reg.pl_mean_var.mean())

        # 推論用のgeneratorに指数移動平均を行った重みを適用する
        Generator.apply_decay_parameters(self.generator,
                                         self.generator_predict,
                                         decay=self.decay)
        fake_imgs_predict, fake_dlatents_out_predict = self.generator_predict(
            train_z)
        fake_predict_validity = self.discriminator(fake_imgs_predict)
        prob_fake_predict = F.sigmoid(fake_predict_validity).mean()
        TensorboardLogger.write_scalar('prob_fake_predict/generator',
                                       prob_fake_predict)
        # print('prob_fake_predict(generator): {}'.format(prob_fake_predict))

        Generator.apply_decay_parameters(self.generator_predict,
                                         self.generator,
                                         decay=self.opt.reverse_decay)

        if current_loop_num % self.opt.save_metrics_interval == 0:
            TensorboardLogger.write_scalar('score/g_score',
                                           fake_validity.mean())
            TensorboardLogger.write_scalar('loss/g_loss', g_loss)
            TensorboardLogger.write_histogram('generator/fake_imgs', fake_imgs)
            TensorboardLogger.write_histogram('generator/fake_dlatents_out',
                                              fake_dlatents_out)
            TensorboardLogger.write_histogram('generator/fake_imgs_predict',
                                              fake_imgs_predict)
            TensorboardLogger.write_histogram(
                'generator/fake_dlatents_out_predict',
                fake_dlatents_out_predict)

        if current_loop_num % self.opt.save_images_tensorboard_interval == 0:
            # for index in range(fake_imgs.shape[0]):
            #     img = adjust_dynamic_range(fake_imgs[index].to('cpu').detach().numpy(), drange_in=[-1, 1], drange_out=[0, 255])
            #     TensorboardLogger.write_image('images/fake/{}'.format(index), img)

            for index in range(fake_imgs_predict.shape[0]):
                img = adjust_dynamic_range(
                    fake_imgs_predict[index].to('cpu').detach().numpy(),
                    drange_in=[-1, 1],
                    drange_out=[0, 255])
                TensorboardLogger.write_image(
                    'images/fake_predict/{}'.format(index), img)

        if current_loop_num % self.opt.save_images_interval == 0:
            # 生成した画像を保存する
            if not os.path.isdir(self.opt.results):
                os.makedirs(self.opt.results, exist_ok=True)
            # fake_imgs_val, fake_dlatents_out_val = generator(val_z)
            # save_image_grid(
            #     # fake_imgs_val.to('cpu').detach().numpy(),
            #     fake_imgs.to('cpu').detach().numpy(),
            #     os.path.join(self.opt.results, '{}_fake.png'.format(TensorboardLogger.global_step)),
            #     batch_size=self.opt.batch_size,
            #     drange=[-1, 1])

            # fake_imgs_predict_val, fake_dlatents_out_predict_val = generator_predict(val_z)
            save_image_grid(fake_imgs_predict.to('cpu').detach().numpy(),
                            os.path.join(
                                self.opt.results, '{}_fake_predict.png'.format(
                                    TensorboardLogger.global_step)),
                            batch_size=self.opt.batch_size,
                            drange=[-1, 1])

        return g_loss
Ejemplo n.º 7
0
 def save_model(self):
     BaseLayer.save_model(
         self.opt.model_path,
         (self.generator, self.generator_predict, self.discriminator))
Ejemplo n.º 8
0
    def train_discriminator(self, current_loop_num):
        BaseLayer.set_model_parameter_requires_grad_all(self.generator, False)
        BaseLayer.set_model_parameter_requires_grad_all(
            self.discriminator, True)

        # train discriminator
        for index in range(0, self.opt.discriminator_train_num):
            data_iterator = self.dataloader.__iter__()
            imgs = data_iterator.next()
            # imgs = TranformDynamicRange.fade_lod(x=imgs, lod=0.0)
            # imgs = TranformDynamicRange.upscale_lod(x=imgs, lod=0.0)
            real_imgs = Variable(imgs.type(self.Tensor), requires_grad=False)

            z = self.Tensor(
                np.random.normal(loc=0,
                                 scale=1,
                                 size=(self.opt.batch_size,
                                       self.opt.latent_dim)))
            fake_imgs, fake_dlatents_out = self.generator(z)

            real_validity = self.discriminator(real_imgs)
            prob_real = F.sigmoid(real_validity).mean()
            TensorboardLogger.write_scalar('prob_real/discriminator',
                                           prob_real)
            # print('{} prob_real(discriminator): {}'.format(index, prob_real))

            fake_validity = self.discriminator(fake_imgs)
            prob_fake = F.sigmoid(fake_validity).mean()
            TensorboardLogger.write_scalar('prob_fake/discriminator',
                                           prob_fake)
            # print('{} prob_fake(discriminator): {}'.format(index, prob_fake))

            d_loss = self.discriminator_loss(fake_validity, real_validity)
            self.optimizer_d.zero_grad()
            d_loss.backward()
            self.optimizer_d.step()

        run_d_reg = current_loop_num % self.opt.d_reg_interval == 0
        if run_d_reg:
            d_reg_maxcount = 4 if 4 < self.opt.discriminator_train_num else self.opt.discriminator_train_num
            for index in range(0, d_reg_maxcount):
                # discriminatorの正則化処理
                # z = self.Tensor(np.random.normal(loc=0, scale=1, size=(self.opt.batch_size, self.opt.latent_dim)))
                # fake_imgs, fake_dlatents_out = self.generator(z)
                # fake_validity = self.discriminator(fake_imgs)

                real_imgs.requires_grad = True
                real_validity = self.discriminator(real_imgs)

                d_reg = self.discriminator_loss_r1(real_validity, real_imgs)
                self.optimizer_d.zero_grad()
                d_reg.backward()
                self.optimizer_d.step()
            TensorboardLogger.writer.add_scalar(
                '{}/reg/d_reg'.format(TensorboardLogger.now), d_reg,
                TensorboardLogger.global_step)

        if current_loop_num % self.opt.save_metrics_interval == 0:
            TensorboardLogger.write_scalar('score/d_score',
                                           real_validity.mean())
            TensorboardLogger.write_scalar('loss/d_loss', d_loss)
            TensorboardLogger.write_histogram('real_imgs', real_imgs)

        return d_loss
Ejemplo n.º 9
0
def main(id, aggregate_factor, min_pop, bbox, shapefile):
    """ makes predictions is areas where we have no survey.
    Args:
        id (int): the config id
        aggregate_factor (int): aggregate pixels to lower resolution by x much
        min_pop: minimium population in pixel to score
        bbox: bounding box <minlat> <minlon> <maxlat> <maxlon>, if omitted will use boundaries from dataset
        shapefile: aggregate within shapefile's geometires

    Example:
        id, aggregate_factor, min_pop = 3075, 15, 500
    """
    # read the configs for id
    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""".format(
        private_config['DB']['user'], private_config['DB']['password'],
        private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query(
        "select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    raster = config["base_raster"][0]
    scope = config["scope"][0]
    nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), \
                                                   config["nightlights_date"][0].get("end")
    s2_date_start, s2_date_end = config["NDs_date"][0].get(
        "start"), config["NDs_date"][0].get("end")
    ISO = config["iso3"][0]
    if config['satellite_config'][0].get('satellite_images') == 'Y':
        print('INFO: satellite images from Google and Sentinel-2')
        step = config['satellite_config'][0].get("satellite_step")
    elif config['satellite_config'][0].get('satellite_images') == 'G':
        print('INFO: only Google satellite images.')
        step = config['satellite_config'][0].get("satellite_step")
    elif config['satellite_config'][0].get('satellite_images') == 'N':
        print('INFO: no satellite images')

    # ----------------------------------- #
    # WorldPop Raster too granular (lots of images), aggregate #
    if aggregate_factor > 1:
        print(
            'INFO: aggregating raster with factor {}'.format(aggregate_factor))
        base_raster = "../local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    # ---------------- #
    # AREA OF INTEREST #
    # ---------------- #
    # dataset_df = pd.read_csv(dataset)
    # data_cols = dataset_df.columns.values

    if sum(bbox) != 0:  # dummy bbox
        print("INFO: using AOI from bbox")
        print(sum(bbox))
        # define AOI with manually defined bbox
        minlat, minlon, maxlat, maxlon = bbox[0], bbox[1], bbox[2], bbox[3]
        area = points_to_polygon(minlat=minlat,
                                 minlon=minlon,
                                 maxlat=maxlat,
                                 maxlon=maxlon)
    else:
        print("INFO: using AOI from dataset.")
        # use dataset's extent
        dataset_df = pd.read_csv(dataset)
        minlat, maxlat, minlon, maxlon = boundaries(dataset_df['gpsLatitude'],
                                                    dataset_df['gpsLongitude'])
        area = points_to_polygon(minlat=minlat,
                                 minlon=minlon,
                                 maxlat=maxlat,
                                 maxlon=maxlon)
        del dataset_df

    # crop raster
    with rasterio.open(base_raster) as src:
        out_image, out_transform = mask(src, [area], crop=True)
        out_meta = src.meta.copy()

    # save the resulting raster
    out_meta.update({
        "driver": "GTiff",
        "height": out_image.shape[1],
        "width": out_image.shape[2],
        "transform": out_transform
    })

    final_raster = "../final_raster.tif"
    print('INFO: Removing tiles with population under {}'.format(
        min_pop))  # only score areas where there are at agg factor living
    with rasterio.open(final_raster, "w", **out_meta) as dest:
        out_image[out_image < min_pop] = dest.nodata
        dest.write(out_image)
        list_j, list_i = np.where(out_image[0] != dest.nodata)

    # instantiate GRID
    GRID = BaseLayer(final_raster)

    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    ix = pd.MultiIndex.from_arrays([list_i, list_j, coords_y, coords_x],
                                   names=('i', 'j', "gpsLatitude",
                                          "gpsLongitude"))

    print("Number of clusters: {} ".format(len(ix)))

    pipeline = 'scoring'

    # ------------------------------------------------ #
    # download images from Google and Extract Features #
    # ------------------------------------------------ #
    if config['satellite_config'][0].get('satellite_images') in ['Y', 'G']:
        features_path = "../Data/Features/features_Google_id_{}_{}.csv".format(
            id, pipeline)
        data_path = "../Data/Satellite/"

        gimages = GoogleImages(data_path)
        # download the images from the relevant API
        gimages.download(coords_x, coords_y, step=step)
        # extract the features
        features = pd.DataFrame(gimages.featurize(coords_x,
                                                  coords_y,
                                                  step=step),
                                index=ix)
        features.columns = [str(col) + '_Google' for col in features.columns]
        features.to_csv(features_path)
        print('INFO: features extracted.')
        data = features.copy()
    # ------------------------------------------------------------- #
    # download Sentinel images and Extract Features #
    # ------------------------------------------------------------- #
    if config['satellite_config'][0].get('satellite_images') == 'Y':
        features_path = "../Data/Features/features_Sentinel_id_{}_{}.csv".format(
            id, pipeline)
        data_path = "../Data/Satellite/"
        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        from sentinel_images import SentinelImages

        simages = SentinelImages(data_path)
        # download the images from the relevant API
        simages.download(coords_x, coords_y, start_date, end_date)
        print('INFO: scoring ...')
        # extract the features
        print('INFO: extractor instantiated.')
        features = pd.DataFrame(simages.featurize(coords_x, coords_y,
                                                  start_date, end_date),
                                index=ix)

        features.columns = [str(col) + '_Sentinel' for col in features.columns]
        features.to_csv(features_path)

        if data is not None:
            data = data.join(features)
        else:
            data = features.copy()
        print('INFO: features extracted')

    # --------------- #
    # add nightlights #
    # --------------- #
    from nightlights import Nightlights

    nlights = Nightlights('../Data/Geofiles/')
    nlights.download(area, nightlights_date_start, nightlights_date_end)
    features = pd.DataFrame(nlights.featurize(coords_x, coords_y),
                            columns=['nightlights'],
                            index=ix)
    # quantize nightlights
    features['nightlights'] = pd.qcut(features['nightlights'],
                                      5,
                                      labels=False,
                                      duplicates='drop')

    data = data.join(features)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            dist = OSM.distance_to_nearest(coords_y, coords_x,
                                           osm_gdf["value"])
            data['distance_{}'.format(value)] = [
                np.log(0.0001 + x) for x in dist
            ]

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')
    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end,
                   scope)
    S2.download()
    data['max_NDVI'], data['max_NDBI'], data['max_NDWI'] = S2.rms_values(
        coords_x, coords_y)

    # --------------- #
    # add ACLED #
    # --------------- #
    from acled import ACLED

    acled = ACLED("../Data/Geofiles/ACLED/")
    acled.download(ISO, nightlights_date_start, nightlights_date_end)
    d = {}
    for property in ["fatalities", "n_events", "violence_civ"]:
        for k in [10000, 100000]:
            d[property + "_" + str(k)] = acled.featurize(coords_x,
                                                         coords_y,
                                                         property=property,
                                                         function='density',
                                                         buffer=k)

    d["weighted_sum_fatalities_by_dist"] = acled.featurize(
        coords_x, coords_y, property="fatalities", function='weighted_kNN')
    d["distance_to_acled_event"] = acled.featurize(coords_x,
                                                   coords_y,
                                                   function='distance')
    # quantize ACLED
    for c in d.keys():
        d[c] = np.nan_to_num(pd.qcut(d[c], 5, labels=False, duplicates='drop'))

    features = pd.DataFrame(d, index=data.index)
    data = data.join(features)

    # --------------- #
    # save features   #
    # --------------- #
    print('INFO: {} columns.'.format(len(data.columns)))
    # features to be use in the linear model
    features_list = list(sorted(data.columns))
    print(features_list)
    data.to_csv("../Data/Features/features_all_id_{}_{}_nonscaled.csv".format(
        id, pipeline))
    # Scale Features
    print("Normalizing : max")
    data[features_list] = (data[features_list] - data[features_list].mean()
                           ) / (data[features_list].max() + 0.001)

    data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(
        id, pipeline))

    # ------- #
    # predict #
    # ------- #
    ensemble_pipeline = joblib.load(
        '../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model loaded.')

    X = data.reset_index(level=[2, 3])
    ensemble_predictions = ensemble_pipeline.predict(X.values)

    results = pd.DataFrame({
        'i': list_i,
        'j': list_j,
        'lat': coords_y,
        'lon': coords_x,
        'yhat': ensemble_predictions
    })
    results.to_csv('../Data/Results/config_{}.csv'.format(id))
    outfile = "../Data/Results/scalerout_{}.tif".format(id)
    tifgenerator(outfile=outfile, raster_path=final_raster, df=results)

    outfile = "../Data/Results/scalerout_{}_kNN.tif".format(id)
    results['yhat_kNN'] = ensemble_pipeline.regr_[0].predict(X.values)
    tifgenerator(outfile=outfile,
                 raster_path=final_raster,
                 df=results,
                 value='yhat_kNN')

    outfile = "../Data/Results/scalerout_{}_Ridge.tif".format(id)
    results['yhat_Ridge'] = ensemble_pipeline.regr_[1].predict(X.values)
    tifgenerator(outfile=outfile,
                 raster_path=final_raster,
                 df=results,
                 value='yhat_Ridge')

    if shapefile is not None:
        input_rst = "../Data/Results/scalerout_{}.tif".format(id)
        weight_rst = "../tmp/final_raster.tif"

        output_shp = "../Data/Results/scalerout_{}_aggregated.shp".format(id)
        from utils import weighted_sum_by_polygon
        weighted_sum_by_polygon(shapefile, input_rst, weight_rst, output_shp)