コード例 #1
0
class ModellerListener(RedisListener):
    def __init__(self, *args, **kwargs):
        super(ModellerListener, self).__init__(*args, **kwargs)
        self.modeller = Modeller()
        pass

    def process(self, key, channel, command):
        if key and channel and command:
            try:
                session, type_val, hash_val = key.split('||')
                session = session.split('::')[1]
                type_val = type_val.split('::')[1]
                hash_val = hash_val.split('::')[1]

                command = command.lower()
                if type_val == 'transport' and command == 'set':
                    try:
                        self.modeller.model(key, session)
                    except Exception, e:
                        print e
                        pass
                else:
                    # ignore the event
                    pass
            except ValueError, e:
                print "invalid key detected"
                pass
コード例 #2
0
ファイル: window.py プロジェクト: marabyank/Modelling
    def run(self):
        uniform_a = self.spin_a.value()
        uniform_b = self.spin_b.value()
        expo_lambd = self.spin_lambd.value()
        req_count = self.spin_req_count.value()
        reenter_prob = self.spin_reenter_prob.value()
        dt = self.spin_dt.value()
        method = self.comboBox_method.currentIndex()

        print(uniform_a, uniform_b, expo_lambd, req_count, reenter_prob, dt)
        modeller = Modeller(uniform_a, uniform_b, expo_lambd, reenter_prob,
                            req_count, dt)
        if method == 0:
            self.show_results(modeller.event_based_modelling())
        else:
            self.show_results(modeller.time_based_modelling())
コード例 #3
0
def update_output(list_of_contents, model_type, n_init, max_iter, n_clusters,
                  eps, min_samples, column1, column2, list_of_names,
                  list_of_dates):
    if list_of_contents:
        if column1 == column2:
            return html.P("Select different columns!")

        contents = list_of_contents[0]
        content_type, content_string = contents.split(',')
        decoded = base64.b64decode(content_string)
        try:
            df = parse_file(decoded, content_type, list_of_names)
        except Exception as e:
            print(e)
            return html.Div(['There was an error processing this file.'])

        if len(df.columns) > 8:
            return html.P("Dataframe has too many columns!")

        model = Modeller(
            df, model_type,
            **dict(n_init=n_init,
                   max_iter=max_iter,
                   n_clusters=n_clusters,
                   eps=eps,
                   min_samples=min_samples))
        try:
            labels = model.set_up_model([column1, column2])
        except Exception as e:
            return html.P(str(e))

        if not labels:
            return html.P("Please, fill in all the parameters in green.")
        return dcc.Graph(id="chart",
                         figure=labels,
                         style={
                             'display': 'block',
                             'height': 600,
                             'width': 900,
                             'margin-left': 'auto',
                             'margin-right': 'auto'
                         })
    return html.P("")
コード例 #4
0
ファイル: test.py プロジェクト: giftig/hackathon
def main():
    data = []

    for f in os.listdir('data/tweets'):
        if not f.endswith('.json'):
            continue

        f = os.path.join('data/tweets', f)
        with open(f) as fp:
            data.append(json.load(fp))

    m = Modeller()
    data = [m.model_tweet(e, ['wendyiscool']) for e in data]

    es = ESClient()
    for e in data:
        if not e:
            continue

        es.insert_entry(e)

    print json.dumps(data)
コード例 #5
0
ファイル: main.py プロジェクト: corticalstack/KDDCup2009
def main():
    filehandler = Filehandler()
    modeller = Modeller()

    if envparm['PrepEnabled']:
        logging.info("Executing preprocessor")
        Preprocessor(envparm)

    if envparm['ProcessDS01']:
        process_dataset(filehandler, modeller,
                        filehandler.dataset_prep_path_01)

    if envparm['ProcessDS02']:
        process_dataset(filehandler, modeller,
                        filehandler.dataset_prep_path_02)

    if envparm['ProcessDS03']:
        process_dataset(filehandler, modeller,
                        filehandler.dataset_prep_path_03)

    if modeller.scores:
        modeller.output_scores(filehandler)
コード例 #6
0
ファイル: master.py プロジェクト: WendyAnthony/HRM
def run(id):
    # ----------------- #
    # SETUP #############
    # ----------------- #

    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    indicator = config["indicator"][0]
    raster = config["satellite_grid"][0]
    aggregate_factor = config["base_raster_aggregation"][0]
    scope = config["scope"][0]
    nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end")
    s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end")
    if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step")

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    from utils import aggregate
    if aggregate_factor > 1:
        print('INFO: aggregating raster {}'.format(raster))
        base_raster = "../tmp/local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    # -------- #
    # DATAPREP #
    # -------- #
    data = pd.read_csv(dataset)
    data_cols = data.columns.values

    # grid
    GRID = RasterGrid(base_raster)
    list_i, list_j = GRID.get_gridcoordinates(data)

    # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES
    # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y

    data["i"], data["j"] = list_i, list_j

    # Get Polygon Geojson of the boundaries
    minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude")
    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    print("Number of clusters: {} ".format(len(data)))

    list_i, list_j, pipeline = data["i"], data["j"], 'evaluation'

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # downlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000)
            print('INFO: images downloaded.')

            if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)):
                print('INFO: already scored.')
                features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline))
            else:
                print('INFO: scoring ...')
                # extract the features
                network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
                print('INFO: extractor instantiated.')

                features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline)
                # normalize the features

                features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False)

            features = features.drop('index', 1)
            data = data.merge(features, on=["i", "j"])

        data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

        print('INFO: features extracted.')

    # --------------- #
    # add nightlights #
    # --------------- #

    from nightlights import Nightlights

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end)
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')

    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope)
    S2.download()
    data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series)
    # --------------- #
    # save features   #
    # --------------- #
    # features to be use in the linear model
    features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j'])))

    # Standardize Features (0 mean and 1 std)
    #data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std()
    print("Normalizing : max")
    data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max()

    data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

    # --------------- #
    # model indicator #
    # --------------- #
    # shuffle dataset
    data = data.sample(frac=1, random_state=1783).reset_index(drop=True)  # shuffle data

    # if set in the config, take log of indicator
    if config['log'][0]:
        data[indicator] = np.log(data[indicator])

    from modeller import Modeller
    X, y = data[features_list + ["gpsLatitude", "gpsLongitude"]], data[indicator]
    modeller = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], scoring='r2', cv_loops=20)

    kNN_pipeline = modeller.make_model_pipeline('kNN')
    kNN_scores = modeller.compute_scores(kNN_pipeline, y)
    kNN_R2_mean = kNN_scores.mean()
    kNN_R2_std = kNN_scores.std()
    print("kNN_R2_mean: ", kNN_R2_mean, "kNN_R2_std: ", kNN_R2_std)

    Ridge_pipeline = modeller.make_model_pipeline('Ridge')
    Ridge_scores = modeller.compute_scores(Ridge_pipeline, y)
    Ridge_R2_mean = Ridge_scores.mean()
    Ridge_R2_std = Ridge_scores.std()
    print("Ridge_R2_mean: ", Ridge_R2_mean, "Ridge_R2_std: ", Ridge_R2_std)

    Ensemble_pipeline = modeller.make_ensemble_pipeline([kNN_pipeline, Ridge_pipeline])
    Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y)
    Ensemble_R2_mean = Ensemble_scores.mean()
    Ensemble_R2_std = Ensemble_scores.std()
    print("Ensemble_R2_mean: ", Ensemble_R2_mean, "Ensemble_R2_std: ", Ensemble_R2_std)

    # ------------------ #
    # write scores to DB #
    # ------------------ #

    query = """
    insert into results_new (run_date, config_id, r2, r2_sd, r2_knn, r2_sd_knn, r2_features, r2_sd_features, mape_rmsense)
    values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format(
        config['id'][0],
        Ensemble_R2_mean, Ensemble_R2_std, kNN_R2_mean, kNN_R2_std, Ridge_R2_mean, Ridge_R2_std, 0)
    engine.execute(query)

    # ------------------------- #
    # write predictions to file #
    # ------------------------- #

    print('INFO: writing predictions to disk ...')

    from sklearn.model_selection import cross_val_predict
    results = pd.DataFrame({
        'yhat': cross_val_predict(Ensemble_pipeline, X.values, y),
        'y': data[indicator].values,
        'lat': data['gpsLatitude'],
        'lon': data['gpsLongitude']})
    results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False)

    # save model for production
    Ensemble_pipeline.fit(X.values, y)

    # Best n_neighbors (kNN)
    print('INFO: number of neighbours chosen: ', Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_)
    # Best alpha (Ridge)
    print('INFO: regularization param chosen: ', Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_)

    from sklearn.externals import joblib
    joblib.dump(Ensemble_pipeline, '../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model saved.')
コード例 #7
0
ファイル: feature_score.py プロジェクト: WFP-VAM/HRM
def individual_r2(id, indicator, cv):
    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    if indicator is None:
        indicator = config["indicator"][0]
    data = pd.read_csv(dataset)
    data_cols = data.columns.values
    print(data_cols)

    data = pd.read_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id))

    data["noise"] = np.random.normal(0, 1, len(data))

    data = data.sample(frac=1, random_state=1783).reset_index(drop=True)  # shuffle data

    features_list = list(set(data.columns) - set(data_cols) - set(['i', 'j', 'gpsLatitude', 'gpsLongitude', 'cluster', 'n', indicator, "log_".format(indicator)]))
    print(features_list)
    nn_features_google = [i for i in features_list if i.endswith('_Google')]
    nn_features_sentinel = [i for i in features_list if i.endswith('_Sentinel')]
    nn_features = nn_features_google + nn_features_sentinel
    print(nn_features)
    no_nn_features = list(set(features_list) - set(nn_features))
    print(no_nn_features)

    # if take log of indicator
    if config['log'][0]:
        data[indicator] = np.log(data[indicator])

    X = data
    print("indicator: ", indicator)
    y = data[indicator]
    cv_loops = cv
    from modeller import Modeller
    Modeller_all = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops)

    kNN_pipeline = Modeller_all.make_model_pipeline('kNN')
    kNN_scores = Modeller_all.compute_scores(kNN_pipeline, y)
    kNN_R2_mean = kNN_scores.mean()
    kNN_R2_std = kNN_scores.std()
    print("kNN_R2_mean: ", round(kNN_R2_mean, 2), "kNN_R2_std: ", round(kNN_R2_std, 2))

    Ridge_pipeline = Modeller_all.make_model_pipeline('Ridge')
    Ridge_scores = Modeller_all.compute_scores(Ridge_pipeline, y)
    Ridge_R2_mean = Ridge_scores.mean()
    Ridge_R2_std = Ridge_scores.std()
    print("Ridge_R2_mean: ", round(Ridge_R2_mean, 2), "Ridge_R2_std: ", round(Ridge_R2_std, 2))

    Ensemble_pipeline = Modeller_all.make_ensemble_pipeline([kNN_pipeline, Ridge_pipeline])
    Ensemble_scores = Modeller_all.compute_scores(Ensemble_pipeline, y)
    Ensemble_R2_mean = Ensemble_scores.mean()
    Ensemble_R2_std = Ensemble_scores.std()
    print("Ensemble_R2_mean: ", round(Ensemble_R2_mean, 2), "Ensemble_R2_std: ", round(Ensemble_R2_std, 2))

    Modeller_google = Modeller(X, rs_features=nn_features_google, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops)
    Ridge_pipeline = Modeller_google.make_model_pipeline('Ridge')
    Ridge_scores_google = Modeller_google.compute_scores(Ridge_pipeline, y)
    Ridge_R2_mean_google = Ridge_scores_google.mean()
    Ridge_R2_std_google = Ridge_scores_google.std()
    print("Ridge_R2_google_mean: ", round(Ridge_R2_mean_google, 2), "Ridge_R2_google_std: ", round(Ridge_R2_std_google, 2))

    Modeller_sentinel = Modeller(X, rs_features=nn_features_sentinel, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops)
    Ridge_pipeline = Modeller_sentinel.make_model_pipeline('Ridge')
    Ridge_scores_sentinel = Modeller_sentinel.compute_scores(Ridge_pipeline, y)
    Ridge_R2_mean_sentinel = Ridge_scores_sentinel.mean()
    Ridge_R2_std_sentinel = Ridge_scores_sentinel.std()
    print("Ridge_R2_sentinel_mean: ", round(Ridge_R2_mean_sentinel, 2), "Ridge_R2_sentinel_std: ", round(Ridge_R2_std_sentinel, 2))

    Modeller_nn = Modeller(X, rs_features=nn_features, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops)
    Ridge_pipeline = Modeller_nn.make_model_pipeline('Ridge')
    Ridge_scores_nn = Modeller_nn.compute_scores(Ridge_pipeline, y)
    Ridge_R2_mean_nn = Ridge_scores_nn.mean()
    Ridge_R2_std_nn = Ridge_scores_nn.std()
    print("Ridge_R2_nn_mean: ", round(Ridge_R2_mean_nn, 2), "Ridge_R2_nn_std: ", round(Ridge_R2_std_nn, 2))

    Modeller_no_nn = Modeller(X, rs_features=no_nn_features, spatial_features=["gpsLatitude", "gpsLongitude"], cv_loops=cv_loops)
    Ridge_pipeline = Modeller_no_nn.make_model_pipeline('Ridge')
    Ridge_scores_no_nn = Modeller_no_nn.compute_scores(Ridge_pipeline, y)
    Ridge_R2_mean_no_nn = Ridge_scores_no_nn.mean()
    Ridge_R2_std_no_nn = Ridge_scores_no_nn.std()
    print("Ridge_R2_no_nn_mean: ", round(Ridge_R2_mean_no_nn, 2), "Ridge_R2_no_nn_std: ", round(Ridge_R2_std_no_nn, 2))

    for feature in features_list:
        Modeller_feature = Modeller(X, rs_features=feature, cv_loops=cv_loops)
        Ridge_pipeline = Modeller_feature.make_model_pipeline('Ridge')
        Ridge_scores_feature = Modeller_feature.compute_scores(Ridge_pipeline, y)
        Ridge_R2_mean_feature = Ridge_scores_feature.mean()
        Ridge_R2_std_feature = Ridge_scores_feature.std()

        all_but_feature = list(set(features_list) - set([feature]))
        Modeller_all_but_feature = Modeller(X, rs_features=all_but_feature, cv_loops=cv_loops)
        Ridge_pipeline2 = Modeller_all_but_feature.make_model_pipeline('Ridge')
        Ridge_scores_all_but_feature = Modeller_all_but_feature.compute_scores(Ridge_pipeline2, y)
        Ridge_R2_mean_all_but_feature = Ridge_scores_all_but_feature.mean()
        print(feature, " R2_mean: ", round(Ridge_R2_mean_feature, 2), " R2_mean_added_value: ", round(Ridge_R2_mean - Ridge_R2_mean_all_but_feature, 2), "R2_std: ", round(Ridge_R2_std_feature, 2))
コード例 #8
0
def run(file, id=None):
    # ----------------- #
    # SETUP #############
    # ----------------- #
    if id is None:
        config = get_config_file(file)
    else:
        config, engine = get_config_db(id)

    for d in [
            '../Data/Features', '../Data/Geofiles/OSM',
            '../Data/Geofiles/nightlights', '../Data/Results'
    ]:
        if not os.path.exists(d):
            os.makedirs(d)

    # --------------------- #
    # Setting up playground #
    # --------------------- #
    assert (
        os.path.exists(config['dataset_filename'])
    ), "ups, dataset specified not found: " + config['dataset_filename']
    data = pd.read_csv(config['dataset_filename'])
    print(str(np.datetime64('now')), 'INFO: original dataset lenght: ',
          data.shape[0])
    data['gpsLongitude'] = np.round(data['gpsLongitude'], 5)
    data['gpsLatitude'] = np.round(data['gpsLatitude'], 5)

    # avoid duplicates
    data = data[['gpsLongitude', 'gpsLatitude',
                 config['indicator']]].groupby(['gpsLongitude',
                                                'gpsLatitude']).mean()

    # base layer
    assert (os.path.exists(config['base_raster'])
            ), "ups, raster specified not found: " + config['base_raster']
    GRID = BaseLayer(config['base_raster'],
                     data.index.get_level_values('gpsLongitude'),
                     data.index.get_level_values('gpsLatitude'))
    # TODO: we should enforce the most accurate i and j when training, i.e. aggregate = 1?

    # Get Polygon Geojson of the boundaries
    # TODO: maybe go into BaseLayer class?
    minlat, maxlat, minlon, maxlon = boundaries(GRID.lat,
                                                GRID.lon,
                                                buffer=0.05)
    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    print(str(np.datetime64('now')),
          "INFO: Number of clusters: {} ".format(len(data)))

    pipeline = 'evaluation'

    # ------------------------------- #
    # get features from Google images #
    # ------------------------------- #
    if config['satellite_config']['satellite_images'] in ['Y', 'G']:
        features_path = "../Data/Features/features_Google_id_{}_{}.csv".format(
            id, pipeline)
        data_path = "../Data/Satellite/"
        from google_images import GoogleImages

        if os.path.exists(features_path):
            print('INFO: already scored.')
            features = pd.read_csv(features_path.format(id, pipeline),
                                   index_col=['gpsLongitude', 'gpsLatitude'],
                                   float_precision='round_trip')
        else:
            gimages = GoogleImages(data_path)
            # download the images from the relevant API
            gimages.download(GRID.lon,
                             GRID.lat,
                             step=config['satellite_config']['satellite_step'])
            # extract the features
            features = pd.DataFrame(gimages.featurize(
                GRID.lon,
                GRID.lat,
                step=config['satellite_config']['satellite_step']),
                                    index=data.index)

            features.columns = [
                str(col) + '_Google' for col in features.columns
            ]
            features.to_csv(features_path)

        data = data.join(features)
        print('INFO: features extracted from Google satellite images')

    # --------------------------------- #
    # get features from Sentinel images #
    # --------------------------------- #
    if config['satellite_config']['satellite_images'] in ['Y', 'S']:
        features_path = "../Data/Features/features_Sentinel_id_{}_{}.csv".format(
            id, pipeline)
        data_path = "../Data/Satellite/"
        start_date = config["satellite_config"]["start_date"]
        end_date = config["satellite_config"]["end_date"]

        from sentinel_images import SentinelImages

        if os.path.exists(features_path):
            print('INFO: already scored.')
            features = pd.read_csv(features_path.format(id, pipeline),
                                   index_col=['gpsLongitude', 'gpsLatitude'],
                                   float_precision='round_trip')
        else:
            simages = SentinelImages(data_path)
            # download the images from the relevant API
            simages.download(GRID.lon, GRID.lat, start_date, end_date)
            print('INFO: scoring ...')
            # extract the features
            print('INFO: extractor instantiated.')
            features = pd.DataFrame(simages.featurize(GRID.lon, GRID.lat,
                                                      start_date, end_date),
                                    index=data.index)

            features.columns = [
                str(col) + '_Sentinel' for col in features.columns
            ]
            features.to_csv(features_path)

        data = data.join(features)
        print('INFO: features extracted from Sentinel images')

    # --------------- #
    # add nightlights #
    # --------------- #
    from nightlights import Nightlights

    nlights = Nightlights('../Data/Geofiles/')
    nlights.download(area, config['nightlights_date']['start'],
                     config['nightlights_date']['end'])
    features = pd.DataFrame(nlights.featurize(GRID.lon, GRID.lat),
                            columns=['nightlights'],
                            index=data.index)
    # quantize nightlights
    features['nightlights'] = pd.qcut(features['nightlights'],
                                      5,
                                      labels=False,
                                      duplicates='drop')
    data = data.join(features)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            dist = OSM.distance_to_nearest(GRID.lat, GRID.lon,
                                           osm_gdf["value"])
            data['distance_{}'.format(value)] = [
                np.log(0.0001 + x) for x in dist
            ]

    # ---------------- #
    # NDBI, NDVI, NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')
    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', config['NDs_date']['start'],
                   config['NDs_date']['end'], config['scope'])
    S2.download()
    data['max_NDVI'], data['max_NDBI'], data['max_NDWI'] = S2.rms_values(
        GRID.lon, GRID.lat)

    # --------------- #
    # add ACLED #
    # --------------- #
    from acled import ACLED

    acled = ACLED("../Data/Geofiles/ACLED/")
    acled.download(config['iso3'], config['nightlights_date']['start'],
                   config['nightlights_date']['end'])
    d = {}
    for property in ["fatalities", "n_events", "violence_civ"]:
        for k in [10000, 100000]:
            d[property + "_" + str(k)] = acled.featurize(GRID.lon,
                                                         GRID.lat,
                                                         property=property,
                                                         function='density',
                                                         buffer=k)

    d["weighted_sum_fatalities_by_dist"] = acled.featurize(
        GRID.lon, GRID.lat, property="fatalities", function='weighted_kNN')
    d["distance_to_acled_event"] = acled.featurize(GRID.lon,
                                                   GRID.lat,
                                                   function='distance')
    # quantize ACLED
    for c in d.keys():
        d[c] = np.nan_to_num(pd.qcut(d[c], 5, labels=False, duplicates='drop'))

    features = pd.DataFrame(d, index=data.index)
    data = data.join(features)

    # --------------- #
    # save features   #
    # --------------- #
    # drop columns with only 1 value
    print(
        'INFO: {} columns. Dropping features with unique values (if any) ...'.
        format(len(data.columns)))
    data = data[[col for col in data if not data[col].nunique() == 1]]
    print('INFO: {} columns.'.format(len(data.columns)))
    # features to be use in the linear model
    features_list = list(
        sorted(set(data.columns) - set(['i', 'j', config['indicator']])))

    #Save non-scaled features
    data.to_csv(
        "../Data/Features/features_all_id_{}_evaluation_nonscaled.csv".format(
            config['id']))

    # Scale Features
    print("Normalizing : max")
    data[features_list] = (data[features_list] - data[features_list].mean()
                           ) / (data[features_list].max() + 0.001)
    data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(
        config['id']))

    # --------------- #
    # model indicator #
    # --------------- #
    # shuffle dataset
    data = data.sample(frac=1, random_state=1783)  # shuffle data
    scores_dict = {}  # placeholder to save the scores
    from modeller import Modeller
    X, y = data[features_list].reset_index(), data[config['indicator']]
    modeller = Modeller(X,
                        rs_features=features_list,
                        spatial_features=["gpsLatitude", "gpsLongitude"],
                        scoring='r2',
                        cv_loops=20)

    kNN_pipeline = modeller.make_model_pipeline('kNN')
    kNN_scores = modeller.compute_scores(kNN_pipeline, y)
    scores_dict['kNN_R2_mean'] = round(kNN_scores.mean(), 2)
    scores_dict['kNN_R2_std'] = round(kNN_scores.std(), 2)
    print("kNN_R2_mean: ", scores_dict['kNN_R2_mean'], "kNN_R2_std: ",
          scores_dict['kNN_R2_std'])

    Ridge_pipeline = modeller.make_model_pipeline('Ridge')
    Ridge_scores = modeller.compute_scores(Ridge_pipeline, y)
    scores_dict['ridge_R2_mean'] = round(Ridge_scores.mean(), 2)
    scores_dict['ridge_R2_std'] = round(Ridge_scores.std(), 2)
    print("Ridge_R2_mean: ", scores_dict['ridge_R2_mean'], "Ridge_R2_std: ",
          scores_dict['ridge_R2_std'])

    Ensemble_pipeline = modeller.make_ensemble_pipeline(
        [kNN_pipeline, Ridge_pipeline])
    Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y)
    scores_dict['ensemble_R2_mean'] = round(Ensemble_scores.mean(), 2)
    scores_dict['ensemble_R2_std'] = round(Ensemble_scores.std(), 2)
    print("Ensemble_R2_mean: ", scores_dict['ensemble_R2_mean'],
          "Ensemble_R2_std: ", scores_dict['ensemble_R2_std'])

    # save results
    if id is None:
        write_scores_to_file(scores_dict, config['id'])
    else:
        write_scores_to_db(scores_dict, config['id'], engine)

    # ------------------------- #
    # write predictions to file #
    # ------------------------- #
    print('INFO: writing predictions to disk ...')

    from sklearn.model_selection import cross_val_predict
    results = pd.DataFrame(
        {
            'yhat': cross_val_predict(Ensemble_pipeline, X.values, y),
            'y': data[config['indicator']].values
        },
        index=data.index)
    results.to_csv('../Data/Results/config_{}.csv'.format(config['id']))

    # save model for production
    Ensemble_pipeline.fit(X.values, y)

    # Best n_neighbors (kNN)
    print('INFO: number of neighbours chosen: ',
          Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_)
    # Best alpha (Ridge)
    print('INFO: regularization param chosen: ',
          Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_)

    from sklearn.externals import joblib
    joblib.dump(Ensemble_pipeline,
                '../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model saved.')
コード例 #9
0
def run(id):
    # ----------------- #
    # SETUP #############
    # ----------------- #

    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    indicator = config["indicator"][0]
    raster = config["satellite_grid"][0]
    aggregate_factor = config["aggregation"][0]

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    from utils import aggregate
    if aggregate_factor > 1:
        print('INFO: aggregating raster ...')
        base_raster = "../tmp/local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    nightlights_date_start = config["nightlights_date"][0].get("start")
    nightlights_date_end = config["nightlights_date"][0].get("end")

    if config['satellite_config'][0].get('satellite_images') == 'Y':
        step = config['satellite_config'][0].get("satellite_step")

    # -------- #
    # DATAPREP #
    # -------- #
    data = pd.read_csv(dataset)
    data_cols = data.columns.values

    # grid
    GRID = RasterGrid(base_raster)
    list_i, list_j = GRID.get_gridcoordinates(data)

    # to use the centroid from the tile instead
    # coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)
    #data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y
    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES
    # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y

    data["i"], data["j"] = list_i, list_j

    # Get Polygon Geojson of the boundaries
    minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude")
    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    # --------------------------- #
    # GROUP CLUSTERS IN SAME TILE #
    # --------------------------- #
    # TODO: looks like shit
    cluster_N = 'n'
    print("Number of clusters: {} ".format(len(data)))

    def wavg(g, df, weight_series):
        w = df.ix[g.index][weight_series]
        return (g * w).sum() / w.sum()

    fnc = functools.partial(wavg, df=data, weight_series=cluster_N)

    try:
        data = data.groupby(["i", "j"]).agg({indicator: fnc, 'gpsLatitude': fnc, 'gpsLongitude': fnc}).reset_index()
    except KeyError:
        print("No weights, taking the average per i and j")
        data = data[['i', 'j', 'n', 'gpsLatitude', 'gpsLongitude', indicator]].groupby(["i", "j"]).mean().reset_index()

    print("Number of unique tiles: {} ".format(len(data)))

    list_i, list_j, pipeline = data["i"], data["j"], 'evaluation'

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # downlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000)
            print('INFO: images downloaded.')

            if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)):
                print('INFO: already scored.')
                features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline))
            else:
                print('INFO: scoring ...')
                # extract the features
                network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
                print('INFO: extractor instantiated.')

                features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline)
                # normalize the features

                features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False)

            features = features.drop('index', 1)
            data = data.merge(features, on=["i", "j"])

        data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

        print('INFO: features extracted.')

    # --------------- #
    # add nightlights #
    # --------------- #

    from nightlights import Nightlights

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end)
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(data)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            #density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))
            #data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x))
            #osm_features.append('density_{}'.format(value))

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    # TODO: Use efficiently maxNDBImaxNDVImaxNDWI_sum_todf
    print('INFO: getting NDBI, NDVI, NDWI ...')

    start_date = "2017-01-01"  # TODO: Add to config, be careful no image before 2015
    end_date = "2018-01-01"
    for i in date_range(start_date, end_date, 3):
        print('INFO: getting max NDVI between dates: {}'.format(i))
        gee_ndvi_max_raster = gee_sentinel_raster(i[0], i[1], area, ind="NDVI")
        data["max_NDVI_{}_{}".format(i[0], i[1])] = data.apply(gee_raster_mean, args=(gee_ndvi_max_raster, "gpsLatitude", "gpsLongitude", "NDVI"), axis=1)

    print('INFO: getting max NDBI')
    gee_ndbi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDBI")
    data["max_NDBI"] = data.apply(gee_raster_mean, args=(gee_ndbi_max_raster, "gpsLatitude", "gpsLongitude", "NDBI"), axis=1)

    print('INFO: getting max NDWI')
    gee_ndwi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDWI")
    data["max_NDWI"] = data.apply(gee_raster_mean, args=(gee_ndwi_max_raster, "gpsLatitude", "gpsLongitude", "NDWI"), axis=1)

    # --------------- #
    # save features   #
    # --------------- #

    features_list = list(set(data.columns) - set(data_cols) - set(['i', 'j']))

    # Standardize Features (0 mean and 1 std)
    data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std()

    data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

    # --------------- #
    # model indicator #
    # --------------- #
    data = data.sample(frac=1, random_state=1783).reset_index(drop=True)  # shuffle data

    data_features = data[features_list]

    # if take log of indicator
    if config['log'][0]:
        data[indicator] = np.log(data[indicator])
    from modeller import Modeller
    md = Modeller(['kNN', 'Kriging', 'RmSense', 'Ensamble'], data_features)
    cv_loops = 20
    md.compute(data[['i', 'j']], data[indicator].values, cv_loops)

    # save model for production
    md.save_models(id)
    print(str(np.datetime64('now')), 'INFO: model saved.')

    # ------------------ #
    # write scores to DB #
    # ------------------ #

    r2, r2_var = np.mean(md.scores['Ensamble']), np.var(md.scores['Ensamble'])
    r2_knn, r2_var_knn = np.mean(md.scores['kNN']), np.var(md.scores['kNN'])
    r2_rmsense, r2_var_rmsense = np.mean(md.scores['RmSense']), np.var(md.scores['RmSense'])
    y_duplicated = np.repeat(data[indicator], cv_loops)
    mape_rmsense = np.mean(np.abs([item for sublist in md.results['RmSense'] for item in sublist] - y_duplicated) / y_duplicated)
    if mape_rmsense == float("inf") or mape_rmsense == float("-inf"):
        mape_rmsense = 0

    query = """
    insert into results_new (run_date, config_id, r2, r2_var, r2_knn, r2_var_knn, r2_features, r2_var_features, mape_rmsense)
    values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format(
        config['id'][0],
        r2, r2_var, r2_knn, r2_var_knn, r2_rmsense, r2_var_rmsense, mape_rmsense)
    engine.execute(query)

    # ------------------------- #
    # write predictions to file #
    # ------------------------- #
    print('INFO: writing predictions to disk ...')
    results = pd.DataFrame({
        #'yhat': [item for sublist in md.results['kNN'] for item in sublist],
        'y': data[indicator].values,
        'lat': data['gpsLatitude'],
        'lon': data['gpsLongitude']})
    results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False)
コード例 #10
0
 def __init__(self, *args, **kwargs):
     super(ModellerListener, self).__init__(*args, **kwargs)
     self.modeller = Modeller()
     pass