Example #1
0
def compare_models(data, roi_density, radius, center, **gbrt_params):
    """ For a fixed sample density, ROI center, and ROI radius, splits the data
        set into a training and validation set and returns the measures of
        error (normalized rmse and r2) of GBRT, linear regression, and constant
        predictor.

        Args:
            data (pandas.DataFrame): entire data set to use.
            roi_density (float): required sample density in ROI.
            radius (float): ROI radius in km.
            center (tuple): longitutde-latitude coordinates of ROI center.

        Return:
            dict: keys are 'gbrt', 'linear', and 'constant', values are r2 and
                  rmse pairs as produced by `error_summary`.
        """
    X_train, y_train, X_test, y_test = \
        split_with_circle(data, center, roi_density=roi_density, radius=radius)
    assert not X_test.empty

    X_train = X_train.drop(['lat', 'lon'], axis=1)
    X_test = X_test.drop(['lat', 'lon'], axis=1)

    # consider 3 predictors: GBRT, linear regression, and a constant predictor
    gbrt = train_gbrt(X_train, y_train, **gbrt_params)
    y_gbrt = gbrt.predict(X_test)

    lin_reg = train_linear(X_train, y_train)
    y_lin = lin_reg.predict(X_test)

    y_const = y_train.mean() + np.zeros(len(y_test))
    # error_summary returns (r2, rmse) pairs
    return {'gbrt': error_summary(y_test, y_gbrt),
            'linear':  error_summary(y_test, y_lin),
            'constant': error_summary(y_test, y_const)}
Example #2
0
 def _predict(X_train, y_train, X_test, noise_amp):
     # If noise ~ N(0, s^2), then mean(|noise|) = s * sqrt(2/pi),
     # cf. https://en.wikipedia.org/wiki/Half-normal_distribution
     # To get noise with mean(|noise|) / mean(y) = noise_ampl, we need to
     # have noise ~ N(0, s*^2) with s* = mean(y) * noise_ampl * sqrt(pi/2).
     noise = np.mean(y_train) * noise_amp * np.sqrt(np.pi/ 2) * np.random.randn(len(y_train))
     gbrt = train_gbrt(X_train.drop(['lat', 'lon'], axis=1),
                       y_train + noise)
     lin_reg = train_linear(X_train.drop(['lat', 'lon'], axis=1),
                            y_train + noise)
     gbrt_pred = gbrt.predict(X_test.drop(['lat', 'lon'], axis=1))
     lin_pred = lin_reg.predict(X_test.drop(['lat', 'lon'], axis=1))
     return gbrt_pred, lin_pred
Example #3
0
def plot_partial_dependence(X_train, y_train, include_features=None, n_ways=1):
    """ Plots one-way or two-way partial dependencies (cf. Friedman 2001 or
        ESL). If include_features is given, only those features will be
        considered, otherwise all non-categorical features will be included.
    """
    raw_features = list(X_train)
    features, feature_names = [], []
    for i in range(len(raw_features)):
        if raw_features[i] in FEATURE_NAMES: # everything but categoricals
            # feature_name indexes match those of full training data column no.
            feature_names.append(FEATURE_NAMES[raw_features[i]])
            if include_features is None or raw_features[i] in include_features:
                features.append(i)
        else:
            # will never be used because categoricals are excluded but we
            # should keep track of indices nevertheless
            feature_names.append('Some categorical')
    assert len(feature_names) == len(raw_features)
    sys.stderr.write('Plotting %d-way partial depdnence for %d features\n' %
                     (n_ways, len(features)))

    if n_ways == 1:
        target_features = features # one-way pdp
    elif n_ways == 2:
        target_features = list(combinations(features, 2)) # two-way pdp
    else:
        raise Exception('only one-way and two-way partial dependence plots allowed, %d given' % int(n_ways))

    reg = train_gbrt(X_train, y_train)
    fig, axs = partial_dependence.plot_partial_dependence(
        reg, X_train, target_features, figsize=(22, 12),
        feature_names=feature_names, n_jobs=3, grid_resolution=50
    )
    for ax in axs:
        ax.yaxis.label.set_size(8)
        ax.grid(True)
        for tick in ax.xaxis.get_major_ticks():
            tick.label.set_fontsize(8)
    fig.tight_layout()
Example #4
0
def plot_feature_importance_analysis(data, roi_density, radius, ncenters,
                                     dumpfile=None, replot=False, **gbrt_params):
    """ Plots feature importance results (cf. Friedman 2001 or ESL) averaged
        over ncenters rounds of cross validation for given ROI training density
        and radius.
    """
    raw_features = list(data)
    for f in ['lat', 'lon', 'GHF']:
        raw_features.pop(raw_features.index(f))

    # a map to collapse categorical dummies for feature importances. The dict
    # has keys in `raw_features` indices, and values in `features` indices.
    decat_by_raw_idx = {}
    features = []
    for idx, f in enumerate(raw_features):
        match = [c for c in CATEGORICAL_FEATURES if c == f[:len(c)]]
        if match:
            assert len(match) == 1
            try:
                i = features.index(match[0])
            except ValueError:
                features.append(match[0])
                i = len(features) - 1
            decat_by_raw_idx[idx] = i
            continue
        features.append(f)
        decat_by_raw_idx[idx] = len(features) - 1

    if replot:
        res = pickle_load(dumpfile)
        gbrt_importances = res['gbrt_importances']
    else:
        # at this point features contains original feature names and raw_features
        # contains categorical dummies, in each round we map
        # feature_importances_, which has the same size as raw_features, to feature
        # importances for original features by adding the importances of each
        # categorical dummy.

        centers = [random_prediction_ctr(data, radius, min_density=roi_density) for _ in range(ncenters)]
        gbrt_importances = np.zeros([ncenters, len(features)])
        for center_idx, center in enumerate(centers):
            sys.stderr.write('%d / %d ' % (center_idx + 1, ncenters))
            X_train, y_train, X_test, y_test = \
                split_with_circle(data, center, roi_density=roi_density, radius=radius)
            X_train = X_train.drop(['lat', 'lon'], axis=1)
            X_test = X_test.drop(['lat', 'lon'], axis=1)
            assert not X_test.empty

            gbrt = train_gbrt(X_train, y_train, **gbrt_params)
            raw_importances = gbrt.feature_importances_
            for idx, value in enumerate(raw_importances):
                gbrt_importances[center_idx][decat_by_raw_idx[idx]] += value

        if dumpfile:
            res = {'gbrt_importances': gbrt_importances, 'features': features}
            pickle_dump(dumpfile, res, 'feature importances')

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    means = gbrt_importances.mean(axis=0)
    sds = np.sqrt(gbrt_importances.var(axis=0))
    sort_order = list(np.argsort(means))

    feature_names = [FEATURE_NAMES[features[i]] for i in sort_order]

    means, sds = [means[i] for i in sort_order], [sds[i] for i in sort_order]
    _yrange = [i-0.4 for i in range(len(features))] # labels in the middle of bars
    ax.barh(_yrange, means, color='k', ecolor='k', alpha=.3, xerr=sds[::-1])
    ax.set_ylim(-1, len(features))
    ax.grid(True)
    ax.set_yticks(range(len(features)))
    ax.set_yticklabels(feature_names, rotation=0, fontsize=10)
    ax.set_title('GBRT feature importances')
    fig.subplots_adjust(left=0.3) # for vertical xtick labels
Example #5
0
def plot_generalization_analysis(data, roi_density, radius, ncenters,
                                 ns_estimators, replot=False, dumpfile=None):
    """ For all given values for n_estimators (number of trees) for GBRT,
        perform cross-validation over ncenters ROIs with given radius and
        sample density. The average training and validation error for each
        number of trees is plotted. This is the standard plot to detect
        overfitting defined as the turning point beyond which validation error
        starts increasing while training error is driven down to zero. As
        expected, GBRT does not overfit (validation error plateaus).

        One standard deviation is indicated by a shaded region.
    """
    fig, ax = plt.subplots()

    if replot:
        res = pickle_load(dumpfile)
        for v in ['roi_density', 'radius', 'ns_estimators', 'train_rmses', 'test_rmses']:
            exec('%s = res["%s"]' % (v, v))
        assert len(train_rmses) == len(test_rmses), \
               'array length (# of centers) should be the same for training and test'
    else:
        sys.stderr.write('=> Experiment: Generalization ' + \
                         '(roi_density: %.2f, radius: %.2f,' % (roi_density, radius) +
                         ' no. centers: %d, no. of n_estimators: %d)\n' % (ncenters, len(ns_estimators)))
        centers = [random_prediction_ctr(data, radius, min_density=roi_density)
                   for _ in range(ncenters)]

        train_rmses = np.zeros([ncenters, len(ns_estimators)])
        test_rmses = np.zeros([ncenters, len(ns_estimators)])
        for center_idx, center in enumerate(centers):
            sys.stderr.write('# center %d/%d\n' % (center_idx + 1, ncenters))
            X_train, y_train, X_test, y_test = \
                split_with_circle(data, center, roi_density=roi_density, radius=radius)
            X_train = X_train.drop(['lat', 'lon'], axis=1)
            X_test = X_test.drop(['lat', 'lon'], axis=1)
            assert not X_test.empty

            for n_idx, n in enumerate(ns_estimators):
                sys.stderr.write('  # n_estimators: %d ' % n)
                gbrt = train_gbrt(X_train, y_train, n_estimators=n)
                _, train_rmse = error_summary(y_train, gbrt.predict(X_train))
                _, test_rmse  = error_summary(y_test, gbrt.predict(X_test))
                train_rmses[center_idx][n_idx] = train_rmse
                test_rmses[center_idx][n_idx] = test_rmse

        if dumpfile:
            res = {'roi_density': roi_density,
                   'radius': radius,
                   'ns_estimators': ns_estimators,
                   'train_rmses': train_rmses,
                   'test_rmses': test_rmses}
            pickle_dump(dumpfile, res, comment='generalization errors')

    num_sigma = 1

    mean_rmse = test_rmses.mean(axis=0)
    sd_rmse = np.sqrt(test_rmses.var(axis=0))
    lower_rmse = mean_rmse - num_sigma * sd_rmse
    higher_rmse = mean_rmse + num_sigma * sd_rmse
    ax.plot(ns_estimators, mean_rmse, 'r', marker='o', markersize=3, alpha=.9, label='validation')
    ax.fill_between(ns_estimators, lower_rmse, higher_rmse, facecolor='r', edgecolor='r', alpha=.3)

    mean_rmse = train_rmses.mean(axis=0)
    sd_rmse = np.sqrt(train_rmses.var(axis=0))
    lower_rmse = mean_rmse - num_sigma * sd_rmse
    higher_rmse = mean_rmse + num_sigma * sd_rmse
    ax.plot(ns_estimators, mean_rmse, 'g', marker='o', markersize=3, alpha=.9, label='training')
    ax.fill_between(ns_estimators, lower_rmse, higher_rmse, facecolor='g', edgecolor='g', alpha=.3)

    ax.grid(True)
    ax.set_xlim(ns_estimators[0] - 100, ns_estimators[-1] + 100)
    ax.set_ylim(0, .3)
    ax.set_yticks(np.arange(0, .31, .05))
    ax.set_xlabel('Number of trees')
    ax.set_ylabel('Normalized RMSE')
    ax.legend(prop={'size':12.5})
    fig.tight_layout()
Example #6
0
    test_lons = X_test.lon.as_matrix()
    test_lats = X_test.lat.as_matrix()
    X_test = X_test.drop(['lat', 'lon'], axis=1)

    # -------------------- Plot training data  -------------------------
    plot_training_GHF(train_lons, train_lats, y_train)
    save_cur_fig('greenland_training_GHF.png', title='GHF at training set')

    plot_gaussian_prescribed_GHF(train_lons, train_lats, y_train)
    save_cur_fig(
        'greenland_prescribed_GHF.png',
        title=
        'Points with prescribed GHF \n around GHF measurements (mW m$^{-2}$)')

    # -------------------- Plot predicted results ----------------------
    reg = train_gbrt(X_train, y_train)
    y_pred = reg.predict(X_test)

    plot_prediction_points(test_lons, test_lats, y_pred)
    save_cur_fig('greenland_prediction_points.png',
                 title='GHF predicted for Greenland (mW m$^{-2}$)')

    plot_prediction(test_lons, test_lats, y_pred)
    save_cur_fig('greenland_prediction.png',
                 title='GHF predicted for Greenland (mW m$^{-2}$)')

    lons = np.hstack([train_lons, test_lons])
    lats = np.hstack([train_lats, test_lats])
    ghfs = np.hstack([y_train, y_pred])

    plot_prediction_interpolated(lons, lats, ghfs)
Example #7
0
# Supplementary Figure 4
# if the random_prediction_ctr is chosen, occasionally errors
# may occur if that random center does not have the density of
# 50. If so, simply re-run
roi_densities = [50, 0, 20, 10, 5]
#center = random_prediction_ctr(data, GREENLAND_RADIUS)
center = (28.67, 45.5)

plt.clf()

for roi_density in roi_densities:
    print 'center: ', center
    X_train, y_train, X_test, y_test = split_with_circle(
        data, center, roi_density=roi_density, radius=GREENLAND_RADIUS)
    reg = train_gbrt(X_train.drop(['lat', 'lon'], axis=1), y_train)
    y_pred = reg.predict(X_test.drop(['lat', 'lon'], axis=1))

    r2, rmse = error_summary(y_test, y_pred)

    m = Basemap(projection='merc',
                lat_0=center[0],
                lon_0=center[1],
                resolution='l',
                area_thresh=1000.0,
                llcrnrlon=0,
                llcrnrlat=25,
                urcrnrlon=60,
                urcrnrlat=61)

    m.drawlsmask(land_color="#ffffff", ocean_color="#e8f4f8", resolution='l')