def display(set, i=0): image = np.reshape(set[0][i], (28, 28)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) imgplot = ax.imshow(image, cmap=mpl.cm.Greys) imgplot.set_interpolation('nearest') plt.show()
def plot_space_leakage(data, num_samples, normalize=False, features=None, dumpfile=None, replot=False): """ Scatter plots spatial distance vs euclidean distance in feature space for specified features. If features is None all features excluding latitude/longitude are included. Since the total number of pairs of points is typically large pairs are picked by sampling the data set randomly. """ raw_features = list(data) if replot: res = pickle_load(dumpfile) distances = res['distances'] else: distance_features = ['lat', 'lon'] if normalize: # normalize all features to [0, 1] for f in list(data): if f in distance_features: continue data[f] = (data[f] - data[f].min()) / (data[f].max() - data[f].min()) if features is None: non_features = distance_features + ['GHF'] features = [x for x in list(data) if x not in non_features] distances = [] sys.stderr.write('Sampling %d pairs of points: \n' % num_samples) for i in range(num_samples): if (i+1) % 100 == 0: sys.stderr.write('%d...\n' % (i+1)) p1, p2 = np.random.randint(0, len(data), 2) p1, p2 = data.iloc[p1], data.iloc[p2] feature_d = np.linalg.norm(p1[features] - p2[features]) spatial_d = np.linalg.norm([p1['lat'] - p2['lat'], p1['lon'] - p2['lon']]) distances.append((spatial_d, feature_d)) if dumpfile: res = {'distances': distances} pickle_dump(dumpfile, res, 'space leakage') fig = plt.figure(figsize=(8, 10)) ax = fig.add_subplot(1, 1, 1) ax.scatter([x[0] for x in distances], [x[1] for x in distances], edgecolor=None, facecolor='k', alpha=.5) ax.set_xlabel('Distance in latitude-longitude') ax.set_ylabel('Distance in feature space') ax.grid(True) ax.set_title('Opacity of selected features with respect to spatial coordinates') fig.tight_layout()
def plot_error_by_density(data, roi_densities, radius, ncenters, region='NA-WE', replot=False, dumpfile=None, **gbrt_params): """ ncenters random centers are picked and over all given ROI densities. Cross-validation errors (normalized RMSE and r2) are averaged over ncenters. One standard deviation mark is shown by a shaded region. """ sys.stderr.write('=> Experiment: Error by Density (region: %s, no. centers: %d, no. densities: %d)\n' % (region, ncenters, len(roi_densities))) fig = plt.figure(figsize=(11,5)) ax_rmse, ax_r2 = fig.add_subplot(1, 2, 1), fig.add_subplot(1, 2, 2) if replot: results = pickle_load(dumpfile) else: centers = [ random_prediction_ctr(data, radius, region=region, min_density=max(roi_densities)) for _ in range(ncenters) ] shape = (ncenters, len(roi_densities)) # blank error matrix (keyed by center number and roi density index), # used to initialize multiple components of the results dictionary. blank = np.zeros(shape) results = { 'ncenters': ncenters, 'roi_densities': roi_densities, 'errors': { 'gbrt': {'rmse': blank.copy(), 'r2': blank.copy()}, 'linear': {'rmse': blank.copy(), 'r2': blank.copy()}, 'constant': {'rmse': blank.copy(), 'r2': blank.copy()}, }, } for idx_density, roi_density in enumerate(roi_densities): for idx_ctr, center in enumerate(centers): sys.stderr.write('# density = %.2f, center %d/%d ' % (roi_density, idx_ctr + 1, ncenters)) comp = compare_models(data, roi_density, radius, center, **gbrt_params) for k in results['errors'].keys(): # k is one of gbrt, linear, or constant results['errors'][k]['r2'][idx_ctr][idx_density] = comp[k][0] results['errors'][k]['rmse'][idx_ctr][idx_density] = comp[k][1] if dumpfile: pickle_dump(dumpfile, results, comment='GBRT performance results') errors = results['errors'] roi_densities = results['roi_densities'] ncenters = results['ncenters'] num_sigma = 1 # Plot GBRT results kw = {'alpha': .9, 'lw': 1, 'marker': 'o', 'markersize': 4, 'color': 'b'} mean_rmse = errors['gbrt']['rmse'].mean(axis=0) sd_rmse = np.sqrt(errors['gbrt']['rmse'].var(axis=0)) lower_rmse = mean_rmse - num_sigma * sd_rmse higher_rmse = mean_rmse + num_sigma * sd_rmse ax_rmse.plot(roi_densities, mean_rmse, label='GBRT', **kw) ax_rmse.fill_between(roi_densities, lower_rmse, higher_rmse, facecolor='b', edgecolor='b', alpha=.3) mean_r2 = errors['gbrt']['r2'].mean(axis=0) sd_r2 = np.sqrt(errors['gbrt']['r2'].var(axis=0)) lower_r2 = mean_r2 - num_sigma * sd_r2 higher_r2 = mean_r2 + num_sigma * sd_r2 ax_r2.plot(roi_densities, errors['gbrt']['r2'].mean(axis=0), **kw) ax_r2.fill_between(roi_densities, lower_r2, higher_r2, facecolor='b', edgecolor='b', alpha=.2) # Plot Linear Regression results kw = {'alpha': .7, 'lw': 1, 'marker': 'o', 'markersize': 4, 'markeredgecolor': 'r', 'color': 'r'} mean_rmse = errors['linear']['rmse'].mean(axis=0) sd_rmse = np.sqrt(errors['linear']['rmse'].var(axis=0)) lower_rmse = mean_rmse - num_sigma * sd_rmse higher_rmse = mean_rmse + num_sigma * sd_rmse ax_rmse.plot(roi_densities, mean_rmse, label='linear regression', **kw) ax_rmse.fill_between(roi_densities, lower_rmse, higher_rmse, facecolor='r', edgecolor='r', alpha=.3) mean_r2 = errors['linear']['r2'].mean(axis=0) sd_r2 = np.sqrt(errors['linear']['r2'].var(axis=0)) lower_r2 = mean_r2 - num_sigma * sd_r2 higher_r2 = mean_r2 + num_sigma * sd_r2 ax_r2.plot(roi_densities, errors['linear']['r2'].mean(axis=0), **kw) ax_r2.fill_between(roi_densities, lower_r2, higher_r2, facecolor='r', edgecolor='r', alpha=.2) # Plot constant predictor results kw = {'alpha': .7, 'lw': 1, 'ls': '--', 'marker': 'o', 'markersize': 4, 'color': 'k', 'markeredgecolor': 'k'} ax_rmse.plot(roi_densities, errors['constant']['rmse'].mean(axis=0), label='constant predictor', **kw) ax_r2.plot(roi_densities, errors['constant']['r2'].mean(axis=0), **kw) # Style plot ax_rmse.set_ylabel('Normalized RMSE', fontsize=14) ax_r2.set_ylabel('$r^2$', fontsize=16) ax_r2.set_ylim(-.05, 1) ax_r2.set_xlim(min(roi_densities) - 5, max(roi_densities) + 5) ax_r2.set_yticks(np.arange(0, 1.01, .1)) ax_rmse.set_ylim(0, .5) ax_rmse.set_yticks(np.arange(0, .51, .05)) ax_rmse.set_xlim(*ax_r2.get_xlim()) for ax in [ax_rmse, ax_r2]: # FIXME force xlims to be the same ax.set_xlabel('density of training points in ROI ($10^{-6}$ km $^{-2}$)', fontsize=14) ax.grid(True) ax_rmse.legend(prop={'size':15}, numpoints=1) fig.tight_layout()
def plot_feature_importance_analysis(data, roi_density, radius, ncenters, dumpfile=None, replot=False, **gbrt_params): """ Plots feature importance results (cf. Friedman 2001 or ESL) averaged over ncenters rounds of cross validation for given ROI training density and radius. """ raw_features = list(data) for f in ['lat', 'lon', 'GHF']: raw_features.pop(raw_features.index(f)) # a map to collapse categorical dummies for feature importances. The dict # has keys in `raw_features` indices, and values in `features` indices. decat_by_raw_idx = {} features = [] for idx, f in enumerate(raw_features): match = [c for c in CATEGORICAL_FEATURES if c == f[:len(c)]] if match: assert len(match) == 1 try: i = features.index(match[0]) except ValueError: features.append(match[0]) i = len(features) - 1 decat_by_raw_idx[idx] = i continue features.append(f) decat_by_raw_idx[idx] = len(features) - 1 if replot: res = pickle_load(dumpfile) gbrt_importances = res['gbrt_importances'] else: # at this point features contains original feature names and raw_features # contains categorical dummies, in each round we map # feature_importances_, which has the same size as raw_features, to feature # importances for original features by adding the importances of each # categorical dummy. centers = [random_prediction_ctr(data, radius, min_density=roi_density) for _ in range(ncenters)] gbrt_importances = np.zeros([ncenters, len(features)]) for center_idx, center in enumerate(centers): sys.stderr.write('%d / %d ' % (center_idx + 1, ncenters)) X_train, y_train, X_test, y_test = \ split_with_circle(data, center, roi_density=roi_density, radius=radius) X_train = X_train.drop(['lat', 'lon'], axis=1) X_test = X_test.drop(['lat', 'lon'], axis=1) assert not X_test.empty gbrt = train_gbrt(X_train, y_train, **gbrt_params) raw_importances = gbrt.feature_importances_ for idx, value in enumerate(raw_importances): gbrt_importances[center_idx][decat_by_raw_idx[idx]] += value if dumpfile: res = {'gbrt_importances': gbrt_importances, 'features': features} pickle_dump(dumpfile, res, 'feature importances') fig = plt.figure() ax = fig.add_subplot(1, 1, 1) means = gbrt_importances.mean(axis=0) sds = np.sqrt(gbrt_importances.var(axis=0)) sort_order = list(np.argsort(means)) feature_names = [FEATURE_NAMES[features[i]] for i in sort_order] means, sds = [means[i] for i in sort_order], [sds[i] for i in sort_order] _yrange = [i-0.4 for i in range(len(features))] # labels in the middle of bars ax.barh(_yrange, means, color='k', ecolor='k', alpha=.3, xerr=sds[::-1]) ax.set_ylim(-1, len(features)) ax.grid(True) ax.set_yticks(range(len(features))) ax.set_yticklabels(feature_names, rotation=0, fontsize=10) ax.set_title('GBRT feature importances') fig.subplots_adjust(left=0.3) # for vertical xtick labels
def plot_sensitivity_analysis(data, roi_density, radius, noise_amps, ncenters, replot=False, dumpfile=None): """ For each given noise amplitude, performs cross-validation on ncenters with given radius and density, the average over ncenters of normalized rmse between noise-free predictions and predictions based on noisy GHF is calculated. This perturbation in predictions is plotted against the expected absolute value of applied noise (amplitude). Both GBRT and linear regression are considered. One standard deviation is indicated by a shaded region. The case of Greenland is considered separately and overlayed. """ fig = plt.figure(figsize=(10, 5)) ax_gbrt = fig.add_subplot(1, 2, 1) ax_lin = fig.add_subplot(1, 2, 2) def _predict(X_train, y_train, X_test, noise_amp): # If noise ~ N(0, s^2), then mean(|noise|) = s * sqrt(2/pi), # cf. https://en.wikipedia.org/wiki/Half-normal_distribution # To get noise with mean(|noise|) / mean(y) = noise_ampl, we need to # have noise ~ N(0, s*^2) with s* = mean(y) * noise_ampl * sqrt(pi/2). noise = np.mean(y_train) * noise_amp * np.sqrt(np.pi/ 2) * np.random.randn(len(y_train)) gbrt = train_gbrt(X_train.drop(['lat', 'lon'], axis=1), y_train + noise) lin_reg = train_linear(X_train.drop(['lat', 'lon'], axis=1), y_train + noise) gbrt_pred = gbrt.predict(X_test.drop(['lat', 'lon'], axis=1)) lin_pred = lin_reg.predict(X_test.drop(['lat', 'lon'], axis=1)) return gbrt_pred, lin_pred if replot: res = pickle_load(dumpfile) rmses_gbrt, rmses_lin = res['rmses_gbrt'], res['rmses_lin'] noise_amps = res['noise_amps'] else: centers = [random_prediction_ctr(data, radius, min_density=roi_density) for _ in range(ncenters)] y0 = [] centers = [None] + centers # one extra "center" (Greenland) rmses_gbrt = np.zeros((len(centers), len(noise_amps))) rmses_lin = np.zeros((len(centers), len(noise_amps))) for idx_ctr, center in enumerate(centers): if center is None: # Greenland case X_train, y_train, X_test = greenland_train_test_sets() else: X_train, y_train, X_test, _ = \ split_with_circle(data, center, roi_density=roi_density, radius=radius) sys.stderr.write('(ctr %d) noise_amp = 0.00 ' % (idx_ctr + 1)) y0_gbrt, y0_lin = _predict(X_train, y_train, X_test, 0) for idx_noise, noise_amp in enumerate(noise_amps): sys.stderr.write('(ctr %d) noise_amp = %.2f ' % (idx_ctr + 1, noise_amp)) y_gbrt, y_lin = _predict(X_train, y_train, X_test, noise_amp) rmse_gbrt = sqrt(mean_squared_error(y0_gbrt, y_gbrt)) / np.mean(y0_gbrt) rmse_lin = sqrt(mean_squared_error(y0_lin, y_lin)) / np.mean(y0_lin) rmses_gbrt[idx_ctr][idx_noise] = rmse_gbrt rmses_lin[idx_ctr][idx_noise] = rmse_lin if dumpfile: res = {'rmses_lin': rmses_lin, 'rmses_gbrt': rmses_gbrt, 'noise_amps': noise_amps} pickle_dump(dumpfile, res, 'sensitivity analysis') kw = dict(alpha=.6, lw=2, marker='o', color='k', label='global average') noise_amps = np.append([0], noise_amps) num_sigma = 1 mean_rmse = rmses_lin[1:].mean(axis=0) sd_rmse = np.sqrt(rmses_lin[1:].var(axis=0)) lower_rmse = np.append([0], mean_rmse - num_sigma * sd_rmse) higher_rmse = np.append([0], mean_rmse + num_sigma * sd_rmse) mean_rmse = np.append([0], mean_rmse) ax_lin.plot(noise_amps, mean_rmse, **kw) ax_lin.fill_between(noise_amps, lower_rmse, higher_rmse, facecolor='k', edgecolor='k', alpha=.2) mean_rmse = rmses_gbrt[1:].mean(axis=0) sd_rmse = np.sqrt(rmses_gbrt[1:].var(axis=0)) lower_rmse = np.append([0], mean_rmse - num_sigma * sd_rmse) higher_rmse = np.append([0], mean_rmse + num_sigma * sd_rmse) mean_rmse = np.append([0], mean_rmse) ax_gbrt.plot(noise_amps, mean_rmse, **kw) ax_gbrt.fill_between(noise_amps, lower_rmse, higher_rmse, facecolor='k', edgecolor='k', alpha=.2) # Greenland case kw = dict(color='g', alpha=.5, lw=2.5, marker='o', markeredgewidth=0.0, label='Greenland') ax_lin.plot(noise_amps, np.append([0], rmses_lin[0]), **kw) ax_gbrt.plot(noise_amps, np.append([0], rmses_gbrt[0]), **kw) for ax in [ax_gbrt, ax_lin]: ax.set_xlabel('Relative magnitude of noise in training GHF', fontsize=12) ax.set_xlim(0, max(noise_amps) * 1.1) ax.set_aspect('equal') ax.grid(True) ax.set_xticks(np.arange(0, .35, .05)) ax.set_yticks(np.arange(0, .35, .05)) ax.set_xlim(-.025, .325) ax.set_ylim(-.025, .325) ax.legend(loc=1, fontsize=12) ax_gbrt.set_ylabel(r'Normalized RMSE difference in $\widehat{GHF}_{\mathrm{GBRT}}$', fontsize=12) ax_lin.set_ylabel(r'Normalized RMSE difference in $\widehat{GHF}_{\mathrm{lin}}$', fontsize=12) fig.tight_layout()
def plot_error_by_radius(data, roi_density, radii, ncenters, region='NA-WE', replot=False, dumpfile=None, **gbrt_params): """ ncenters random centers are picked and over all given radii. Cross-validation errors (normalized RMSE and r2) are averaged over ncenters. One standard deviation mark is shown by a shaded region. """ fig = plt.figure(figsize=(11,5)) ax_rmse, ax_r2 = fig.add_subplot(1, 2, 1), fig.add_subplot(1, 2, 2) if replot: results = pickle_load(dumpfile) else: centers = [ # HACK there's no easy way to check if for a given center the # demanded density is attainable for circles of all desired radii. # Ask for twice the density we need on the largest radius and hope # for the best! random_prediction_ctr(data, max(radii), region=region, min_density=2*roi_density) for _ in range(ncenters) ] shape = (ncenters, len(radii)) # blank error matrix (keyed by center number and roi density index), # used to initialize multiple components of the results dictionary. blank = np.zeros(shape) results = { 'ncenters': ncenters, 'radii': radii, 'errors': { 'gbrt': {'rmse': blank.copy(), 'r2': blank.copy()}, 'linear': {'rmse': blank.copy(), 'r2': blank.copy()}, 'constant': {'rmse': blank.copy(), 'r2': blank.copy()}, }, } for idx_radius, radius in enumerate(radii): for idx_ctr, center in enumerate(centers): sys.stderr.write('# radius = %.0f, center %d/%d ' % (radius, idx_ctr + 1, ncenters)) comp = compare_models(data, roi_density, radius, center, **gbrt_params) for k in results['errors'].keys(): # k is one of gbrt, linear, or constant results['errors'][k]['r2'][idx_ctr][idx_radius] = comp[k][0] results['errors'][k]['rmse'][idx_ctr][idx_radius] = comp[k][1] if dumpfile: pickle_dump(dumpfile, results, comment='GBRT performance results') errors = results['errors'] radii = results['radii'] ncenters = results['ncenters'] num_sigma = 1 # Plot GBRT results kw = {'alpha': .9, 'lw': 1, 'marker': 'o', 'markersize': 4, 'color': 'b'} mean_rmse = errors['gbrt']['rmse'].mean(axis=0) sd_rmse = np.sqrt(errors['gbrt']['rmse'].var(axis=0)) lower_rmse = mean_rmse - num_sigma * sd_rmse higher_rmse = mean_rmse + num_sigma * sd_rmse ax_rmse.plot(radii, mean_rmse, label='GBRT', **kw) ax_rmse.fill_between(radii, lower_rmse, higher_rmse, facecolor='b', edgecolor='b', alpha=.3) mean_r2 = errors['gbrt']['r2'].mean(axis=0) sd_r2 = np.sqrt(errors['gbrt']['r2'].var(axis=0)) lower_r2 = mean_r2 - num_sigma * sd_r2 higher_r2 = mean_r2 + num_sigma * sd_r2 ax_r2.plot(radii, errors['gbrt']['r2'].mean(axis=0), **kw) ax_r2.fill_between(radii, lower_r2, higher_r2, facecolor='b', edgecolor='b', alpha=.2) # Plot Linear Regression results kw = {'alpha': .7, 'lw': 1, 'marker': 'o', 'markersize': 4, 'markeredgecolor': 'r', 'color': 'r'} mean_rmse = errors['linear']['rmse'].mean(axis=0) sd_rmse = np.sqrt(errors['linear']['rmse'].var(axis=0)) lower_rmse = mean_rmse - num_sigma * sd_rmse higher_rmse = mean_rmse + num_sigma * sd_rmse ax_rmse.plot(radii, mean_rmse, label='linear regression', **kw) ax_rmse.fill_between(radii, lower_rmse, higher_rmse, facecolor='r', edgecolor='r', alpha=.3) mean_r2 = errors['linear']['r2'].mean(axis=0) sd_r2 = np.sqrt(errors['linear']['r2'].var(axis=0)) lower_r2 = mean_r2 - num_sigma * sd_r2 higher_r2 = mean_r2 + num_sigma * sd_r2 ax_r2.plot(radii, errors['linear']['r2'].mean(axis=0), **kw) ax_r2.fill_between(radii, lower_r2, higher_r2, facecolor='r', edgecolor='r', alpha=.2) # Plot constant predictor results kw = {'alpha': .7, 'lw': 1, 'ls': '--', 'marker': 'o', 'markersize': 4, 'color': 'k', 'markeredgecolor': 'k'} ax_rmse.plot(radii, errors['constant']['rmse'].mean(axis=0), label='constant predictor', **kw) ax_r2.plot(radii, errors['constant']['r2'].mean(axis=0), **kw) # Style plot ax_rmse.set_ylabel('Normalized RMSE', fontsize=14) ax_r2.set_ylabel('$r^2$', fontsize=16) ax_r2.set_ylim(-.05, 1) ax_r2.set_xlim(min(radii) - 100, max(radii) + 100) ax_r2.set_yticks(np.arange(0, 1.01, .1)) ax_rmse.set_ylim(0, .5) ax_rmse.set_yticks(np.arange(0, .51, .05)) ax_rmse.set_xlim(*ax_r2.get_xlim()) for ax in [ax_rmse, ax_r2]: # FIXME force xlims to be the same ax.set_xlabel('radius of ROI (km)', fontsize=14) ax.grid(True) ax_rmse.legend(prop={'size':15}, numpoints=1) fig.tight_layout()