def plot_annual_jacket_crossings(df, stations, temp): """ Plot mean annual temperature variations for all stations. Parameters ---------- df : dataframe Data for all stations. stations : list List of stations to include in plot. t_range : list, optional Values to clip temperatures to. Returns ------- f : figure Matplotlib figure. """ days = np.linspace(1, 365, num=365) f, axes = plt.subplots(len(stations), 1, sharex=True, figsize=(6, 2 * len(stations))) for ii, (st, ax) in enumerate(zip(stations, axes)): name = utils.short_name(st) data = utils.single_station_data(df, st) cross, years = utils.annual_jacket_crossing(data, temp) mean = np.nanmean(cross, axis=0) frac_cross = (mean > .5).sum() / float(mean.shape[0]) ax.fill_between(days, np.zeros_like(mean), mean, facecolor='blue', alpha=.5) ax.text(7.5, .65, '{}% of days\nP>.5\n@{} deg.'.format( np.rint(100 * frac_cross).astype(int), temp), bbox={ 'facecolor': 'white', 'alpha': 0.5, 'pad': 5 }) ax.axhline(.5, c='black') ax.set_ylim([0, 1]) ax.set_xlim([0, 366]) ax.set_title(name) ax.set_yticks(np.linspace(0, 1, 5)) ax.set_xticks([79, 172, 265, 344]) ax.set_xticklabels(['March 20', 'June 21', 'Sept. 22', 'Dec. 21']) ax.set_ylabel('P(jacket crossing)') ax.grid() return f
def plot_daily_fluctuations(df, stations): """ Plot daily fluctuations for TMAX and TMIN. Parameters ---------- df : dataframe Data for all stations. stations : list List of stations to include in plot. Returns ------- f : figure Matplotlib figure. """ f, axes = plt.subplots(len(stations), 1, sharex=True, figsize=(5, 2 * len(stations))) for ii, (st, ax) in enumerate(zip(stations, axes)): name = utils.short_name(st) data = utils.single_station_data(df, st) max_data, years = utils.annual_data(data, 'TMAX') max_data -= max_data.mean(axis=0, keepdims=True) hist, bins = np.histogram(max_data.flatten(), bins=60, range=[-30, 30], density=True) ax.step(bins[:-1], hist, 'r', where='mid', label='Daily max') min_data, years = utils.annual_data(data, 'TMIN') min_data -= min_data.mean(axis=0, keepdims=True) hist, bins = np.histogram(min_data.flatten(), bins=60, range=[-30, 30], density=True) ax.step(bins[:-1], hist, 'b', where='mid', label='Daily min') ax.set_title(name) ax.set_ylabel('prob. density') ax.set_ylim([0, .15]) ax.set_yticks(np.arange(0, .16, .05)) ax.grid() axes[0].legend(loc='best', ncol=2) ax.set_xlabel('Deviation from mean daily temperature') return f
def plot_annual_temperature(df, stations, t_range=None): """ Plot mean annual temperature variations for all stations. Parameters ---------- df : dataframe Data for all stations. stations : list List of stations to include in plot. t_range : list, optional Values to clip temperatures to. Returns ------- f : figure Matplotlib figure. """ if t_range is None: t_range = [0, 100] time = np.linspace(1, 365, num=365) f, axes = plt.subplots(len(stations), 1, sharex=True, figsize=(6, 2 * len(stations))) for ii, (st, ax) in enumerate(zip(stations, axes)): name = utils.short_name(st) data = utils.single_station_data(df, st) mean = np.zeros(365) delta = np.zeros(365) for day in range(365): temps = data[['TMIN', 'TMAX']].loc[data['day'] == day + 1] mean[day] = np.nanmean(temps.values) delta[day] = np.nanmean((temps['TMAX'] - temps['TMIN']).values) ax.fill_between(time, mean + delta / 2., mean - delta / 2., facecolor='red', alpha=.5) ax.plot(time, mean, c='black') ax.set_ylim(t_range) ax.set_xlim([0, 366]) ax.set_title(name) #ax.set_xlabel('Day of year') ax.set_xticks([79, 172, 265, 344]) ax.set_xticklabels(['March 20', 'June 21', 'Sept. 22', 'Dec. 21']) ax.grid() ax.set_ylabel('Temp.') return f
def plot_stations_all_time(df, stations, t_range=None): """ Plot all min and max temp data for all stations. Parameters ---------- df : dataframe Data for all stations. stations : list List of stations to include in plot. t_range : list, optional Values to clip temperatures to. Returns ------- f : figure Matplotlib figure. """ if t_range is None: t_range = [-20, 120] f, axes = plt.subplots(len(stations), 1, figsize=(12, 2 * len(stations))) for ii, (st, ax) in enumerate(zip(stations, axes)): name = utils.short_name(st) data = utils.single_station_data(df, st) if ii == 0: legend = True else: legend = False time = matplotlib.dates.date2num(data.index.date) for p, c in zip(['TMIN', 'TMAX'], ['blue', 'red']): ax.plot_date(time, data[p], c=c, fmt='-', alpha=.5) mean = data[p].mean() rolling = data[p].rolling(window=30, min_periods=10, center=True).median() y = mean * np.ones_like(time) ax.plot_date(time, y, c=c, zorder=10, fmt='-') ax.plot_date(time, rolling, c=c, zorder=10, fmt='-') ax.set_ylim(t_range) ax.set_title(name) ax.set_ylabel('Temp.') ax.set_xlabel('Year') return f
def plot_annual_power_spectrum(df, stations): """ Plot annual temperature powerspectrum. Parameters ---------- df : dataframe Data for all stations. stations : list List of stations to include in plot. Returns ------- f : figure Matplotlib figure. """ f, axes = plt.subplots(len(stations), 1, sharex=True, figsize=(6, 2 * len(stations))) for ii, (st, ax) in enumerate(zip(stations, axes)): name = utils.short_name(st) data = utils.single_station_data(df, st) freqs, tmin_power = utils.mean_annual_powerspectrum(data, 'TMIN') freqs, tmax_power = utils.mean_annual_powerspectrum(data, 'TMAX') ax.loglog(freqs, tmin_power, c='blue') ax.loglog(freqs, tmin_power, '.', c='blue', alpha=.5) ax.loglog(freqs, tmax_power, c='red') ax.loglog(freqs, tmax_power, '.', c='red', alpha=.5) ax.set_title(name) ax.set_ylabel('Temp.') ax.axvline(12, c='black', linestyle='--', label='Monthly fluctuations') ax.axvline(52, c='black', label='Weekly fluctuations') ax.set_ylim([1e1, 1e4]) ax.set_xlim([1e0, 2e2]) ax.grid() axes[0].plot(0, 0, 'r-', label='Daily max') axes[0].plot(0, 0, 'b-', label='Daily min') leg = axes[0].legend(loc='best', ncol=2) axes[-1].set_xlabel('Cycles/year') return f
def plot_annual_daily_comparison(df, stations): """ Plot annual vs. daily temperature variations for all stations. Parameters ---------- df : dataframe Data for all stations. stations : list List of stations to include in plot. Returns ------- f : figure Matplotlib figure. """ colors = matplotlib.cm.get_cmap('plasma') colors = [colors(v) for v in np.linspace(0, 1, len(stations))] f, ax = plt.subplots(1) x_max = 0. y_max = 0. x_min = np.inf y_min = np.inf for ii, st in enumerate(stations): name = utils.short_name(st) data = utils.single_station_data(df, st) daily_delta = data['TMAX'] - data['TMIN'] years = sorted(set(data.index.year)) days = data.index.dayofyear if days[0] > 1: years = years[1:] if days[-1] < 365: years = years[:-1] annual_delta = np.zeros(len(years)) for jj, year in enumerate(years): min_t = data['TMIN'].loc[data.index.year == year].min() min_t = min(min_t, data['TMAX'].loc[data.index.year == year].min()) max_t = data['TMIN'].loc[data.index.year == year].max() max_t = max(max_t, data['TMAX'].loc[data.index.year == year].max()) annual_delta[jj] = max_t - min_t e = Ellipse(xy=[annual_delta.mean(), np.nanmean(daily_delta)], height=2 * np.nanstd(daily_delta), width=2 * annual_delta.std()) ax.plot(annual_delta.mean(), np.nanmean(daily_delta), 'o', c=colors[ii]) x_max = max(x_max, annual_delta.mean() + 1.5 * annual_delta.std()) y_max = max(y_max, np.nanmean(daily_delta) + 1.5 * np.nanstd(daily_delta)) x_min = min(x_min, annual_delta.mean() - 1.5 * annual_delta.std()) y_min = min(y_min, np.nanmean(daily_delta) - 1.5 * np.nanstd(daily_delta)) ax.add_artist(e) e.set_facecolor(colors[ii]) e.set_alpha(.5) e.set_clip_box(ax.bbox) ax.plot(0, 0, c=colors[ii], label=name) ax.set_xlim([x_min, x_max]) ax.set_ylim([y_min, y_max]) leg = ax.legend(loc='best', prop={'size': 12}, ncol=2) leg.get_frame().set_alpha(0.5) ax.set_xlabel('Annual temp. swing') ax.set_ylabel('Daily temp. swing') f.set_size_inches(8., 8. * (y_max - y_min) / (x_max - x_min)) plt.grid() return f
pipeline.fit(X_train, y_train) if args.submission: fname_spec = '_submission_' else: # gridsearch? log all results + call out the winner if hasattr(pipeline, 'best_params_'): logr.info('best gridsearch score={}'.format(pipeline.best_score_)) logr.info('best set of pipeline params={}'.format(pipeline.best_params_)) logr.info('now displaying all pipeline param scores...') for params, mean_score, scores in pipeline.grid_scores_: logr.info("{:0.3f} (+/-{:0.03f}) for {}".format(mean_score, scores.std()*2, params)) fname_spec = '_expt_' # build proper file name so we can reference it in the logs model_name = utils.short_name(pipeline) + \ fname_spec + \ datetime.utcnow().strftime('%Y-%m-%d_%H%M%S') logr.info('writing fit {} pipeline to disk as {}'.format(job, model_name)) try: joblib.dump(pipeline, os.path.join('saved_models', model_name) + '.pkl', compress=3) except OverflowError, e: # this is annoying; look into it later logr.warn('joblib write failed with error={}'.format(e)) logr.info('proceeding with predictions without writing model to disk') # do something useful with the fit model if args.submission: # make predictions for a leaderboard submission logr.info('writing predictions to formatted submission file')
def run(args): task = args['model'] submit = args['submission'] # 1.)Load data for training model X_train_full, y_train_full = utils.load_train_data(task) if submit: # making a submission; train on all given data print('fitting models to entire training set') X_train, y_train = X_train_full, y_train_full X_test = utils.load_test_data(task) else: # running an experiment - cross validate with train/test split test_size = args['test_size'] print('fitting models to cv train/test split with train% = {}'.format(1-test_size)) X_train, X_val, y_train, y_val = train_test_split(X_train_full,y_train_full, test_size=test_size, random_state=args['random_state']) # 2.) Get pipeline if task == 'Visit': pipeline_detail = visit[args['expt']] X_train, y_train = utils.sample_negatives(X_train, y_train, 2) if not submit: X_val, y_val = utils.sample_negatives(X_val, y_val, 1) else: pipeline_detail = rating[args['expt']] pipeline = pipeline_detail['pl'] # Fit model to training data print('fitting model to array sizes (xtrain, ytrain)={}'.format([i.shape for i in [X_train, y_train]])) print('fitting experiment pipeline with signature={}'.format(pipeline)) pipeline.fit(X_train, y_train) # 3.) For non-submission experiments, get the best parameters from grid search if submit: fname_spec = '_submission_' else: # log all results + call out the winner if hasattr(pipeline, 'best_params_'): print('best gridsearch score={}'.format(pipeline.best_score_)) print('best set of pipeline params={}'.format(pipeline.best_params_)) print('now displaying all pipeline param scores...') cv_results = pipeline.cv_results_ for params, mean_score, scores in list(zip(cv_results['params'], cv_results['mean_test_score'], cv_results['std_test_score'])): print("{:0.3f} (+/-{:0.03f}) for {}".format(mean_score, scores.std() * 2, params)) fname_spec = '_expt_' model_name = utils.short_name(pipeline) + fname_spec + datetime.utcnow().strftime('%Y-%m-%d_%H%M%S') # 4.) Prepare submission if submit: print('writing predictions to formatted submission file') predictions = pipeline.predict(X_test) if hasattr(pipeline, 'best_params_'): print('predicting test values with best-choice gridsearch params') utils.create_submission(predictions, pipeline_detail['name'], X_test) else: cv = args['k-fold'] print('cross validating model predictions with cv={}'.format(cv)) predictions = cross_val_predict(pipeline, X_val, y_val, cv=cv) # print("cross val prediction", accuracy_score(y_val, predictions)) print("cross val prediction", mean_squared_error(y_val, predictions)) predictions_train = pipeline.predict(X_train) predictions_test = pipeline.predict(X_val) if task == 'Visit': print('obtained train accuracy = {:.2f}, test accuracy = {:.2f} pipeline={} '.format( accuracy_score(y_train, predictions_train), accuracy_score(y_val, predictions_test), pipeline)) print('calculating confusion matrix') try: cf = confusion_matrix(y_val, predictions) print("confusion matrix: ", cf) sb.heatmap(cf) except RuntimeError as e: print('plotting error. matplotlib backend may need to be changed (see readme). error={}'.format(e)) print('plot may still have been saved, and model has already been saved to disk.') else: print('obtained train mse = {:.2f} test mse={}, pipeline={} '.format( mean_squared_error(y_train, predictions_train), mean_squared_error(y_val, predictions_test), pipeline)) if args['cross_val_score']: # this gives a better idea of uncertainty, but it adds 'cv' more print('cross validating model accuracy with cv={}'.format(cv)) scores = cross_val_score(pipeline, X_val, y_val, cv=cv) print('obtained accuracy={:0.2f}% +/- {:0.2f} with cv={}, \ pipeline={} '.format(scores.mean() * 100, scores.std() * 100 * 2, cv, pipeline)) print('completed with pipeline {}'.format(pipeline_detail['name']))