def run_setting(*, setting: Setting, start_date: pd.Timestamp, end_date: pd.Timestamp, vprint=print) -> pd.DataFrame: sub = pd.read_csv('../input/sample_submission.csv') get_new_data = end_date > pd.to_datetime('2018-03-28') test_data = get_city_data(city=setting.city, vprint=vprint, impute_with_lgbm=False, get_new_data=get_new_data) if setting.lgbm_impute: train_data = get_city_data(city=setting.city, vprint=vprint, impute_with_lgbm=setting.lgbm_impute, get_new_data=get_new_data) else: train_data = test_data.copy() impute_methods = ['fwbw_day', 'mean'] if setting.fwbw_mean_impute else [] scores_df = fit_predict_score(tr=train_data, ts=test_data, sub=sub, city=setting.city, windows=setting.windows, start_date=start_date, end_date=end_date, impute_methods=impute_methods, method=setting.method, n_thread=setting.n_thread, vprint=vprint) outfile_name = 'rolling_summary_experiment_{}_{}.csv'.format(setting.num, setting.city) outfile_path = '../summaries/{}'.format(outfile_name) if os.path.exists(outfile_path): df = pd.read_csv(outfile_path) scores_df = pd.concat([scores_df, df], axis=0) scores_df.drop_duplicates(inplace=True) scores_df.sort_values(by='date', inplace=True) scores_df.to_csv('../summaries/{}'.format(outfile_name), index=False) with open('../summaries/settings.txt', 'a') as f: f.write(str(setting) + ' Summary File: ' + outfile_name) vprint(1, '# ---- mean {} '.format(scores_df['smape'].mean())) return scores_df
def predict(*, pred_date: str, bj_windows: str='golden_8', ld_windows: str='golden_8', bj_method: str='median', ld_method: str='median', bj_lgbm: bool=True, ld_lgbm: bool=True, bj_fwbw: bool=True, ld_fwbw: bool=True, n_thread: int=8, save: bool=True, dosubmit: bool=False, suffix: str='dummy', verbose: int=2): vprint = get_verbose_print(verbose_level=verbose) pred_date = pd.to_datetime(pred_date) get_new_data = pred_date > pd.to_datetime('2018-03-28') sub = pd.read_csv("../input/sample_submission.csv") OUTDIR = '../submission/sub_{}-{}-{}'.format(pred_date.year, pred_date.month, pred_date.day) os.system('mkdir -p {}'.format(OUTDIR)) predict_start_day = pred_date + pd.Timedelta(1, unit='D') predict_start = pd.to_datetime(get_date(predict_start_day)) bj_data = get_city_data(city='bj', vprint=vprint, impute_with_lgbm=bj_lgbm, get_new_data=get_new_data) ld_data = get_city_data(city='ld', vprint=vprint, impute_with_lgbm=ld_lgbm, get_new_data=get_new_data) vprint(2, bj_data.head()) vprint(2, bj_data.loc[bj_data['stationId']!= 'zhiwuyuan_aq'].tail()) vprint(2, ld_data.head()) vprint(2, ld_data.tail()) bj_fwbw_impute_methods = ['day', 'mean'] if bj_fwbw else [] ld_fwbw_impute_methods = ['day', 'mean'] if ld_fwbw else [] bj_pred = rolling_summary(sub=sub, data=bj_data, predict_start=predict_start, windows=MEDIAN_WINDOWS[bj_windows], n_thread=n_thread, method=bj_method, impute_methods=bj_fwbw_impute_methods, vprint=vprint) ld_pred = rolling_summary(sub=sub, data=ld_data, predict_start=predict_start, windows=MEDIAN_WINDOWS[ld_windows], n_thread=n_thread, method=ld_method, impute_methods=ld_fwbw_impute_methods, vprint=vprint) submissions = sub.copy() bj_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in BEIJING_STATIONS) ld_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in LONDON_STATIONS) submissions.loc[bj_cond] = bj_pred.loc[bj_cond].values submissions.loc[ld_cond] = ld_pred.loc[ld_cond].values submissions['PM2.5'] = submissions['PM2.5'].map(lambda x: max(0, x)) submissions['PM10'] = submissions['PM10'].map(lambda x: max(0, x)) submissions['O3'] = submissions['O3'].map(lambda x: max(0, x)) if save: if not suffix: filepath = '{}/model_{}_sub.csv'.format(OUTDIR, 3) else: filepath = '{}/model_{}_sub_{}.csv'.format(OUTDIR, 3, suffix) submissions.to_csv(filepath, index=False) if dosubmit: submit(subfile=filepath, description='model_{}_{}'.format(3, str(predict_start).split()[0]), filename='model_{}_sub_{}.csv'.format(3, str(predict_start).split()[0]) )
def predict(*, pred_date: str, bj_his_length: int, ld_his_length: int, bj_npoints: int, ld_npoints: int, bj_scale:float, ld_scale: float, n_thread: int=8, save: bool=True, dosubmit: bool=False, suffix: str='dummy', verbose: int=2): vprint = get_verbose_print(verbose_level=verbose) pred_date = pd.to_datetime(pred_date) get_new_data = pred_date > pd.to_datetime('2018-03-28') sub = pd.read_csv("../input/sample_submission.csv") OUTDIR = '../submission/sub_{}-{}-{}'.format(pred_date.year, pred_date.month, pred_date.day) os.system('mkdir -p {}'.format(OUTDIR)) predict_start_day = pred_date + pd.Timedelta(1, unit='D') predict_start = pd.to_datetime(get_date(predict_start_day)) bj_data = get_city_data(city='bj', vprint=vprint, impute_with_lgbm=True, partial_data=True, get_new_data=get_new_data) ld_data = get_city_data(city='ld', vprint=vprint, impute_with_lgbm=True, partial_data=True, get_new_data=get_new_data) vprint(2, bj_data.head()) vprint(2, bj_data.loc[bj_data['stationId']!= 'zhiwuyuan_aq'].tail()) vprint(2, ld_data.head()) vprint(2, ld_data.tail()) bj_pred = fbprophet(sub=sub, data=bj_data, current_date=predict_start, history_length=bj_his_length, changepoint_scale=bj_scale, num_changepoints=bj_npoints, n_thread=n_thread, vprint=vprint) ld_pred = fbprophet(sub=sub, data=ld_data, current_date=predict_start, history_length=ld_his_length, changepoint_scale=ld_scale, num_changepoints=ld_npoints, n_thread=n_thread, vprint=vprint) submissions = sub.copy() bj_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in BEIJING_STATIONS) ld_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in LONDON_STATIONS) submissions.loc[bj_cond, ['PM2.5', 'PM10', 'O3']] = bj_pred.loc[bj_cond, ['PM2.5', 'PM10', 'O3']].values submissions.loc[ld_cond, ['PM2.5', 'PM10']] = ld_pred.loc[ld_cond, ['PM2.5', 'PM10']].values submissions['PM2.5'] = submissions['PM2.5'].map(lambda x: max(0, x)) submissions['PM10'] = submissions['PM10'].map(lambda x: max(0, x)) submissions['O3'] = submissions['O3'].map(lambda x: max(0, x)) submissions = submissions[['test_id', 'PM2.5', 'PM10', 'O3']] if save: if not suffix: filepath = '{}/model_{}_sub.csv'.format(OUTDIR, 4) else: filepath = '{}/model_{}_sub_{}.csv'.format(OUTDIR, 4, suffix) submissions.to_csv(filepath, index=False) if dosubmit: submit(subfile=filepath, description='model_{}_{}'.format(4, str(predict_start).split()[0]), filename='model_{}_sub_{}.csv'.format(4, str(predict_start).split()[0]))
def run_setting(*, setting: Setting, start_date: pd.Timestamp, end_date: pd.Timestamp, vprint=print) -> pd.DataFrame: sub = pd.read_csv('../input/sample_submission.csv') get_new_data = end_date > pd.to_datetime('2018-03-28') train_data = get_city_data(city=setting.city, vprint=vprint, impute_with_lgbm=True, get_new_data=get_new_data) test_data = get_city_data(city=setting.city, vprint=vprint, impute_with_lgbm=False, get_new_data=get_new_data) scores_df = fit_predict_score( sub=sub, tr=train_data, ts=test_data, start_date=start_date, end_date=end_date, city=setting.city, history_length=setting.history_length, changepoint_scale=setting.changepoint_scale, num_changepoints=setting.num_changepoints, n_thread=setting.n_thread, vprint=vprint) outfile_name = 'fbprophet_experiment_{}_{}.csv'.format(setting.num, setting.city) outfile_path = '../summaries/{}'.format(outfile_name) if os.path.exists(outfile_path): df = pd.read_csv(outfile_path) scores_df = pd.concat([scores_df, df], axis=0) scores_df.drop_duplicates(inplace=True) scores_df.sort_values(by='date', inplace=True) scores_df.to_csv('../summaries/{}'.format(outfile_name), index=False) with open('../summaries/fb_settings.txt', 'a') as f: f.write(str(setting) + ' Summary File: ' + outfile_name) vprint(1, '# ---- mean {} '.format(scores_df['smape'].mean())) return scores_df
def find_city(self, pattern, max_results): pattern = pattern.decode(self.locale.codeset).lower() MODULE.info('pattern: %s' % pattern) if not pattern: return [] # for the given pattern, find matching cities city_data = util.get_city_data() matches = [] for icity in city_data: match = None for jlabel in icity.get('label', {}).itervalues(): label = jlabel.decode(self.locale.codeset).lower() if pattern in label: # matching score is the overlap if the search pattern and the matched text # (as fraction between 0 and 1) match_score = len(pattern) / float(len(label)) if match and match_score < match['match_score']: # just keep the best match of a city continue if match_score > 0.1: # found a match with more than 10% overlap :) match = icity.copy() match['match'] = jlabel match['match_score'] = match_score if match: matches.append(match) MODULE.info('Search for pattern "%s" with %s matches' % (pattern, len(matches))) if not matches: return None # add additional score w.r.t. the population size of the city # such that the largest city gains additional 0.4 on top max_population = max([imatch['population'] for imatch in matches]) weighted_inv_max_population = 0.6 / float(max_population) for imatch in matches: imatch['final_score'] = imatch[ 'match_score'] + weighted_inv_max_population * imatch[ 'population'] # sort matches... matches.sort(key=lambda x: x['final_score'], reverse=True) MODULE.info('Top 5 matches: %s' % json.dumps(matches[:5], indent=2)) matches = matches[:max_results] # add additional information about keyboard layout, time zone etc. and # get the correct localized labels country_data = util.get_country_data() for imatch in matches: match_country = country_data.get(imatch.get('country')) if match_country: imatch.update(util.get_random_nameserver(match_country)) imatch.update( dict( default_lang=match_country.get('default_lang'), country_label=self._get_localized_label( match_country.get('label', {})), label=self._get_localized_label(imatch.get('label')) or imatch.get('match'), )) return matches
def _preload_city_data(self): util.get_city_data() util.get_country_data()
def run_setting(setting, verbose_level=2, start_date, end_date, n_folds=5, skip=1): num, city, history_length, num_shifts, median_shifts, median_windows, dropout_rate, l2_strength, units, batch_size = setting median_windows = MEDIAN_WINDOWS[median_windows] vprint = get_verbose_print(verbose_level) sub = pd.read_csv('../input/sample_submission.csv') get_new_data = end_date > pd.to_datetime('2018-03-28') test_data = get_city_data(city=city, vprint=vprint, impute_with_lgbm=False, get_new_data=get_new_data) train_data = test_data.copy() # type: pd.DataFrame vprint(2, train_data.head()) vprint(2, train_data.loc[train_data['stationId'] != 'zhiwuyuan_aq'].tail()) train_data = impute(train_data, lgbm=True, hour=True, mean=True) vprint(2, train_data.head()) vprint(2, train_data.loc[train_data['stationId'] != 'zhiwuyuan_aq'].tail()) w_train_data = long_to_wide(train_data) current_date = start_date scores_df = [] STATIONS = BEIJING_STATIONS if city == 'bj' else LONDON_STATIONS while current_date < end_date: vprint(1, "running experiment for {} at {}".format(current_date, datetime.now())) train_split_date = current_date - pd.Timedelta(3, unit='D') x_train, y_train = wide_make_fw_x_y( wdata=w_train_data, ldata=train_data, split_date=train_split_date, history_length=history_length, num_shifts=num_shifts, use_medians=True, median_shifts=median_shifts, window_sizes=median_windows, for_prediction=False, n_thread=8, vprint=vprint, save_feature=True, use_cache=True, window_name=setting.median_windows) x_test, _ = wide_make_fw_x_y( wdata=w_train_data, ldata=train_data, split_date=current_date, history_length=history_length, num_shifts=1, use_medians=True, median_shifts=median_shifts, window_sizes=median_windows, for_prediction=False, n_thread=8, vprint=vprint, save_feature=True, use_cache=True, window_name=setting.median_windows) x_train = x_train.loc[x_train['stationId'].map(lambda x: x.split('#')[0] in [s for s in STATIONS if s != 'zhiwuyuan_aq'])] y_train = y_train.loc[y_train['stationId'].map(lambda x: x.split('#')[0] in [s for s in STATIONS if s != 'zhiwuyuan_aq'])] x_test = x_test.loc[x_test['stationId'].map(lambda x: x.split('#')[0] in STATIONS)] subs = [] min_valid_smape = [] groups = x_train['stationId'].map(lambda x: x.split('#')[0]) group_kfold = GroupKFold(n_splits=n_folds) splits = list(group_kfold.split(X=x_train, groups=groups)) for it, (train_idx, val_idx) in enumerate(splits): vprint(2, '# ---- fold {} ----'.format(it + 1)) model = get_keras_model(input_dim=x_train.shape[1] - 1, dropout_rate=dropout_rate, l2_strength=l2_strength, units=units) if it == 0: vprint(1, model.summary()) history = model.fit( x=x_train.iloc[train_idx, 1:].values, y=y_train.iloc[train_idx, 1:].values, validation_data=(x_train.iloc[val_idx, 1:].values, y_train.iloc[val_idx, 1:].values), batch_size=batch_size, epochs=65535, verbose=0, callbacks=[ ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=15, verbose=verbose_level), EarlyStopping(monitor='val_loss', patience=30, verbose=verbose_level), ModelCheckpoint(filepath='./model_checkpoint_{}.hdf5'.format(num), monitor='val_loss', save_best_only=True, save_weights_only=True, mode='min') ]) min_valid_smape.append(np.min(history.history['val_loss'])) predictions = model.predict(x_test.iloc[:, 1:], verbose=verbose_level) predictions = pd.DataFrame(predictions) predictions['stationId'] = x_test['stationId'].map(lambda x: x.split('#')[0]).tolist() predictions['measure'] = x_test['stationId'].map(lambda x: x.split('#')[1]).tolist() vprint(2, '# ---- formatting submission df ----') for idx, row in tqdm(predictions.iterrows()): values = row[:48].values sub.loc[sub.test_id.isin([row['stationId'] + '#' + str(i) for i in range(48)]), row['measure']] = values subs.append(sub[SUB_COLS]) vprint(2, 'mean {}, std {}'.format(np.mean(min_valid_smape), np.std(min_valid_smape))) submissions = subs[0] for sub in subs[1:]: submissions[['PM2.5', 'PM10', 'O3']] += sub[['PM2.5', 'PM10', 'O3']] submissions[['PM2.5', 'PM10', 'O3']] /= n_folds truth = get_truth(city=city, data=test_data, start_date=current_date + pd.Timedelta(1, unit='D')) scores = evaluate(city=city, truth=truth, predictions=submissions) if 'zhiwuyuan_aq-O3' in scores: scores['zhiwuyuan_aq-O3'] = np.nan if 'zhiwuyuan_aq-PM2.5' in scores: scores['zhiwuyuan_aq-PM2.5'] = np.nan if 'zhiwuyuan_aq-PM10' in scores: scores['zhiwuyuan_aq-PM10'] = np.nan scores['smape'] = pd.Series(scores).mean() scores['date'] = get_date(current_date) vprint(1, scores['smape']) current_date += pd.Timedelta(value=skip, unit='D') scores_df.append(scores) scores_df = pd.DataFrame(scores_df) scores_df = scores_df[['date', 'smape'] + [col for col in scores_df.columns if col not in ['date', 'smape']]] outfile_name = 'shortcut_mlp_experiment_{}_{}.csv'.format(num, city) outfile_path = '../summaries/{}'.format(outfile_name) if os.path.exists(outfile_path): df = pd.read_csv(outfile_path) scores_df = pd.concat([scores_df, df], axis=0) scores_df.drop_duplicates(inplace=True) scores_df.sort_values(by='date', inplace=True) scores_df.to_csv('../summaries/{}'.format(outfile_name), index=False) with open('../summaries/shortcut_mlp_settings.txt', 'a') as f: f.write(str(setting) + ' Summary File: ' + outfile_name) vprint(1, '# ---- mean {} '.format(scores_df['smape'].mean()))
def predict(*, pred_date: str, bj_his_length=360, ld_his_length=420, bj_windows='golden_8', ld_windows='fib_8', bj_dropout=0.6, ld_dropout=0.2, bj_units=(48, 48, 48, 48), ld_units=(24, 24, 24, 24), bj_batchsize=84, ld_batchsize=22, verbose: int=2, save=True, dosubmit=False, suffix='alt_lgb_split'): vprint = get_verbose_print(verbose_level=verbose) pred_date = pd.to_datetime(pred_date) get_new_data = pred_date > pd.to_datetime('2018-03-28') sub = pd.read_csv("../input/sample_submission.csv") OUTDIR = '../submission/sub_{}-{}-{}'.format(pred_date.year, pred_date.month, pred_date.day) os.system('mkdir -p {}'.format(OUTDIR)) predict_start_day = pred_date + pd.Timedelta(1, unit='D') predict_start = pd.to_datetime(get_date(predict_start_day)) bj_data = get_city_data(city='bj', vprint=vprint, impute_with_lgbm=False, partial_data=False, get_new_data=get_new_data) ld_data = get_city_data(city='ld', vprint=vprint, impute_with_lgbm=False, partial_data=False, get_new_data=get_new_data) vprint(2, bj_data.head()) vprint(2, bj_data.loc[bj_data['stationId']!= 'zhiwuyuan_aq'].tail()) vprint(2, ld_data.head()) vprint(2, ld_data.tail()) bj_data = impute(bj_data, lgbm=True, hour=True, mean=True) ld_data = impute(ld_data, lgbm=True, hour=True, mean=True) vprint(2, bj_data.head()) vprint(2, bj_data.loc[bj_data['stationId']!= 'zhiwuyuan_aq'].tail()) vprint(2, ld_data.head()) vprint(2, ld_data.tail()) bj_w_train_data = long_to_wide(bj_data) ld_w_train_data = long_to_wide(ld_data) train_split_date = pred_date - pd.Timedelta(3, unit='D') bj_pred = fit_predict(city='bj', sub=sub, w_train_data=bj_w_train_data, train_data=bj_data, train_split_date=train_split_date, history_length=bj_his_length, pred_date=pred_date, windows=MEDIAN_WINDOWS[bj_windows], dropout_rate=bj_dropout, units=bj_units, batch_size=bj_batchsize, l2_strength=0.0001, n_folds=5, vprint=vprint ) ld_pred = fit_predict(city='ld', sub=sub, w_train_data=ld_w_train_data, train_data=ld_data, train_split_date=train_split_date, history_length=ld_his_length, pred_date=pred_date, windows=MEDIAN_WINDOWS[ld_windows], dropout_rate=ld_dropout, units=ld_units, batch_size=ld_batchsize, l2_strength=0.0001, n_folds=5, vprint=vprint ) submissions = sub.copy() bj_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in BEIJING_STATIONS) ld_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in LONDON_STATIONS) submissions.loc[bj_cond, ['PM2.5', 'PM10', 'O3']] = bj_pred.loc[bj_cond, ['PM2.5', 'PM10', 'O3']].values submissions.loc[ld_cond, ['PM2.5', 'PM10']] = ld_pred.loc[ld_cond, ['PM2.5', 'PM10']].values submissions['PM2.5'] = submissions['PM2.5'].map(lambda x: max(0, x)) submissions['PM10'] = submissions['PM10'].map(lambda x: max(0, x)) submissions['O3'] = submissions['O3'].map(lambda x: max(0, x)) submissions = submissions[['test_id', 'PM2.5', 'PM10', 'O3']] if save: if not suffix: filepath = '{}/model_{}_sub.csv'.format(OUTDIR, 6) else: filepath = '{}/model_{}_sub_{}.csv'.format(OUTDIR, 6, suffix) submissions.to_csv(filepath, index=False) if dosubmit: submit(subfile=filepath, description='model_{}_{}'.format(6, str(predict_start).split()[0]), filename='model_{}_sub_{}.csv'.format(6, str(predict_start).split()[0]))