Example #1
0
def run_setting(*, setting: Setting, start_date: pd.Timestamp, end_date: pd.Timestamp, vprint=print) -> pd.DataFrame:
    sub = pd.read_csv('../input/sample_submission.csv')
    get_new_data = end_date > pd.to_datetime('2018-03-28')
    test_data = get_city_data(city=setting.city, vprint=vprint, impute_with_lgbm=False, get_new_data=get_new_data)
    if setting.lgbm_impute:
        train_data = get_city_data(city=setting.city, vprint=vprint, impute_with_lgbm=setting.lgbm_impute,
                                   get_new_data=get_new_data)
    else:
        train_data = test_data.copy()
    impute_methods = ['fwbw_day', 'mean'] if setting.fwbw_mean_impute else []
    scores_df = fit_predict_score(tr=train_data, ts=test_data, sub=sub, city=setting.city, windows=setting.windows,
                                  start_date=start_date, end_date=end_date, impute_methods=impute_methods,
                                  method=setting.method, n_thread=setting.n_thread, vprint=vprint)
    outfile_name = 'rolling_summary_experiment_{}_{}.csv'.format(setting.num, setting.city)
    outfile_path = '../summaries/{}'.format(outfile_name)
    if os.path.exists(outfile_path):
        df = pd.read_csv(outfile_path)
        scores_df = pd.concat([scores_df, df], axis=0)
        scores_df.drop_duplicates(inplace=True)
        scores_df.sort_values(by='date', inplace=True)
    scores_df.to_csv('../summaries/{}'.format(outfile_name), index=False)
    with open('../summaries/settings.txt', 'a') as f:
        f.write(str(setting) + ' Summary File: ' + outfile_name)
    vprint(1, '# ---- mean {} '.format(scores_df['smape'].mean()))
    return scores_df
Example #2
0
def predict(*, pred_date: str, bj_windows: str='golden_8', ld_windows: str='golden_8', bj_method: str='median',
            ld_method: str='median', bj_lgbm: bool=True, ld_lgbm: bool=True, bj_fwbw: bool=True, ld_fwbw: bool=True,
            n_thread: int=8, save: bool=True, dosubmit: bool=False, suffix: str='dummy', verbose: int=2):
    vprint = get_verbose_print(verbose_level=verbose)
    pred_date = pd.to_datetime(pred_date)
    get_new_data = pred_date > pd.to_datetime('2018-03-28')
    sub = pd.read_csv("../input/sample_submission.csv")
    OUTDIR = '../submission/sub_{}-{}-{}'.format(pred_date.year, pred_date.month, pred_date.day)
    os.system('mkdir -p {}'.format(OUTDIR))
    predict_start_day = pred_date + pd.Timedelta(1, unit='D')
    predict_start = pd.to_datetime(get_date(predict_start_day))
    bj_data = get_city_data(city='bj', vprint=vprint, impute_with_lgbm=bj_lgbm, get_new_data=get_new_data)
    ld_data = get_city_data(city='ld', vprint=vprint, impute_with_lgbm=ld_lgbm, get_new_data=get_new_data)
    vprint(2, bj_data.head())
    vprint(2, bj_data.loc[bj_data['stationId']!= 'zhiwuyuan_aq'].tail())
    vprint(2, ld_data.head())
    vprint(2, ld_data.tail())
    bj_fwbw_impute_methods = ['day', 'mean'] if bj_fwbw else []
    ld_fwbw_impute_methods = ['day', 'mean'] if ld_fwbw else []
    bj_pred = rolling_summary(sub=sub, data=bj_data, predict_start=predict_start, windows=MEDIAN_WINDOWS[bj_windows],
                              n_thread=n_thread, method=bj_method, impute_methods=bj_fwbw_impute_methods, vprint=vprint)
    ld_pred = rolling_summary(sub=sub, data=ld_data, predict_start=predict_start, windows=MEDIAN_WINDOWS[ld_windows],
                              n_thread=n_thread, method=ld_method, impute_methods=ld_fwbw_impute_methods, vprint=vprint)
    submissions = sub.copy()
    bj_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in BEIJING_STATIONS)
    ld_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in LONDON_STATIONS)
    submissions.loc[bj_cond] = bj_pred.loc[bj_cond].values
    submissions.loc[ld_cond] = ld_pred.loc[ld_cond].values
    submissions['PM2.5'] = submissions['PM2.5'].map(lambda x: max(0, x))
    submissions['PM10'] = submissions['PM10'].map(lambda x: max(0, x))
    submissions['O3'] = submissions['O3'].map(lambda x: max(0, x))

    if save:
        if not suffix:
            filepath = '{}/model_{}_sub.csv'.format(OUTDIR, 3)
        else:
            filepath = '{}/model_{}_sub_{}.csv'.format(OUTDIR, 3, suffix)
        submissions.to_csv(filepath, index=False)

        if dosubmit:
            submit(subfile=filepath,
                   description='model_{}_{}'.format(3, str(predict_start).split()[0]),
                   filename='model_{}_sub_{}.csv'.format(3, str(predict_start).split()[0])
            )
def predict(*, pred_date: str, bj_his_length: int, ld_his_length: int, bj_npoints: int, ld_npoints: int, bj_scale:float,
            ld_scale: float, n_thread: int=8, save: bool=True, dosubmit: bool=False, suffix: str='dummy',
            verbose: int=2):
    vprint = get_verbose_print(verbose_level=verbose)
    pred_date = pd.to_datetime(pred_date)
    get_new_data = pred_date > pd.to_datetime('2018-03-28')
    sub = pd.read_csv("../input/sample_submission.csv")
    OUTDIR = '../submission/sub_{}-{}-{}'.format(pred_date.year, pred_date.month, pred_date.day)
    os.system('mkdir -p {}'.format(OUTDIR))
    predict_start_day = pred_date + pd.Timedelta(1, unit='D')
    predict_start = pd.to_datetime(get_date(predict_start_day))
    bj_data = get_city_data(city='bj', vprint=vprint, impute_with_lgbm=True, partial_data=True, get_new_data=get_new_data)
    ld_data = get_city_data(city='ld', vprint=vprint, impute_with_lgbm=True, partial_data=True, get_new_data=get_new_data)
    vprint(2, bj_data.head())
    vprint(2, bj_data.loc[bj_data['stationId']!= 'zhiwuyuan_aq'].tail())
    vprint(2, ld_data.head())
    vprint(2, ld_data.tail())
    bj_pred = fbprophet(sub=sub, data=bj_data, current_date=predict_start, history_length=bj_his_length,
                        changepoint_scale=bj_scale, num_changepoints=bj_npoints, n_thread=n_thread, vprint=vprint)
    ld_pred = fbprophet(sub=sub, data=ld_data, current_date=predict_start, history_length=ld_his_length,
                        changepoint_scale=ld_scale, num_changepoints=ld_npoints, n_thread=n_thread, vprint=vprint)
    submissions = sub.copy()
    bj_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in BEIJING_STATIONS)
    ld_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in LONDON_STATIONS)
    submissions.loc[bj_cond, ['PM2.5', 'PM10', 'O3']] = bj_pred.loc[bj_cond,  ['PM2.5', 'PM10', 'O3']].values
    submissions.loc[ld_cond, ['PM2.5', 'PM10']] = ld_pred.loc[ld_cond, ['PM2.5', 'PM10']].values
    submissions['PM2.5'] = submissions['PM2.5'].map(lambda x: max(0, x))
    submissions['PM10'] = submissions['PM10'].map(lambda x: max(0, x))
    submissions['O3'] = submissions['O3'].map(lambda x: max(0, x))
    submissions = submissions[['test_id', 'PM2.5', 'PM10', 'O3']]
    if save:
        if not suffix:
            filepath = '{}/model_{}_sub.csv'.format(OUTDIR, 4)
        else:
            filepath = '{}/model_{}_sub_{}.csv'.format(OUTDIR, 4, suffix)
        submissions.to_csv(filepath, index=False)

        if dosubmit:
            submit(subfile=filepath,
                   description='model_{}_{}'.format(4, str(predict_start).split()[0]),
                   filename='model_{}_sub_{}.csv'.format(4, str(predict_start).split()[0]))
def run_setting(*, setting: Setting, start_date: pd.Timestamp, end_date: pd.Timestamp, vprint=print) -> pd.DataFrame:
    sub = pd.read_csv('../input/sample_submission.csv')
    get_new_data = end_date > pd.to_datetime('2018-03-28')
    train_data = get_city_data(city=setting.city, vprint=vprint, impute_with_lgbm=True, get_new_data=get_new_data)
    test_data = get_city_data(city=setting.city, vprint=vprint, impute_with_lgbm=False, get_new_data=get_new_data)
    scores_df = fit_predict_score(
        sub=sub, tr=train_data, ts=test_data, start_date=start_date, end_date=end_date, city=setting.city,
        history_length=setting.history_length, changepoint_scale=setting.changepoint_scale,
        num_changepoints=setting.num_changepoints, n_thread=setting.n_thread, vprint=vprint)
    outfile_name = 'fbprophet_experiment_{}_{}.csv'.format(setting.num, setting.city)
    outfile_path = '../summaries/{}'.format(outfile_name)
    if os.path.exists(outfile_path):
        df = pd.read_csv(outfile_path)
        scores_df = pd.concat([scores_df, df], axis=0)
        scores_df.drop_duplicates(inplace=True)
        scores_df.sort_values(by='date', inplace=True)
    scores_df.to_csv('../summaries/{}'.format(outfile_name), index=False)
    with open('../summaries/fb_settings.txt', 'a') as f:
        f.write(str(setting) + ' Summary File: ' + outfile_name)
    vprint(1, '# ---- mean {} '.format(scores_df['smape'].mean()))
    return scores_df
    def find_city(self, pattern, max_results):
        pattern = pattern.decode(self.locale.codeset).lower()
        MODULE.info('pattern: %s' % pattern)
        if not pattern:
            return []

        # for the given pattern, find matching cities
        city_data = util.get_city_data()
        matches = []
        for icity in city_data:
            match = None
            for jlabel in icity.get('label', {}).itervalues():
                label = jlabel.decode(self.locale.codeset).lower()
                if pattern in label:
                    # matching score is the overlap if the search pattern and the matched text
                    # (as fraction between 0 and 1)
                    match_score = len(pattern) / float(len(label))
                    if match and match_score < match['match_score']:
                        # just keep the best match of a city
                        continue
                    if match_score > 0.1:
                        # found a match with more than 10% overlap :)
                        match = icity.copy()
                        match['match'] = jlabel
                        match['match_score'] = match_score
            if match:
                matches.append(match)
        MODULE.info('Search for pattern "%s" with %s matches' %
                    (pattern, len(matches)))
        if not matches:
            return None

        # add additional score w.r.t. the population size of the city
        # such that the largest city gains additional 0.4 on top
        max_population = max([imatch['population'] for imatch in matches])
        weighted_inv_max_population = 0.6 / float(max_population)
        for imatch in matches:
            imatch['final_score'] = imatch[
                'match_score'] + weighted_inv_max_population * imatch[
                    'population']

        # sort matches...
        matches.sort(key=lambda x: x['final_score'], reverse=True)
        MODULE.info('Top 5 matches: %s' % json.dumps(matches[:5], indent=2))
        matches = matches[:max_results]

        # add additional information about keyboard layout, time zone etc. and
        # get the correct localized labels
        country_data = util.get_country_data()
        for imatch in matches:
            match_country = country_data.get(imatch.get('country'))
            if match_country:
                imatch.update(util.get_random_nameserver(match_country))
                imatch.update(
                    dict(
                        default_lang=match_country.get('default_lang'),
                        country_label=self._get_localized_label(
                            match_country.get('label', {})),
                        label=self._get_localized_label(imatch.get('label'))
                        or imatch.get('match'),
                    ))

        return matches
 def _preload_city_data(self):
     util.get_city_data()
     util.get_country_data()
Example #7
0
def run_setting(setting, verbose_level=2, start_date, end_date, n_folds=5, skip=1):
    num, city, history_length, num_shifts, median_shifts, median_windows, dropout_rate, l2_strength, units, batch_size = setting
    median_windows = MEDIAN_WINDOWS[median_windows]
    vprint = get_verbose_print(verbose_level)
    sub = pd.read_csv('../input/sample_submission.csv')
    get_new_data = end_date > pd.to_datetime('2018-03-28')
    test_data = get_city_data(city=city, vprint=vprint, impute_with_lgbm=False, get_new_data=get_new_data)
    train_data = test_data.copy()  # type: pd.DataFrame
    vprint(2, train_data.head())
    vprint(2, train_data.loc[train_data['stationId'] != 'zhiwuyuan_aq'].tail())
    train_data = impute(train_data, lgbm=True, hour=True, mean=True)
    vprint(2, train_data.head())
    vprint(2, train_data.loc[train_data['stationId'] != 'zhiwuyuan_aq'].tail())
    w_train_data = long_to_wide(train_data)
    current_date = start_date
    scores_df = []
    STATIONS = BEIJING_STATIONS if city == 'bj' else LONDON_STATIONS
    while current_date < end_date:
        vprint(1, "running experiment for {} at {}".format(current_date, datetime.now()))

        train_split_date = current_date - pd.Timedelta(3, unit='D')

        x_train, y_train = wide_make_fw_x_y(
            wdata=w_train_data, ldata=train_data, split_date=train_split_date, history_length=history_length,
            num_shifts=num_shifts, use_medians=True, median_shifts=median_shifts, window_sizes=median_windows,
            for_prediction=False, n_thread=8, vprint=vprint, save_feature=True, use_cache=True,
            window_name=setting.median_windows)

        x_test, _ = wide_make_fw_x_y(
            wdata=w_train_data, ldata=train_data, split_date=current_date, history_length=history_length,
            num_shifts=1, use_medians=True, median_shifts=median_shifts, window_sizes=median_windows,
            for_prediction=False, n_thread=8, vprint=vprint, save_feature=True, use_cache=True,
            window_name=setting.median_windows)

        x_train = x_train.loc[x_train['stationId'].map(lambda x: x.split('#')[0] in [s for s in STATIONS if s != 'zhiwuyuan_aq'])]
        y_train = y_train.loc[y_train['stationId'].map(lambda x: x.split('#')[0] in [s for s in STATIONS if s != 'zhiwuyuan_aq'])]
        x_test = x_test.loc[x_test['stationId'].map(lambda x: x.split('#')[0] in STATIONS)]

        subs = []
        min_valid_smape = []

        groups = x_train['stationId'].map(lambda x: x.split('#')[0])
        group_kfold = GroupKFold(n_splits=n_folds)
        splits = list(group_kfold.split(X=x_train, groups=groups))

        for it, (train_idx, val_idx) in enumerate(splits):
            vprint(2, '# ---- fold {} ----'.format(it + 1))
            model = get_keras_model(input_dim=x_train.shape[1] - 1, dropout_rate=dropout_rate, l2_strength=l2_strength, units=units)
            if it == 0:
                vprint(1, model.summary())
            history = model.fit(
                x=x_train.iloc[train_idx, 1:].values,
                y=y_train.iloc[train_idx, 1:].values,
                validation_data=(x_train.iloc[val_idx, 1:].values, y_train.iloc[val_idx, 1:].values),
                batch_size=batch_size,
                epochs=65535,
                verbose=0,
                callbacks=[
                    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=15, verbose=verbose_level),
                    EarlyStopping(monitor='val_loss', patience=30, verbose=verbose_level),
                    ModelCheckpoint(filepath='./model_checkpoint_{}.hdf5'.format(num), monitor='val_loss',
                                    save_best_only=True, save_weights_only=True, mode='min')
                ])
            min_valid_smape.append(np.min(history.history['val_loss']))
            predictions = model.predict(x_test.iloc[:, 1:], verbose=verbose_level)
            predictions = pd.DataFrame(predictions)
            predictions['stationId'] = x_test['stationId'].map(lambda x: x.split('#')[0]).tolist()
            predictions['measure'] = x_test['stationId'].map(lambda x: x.split('#')[1]).tolist()
            vprint(2, '# ---- formatting submission df ----')
            for idx, row in tqdm(predictions.iterrows()):
                values = row[:48].values
                sub.loc[sub.test_id.isin([row['stationId'] + '#' + str(i) for i in range(48)]), row['measure']] = values
            subs.append(sub[SUB_COLS])
        vprint(2, 'mean {}, std {}'.format(np.mean(min_valid_smape), np.std(min_valid_smape)))
        submissions = subs[0]
        for sub in subs[1:]:
            submissions[['PM2.5', 'PM10', 'O3']] += sub[['PM2.5', 'PM10', 'O3']]
        submissions[['PM2.5', 'PM10', 'O3']] /= n_folds

        truth = get_truth(city=city, data=test_data, start_date=current_date + pd.Timedelta(1, unit='D'))
        scores = evaluate(city=city, truth=truth, predictions=submissions)
        if 'zhiwuyuan_aq-O3' in scores:
            scores['zhiwuyuan_aq-O3'] = np.nan
        if 'zhiwuyuan_aq-PM2.5' in scores:
            scores['zhiwuyuan_aq-PM2.5'] = np.nan
        if 'zhiwuyuan_aq-PM10' in scores:
            scores['zhiwuyuan_aq-PM10'] = np.nan
        scores['smape'] = pd.Series(scores).mean()
        scores['date'] = get_date(current_date)
        vprint(1, scores['smape'])
        current_date += pd.Timedelta(value=skip, unit='D')
        scores_df.append(scores)
    scores_df = pd.DataFrame(scores_df)
    scores_df = scores_df[['date', 'smape'] + [col for col in scores_df.columns if col not in ['date', 'smape']]]
    outfile_name = 'shortcut_mlp_experiment_{}_{}.csv'.format(num, city)
    outfile_path = '../summaries/{}'.format(outfile_name)
    if os.path.exists(outfile_path):
        df = pd.read_csv(outfile_path)
        scores_df = pd.concat([scores_df, df], axis=0)
        scores_df.drop_duplicates(inplace=True)
        scores_df.sort_values(by='date', inplace=True)
    scores_df.to_csv('../summaries/{}'.format(outfile_name), index=False)
    with open('../summaries/shortcut_mlp_settings.txt', 'a') as f:
        f.write(str(setting) + ' Summary File: ' + outfile_name)
    vprint(1, '# ---- mean {} '.format(scores_df['smape'].mean()))
Example #8
0
def predict(*, pred_date: str, bj_his_length=360, ld_his_length=420, bj_windows='golden_8', ld_windows='fib_8',
            bj_dropout=0.6, ld_dropout=0.2, bj_units=(48, 48, 48, 48),  ld_units=(24, 24, 24, 24), bj_batchsize=84,
            ld_batchsize=22, verbose: int=2, save=True, dosubmit=False, suffix='alt_lgb_split'):
    vprint = get_verbose_print(verbose_level=verbose)
    pred_date = pd.to_datetime(pred_date)
    get_new_data = pred_date > pd.to_datetime('2018-03-28')
    sub = pd.read_csv("../input/sample_submission.csv")
    OUTDIR = '../submission/sub_{}-{}-{}'.format(pred_date.year, pred_date.month, pred_date.day)
    os.system('mkdir -p {}'.format(OUTDIR))
    predict_start_day = pred_date + pd.Timedelta(1, unit='D')
    predict_start = pd.to_datetime(get_date(predict_start_day))

    bj_data = get_city_data(city='bj', vprint=vprint, impute_with_lgbm=False, partial_data=False, get_new_data=get_new_data)
    ld_data = get_city_data(city='ld', vprint=vprint, impute_with_lgbm=False, partial_data=False, get_new_data=get_new_data)

    vprint(2, bj_data.head())
    vprint(2, bj_data.loc[bj_data['stationId']!= 'zhiwuyuan_aq'].tail())
    vprint(2, ld_data.head())
    vprint(2, ld_data.tail())

    bj_data = impute(bj_data, lgbm=True, hour=True, mean=True)
    ld_data = impute(ld_data, lgbm=True, hour=True, mean=True)

    vprint(2, bj_data.head())
    vprint(2, bj_data.loc[bj_data['stationId']!= 'zhiwuyuan_aq'].tail())
    vprint(2, ld_data.head())
    vprint(2, ld_data.tail())

    bj_w_train_data = long_to_wide(bj_data)
    ld_w_train_data = long_to_wide(ld_data)

    train_split_date = pred_date - pd.Timedelta(3, unit='D')
    bj_pred = fit_predict(city='bj', sub=sub, w_train_data=bj_w_train_data, train_data=bj_data,
                          train_split_date=train_split_date, history_length=bj_his_length, pred_date=pred_date,
                          windows=MEDIAN_WINDOWS[bj_windows], dropout_rate=bj_dropout, units=bj_units,
                          batch_size=bj_batchsize, l2_strength=0.0001, n_folds=5, vprint=vprint
    )

    ld_pred = fit_predict(city='ld', sub=sub, w_train_data=ld_w_train_data, train_data=ld_data,
                          train_split_date=train_split_date, history_length=ld_his_length, pred_date=pred_date,
                          windows=MEDIAN_WINDOWS[ld_windows], dropout_rate=ld_dropout, units=ld_units,
                          batch_size=ld_batchsize, l2_strength=0.0001, n_folds=5, vprint=vprint
    )

    submissions = sub.copy()
    bj_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in BEIJING_STATIONS)
    ld_cond = submissions['test_id'].map(lambda x: x.split('#')[0] in LONDON_STATIONS)
    submissions.loc[bj_cond, ['PM2.5', 'PM10', 'O3']] = bj_pred.loc[bj_cond,  ['PM2.5', 'PM10', 'O3']].values
    submissions.loc[ld_cond, ['PM2.5', 'PM10']] = ld_pred.loc[ld_cond, ['PM2.5', 'PM10']].values
    submissions['PM2.5'] = submissions['PM2.5'].map(lambda x: max(0, x))
    submissions['PM10'] = submissions['PM10'].map(lambda x: max(0, x))
    submissions['O3'] = submissions['O3'].map(lambda x: max(0, x))
    submissions = submissions[['test_id', 'PM2.5', 'PM10', 'O3']]

    if save:
        if not suffix:
            filepath = '{}/model_{}_sub.csv'.format(OUTDIR, 6)
        else:
            filepath = '{}/model_{}_sub_{}.csv'.format(OUTDIR, 6, suffix)
        submissions.to_csv(filepath, index=False)

        if dosubmit:
            submit(subfile=filepath,
                   description='model_{}_{}'.format(6, str(predict_start).split()[0]),
                   filename='model_{}_sub_{}.csv'.format(6, str(predict_start).split()[0]))