Example #1
0
def main(debug=False, use_pkl=False):
    num_rows = 10000 if debug else None
    if use_pkl:
        df = loadpkl('../output/df.pkl')
    else:
        with timer("train & test"):
            df = train_test(num_rows)
        with timer("nightley"):
            df = pd.merge(df, nightley(num_rows), on=['datetime', 'park'], how='outer')
        with timer("hotlink"):
            df = pd.merge(df, hotlink(num_rows), on='datetime', how='outer')
        with timer("colopl"):
            df = pd.merge(df, colopl(num_rows), on=['park', 'year', 'month'], how='outer')
        with timer("weather"):
            df = pd.merge(df, weather(num_rows), on=['datetime', 'park'], how='outer')
        with timer("nied_oyama"):
            df = pd.merge(df, nied_oyama(num_rows), on=['datetime', 'park'], how='outer')
        with timer("agoop"):
            df = pd.merge(df, agoop(num_rows), on=['park', 'year','month'], how='outer')
        with timer("jorudan"):
            df = pd.merge(df, jorudan(num_rows), on=['datetime', 'park'], how='outer')
        with timer("save pkl"):
            save2pkl('../output/df.pkl', df)
    with timer("Run XGBoost with kfold"):
        print("df shape:", df.shape)
        feat_importance = kfold_xgboost(df, num_folds=NUM_FOLDS, stratified=True, debug=debug)
        display_importances(feat_importance ,'../output/xgb_importances.png', '../output/feature_importance_xgb.csv')
Example #2
0
def output(train_df, test_df, models, model_params, feature_importance_df,
           train_preds, test_preds, scores, now, model_name):
    score = sum(scores) / len(scores)
    folder_path = make_output_dir(score, now, model_name)
    for i, m in enumerate(models):
        save2pkl('{0}/model_{1:0=2}.pkl'.format(folder_path, i), m)
    with open('{0}/model_params.json'.format(folder_path), 'w') as f:
        json.dump(model_params, f, indent=4)
    with open('{0}/model_valid_scores.json'.format(folder_path), 'w') as f:
        json.dump({i: s for i, s in enumerate(scores)}, f, indent=4)
    save_importances(feature_importance_df,
                     '{}/importances.png'.format(folder_path),
                     '{}/importance.csv'.format(folder_path))
    # 以下の部分はコンペごとに修正が必要

    test_df.loc[:, 'target'] = test_preds
    test_df = test_df.reset_index()
    # targetが一定値以下のものをoutlierで埋める
    #q = test_df['target'].quantile(.0003)
    #q = 3
    #test_df.loc[:,'target']=test_df['target'].apply(lambda x: x if abs(x) > q else x-0.0001)
    test_df[['card_id',
             'target']].to_csv('{0}/submit_{1:%Y-%m%d-%H%M-%S}_{2}.csv'.format(
                 folder_path, now, score),
                               index=False)
    train_df.loc[:, 'OOF_PRED'] = train_preds
    train_df = train_df.reset_index()
    train_df[['card_id',
              'OOF_PRED']].to_csv('{0}/oof.csv'.format(folder_path), )
Example #3
0
def main(is_eval=False):
    # load csv
    df = pd.read_csv('../input/sell_prices.csv')

    # release week ref https://www.kaggle.com/kyakovlev/m5-simple-fe
    release_df = df.groupby(['store_id', 'item_id'
                             ])['wm_yr_wk'].agg(['min']).reset_index()
    release_df.columns = ['store_id', 'item_id', 'release']

    # merge release week
    df = df.merge(release_df, on=['store_id', 'item_id'], how='left')

    # days from release
    df['days_from_release'] = df['wm_yr_wk'] - df['release']

    # basic aggregations
    df['price_max'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('max')
    df['price_min'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('min')
    df['price_std'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('std')
    df['price_mean'] = df.groupby(['store_id',
                                   'item_id'])['sell_price'].transform('mean')

    # normalized price
    df['price_norm'] = df['sell_price'] / df['price_max']

    # label encoding
    df['price_nunique'] = df.groupby(['store_id', 'item_id'
                                      ])['sell_price'].transform('nunique')
    df['item_nunique'] = df.groupby(['store_id', 'sell_price'
                                     ])['item_id'].transform('nunique')

    # momentum
    df['price_momentum'] = df['sell_price'] / df.groupby(
        ['store_id', 'item_id'])['sell_price'].transform(lambda x: x.shift(1))

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save pkl
    save2pkl('../feats/sell_prices.pkl', df)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Example #4
0
def main(num_rows=None):
    # load csv & pkl
    profiles = pd.read_csv('../input/data_set_phase2/profiles.csv')

    # change columns name
    profiles.columns = ['pid']+['profile_{}'.format(i) for i in range(0,66)]

    # feature engineering
    feats = [f for f in profiles.columns.to_list() if f not in ['pid']]

    profiles['profile_sum'] = profiles[feats].mean(axis=1)
    profiles['profile_mean'] = profiles[feats].sum(axis=1)
    profiles['profile_std'] = profiles[feats].std(axis=1)

    profiles['profile_sum_count'] = profiles['profile_sum'].map(profiles['profile_sum'].value_counts())

    # svd features
    svd = TruncatedSVD(n_components=20, n_iter=20, random_state=326)
    svd_x = svd.fit_transform(profiles[feats].values)
    svd_x = pd.DataFrame(svd_x)
    svd_x.columns = ['profile_svd_{}'.format(i) for i in range(20)]
    svd_x['pid'] = profiles['pid']

    # merge
    profiles = profiles.merge(svd_x, on='pid', how='left')

    # NMF features
    nmf = NMF(n_components=20, init='random', random_state=326)
    nmf_x = nmf.fit_transform(profiles[feats].values)
    nmf_x = pd.DataFrame(nmf_x)
    nmf_x.columns = ['profile_nmf_{}'.format(i) for i in range(20)]
    nmf_x['pid'] = profiles['pid']

    # merge
    profiles = profiles.merge(nmf_x, on='pid', how='left')

    # k-means clustering
    kmeans_model = KMeans(n_clusters=10, random_state=326)
    kmeans_model.fit(profiles[feats].values)
    profiles['profile_k_means'] = kmeans_model.labels_

    # save as pkl
    save2pkl('../features/profiles.pkl', profiles)

    line_notify('{} finished.'.format(sys.argv[0]))
Example #5
0
def output(train_df, test_df, models, model_params, feature_importance_df,
           train_preds, test_preds, scores, now, model_name):
    score = sum(scores) / len(scores)
    folder_path = make_output_dir(score, now, model_name)
    for i, m in enumerate(models):
        save2pkl('{0}/model_{1:0=2}.pkl'.format(folder_path, i), m)
    with open('{0}/model_params.json'.format(folder_path), 'w') as f:
        json.dump(model_params, f, indent=4)
    with open('{0}/model_valid_scores.json'.format(folder_path), 'w') as f:
        json.dump({i: s for i, s in enumerate(scores)}, f, indent=4)
    save_importances(feature_importance_df,
                     '{}/importances.png'.format(folder_path),
                     '{}/importance.csv'.format(folder_path))

    # 以下の部分はコンペごとに修正が必要
    submission_file_name = '{0}/submit_{1:%Y-%m-%d-%H-%M-%S}_{2}.csv'.format(
        folder_path, now, score)

    test_df.loc[:, 'target'] = test_preds
    test_df.loc[:, 'Outlier_Likelyhood'] = test_preds_bin
    q = test_df['Outlier_Likelyhood'].quantile(.9999)  # 1.0930%
    test_df.loc[:, 'target'] = test_df['Outlier_Likelyhood'].apply(
        lambda x: 1 if x > q else x)
    test_df = test_df.reset_index()
    test_df[['card_id', 'target']].to_csv(submission_file_name, index=False)

    train_df.loc[:, 'OOF_PRED'] = train_preds
    train_df = train_df.reset_index()
    train_df[['card_id',
              'OOF_PRED']].to_csv('{0}/oof.csv'.format(folder_path), )

    # API経由でsubmit
    if not is_debug:
        submit(competition_name,
               submission_file_name,
               comment='user02 cv: %.6f' % score)
def kfold_lightgbm(train_df,
                   test_df,
                   num_folds,
                   stratified=False,
                   debug=False):

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=326)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0], 12))
    sub_preds = np.zeros((test_df.shape[0], 12))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['click_mode'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'click_mode'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'click_mode'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                categorical_feature=cat_cols,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               categorical_feature=cat_cols,
                               free_raw_data=False)

        # params
        params = {
            'device': 'gpu',
            'task': 'train',
            'boosting': 'gbdt',
            'objective': 'multiclass',
            'metric': 'multiclass',
            'learning_rate': 0.1,
            'num_class': 12,
            'colsample_bytree': 0.723387165617351,
            'max_depth': 8,
            'min_child_weight': 42.6805833563236,
            'min_data_in_leaf': 34,
            'min_split_gain': 0.010945157429729,
            'num_leaves': 48,
            'reg_alpha': 1.87287994755334,
            'reg_lambda': 4.8093341415383,
            'subsample': 0.483962708535824,
            'verbose': -1,
            'seed': int(2**n_fold),
            'bagging_seed': int(2**n_fold),
            'drop_seed': int(2**n_fold)
        }

        clf = lgb.train(params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        clf.save_model('../output/lgbm_queries_{}.txt'.format(n_fold))

        oof_preds[valid_idx] = clf.predict(valid_x,
                                           num_iteration=clf.best_iteration)
        sub_preds += clf.predict(
            test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(
            clf.feature_importance(importance_type='gain',
                                   iteration=clf.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d F1 Score : %.6f' %
              (n_fold + 1,
               f1_score(valid_y,
                        np.argmax(oof_preds[valid_idx], axis=1),
                        average='weighted')))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full F1 Score & LINE Notify
    full_f1 = f1_score(train_df['click_mode'],
                       np.argmax(oof_preds, axis=1),
                       average='weighted')
    line_notify('Full F1 Score %.6f' % full_f1)

    # display importances
    display_importances(feature_importance_df,
                        '../imp/lgbm_importances_queries_profiles.png',
                        '../imp/feature_importance_lgbm_queries_profiles.csv')

    if not debug:
        # save prediction for submit
        sub_preds = pd.DataFrame(sub_preds)
        sub_preds.columns = [
            'pred_queries_profiles{}'.format(c) for c in sub_preds.columns
        ]
        sub_preds['sid'] = test_df.index

        # save out of fold prediction
        oof_preds = pd.DataFrame(oof_preds)
        oof_preds.columns = [
            'pred_queries_profiles{}'.format(c) for c in oof_preds.columns
        ]
        oof_preds['sid'] = train_df.index

        # merge
        df = oof_preds.append(sub_preds)

        # save as pkl
        save2pkl('../features/queries_profiles_pred.pkl', df)

        line_notify('{} finished.'.format(sys.argv[0]))
Example #7
0
def main(num_rows=None):
    # load csv
    train_queries = pd.read_csv(
        '../input/data_set_phase2/train_queries_phase2.csv', nrows=num_rows)
    test_queries = pd.read_csv('../input/data_set_phase2/test_queries.csv',
                               nrows=num_rows)
    train_clicks = pd.read_csv(
        '../input/data_set_phase2/train_clicks_phase2.csv')

    # phase 1 csv
    train_queries1 = pd.read_csv(
        '../input/data_set_phase2/train_queries_phase1.csv')
    train_clicks1 = pd.read_csv(
        '../input/data_set_phase2/train_clicks_phase1.csv')

    # merge click
    train_queries = pd.merge(train_queries,
                             train_clicks[['sid', 'click_mode']],
                             on='sid',
                             how='left')
    train_queries1 = pd.merge(train_queries1,
                              train_clicks1[['sid', 'click_mode']],
                              on='sid',
                              how='left')

    # merge phase 1 data
    train_queries = train_queries1.append(train_queries)

    # fill na (no click)
    train_queries['click_mode'].fillna(0, inplace=True)

    # set test target as nan
    test_queries['click_mode'] = np.nan

    # merge train & test
    queries_df = train_queries.append(test_queries)

    del train_queries, test_queries, train_queries1, train_clicks, train_clicks1
    gc.collect()

    # to datetime
    queries_df['req_time'] = pd.to_datetime(queries_df['req_time'])

    # distance features
    queries_df['x_o'] = queries_df['o'].apply(
        lambda x: x.split(',')[0]).astype(float)
    queries_df['y_o'] = queries_df['o'].apply(
        lambda x: x.split(',')[1]).astype(float)
    queries_df['x_d'] = queries_df['d'].apply(
        lambda x: x.split(',')[0]).astype(float)
    queries_df['y_d'] = queries_df['d'].apply(
        lambda x: x.split(',')[1]).astype(float)

    # count features
    queries_df['queries_o_count'] = queries_df['o'].map(
        queries_df['o'].value_counts())
    queries_df['queries_d_count'] = queries_df['d'].map(
        queries_df['d'].value_counts())

    queries_df['queries_x_o_count'] = queries_df['x_o'].map(
        queries_df['x_o'].value_counts())
    queries_df['queries_y_o_count'] = queries_df['y_o'].map(
        queries_df['y_o'].value_counts())
    queries_df['queries_x_d_count'] = queries_df['x_d'].map(
        queries_df['x_d'].value_counts())
    queries_df['queries_y_d_count'] = queries_df['y_d'].map(
        queries_df['y_d'].value_counts())

    queries_df['queries_distance'] = np.sqrt(
        (queries_df['x_o'] - queries_df['x_d'])**2 +
        (queries_df['y_o'] - queries_df['y_d'])**2)

    queries_df['o_d'] = queries_df['o'].astype(
        str) + '_' + queries_df['d'].astype(str)
    queries_df['queries_o_d_count'] = queries_df['o_d'].map(
        queries_df['o_d'].value_counts())

    # datetime features
    queries_df['queries_weekday'] = queries_df['req_time'].dt.weekday
    queries_df['queries_hour'] = queries_df['req_time'].dt.hour
    queries_df['queries_is_holiday'] = queries_df['req_time'].apply(
        lambda x: is_holiday(x)).astype(int)

    queries_df['queries_weekday_count'] = queries_df['queries_weekday'].map(
        queries_df['queries_weekday'].value_counts())
    queries_df['queries_hour_count'] = queries_df['queries_hour'].map(
        queries_df['queries_hour'].value_counts())

    # coordinate & datetime features
    queries_df['o_d_is_holiday'] = queries_df['queries_is_holiday'].astype(
        str) + '_' + queries_df['o_d']
    queries_df['o_d_weekday'] = queries_df['queries_weekday'].astype(
        str) + '_' + queries_df['o_d']
    queries_df['o_d_hour'] = queries_df['queries_hour'].astype(
        str) + '_' + queries_df['o_d']

    queries_df['o_is_holiday'] = queries_df['queries_is_holiday'].astype(
        str) + '_' + queries_df['o']
    queries_df['o_weekday'] = queries_df['queries_weekday'].astype(
        str) + '_' + queries_df['o']
    queries_df['o_hour'] = queries_df['queries_hour'].astype(
        str) + '_' + queries_df['o']

    queries_df['d_is_holiday'] = queries_df['queries_is_holiday'].astype(
        str) + '_' + queries_df['d']
    queries_df['d_weekday'] = queries_df['queries_weekday'].astype(
        str) + '_' + queries_df['d']
    queries_df['d_hour'] = queries_df['queries_hour'].astype(
        str) + '_' + queries_df['d']

    queries_df['queries_o_d_is_holiday_count'] = queries_df[
        'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
    queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map(
        queries_df['o_d_weekday'].value_counts())
    queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map(
        queries_df['o_d_hour'].value_counts())

    queries_df['queries_o_is_holiday_count'] = queries_df[
        'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
    queries_df['queries_o_weekday_count'] = queries_df['o_d_weekday'].map(
        queries_df['o_d_weekday'].value_counts())
    queries_df['queries_o_hour_count'] = queries_df['o_d_hour'].map(
        queries_df['o_d_hour'].value_counts())

    queries_df['queries_o_d_is_holiday_count'] = queries_df[
        'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
    queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map(
        queries_df['o_d_weekday'].value_counts())
    queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map(
        queries_df['o_d_hour'].value_counts())

    # rounded value features
    queries_df['x_o_round'] = queries_df['x_o'].round(1)
    queries_df['y_o_round'] = queries_df['y_o'].round(1)
    queries_df['x_d_round'] = queries_df['x_d'].round(1)
    queries_df['y_d_round'] = queries_df['y_d'].round(1)
    queries_df['queries_distance_round'] = queries_df[
        'queries_distance'].round(1)

    queries_df['o_round'] = queries_df['x_o_round'].astype(
        str) + '_' + queries_df['y_o_round'].astype(str)
    queries_df['d_round'] = queries_df['x_d_round'].astype(
        str) + '_' + queries_df['y_d_round'].astype(str)
    queries_df['o_d_round'] = queries_df['o_round'].astype(
        str) + '_' + queries_df['d_round'].astype(str)

    queries_df['queries_x_o_round_count'] = queries_df['x_o_round'].map(
        queries_df['x_o_round'].value_counts())
    queries_df['queries_y_o_round_count'] = queries_df['y_o_round'].map(
        queries_df['y_o_round'].value_counts())
    queries_df['queries_x_d_round_count'] = queries_df['x_d_round'].map(
        queries_df['x_d_round'].value_counts())
    queries_df['queries_y_d_round_count'] = queries_df['y_d_round'].map(
        queries_df['y_d_round'].value_counts())
    queries_df['queries_distance_round_count'] = queries_df[
        'queries_distance_round'].map(
            queries_df['queries_distance_round'].value_counts())
    queries_df['queries_o_round_count'] = queries_df['o_round'].map(
        queries_df['o_round'].value_counts())
    queries_df['queries_d_round_count'] = queries_df['d_round'].map(
        queries_df['d_round'].value_counts())
    queries_df['queries_o_d_round_count'] = queries_df['o_d_round'].map(
        queries_df['o_d_round'].value_counts())

    # factorize
    queries_df['x_o_round'], _ = pd.factorize(queries_df['x_o_round'])
    queries_df['y_o_round'], _ = pd.factorize(queries_df['y_o_round'])
    queries_df['x_d_round'], _ = pd.factorize(queries_df['x_d_round'])
    queries_df['y_d_round'], _ = pd.factorize(queries_df['y_d_round'])
    queries_df['queries_distance_round'], _ = pd.factorize(
        queries_df['queries_distance_round'])

    # target encoding
    cols_encoding = [
        'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round',
        'queries_distance_round'
    ]
    queries_df = targetEncodingMultiClass(queries_df, 'click_mode',
                                          cols_encoding)

    # drop string features
    cols_drop = [
        'o', 'd', 'o_d', 'o_d_is_holiday', 'o_d_weekday', 'o_d_hour',
        'o_is_holiday', 'o_weekday', 'o_hour', 'd_is_holiday', 'd_weekday',
        'd_hour', 'o_round', 'd_round', 'o_d_round'
    ]

    queries_df.drop(cols_drop, axis=1, inplace=True)

    # reduce memory usage
    queries_df = reduce_mem_usage(queries_df)

    # save as pkl
    save2pkl('../features/queries.pkl', queries_df)

    # save configs
    configs = json.load(open('../configs/101_lgbm_queries.json'))
    configs['features'] = queries_df.columns.to_list()
    to_json(configs, '../configs/101_lgbm_queries.json')

    line_notify('{} finished.'.format(sys.argv[0]))
def main(is_eval=False):
    # load csv
    df = pd.read_csv('../input/calendar.csv')

    # to datetime
    df['date'] = pd.to_datetime(df['date'])

    # seasonality
    df['seasonality'] = np.cos(np.pi * (df['date'].dt.dayofyear / 366 * 2 - 1))

    # drop string columns
    df.drop('weekday', axis=1, inplace=True)

    df['day'] = df['date'].dt.day
    df['week'] = df['date'].dt.weekofyear
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['year'] = (df['year'] - df['year'].min())
    df['weekofmonth'] = df['day'].apply(lambda x: ceil(x / 7))

    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

    # features holiday
    df['date'] = df['date'].apply(lambda x: x.date())  # to date

    holidays_us = []
    for y in range(2011, 2017):
        for ptr in holidays.UnitedStates(years=y).items():
            holidays_us.append(ptr[0])

    holidays_ca = []
    for y in range(2011, 2017):
        for ptr in holidays.UnitedStates(state='CA', years=y).items():
            holidays_ca.append(ptr[0])

    holidays_tx = []
    for y in range(2011, 2017):
        for ptr in holidays.UnitedStates(state='TX', years=y).items():
            holidays_tx.append(ptr[0])

    holidays_wi = []
    for y in range(2011, 2017):
        for ptr in holidays.UnitedStates(state='WI', years=y).items():
            holidays_wi.append(ptr[0])

    df['is_holiday_us'] = df['date'].apply(lambda x: 1
                                           if x in holidays_us else 0)
    df['is_holiday_ca'] = df['date'].apply(lambda x: 1
                                           if x in holidays_ca else 0)
    df['is_holiday_tx'] = df['date'].apply(lambda x: 1
                                           if x in holidays_tx else 0)
    df['is_holiday_wi'] = df['date'].apply(lambda x: 1
                                           if x in holidays_wi else 0)

    # preprocess event_name_1
    # to datetime
    df['date'] = pd.to_datetime(df['date'])

    # Moon Phase
    df['moon'] = df['date'].apply(get_moon_phase)

    # add ramadan end dates
    ramadan_end_dates = [
        '2011-8-29', '2012-8-18', '2013-8-7', '2014-7-27', '2015-7-16',
        '2016-7-5'
    ]
    for d in ramadan_end_dates:
        df.loc[df['date'] == d, 'event_name_1'] = 'Ramadan ends'

    # add Pesach start dates
    pesach_start_dates = [
        '2011-4-18', '2012-4-6', '2013-3-25', '2014-4-14', '2015-4-3',
        '2016-4-22'
    ]
    for d in pesach_start_dates:
        df.loc[df['date'] == d, 'event_name_1'] = 'Pesach Start'

    # add purim start dates
    purim_start_dates = [
        '2011-3-19', '2012-3-7', '2013-2-23', '2014-3-15', '2015-3-4',
        '2016-3-23'
    ]
    for d in purim_start_dates:
        df.loc[df['date'] == d, 'event_name_1'] = 'Purim Start'

    # add chanukah start dates
    chanukah_start_dates = [
        '2011-12-21', '2012-12-9', '2013-11-28', '2014-12-17', '2015-12-7',
        '2016-12-25'
    ]
    for d in chanukah_start_dates:
        df.loc[df['date'] == d, 'event_name_1'] = 'Chanukah Start'

    # add isin features
    is_nba_final = []
    is_lent = []
    is_ramadan = []
    is_pesach = []
    is_purim = []
    is_chanukah = []

    tmp_nba = 0
    tmp_lent = 0
    tmp_ramadan = 0
    tmp_pesach = 0
    tmp_purim = 0
    tmp_chanukah = 0

    for e in df['event_name_1']:
        if e == 'NBAFinalsStart':
            tmp_nba = 1
        is_nba_final.append(tmp_nba)
        if e == 'NBAFinalsEnd':
            tmp_nba = 0

        if e == 'LentStart':
            tmp_lent = 1
        is_lent.append(tmp_lent)
        if e == 'Easter':
            tmp_lent = 0

        if e == 'Ramadan starts':
            tmp_ramadan = 1
        is_ramadan.append(tmp_ramadan)
        if e == 'Ramadan ends':
            tmp_ramadan = 0

        if e == 'Pesach Start':
            tmp_pesach = 1
        is_pesach.append(tmp_pesach)
        if e == 'Pesach End':
            tmp_pesach = 0

        if e == 'Purim Start':
            tmp_purim = 1
        is_purim.append(tmp_purim)
        if e == 'Purim End':
            tmp_purim = 0

        if e == 'Chanukah Start':
            tmp_chanukah = 1
        is_chanukah.append(tmp_chanukah)
        if e == 'Chanukah End':
            tmp_chanukah = 0

    df['is_NBA_final'] = is_nba_final
    df['is_lent'] = is_lent
    df['is_ramadan'] = is_ramadan
    df['is_pesach'] = is_pesach
    df['is_purim'] = is_purim
    df['is_chanukah'] = is_chanukah

    # add blackfriday flag
    blackfriday_dates = [
        '2011-11-25', '2012-11-23', '2013-11-29', '2014-11-28', '2015-11-27'
    ]
    df['is_blackfriday'] = 0
    for d in blackfriday_dates:
        df.loc[df['date'] == d, 'is_blackfriday'] = 1

    # factorize numerical columns
    cols_string = [
        'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'
    ]
    for c in cols_string:
        df[c], _ = pd.factorize(df[c])
        df[c].replace(-1, np.nan, inplace=True)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save pkl
    save2pkl('../feats/calendar.pkl', df)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Example #9
0
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):

    # Divide in training/validation and test data
    train_df = df[df['visitors'].notnull()]
    test_df = df[df['visitors'].isnull()]

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()

    # save pkl
    save2pkl('../output/train_df.pkl', train_df)
    save2pkl('../output/test_df.pkl', test_df)

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=47)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=47)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['park_japanese_holiday'])):
        train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(
            train_df['visitors'].iloc[train_idx])
        valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(
            train_df['visitors'].iloc[valid_idx])

        # set data structure
        lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False)

        # パラメータは適当です
        params = {
            'device': 'gpu',
            'gpu_use_dp': True,
            'task': 'train',
            'boosting': 'goss',
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.01,
            'num_leaves': 64,
            'colsample_bytree': 0.977334338875847,
            'subsample': 0.027687793278932,
            'max_depth': 20,
            'reg_alpha': 9.72886163508719,
            'reg_lambda': 9.9935502633216,
            'min_split_gain': 0.178508066955524,
            'min_child_weight': 43.4750700383884,
            'min_data_in_leaf': 18,
            'other_rate': 0.925113620582013,
            'top_rate': 0.006970683025472,
            'verbose': -1,
            'seed': int(2**n_fold),
            'bagging_seed': int(2**n_fold),
            'drop_seed': int(2**n_fold)
        }

        reg = lgb.train(params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        reg.save_model('../output/lgbm_' + str(n_fold) + '.txt')

        oof_preds[valid_idx] = np.expm1(
            reg.predict(valid_x, num_iteration=reg.best_iteration))
        sub_preds += np.expm1(
            reg.predict(test_df[feats],
                        num_iteration=reg.best_iteration)) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(
            reg.feature_importance(importance_type='gain',
                                   iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d MAE : %.6f' %
              (n_fold + 1,
               mean_absolute_error(np.expm1(valid_y), oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full MAEスコアの表示&LINE通知
    full_mae = mean_absolute_error(train_df['visitors'], oof_preds)
    line_notify('LigntGBM Full MAE score %.6f' % full_mae)

    if not debug:
        # 提出データの予測値を保存
        test_df.loc[:, 'visitors'] = sub_preds
        test_df[['index',
                 'visitors']].sort_values('index').to_csv(submission_file_name,
                                                          index=False,
                                                          header=False,
                                                          sep='\t')

        # out of foldの予測値を保存
        train_df.loc[:, 'OOF_PRED'] = oof_preds
        train_df[['index',
                  'OOF_PRED']].sort_values('index').to_csv(oof_file_name,
                                                           index=False)

    return feature_importance_df
Example #10
0
def kfold_lightgbm(train_df,
                   test_df,
                   num_folds,
                   stratified=False,
                   debug=False):

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=326)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0], 12))
    sub_preds = np.zeros((test_df.shape[0], 12))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['click_mode'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'click_mode'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'click_mode'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                categorical_feature=CAT_COLS,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               categorical_feature=CAT_COLS,
                               free_raw_data=False)

        # params
        params = {
            'device': 'gpu',
            'task': 'train',
            'boosting': 'gbdt',
            'objective': 'multiclass',
            'metric': 'multiclass',
            'learning_rate': 0.01,
            'num_class': 12,
            'num_leaves': 52,
            'colsample_bytree': 0.3490457769968177,
            'subsample': 0.543646263362097,
            'max_depth': 11,
            'reg_alpha': 4.762312990232561,
            'reg_lambda': 9.98131082276387,
            'min_split_gain': 0.19161156850826594,
            'min_child_weight': 15.042054927368088,
            'min_data_in_leaf': 17,
            'verbose': -1,
            'seed': int(2**n_fold),
            'bagging_seed': int(2**n_fold),
            'drop_seed': int(2**n_fold)
        }

        clf = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_test],
            valid_names=['train', 'test'],
            #                        feval=eval_f,
            num_boost_round=10000,
            early_stopping_rounds=200,
            verbose_eval=100)

        # save model
        clf.save_model('../output/lgbm_3_{}.txt'.format(n_fold))

        oof_preds[valid_idx] = clf.predict(valid_x,
                                           num_iteration=clf.best_iteration)
        sub_preds += clf.predict(
            test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(
            clf.feature_importance(importance_type='gain',
                                   iteration=clf.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d F1 Score : %.6f' %
              (n_fold + 1,
               f1_score(valid_y,
                        np.argmax(oof_preds[valid_idx], axis=1),
                        average='weighted')))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full F1 Score & LINE Notify
    full_f1 = f1_score(train_df['click_mode'],
                       np.argmax(oof_preds, axis=1),
                       average='weighted')
    print('Full F1 Score %.6f' % full_f1)
    line_notify('Full F1 Score %.6f' % full_f1)

    # display importances
    display_importances(feature_importance_df, '../imp/lgbm_importances_3.png',
                        '../imp/feature_importance_lgbm_3.csv')

    if not debug:
        # save prediction for submit
        test_df['recommend_mode'] = np.argmax(sub_preds, axis=1)
        test_df = test_df.reset_index()

        # post processing
        test_df['recommend_mode'][(test_df['plan_num_plans'] == 1)
                                  & (test_df['recommend_mode'] != 0
                                     )] = test_df['plan_0_transport_mode'][
                                         (test_df['plan_num_plans'] == 1)
                                         & (test_df['recommend_mode'] != 0)]

        # save csv
        test_df[['sid', 'recommend_mode']].to_csv(submission_file_name,
                                                  index=False)

        # save out of fold prediction
        train_df.loc[:, 'recommend_mode'] = np.argmax(oof_preds, axis=1)
        train_df = train_df.reset_index()
        train_df[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name,
                                                                 index=False)

        # save prediction for submit
        sub_preds = pd.DataFrame(sub_preds)
        sub_preds.columns = [
            'pred_lgbm_plans{}'.format(c) for c in sub_preds.columns
        ]
        sub_preds['sid'] = test_df['sid']
        sub_preds['click_mode'] = test_df['click_mode']

        # save out of fold prediction
        oof_preds = pd.DataFrame(oof_preds)
        oof_preds.columns = [
            'pred_lgbm_plans{}'.format(c) for c in oof_preds.columns
        ]
        oof_preds['sid'] = train_df['sid']
        oof_preds['click_mode'] = train_df['click_mode']

        # merge
        df = oof_preds.append(sub_preds)

        # save as pkl
        save2pkl('../features/lgbm_pred_3.pkl', df)

        line_notify('{} finished.'.format(sys.argv[0]))
Example #11
0
def kfold_xgboost(df, num_folds, stratified = False, debug= False):

    # Divide in training/validation and test data
    train_df = df[df['visitors'].notnull()]
    test_df = df[df['visitors'].isnull()]

    print("Starting XGBoost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()

    # save pkl
    save2pkl('../output/train_df.pkl', train_df)
    save2pkl('../output/test_df.pkl', test_df)

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # final predict用にdmatrix形式のtest dfを作っておきます
    test_df_dmtrx = xgb.DMatrix(test_df[feats], label=train_df['visitors'])

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['park_japanese_holiday'])):
        train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(train_df['visitors'].iloc[train_idx])
        valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(train_df['visitors'].iloc[valid_idx])

        # set data structure
        xgb_train = xgb.DMatrix(train_x,
                                label=train_y)
        xgb_test = xgb.DMatrix(valid_x,
                               label=valid_y)

        # params
        params = {
                'objective':'gpu:reg:linear', # GPU parameter
                'booster': 'gbtree',
                'eval_metric':'rmse',
                'silent':1,
                'eta': 0.01,
                'max_depth': 8,
                'min_child_weight': 19,
                'gamma': 0.089444100759612,
                'subsample': 0.91842954303314,
                'colsample_bytree': 0.870658058238432,
                'colsample_bylevel': 0.995353255250289,
                'alpha':19.9615600411437,
                'lambda': 2.53962270252528,
                'tree_method': 'gpu_hist', # GPU parameter
                'predictor': 'gpu_predictor', # GPU parameter
                'seed':int(2**n_fold)
                }

        reg = xgb.train(
                        params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_train,'train'),(xgb_test,'test')],
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        # save model
        reg.save_model('../output/xgb_'+str(n_fold)+'.txt')

        oof_preds[valid_idx] = np.expm1(reg.predict(xgb_test))
        sub_preds += np.expm1(reg.predict(test_df_dmtrx)) / num_folds

        fold_importance_df = pd.DataFrame.from_dict(reg.get_score(importance_type='gain'), orient='index', columns=['importance'])
        fold_importance_df["feature"] = fold_importance_df.index.tolist()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d MAE : %.6f' % (n_fold + 1, mean_absolute_error(np.expm1(valid_y), oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    del test_df_dmtrx
    gc.collect()

    # Full MAEスコアの表示&LINE通知
    full_mae = mean_absolute_error(train_df['visitors'], oof_preds)
    line_notify('XGBoost Full MAE score %.6f' % full_mae)

    if not debug:
        # 提出データの予測値を保存
        test_df.loc[:,'visitors'] = sub_preds
        test_df[['index', 'visitors']].sort_values('index').to_csv(submission_file_name, index=False, header=False, sep='\t')

        # out of foldの予測値を保存
        train_df.loc[:,'OOF_PRED'] = oof_preds
        train_df[['index', 'OOF_PRED']].sort_values('index').to_csv(oof_file_name, index= False)

    return feature_importance_df
def main():
    parser = get_parser()
    try:
        args = parser.parse_args()
    except:
        sys.exit(0)

    # Set up commands from parser
    params = dict()
    ntraj = params['Ntraj'] = args.ntraj
    seed = params['seed'] = args.seed
    duration = params['duration'] = args.duration
    delta_t = params['delta_t'] = args.deltat
    Nfock_a = params['Nfock_a'] = args.nfocka
    Nfock_j = params['Nfock_j'] = args.nfockj
    downsample = params['downsample'] = args.downsample
    Regime = params['regime'] = args.regime
    num_systems = params['num_systems'] = args.num_systems
    drive_second_system = params[
        'drive_second_system'] = args.drive_second_system

    if args.sdeint_method_name == "":
        logging.info(
            "sdeint_method_name not set. Using itoEuler as a default.")
        sdeint_method_name = params['sdeint_method_name'] = "itoEuler"
    else:
        sdeint_method_name = params[
            'sdeint_method_name'] = args.sdeint_method_name

    R = params['R'] = args.R
    eps = params['eps'] = args.eps
    noise_amp = params['noise_amp'] = args.noise_amp
    trans_phase = params['trans_phase'] = args.trans_phase

    # Does the user want to print verbose output?
    quiet = args.quiet

    if not quiet:
        print_params(params=params)

    # How much to downsample results
    logging.info("Downsample set to %s", downsample)

    ## Names of files and output
    if args.outdir is None:
        outdir = os.getcwd()
    else:
        outdir = args.outdir

    try:
        os.stat(outdir)
    except:
        os.mkdir(outdir)

    param_str = ("%s-" * 14)[:-1] % (seed, ntraj, delta_t, Nfock_a, Nfock_j,
                                     duration, downsample, sdeint_method_name,
                                     num_systems, R, eps, noise_amp,
                                     trans_phase, drive_second_system)
    file_name = '%s/QSD_%s_%s' % (outdir, Regime, param_str)

    # Saving options
    save_mat = args.save2mat
    save_pkl = args.save2pkl

    if save_mat == False and save_pkl == False:
        logging.warning(
            "Both pickle and mat save are disabled, no data will be saved.")
        logging.warning(
            "You can modify this with args --save2pkl and --save2mat")

    implicit_type = None

    if sdeint_method_name in SDEINT_METHODS:
        sdeint_method = SDEINT_METHODS[sdeint_method_name]

        ## For now let's use the full implicit method for implicit methods.
        ## The value implicit_type can be made one of:
        ## "implicit", "semi_implicit_drift", or "semi_implicit_diffusion".
        if sdeint_method_name in IMPLICIT_METHODS:
            implicit_type = "implicit"
    else:
        logging.error(
            "Unknown sdeint_method_name, %s, or not implemented yet.",
            sdeint_method_name)
        raise ValueError("Unknown sdeint_method_name, or not implemented yet.")

    tspan = np.arange(0, duration, delta_t)

    obsq_data = None
    if num_systems == 1:

        if Regime == "absorptive_bistable":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_JC(
                Nfock_a, Nfock_j)
        elif Regime == "kerr_bistable":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable(
                Nfock_a)
        elif Regime == "kerr_bistable2":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime2(
                Nfock_a)
        elif Regime == "kerr_bistable3":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime3(
                Nfock_a)
        elif Regime == "kerr_bistable4":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime4(
                Nfock_a)
        elif Regime == "kerr_bistable5":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime5(
                Nfock_a)
        elif Regime == "kerr_bistable6":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime6(
                Nfock_a)
        elif Regime == "kerr_bistable7":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime7(
                Nfock_a)
        elif Regime[:len(
                "kerr_bistable"
        )] == "kerr_bistable":  ##inputs in this case are e.g. kerr_bistableA33.25_...
            which_kerr = Regime[len(
                "kerr_bistable")]  ## e.g. A in kerr_bistableA33.25_
            custom_drive = float(Regime[len("kerr_bistableA"):]
                                 )  ## e.g. 33.25 in kerr_bistableA33.25
            logging.info("Regime is set to %s, with custom drive %s" %
                         (Regime, custom_drive))
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime_chose_drive(
                Nfock_a, which_kerr, custom_drive)
        elif Regime == "kerr_qubit":
            logging.info("Regime is set to %s", Regime)
            H, psi0, Ls, obsq_data, obs_names = make_system_kerr_qubit(Nfock_a)
        else:
            logging.error("Unknown regime, %s, or not implemented yet.",
                          Regime)
            raise ValueError("Unknown regime, or not implemented yet.")

        ### Run simulation for one system
        D = qsd_solve(
            H=H,
            psi0=psi0,
            tspan=tspan,
            Ls=Ls,
            sdeint_method=sdeint_method,
            obsq=obsq_data,
            ntraj=ntraj,
            seed=seed,
            normalize_state=True,
            downsample=downsample,
            implicit_type=implicit_type,
        )
    elif num_systems == 2:

        if Regime == "absorptive_bistable":
            logging.info("Regime is set to %s", Regime)
            H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_JC_two_systems(
                Nfock_a, Nfock_j, drive_second_system)
        elif Regime == "kerr_bistable":
            logging.info("Regime is set to %s", Regime)
            H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_bistable_two_systems(
                Nfock_a, drive_second_system)
        elif Regime == "kerr_qubit":
            logging.info("Regime is set to %s", Regime)
            H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_qubit_two_systems(
                Nfock_a, drive_second_system)
        elif Regime[:len("empty_then_kerr"
                         )] == 'empty_then_kerr':  ##e.g. empty_then_kerrA33.25
            which_kerr = Regime[len(
                "empty_then_kerr")]  ## e.g. A in empty_then_kerrA33.25_
            custom_drive = float(Regime[len("empty_then_kerrA"):]
                                 )  ## e.g. 33.25 in empty_then_kerrA33.25
            logging.info("Regime is set to %s, with custom drive %s" %
                         (Regime, custom_drive))
            H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_empty_then_kerr(
                Nfock_a, which_kerr, custom_drive)
        elif Regime[:len(
                "kerr_bistable"
        )] == "kerr_bistable":  ##inputs in this case are e.g. kerr_bistableA33.25_...
            which_kerr = Regime[len(
                "kerr_bistable")]  ## e.g. A in kerr_bistableA33.25_
            custom_drive = float(Regime[len("kerr_bistableA"):]
                                 )  ## e.g. 33.25 in kerr_bistableA33.25
            logging.info("Regime is set to %s, with custom drive %s" %
                         (Regime, custom_drive))
            H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_bistable_regime_chose_drive_two_systems(
                Nfock_a, which_kerr, custom_drive)

        else:
            logging.error("Unknown regime, %s, or not implemented yet.",
                          Regime)
            raise ValueError("Unknown regime, or not implemented yet.")

        ### Run simulation for one system
        D = qsd_solve_two_systems(
            H1,
            H2,
            psi0,
            tspan,
            L1s,
            L2s,
            R=R,
            eps=eps,
            n=noise_amp,
            sdeint_method=sdeint_method,
            trans_phase=trans_phase,
            obsq=obsq_data,
            normalize_state=True,
            downsample=downsample,
            ops_on_whole_space=
            False,  ## assume the given operators only operate on their own subspace
            multiprocessing=False,  ## disable multiprocessing for now
            ntraj=ntraj,
            seed=seed,
            implicit_type=implicit_type,
        )
    else:
        logging.error("Unknown num_systems, %s, or not implemented yet.",
                      num_systems)
        raise ValueError("Unknown num_systems, or not implemented yet.")

    ### include time in results
    D.update({'tspan': tspan})

    ### downsample
    D_downsampled = {
        'psis': D['psis'],
        'obsq_expects': D['obsq_expects'],
        'seeds': D['seeds'],
        'tspan': D['tspan'][::downsample]
    }

    ### Save results
    if save_mat:
        logging.info("Saving mat file...")
        save2mat(data=D_downsampled,
                 file_name=file_name,
                 obs=obs_names,
                 params=params)
    if save_pkl:
        logging.info("Saving pickle file...")
        save2pkl(data=D_downsampled,
                 file_name=file_name,
                 obs=obs_names,
                 params=params)
                                lambd=lambd,
                                n=noise_amp,
                                sdeint_method=sdeint_method,
                                trans_phase=trans_phase,
                                obsq=obsq_data,
                                normalize_state=True,
                                downsample=downsample,
                                multiprocessing=False,
                                ntraj=ntraj,
                                processes=8,
                                seed=1,
                                implicit_type=None)

    ## include time in results, and unfiltered behavior of the generated
    ## trajectory of system 1.
    D.update({'tspan': tspan, 'sys1_expects': obs})

    ### Save results
    if save_mat:
        logging.info("Saving mat file...")
        save2mat(data=D,
                 file_name=output_file_path,
                 obs=obs_names,
                 params=params)
    if save_pkl:
        logging.info("Saving pickle file...")
        save2pkl(data=D,
                 file_name=output_file_path,
                 obs=obs_names,
                 params=params)
def main():
    parser = get_parser()
    try:
        args = parser.parse_args()
    except:
        sys.exit(0)

    # Set up commands from parser
    params = dict()
    ntraj = params['Ntraj'] = args.ntraj
    seed = params['seed'] = args.seed
    duration = params['duration'] = args.duration
    delta_t = params['delta_t'] = args.deltat
    Nfock_a = params['Nfock_a'] = args.nfocka
    Nfock_j = params['Nfock_j'] = args.nfockj
    downsample = params['downsample'] = args.downsample

    # Does the user want to print verbose output?
    quiet = args.quiet

    if not quiet:
        print_params(params=params)

    # How much to downsample results
    logging.info("Downsample set to %s", downsample)

    ## Names of files and output
    Regime = "absorptive_bistable"
    param_str = "%s-%s-%s-%s-%s-%s" % (seed, ntraj, delta_t, Nfock_a, Nfock_j,
                                       duration)
    outdir = ""
    if args.outdir != None:
        outdir = args.outdir
    file_name = '%s/QSD_%s_%s' % (outdir, Regime, param_str)

    # Saving options
    save_mat = args.save2mat
    save_pkl = args.save2pkl

    if save_mat == False and save_pkl == False:
        logging.warning(
            "Both pickle and mat save are disabled, no data will be saved.")
        logging.warning(
            "You can modify this with args --save2pkl and --save2mat")

    # ## Make Operators
    a = Destroy(1)
    ad = a.dag()

    sm = LocalSigma(2, 1, 0) / sqrt(2)
    sp = sm.dag()
    sz = sp * sm - sm * sp

    j = Jminus(2)
    jp = j.dag()
    jz = Jz(2)

    jx = (jp + j) / 2.
    jy = (jp - j) / 2.

    # ## Make SLH Model
    k, g0, g = symbols("kappa, g0,gamma", positive=True)
    DD, TT = symbols("Delta, Theta", real=True)
    W = symbols("Omega")

    L = [sqrt(k) * a, sqrt(g) * j]
    H = -I * g0 * (a * jp - ad * j) + DD * jz + TT * ad * a
    S = identity_matrix(2)

    slh = SLH(S, L, H).coherent_input(W, 0)
    slh

    ## Numerical parameters
    a.space.dimension = Nfock_a
    j.space.dimension = Nfock_j

    if Regime == "absorptive_bistable":
        logging.info("Regime is set to %s", Regime)
        nparams = make_nparams(W=W, k=k, g=g, g0=g0, DD=DD, TT=TT)
    else:
        logging.error("Unknown regime, %s, or not implemented yet.", Regime)
        raise ValueError("Unknown regime, or not implemented yet.")

    Hq, Lqs = slh.substitute(nparams).HL_to_qutip()

    ## Observables
    obs = (a, j, jz, a * a, a.dag() * a, a * jp, jp, jx, jy)
    obsq = [o.to_qutip(full_space=slh.space) for o in obs]

    tspan = np.arange(0, duration, delta_t)
    psi0 = qutip.tensor(qutip.basis(Nfock_a, 0), qutip.basis(Nfock_j, 0)).data
    H = Hq.data
    Ls = [Lq.data for Lq in Lqs]
    obsq = [ob.data for ob in obsq]

    ### Run simulation
    D = qsd_solve(H=H,
                  psi0=psi0,
                  tspan=tspan,
                  Ls=Ls,
                  sdeint_method=sdeint.itoEuler,
                  obsq=obsq,
                  ntraj=ntraj,
                  seed=seed,
                  normalize_state=True)

    ### include time in results
    D.update({'tspan': tspan})

    ### downsample
    D_downsampled = {
        'psis': D['psis'][:, ::downsample],
        'obsq_expects': D['obsq_expects'][:, ::downsample],
        'seeds': D['seeds'],
        'tspan': D['tspan'][::downsample]
    }

    ### Save results
    if save_mat:
        logging.info("Saving mat file...")
        save2mat(data=D_downsampled,
                 file_name=file_name,
                 obs=obs,
                 params=params)
    if save_pkl:
        logging.info("Saving pickle file...")
        save2pkl(data=D_downsampled,
                 file_name=file_name,
                 obs=obs,
                 params=params)
Example #15
0
def kfold_xgboost(train_df, test_df, num_folds, stratified=False, debug=False):

    print("Starting XGBoost. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=326)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0], 12))
    sub_preds = np.zeros((test_df.shape[0], 12))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # dmatrix for test_df
    test_df_dmtrx = xgb.DMatrix(test_df[feats])

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['click_mode'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'click_mode'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'click_mode'].iloc[valid_idx]

        # set data structure
        xgb_train = xgb.DMatrix(train_x, label=train_y)
        xgb_test = xgb.DMatrix(valid_x, label=valid_y)

        # params
        params = {
            'device': 'gpu',
            'objective': 'multi:softmax',  # GPU parameter
            'booster': 'gbtree',
            'eval_metric': 'mlogloss',
            'num_class': 12,
            'eta': 0.05,
            'colsample_bytree': 0.3490457769968177,
            'subsample': 0.543646263362097,
            'max_depth': 11,
            'alpha': 4.762312990232561,
            'lambda': 9.98131082276387,
            'gamma': 0.19161156850826594,
            'min_child_weight': 15.042054927368088,
            'tree_method': 'gpu_hist',  # GPU parameter
            'predictor': 'gpu_predictor',  # GPU parameter
            'silent': 1,
            'seed': int(2**n_fold)
        }

        # train model
        clf = xgb.train(params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_train, 'train'), (xgb_test, 'test')],
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        clf.save_model('../output/xgb_' + str(n_fold) + '.txt')

        oof_preds[valid_idx] = clf.predict(xgb_test, output_margin=True)
        sub_preds += clf.predict(test_df_dmtrx,
                                 output_margin=True) / folds.n_splits

        # save feature importances
        fold_importance_df = pd.DataFrame.from_dict(
            clf.get_score(importance_type='gain'),
            orient='index',
            columns=['importance'])
        fold_importance_df["feature"] = fold_importance_df.index.tolist()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d F1 Score : %.6f' %
              (n_fold + 1,
               f1_score(valid_y,
                        np.argmax(oof_preds[valid_idx], axis=1),
                        average='weighted')))
        del clf, train_x, train_y, valid_x, valid_y, xgb_train, xgb_test
        gc.collect()

    # Full F1 Score & LINE Notify
    full_f1 = f1_score(train_df['click_mode'],
                       np.argmax(oof_preds, axis=1),
                       average='weighted')
    print('Full F1 Score %.6f' % full_f1)
    line_notify('Full F1 Score %.6f' % full_f1)

    # display importances
    display_importances(feature_importance_df, '../imp/xgb_importances.png',
                        '../imp/feature_importance_xgb.csv')

    if not debug:
        # save prediction for submit
        test_df['recommend_mode'] = np.argmax(sub_preds, axis=1)
        test_df = test_df.reset_index()

        # post processing
        test_df['recommend_mode'][(test_df['plan_num_plans'] == 1)
                                  & (test_df['recommend_mode'] != 0
                                     )] = test_df['plan_0_transport_mode'][
                                         (test_df['plan_num_plans'] == 1)
                                         & (test_df['recommend_mode'] != 0)]

        test_df[['sid', 'recommend_mode']].to_csv(submission_file_name,
                                                  index=False)

        # save out of fold prediction
        train_df.loc[:, 'recommend_mode'] = np.argmax(oof_preds, axis=1)
        train_df = train_df.reset_index()
        train_df[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name,
                                                                 index=False)

        # save prediction for submit
        sub_preds = pd.DataFrame(sub_preds)
        sub_preds.columns = [
            'pred_xgb_plans{}'.format(c) for c in sub_preds.columns
        ]
        sub_preds['sid'] = test_df['sid']
        sub_preds['click_mode'] = test_df['click_mode']

        # save out of fold prediction
        oof_preds = pd.DataFrame(oof_preds)
        oof_preds.columns = [
            'pred_xgb_plans{}'.format(c) for c in oof_preds.columns
        ]
        oof_preds['sid'] = train_df['sid']
        oof_preds['click_mode'] = train_df['click_mode']

        # merge
        df = oof_preds.append(sub_preds)

        # save as pkl
        save2pkl('../features/xgb_pred.pkl', df)

        line_notify('{} finished.'.format(sys.argv[0]))