コード例 #1
0
def run_ridge_on_cat(cat):
    if not is_in_cache('cat_ridges_blend_l3_' + cat):
        print_step(cat + ' > Subsetting')
        train_c = train_[train['parent_category_name'] == cat].copy()
        test_c = test_[test['parent_category_name'] == cat].copy()
        print(train_c.shape)
        print(test_c.shape)
        target = train_c['deal_probability'].values
        train_id = train_c['item_id']
        test_id = test_c['item_id']
        train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
        test_c.drop('item_id', axis=1, inplace=True)

        print_step(cat + ' > Modeling')
        results = run_cv_model(train_c, test_c, target, runLasso, params, rmse,
                               cat + '-ridge-blend')
        train_c['cat_ridge'] = results['train']
        test_c['cat_ridge'] = results['test']
        print_step(cat + ' > RMSE: ' + str(rmse(target, train_c['cat_ridge'])))

        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step(cat + ' > Saving in Cache')
        train_c['item_id'] = train_id
        test_c['item_id'] = test_id
        save_in_cache('cat_ridges_blend_l3_' + cat,
                      train_c[['item_id',
                               'cat_ridge']], test_c[['item_id', 'cat_ridge']])
        return True
    else:
        print_step('Already have ' + cat + '...')
        return True
コード例 #2
0
    train_fe.drop([
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ],
                  axis=1,
                  inplace=True)
    print_step('Saving')
    save_in_cache('fe_lgb_data', train_fe, test_fe)
    del train_fe
    del test_fe
    gc.collect()

print('~~~~~~~~~~~~')
print_step('Run LGB')
train, test = run_cv_model(label='sparse_fe_lgb',
                           data_key='tfidf_char_union',
                           model_fn=runSparseLGB,
                           train=train,
                           test=test,
                           kf=kf)
# toxic CV scores : [0.9826496062603199, 0.9830212932736853, 0.9815062563553301, 0.98022789149499, 0.981731541721145]
# toxic mean CV : 0.9818273178210941
# severe_toxic CV scores : [0.9907376375169112, 0.9888719942493184, 0.9903119467039991, 0.9922809301301098, 0.9887765464258907]
# severe_toxic mean CV : 0.9901958110052458
# obscene CV scores : [0.9933673973796135, 0.993919978856799, 0.9926754787135739, 0.9927263904855579, 0.9933408309332551]
# obscene mean CV : 0.9932060152737598
# threat CV scores : [0.9893472977361535, 0.9912972922362948, 0.9904282818441812, 0.99134220616599, 0.9881882482937496]
# threat mean CV : 0.9901206652552738
# insult CV scores : [0.9832124677272037, 0.9835326755212629, 0.9839356436291075, 0.986883748038697, 0.9858095196238779]
# insult mean CV : 0.9846748109080299
# identity_hate CV scores : [0.9843095304682539, 0.9885634545571751, 0.981675404744786, 0.9885268357417188, 0.988652610966542]
# identity_hate mean CV : 0.9863455672956952
# ('sparse_fe_lgb overall : ', 0.987728364593183)
コード例 #3
0
    del train_
    del test_
    del test_ohe
    del test_ohe2
    del train_ohe
    del train_ohe2
    gc.collect()

    print_step('Caching')
    save_in_cache('deep_text_feats2', train, test)
else:
    train, test = load_cache('deep_text_feats2')

print('~~~~~~~~~~~~')
print_step('Run LGB')
results = run_cv_model(train, test, target, runLGB, params, rmse, 'deep_lgb2')
import pdb
pdb.set_trace()

print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('deep_lgb2', pd.DataFrame({'deep_lgb2': results['train']}),
              pd.DataFrame({'deep_lgb2': results['test']}))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['item_id'] = test_id
submission['deal_probability'] = results['test'].clip(0.0, 1.0)
submission.to_csv('submit/submit_deep_lgb2.csv', index=False)
print_step('Done!')
コード例 #4
0
    del train_fe
    del test_fe
    gc.collect()

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

print('~~~~~~~~~~~~')
print_step('Run LGB')
print(train.columns.values)

train, test = get_data()
train, test = run_cv_model(label='fe_lgb',
                           data_key='lgb_fe_with_embeddings_and_svd',
                           model_fn=runLGB,
                           train=train,
                           test=test,
                           kf=kf)

import pdb
pdb.set_trace()
print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('lvl1_fe_lgb', train, test)

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['id'] = test['id']
submission['toxic'] = test['fe_lgb_toxic']
submission['severe_toxic'] = test['fe_lgb_severe_toxic']
コード例 #5
0
print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

print('~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 2/2')
train, test = load_cache('complete_fm_data')

print('~~~~~~~~~~~~~~')
print_step('Run TFFM2')
results = run_cv_model(train, test, target, runTFFM, params2, rmse, 'tffm2')

print_step('Cache')
save_in_cache('tffm2', pd.DataFrame({'tffm2': results['train']}),
              pd.DataFrame({'tffm2': results['test']}))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['item_id'] = test_id
submission['deal_probability'] = results['test'].clip(0.0, 1.0)
submission.to_csv('submit/submit_tffm2.csv', index=False)
print_step('Done!')
# [2018-06-06 05:26:11.306486] tffm2 cv scores : [0.22541019023816908, 0.22463933727489538, 0.22452885067937228, 0.2245032642720666, 0.22523463698732962]
# [2018-06-06 05:26:11.306561] tffm2 mean cv score : 0.22486325589036663
# [2018-06-06 05:26:11.306664] tffm2 std cv score : 0.0003817385117403105
コード例 #6
0
def run_ridge_on_regioncat(regioncat):
    if not is_in_cache('regioncat_ridges_' + regioncat):
        print_step(regioncat + ' > Subsetting')
        train_c = train[train['region_X_cat'] == regioncat].copy()
        test_c = test[test['region_X_cat'] == regioncat].copy()
        print(train_c.shape)
        print(test_c.shape)
        target = train_c['deal_probability'].values
        train_id = train_c['item_id']
        test_id = test_c['item_id']
        train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
        test_c.drop(['item_id'], axis=1, inplace=True)

        print_step(regioncat + ' > Titlecat TFIDF 1/3')
        train_c['titlecat'] = train_c['category_name'].fillna(
            '') + ' ' + train_c['param_1'].fillna('') + ' ' + train_c[
                'param_2'].fillna('') + ' ' + train_c['param_3'].fillna(
                    '') + ' ' + train_c['title'].fillna('')
        test_c['titlecat'] = test_c['category_name'].fillna('') + ' ' + test_c[
            'param_1'].fillna('') + ' ' + test_c['param_2'].fillna(
                '') + ' ' + test_c['param_3'].fillna(
                    '') + ' ' + test_c['title'].fillna('')
        print_step(regioncat + ' > Titlecat TFIDF 2/3')
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_features=100000,
                                min_df=2,
                                max_df=0.8,
                                binary=True,
                                encoding='KOI8-R')
        tfidf_train = tfidf.fit_transform(train_c['titlecat'])
        print(tfidf_train.shape)
        print_step(regioncat + ' > Titlecat TFIDF 3/3')
        tfidf_test = tfidf.transform(test_c['titlecat'])
        print(tfidf_test.shape)

        print_step(regioncat + ' > Titlecat TFIDF Ridge')
        results = run_cv_model(tfidf_train, tfidf_test, target, runRidge,
                               {'alpha': 5.0}, rmse,
                               regioncat + '-titlecat-ridge')
        train_c['regioncat_title_ridge'] = results['train']
        test_c['regioncat_title_ridge'] = results['test']

        print_step(regioncat + ' > Description TFIDF 1/3')
        train_c['desc'] = train_c['title'].fillna(
            '') + ' ' + train_c['description'].fillna('')
        test_c['desc'] = test_c['title'].fillna(
            '') + ' ' + test_c['description'].fillna('')
        print_step(regioncat + ' > Description TFIDF 2/3')
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_features=100000,
                                min_df=2,
                                max_df=0.8,
                                binary=True,
                                encoding='KOI8-R')
        tfidf_train2 = tfidf.fit_transform(train_c['desc'].fillna(''))
        print(tfidf_train2.shape)
        print_step(regioncat + ' > Description TFIDF 3/3')
        tfidf_test2 = tfidf.transform(test_c['desc'].fillna(''))
        print(tfidf_test2.shape)
        results = run_cv_model(tfidf_train2, tfidf_test2, target, runRidge,
                               {'alpha': 5.0}, rmse, regioncat + '-desc-ridge')
        train_c['regioncat_desc_ridge'] = results['train']
        test_c['regioncat_desc_ridge'] = results['test']

        print_step(regioncat + ' > Text Char TFIDF 1/2')
        # Using char n-grams ends up being surprisingly good, HT https://www.kaggle.com/c/avito-demand-prediction/discussion/56061#325063
        tfidf = TfidfVectorizer(ngram_range=(2, 5),
                                max_features=100000,
                                min_df=2,
                                max_df=0.8,
                                binary=True,
                                analyzer='char',
                                encoding='KOI8-R')
        tfidf_train3 = tfidf.fit_transform(train_c['desc'])
        print(tfidf_train3.shape)
        print_step(regioncat + ' > Text Char TFIDF 2/2')
        tfidf_test3 = tfidf.transform(test_c['desc'])
        print(tfidf_test3.shape)

        results = run_cv_model(tfidf_train3, tfidf_test3, target, runRidge,
                               {'alpha': 5.0}, rmse,
                               regioncat + '-desc-char-ridge')
        train_c['regioncat_desc_char_ridge'] = results['train']
        test_c['regioncat_desc_char_ridge'] = results['test']

        print_step('Merging 1/2')
        train_c2 = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr()
        print_step('Merging 2/2')
        test_c2 = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr()
        print(train_c2.shape)
        print(test_c2.shape)

        print('~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step('Run Full Text Ridge')
        results = run_cv_model(train_c2, test_c2, target, runRidge,
                               {'alpha': 8.0}, rmse, regioncat + '-text-ridge')
        train_c['regioncat_all_text_ridge'] = results['train']
        test_c['regioncat_all_text_ridge'] = results['test']

        print('~~~~~~~~~~~~~~~~~~~~~~')
        print_step(regioncat + ' > Dropping')
        train_c.drop([c for c in train_c.columns if 'ridge' not in c],
                     axis=1,
                     inplace=True)
        test_c.drop([c for c in test_c.columns if 'ridge' not in c],
                    axis=1,
                    inplace=True)
        train_c['item_id'] = train_id
        test_c['item_id'] = test_id

        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step(regioncat + ' > Saving in Cache')
        save_in_cache('regioncat_ridges_' + regioncat, train_c, test_c)
    else:
        print(regioncat + ' already in cache! Skipping...')
    return True
コード例 #7
0
else:
    train_, test_ = load_cache('convai_with_fe')


print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)


print('~~~~~~~~~~~~~~~~~~~')
print_step('Run ConvAI LGB')
print(train_.columns.values)
train, test = get_data()
train_, test_ = run_cv_model(label='convai_lgb',
                             data_key='convai_with_fe',
                             model_fn=runLGB,
                             train=train,
                             test=test,
                             kf=kf)

import pdb
pdb.set_trace()
print('~~~~~~~~~~~~~~~~~~')
print_step('Cache Level 2')
save_in_cache('lvl2_convai_lgb', train, test)
print_step('Done!')

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['id'] = test['id']
submission['toxic'] = test_['convai_lgb_toxic']
コード例 #8
0
          test_features.shape)
    print_step('Saving')
    save_in_cache('fm_data', train_features, test_features)
    del train_features
    del test_features
    gc.collect()

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

print('~~~~~~~~~~~')
print_step('Run Ridge')
train, test = run_cv_model(label='ridge',
                           data_key='fm_data',
                           model_fn=runRidge,
                           train=train,
                           test=test,
                           kf=kf)
import pdb
pdb.set_trace()
print('~~~~~~~~~~~~~~~~~~')
print_step('Cache Level 1')
save_in_cache('lvl1_ridge', train, test)
# toxic CV scores : [0.9809843104555062, 0.9818160662139189, 0.9810818473334081, 0.9785535369240607, 0.9805031449391929]
# toxic mean CV : 0.9805877811732173
# severe_toxic CV scores : [0.9910152906145414, 0.989781576288062, 0.9905538900693087, 0.9910741898469113, 0.9895167135389562]
# severe_toxic mean CV : 0.990388332071556
# obscene CV scores : [0.9928806730585695, 0.99347239882342, 0.9933801187817354, 0.9926410905084246, 0.9931233899573142]
# obscene mean CV : 0.9930995342258928
# threat CV scores : [0.9898491598311281, 0.9926748758603351, 0.9905821469352692, 0.9904258099519968, 0.9854784282977856]
# threat mean CV : 0.9898020841753029
コード例 #9
0
    print_step('Saving')
    save_in_cache('lvl2_all', train_, test_)
else:
    train_, test_ = load_cache('lvl2_all')

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

print('~~~~~~~~~~~~~~~~~~~~')
print_step('Run Level 2 LGB')
print(train_.columns.values)
train, test = get_data()
train_, test_ = run_cv_model(label='lvl2_all_lgb',
                             data_key='lvl2_all',
                             model_fn=runLGB,
                             train=train,
                             test=test,
                             kf=kf)
import pdb

pdb.set_trace()
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['id'] = test_['id']
submission['toxic'] = test_['lvl2_all_lgb_toxic']
submission['severe_toxic'] = test_['lvl2_all_lgb_severe_toxic']
submission['obscene'] = test_['lvl2_all_lgb_obscene']
submission['threat'] = test_['lvl2_all_lgb_threat']
submission['insult'] = test_['lvl2_all_lgb_insult']
submission['identity_hate'] = test_['lvl2_all_lgb_identity_hate']
コード例 #10
0
    print_step('Titlecat SVD 1/4')
    svd = TruncatedSVD(n_components=NCOMP, algorithm='arpack')
    svd.fit(tfidf_train)
    print_step('Titlecat SVD 2/4')
    train_svd = pd.DataFrame(svd.transform(tfidf_train))
    print_step('Titlecat SVD 3/4')
    test_svd = pd.DataFrame(svd.transform(tfidf_test))
    print_step('Titlecat SVD 4/4')
    train_svd.columns = ['svd_titlecat_' + str(i + 1) for i in range(NCOMP)]
    test_svd.columns = ['svd_titlecat_' + str(i + 1) for i in range(NCOMP)]
    train = pd.concat([train, train_svd], axis=1)
    test = pd.concat([test, test_svd], axis=1)

    print_step('Titlecat TFIDF Ridge')
    results = run_cv_model(tfidf_train, tfidf_test, target, runRidge,
                           {'alpha': 5.0}, rmse, 'titlecat-ridge')
    train['title_ridge'] = results['train']
    test['title_ridge'] = results['test']

    print('~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print_step('Description TFIDF 1/3')
    train['desc'] = train['title'].fillna(
        '') + ' ' + train['description'].fillna('')
    test['desc'] = test['title'].fillna('') + ' ' + test['description'].fillna(
        '')
    print_step('Description TFIDF 2/3')
    tfidf = TfidfVectorizer(ngram_range=(1, 2),
                            max_features=300000,
                            min_df=2,
                            max_df=0.8,
                            binary=True,
コード例 #11
0
          test_features.shape)
    print_step('Saving')
    save_in_cache('fm_data', train_features, test_features)
    del train_features
    del test_features
    gc.collect()

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

print('~~~~~~~~~~~')
print_step('Run FM')
train, test = run_cv_model(label='fm',
                           data_key='fm_data',
                           model_fn=runFM,
                           train=train,
                           test=test,
                           kf=kf)
import pdb
pdb.set_trace()
print('~~~~~~~~~~~~~~~~~~')
print_step('Cache Level 1')
save_in_cache('lvl1_fm', train, test)
# toxic CV scores : [0.9809843104555062, 0.9818160662139189, 0.9810818473334081, 0.9785535369240607, 0.9805031449391929]
# toxic mean CV : 0.9805877811732173
# severe_toxic CV scores : [0.9910152906145414, 0.989781576288062, 0.9905538900693087, 0.9910741898469113, 0.9895167135389562]
# severe_toxic mean CV : 0.990388332071556
# obscene CV scores : [0.9928806730585695, 0.99347239882342, 0.9933801187817354, 0.9926410905084246, 0.9931233899573142]
# obscene mean CV : 0.9930995342258928
# threat CV scores : [0.9898491598311281, 0.9926748758603351, 0.9905821469352692, 0.9904258099519968, 0.9854784282977856]
# threat mean CV : 0.9898020841753029
コード例 #12
0
    del post_trainc
    gc.collect()
    post_test = csr_matrix(hstack([post_testw, post_testc]))
    del post_testw
    del post_testc
    gc.collect()
    save_in_cache('tfidf_char_union_extra_data_toxic', post_train, post_test)

print('~~~~~~~~~~~~~~~~~~')
print_step('Run Attack LR')
train, test = run_cv_model(label='extra_data_attack_lr',
                           data_key='tfidf_char_union_extra_data_attack',
                           train_key='extra_data_attack',
                           model_fn=runSagLR,
                           train=attack,
                           test=test,
                           targets=[
                               'attack', 'quoting_attack', 'recipient_attack',
                               'third_party_attack', 'other_attack'
                           ],
                           kf=kf)

print('~~~~~~~~~~~~~~~~~~~~~')
print_step('Run Attack Ridge')
train, test = run_cv_model(label='extra_data_attack_lr',
                           data_key='tfidf_char_union_extra_data_attack',
                           train_key='extra_data_attack',
                           model_fn=runRidge,
                           train=attack,
                           test=test,
                           targets=['attack_score'],
コード例 #13
0
                                         numeric_cols=train_.columns.values.tolist(),
                                         dummy_cols=[])
        print_step('Importing Data 10/10 3/4')
        train = hstack((train, train_)).tocsr()
        print(train.shape)
        print_step('Importing Data 10/10 4/4')
        test = hstack((test, test_)).tocsr()
        print(test.shape)

        print_step('Caching')
        save_in_cache('complete_fm_data', train, test)
    else:
        train, test = load_cache('complete_fm_data')

    print_step('Run Complete FM')
    results = run_cv_model(train, test, target, runFM, params, rmse, 'complete-fm')
    import pdb
    pdb.set_trace()

    print('~~~~~~~~~~')
    print_step('Cache')
    save_in_cache('complete_fm', pd.DataFrame({'complete_fm': results['train']}),
                                 pd.DataFrame({'complete_fm': results['test']}))

    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print_step('Prepping submission file')
    submission = pd.DataFrame()
    submission['item_id'] = test_id
    submission['deal_probability'] = results['test'].clip(0.0, 1.0)
    submission.to_csv('submit/submit_fm.csv', index=False)
    print_step('Done!')
コード例 #14
0
svd.fit(train_embeddings_df)
print_step('Embedding SVD 2/4')
train_svd = pd.DataFrame(svd.transform(train_embeddings_df))
print_step('Embedding SVD 3/4')
test_svd = pd.DataFrame(svd.transform(test_embeddings_df))
print_step('Embedding SVD 4/4')
train_svd.columns = ['svd_embed_' + str(i + 1) for i in range(NCOMP)]
test_svd.columns = ['svd_embed_' + str(i + 1) for i in range(NCOMP)]
train_fe = pd.concat([train_fe, train_svd], axis=1)
test_fe = pd.concat([test_fe, test_svd], axis=1)

print('~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Run Flat Blender LGB')
print(train_fe.shape)
print(test_fe.shape)
results = run_cv_model(train_fe, test_fe, target, runLGB, params, rmse,
                       'flat_blender_lgb')
import pdb
pdb.set_trace()

print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('flat_blender_lgb',
              pd.DataFrame({'flat_blender_lgb': results['train']}),
              pd.DataFrame({'flat_blender_lgb': results['test']}))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['item_id'] = test_id
submission['deal_probability'] = results['test'].clip(0.0, 1.0)
submission.to_csv('submit/submit_flat_blender_lgb.csv', index=False)
コード例 #15
0
if not is_in_cache('tfidf_word_cleaned'):
    TFIDF_PARAMS_WORD.update({'train': train_cleaned, 'test': test_cleaned})
    post_train_cleaned, post_test_cleaned = run_tfidf(**TFIDF_PARAMS_WORD)
    save_in_cache('tfidf_word_cleaned', post_train_cleaned, post_test_cleaned)
    del post_train_cleaned
    del post_test_cleaned
    gc.collect()
del train_cleaned
del test_cleaned
gc.collect()

print('~~~~~~~~~~~~')
print_step('Run LGB')
train, test = run_cv_model(label='tfidf_word_sparse_lgb',
                           data_key='tfidf_word',
                           model_fn=runSparseLGB,
                           train=train,
                           test=test,
                           kf=kf)
# toxic CV scores : [0.9702901662371838, 0.9696223100754018, 0.9678153536674818, 0.9676149003746513, 0.9711870679257228]
# toxic mean CV : 0.9693059596560882
# severe_toxic CV scores : [0.9801895978261603, 0.9726377779905455, 0.982170654159893, 0.9874727212204224, 0.9781001815195353]
# severe_toxic mean CV : 0.9801141865433113
# obscene CV scores : [0.9830528922626651, 0.9837422487164804, 0.9814396979867874, 0.9815393581964723, 0.9841164068501664]
# obscene mean CV : 0.9827781208025144
# threat CV scores : [0.9818704364268729, 0.9649259614276585, 0.9764352339273181, 0.9867757740570546, 0.9802860678000866]
# threat mean CV : 0.9780586947277982
# insult CV scores : [0.9750866289607637, 0.9725628946207349, 0.9733409509578796, 0.9770282683977928, 0.9761217897403539]
# insult mean CV : 0.974828106535505
# identity_hate CV scores : [0.9657562463199535, 0.9649726686386453, 0.9607666236203398, 0.968004717808433, 0.9670525723564731]
# identity_hate mean CV : 0.965310565748769
# ('tfidf_word_sparse_lgb overall : ', 0.9750659390023312)
コード例 #16
0
print_step('Pre-flight checks')
for col in train_fe.columns:
    print('##')
    print(col)
    print('-')
    print(train_fe[col].values)
    print('-')
    print(test_fe[col].values)
    print('-')
print('-')

print('~~~~~~~~~~~~')
print_step('Run LGB')
print(train_fe.shape)
print(test_fe.shape)
results = run_cv_model(train_fe, test_fe, target, runLGB, params, rmse,
                       'base_lgb')
import pdb

pdb.set_trace()

print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('base_lgb', pd.DataFrame({'base_lgb': results['train']}),
              pd.DataFrame({'base_lgb': results['test']}))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['item_id'] = test_id
submission['deal_probability'] = results['test'].clip(0.0, 1.0)
submission.to_csv('submit/submit_base_lgb.csv', index=False)
コード例 #17
0
    del post_trainc
    gc.collect()
    post_test = csr_matrix(hstack([post_testw, post_testc]))
    del post_testw
    del post_testc
    gc.collect()
    save_in_cache('tfidf_char_union', post_train, post_test)

print('~~~~~~~~~~~~~~~~~~~~')
print_step('Run Word LR Sag')
train, test = run_cv_model(label='tfidf_word_lr_sag',
                           data_key='tfidf_word',
                           model_fn=runSagLR,
                           train=train,
                           test=test,
                           train_key='extra_label',
                           targets=[
                               'toxic', 'severe_toxic', 'obscene', 'insult',
                               'threat', 'identity_hate', 'non_toxic'
                           ],
                           kf=kf)
# toxic CV scores : [0.9757770727127603, 0.9754469511129109, 0.9748022104865504, 0.9727014869411932, 0.9753668774625703]
# toxic mean CV : 0.9748189197431969
# severe_toxic CV scores : [0.9822782217978469, 0.9809759688772627, 0.982837995178992, 0.9888689680969123, 0.9832500976058173]
# severe_toxic mean CV : 0.9836422503113663
# obscene CV scores : [0.9863031895889313, 0.9859709183099142, 0.986069037576627, 0.984788656923766, 0.9868893669717265]
# obscene mean CV : 0.986004233874193
# threat CV scores : [0.9892926265229371, 0.9904405583142148, 0.986893640592099, 0.9938866116828939, 0.982301808641914]
# threat mean CV : 0.9885630491508117
# insult CV scores : [0.9778670021983397, 0.9786535248142688, 0.9773924913032992, 0.9796894980895773, 0.9802901280493739]
# insult mean CV : 0.9787785288909717
コード例 #18
0
cat_cols = ['region', 'city', 'parent_category_name', 'category_name', 'cat_bin',
            'param_1', 'param_2', 'param_3', 'user_type', 'image_top_1', 'day_of_week']
for col in train_.columns:
    print(col)
    if col in cat_cols:
        train_[col] = train_[col].astype('category')
        test_[col] = test_[col].astype('category')
    else:
        train_[col] = train_[col].astype(np.float64)
        test_[col] = test_[col].astype(np.float64)

print('~~~~~~~~~~~~')
print_step('Run LGB')
print(train_.shape)
print(test_.shape)
results = run_cv_model(train_, test_, target, runLGB, params, rmse, 'lgb_blender')
import pdb
pdb.set_trace()

print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('lgb_blender', pd.DataFrame({'lgb_blender': results['train']}),
                             pd.DataFrame({'lgb_blender': results['test']}))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['item_id'] = test_id
submission['deal_probability'] = results['test'].clip(0.0, 1.0)
submission.to_csv('submit/submit_lgb_blender.csv', index=False)
print_step('Done!')