Beispiel #1
0
        pprint(zip(classes, cv_score))
        cv_scores.append(cv_score)
        i += 1
    print_step('All folds done!')
    print('CV scores')
    pprint(zip(classes, np.mean(cv_scores, axis=0)))
    mean_cv_score = np.mean(np.mean(cv_scores, axis=0))
    print('mean cv score : ' + str(mean_cv_score))
    pred_full_test = pred_full_test / 5.
    for k, classx in enumerate(classes):
        train_df['gru128_' + classx] = pred_train[:, k]
        test_df['gru128_' + classx] = pred_full_test[:, k]

    print('~~~~~~~~~~~~~~~~~~')
    print_step('Cache Level 1')
    save_in_cache('lvl1_gru128', train_df, test_df)
    print_step('Done!')

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['toxic'] = test_df['gru128_toxic']
submission['severe_toxic'] = test_df['gru128_severe_toxic']
submission['obscene'] = test_df['gru128_obscene']
submission['threat'] = test_df['gru128_threat']
submission['insult'] = test_df['gru128_insult']
submission['identity_hate'] = test_df['gru128_identity_hate']
submission.to_csv('submit/submit_lvl1_gru128.csv', index=False)
print_step('Done')
# ('toxic', 0.9833006646207911),
Beispiel #2
0
        pprint(zip(classes, cv_score))
        cv_scores.append(cv_score)
        i += 1
    print_step('All folds done!')
    print('CV scores')
    pprint(zip(classes, np.mean(cv_scores, axis=0)))
    mean_cv_score = np.mean(np.mean(cv_scores, axis=0))
    print('mean cv score : ' + str(mean_cv_score))
    pred_full_test = pred_full_test / 5.
    for k, classx in enumerate(classes):
        train_df['2dconv_' + classx] = pred_train[:, k]
        test_df['2dconv_' + classx] = pred_full_test[:, k]

    print('~~~~~~~~~~~~~~~~~~')
    print_step('Cache Level 1')
    save_in_cache('lvl1_2dconv', train_df, test_df)
    print_step('Done!')


print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['toxic'] = test_df['2dconv_toxic']
submission['severe_toxic'] = test_df['2dconv_severe_toxic']
submission['obscene'] = test_df['2dconv_obscene']
submission['threat'] = test_df['2dconv_threat']
submission['insult'] = test_df['2dconv_insult']
submission['identity_hate'] = test_df['2dconv_identity_hate']
submission.to_csv('submit/submit_lvl1_2dconv.csv', index=False)
print_step('Done')
    del train_fe
    del test_fe
    del train_img
    del test_img
    del train_active
    del test_active
    del train_
    del test_
    del test_ohe
    del test_ohe2
    del train_ohe
    del train_ohe2
    gc.collect()

    print_step('Caching')
    save_in_cache('deep_text_feats4', train, test)
else:
    train, test = load_cache('deep_text_feats4')


print('~~~~~~~~~~~~')
print_step('Run LGB')
results = run_cv_model(train, test, target, runLGB, params, rmse, 'deep_lgb4')
import pdb
pdb.set_trace()

print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('deep_lgb4', pd.DataFrame({'deep_lgb4': results['train']}),
                           pd.DataFrame({'deep_lgb4': results['test']}))
            '') + ' ' + test['title'].fillna('')
    if not is_in_cache('titlecat_tfidf'):
        print_step('Titlecat TFIDF 2/3')
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_features=300000,
                                min_df=2,
                                max_df=0.8,
                                binary=True,
                                encoding='KOI8-R')
        tfidf_train = tfidf.fit_transform(train['titlecat'])
        print(tfidf_train.shape)
        print_step('Titlecat TFIDF 3/3')
        tfidf_test = tfidf.transform(test['titlecat'])
        print(tfidf_test.shape)
        print_step('Saving to cache...')
        save_in_cache('titlecat_tfidf', tfidf_train, tfidf_test)
    else:
        print_step('Loading from cache...')
        tfidf_train, tfidf_test = load_cache('titlecat_tfidf')

    print_step('Titlecat Stats 1/6')
    train['titlecat_tfidf_sum'] = tfidf_train.sum(axis=1)
    print_step('Titlecat Stats 2/6')
    train['titlecat_tfidf_mean'] = tfidf_train.mean(axis=1)
    print_step('Titlecat Stats 3/6')
    train['titlecat_tfidf_nnz'] = tfidf_train.getnnz(axis=1)
    print_step('Titlecat Stats 4/6')
    test['titlecat_tfidf_sum'] = tfidf_test.sum(axis=1)
    print_step('Titlecat Stats 5/6')
    test['titlecat_tfidf_mean'] = tfidf_test.mean(axis=1)
    print_step('Titlecat Stats 6/6')
Beispiel #5
0
        pprint(zip(classes, cv_score))
        cv_scores.append(cv_score)
        i += 1
    print_step('All folds done!')
    print('CV scores')
    pprint(zip(classes, np.mean(cv_scores, axis=0)))
    mean_cv_score = np.mean(np.mean(cv_scores, axis=0))
    print('mean cv score : ' + str(mean_cv_score))
    pred_full_test = pred_full_test / 5.
    for k, classx in enumerate(classes):
        train_df['cudnngru_' + classx] = pred_train[:, k]
        test_df['cudnngru_' + classx] = pred_full_test[:, k]

    print('~~~~~~~~~~~~~~~~~~')
    print_step('Cache Level 1')
    save_in_cache('lvl1_cudnngru', train_df, test_df)
    print_step('Done!')

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['toxic'] = test_df['cudnngru_toxic']
submission['severe_toxic'] = test_df['cudnngru_severe_toxic']
submission['obscene'] = test_df['cudnngru_obscene']
submission['threat'] = test_df['cudnngru_threat']
submission['insult'] = test_df['cudnngru_insult']
submission['identity_hate'] = test_df['cudnngru_identity_hate']
submission.to_csv('submit/submit_lvl1_cudnngru.csv', index=False)
print_step('Done')
    lambda x: x.replace('test_jpg/', ''))
print_step('Merging 6/9')
merge['img_path'] = merge['img_path'].apply(
    lambda x: x.replace('train_jpg/', ''))
print_step('Merging 7/9')
merge['img_path'] = merge['img_path'].apply(lambda x: x.replace('.jpg', ''))
print_step('Merging 8/9')
train2 = train.merge(merge, left_on='image', right_on='img_path', how='left')
print_step('Merging 9/9')
test2 = test.merge(merge, left_on='image', right_on='img_path', how='left')

print_step('Dropping 1/2')
drops = list(
    set(train2.columns.values) - set(merge.columns.values) -
    {'deal_probability', 'item_id'})
drops += [
    'img_aspect_ratio', 'img_moment_m11', 'img_moment_mu21', 'img_moment_mu30',
    'img_moment_nu02', 'img_moment_nu12', 'img_moment_m00', 'img_moment_m10',
    'img_moment_mu03', 'img_moment_mu12', 'img_moment_m03', 'img_moment_m12',
    'img_moment_mu11', 'img_moment_nu11', 'img_moment_mu20', 'img_moment_m20',
    'img_moment_nu30', 'img_moment_m01', 'img_moment_m30', 'img_moment_nu03',
    'img_moment_mu02', 'img_moment_m21', 'img_moment_nu21', 'img_moment_m02',
    'img_moment_nu20'
]
train2.drop(drops, axis=1, inplace=True)
print_step('Dropping 2/2')
test2.drop(drops, axis=1, inplace=True)

print_step('Saving...')
save_in_cache('img_data2', train2, test2)
Beispiel #7
0
        test_[col] = test_[col].astype('category')
    else:
        train_[col] = train_[col].astype(np.float64)
        test_[col] = test_[col].astype(np.float64)

print('~~~~~~~~~~~~')
print_step('Run LGB')
print(train_.shape)
print(test_.shape)
results = run_cv_model(train_, test_, target, runLGB, params, rmse, 'lgb_blender')
import pdb
pdb.set_trace()

print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('lgb_blender', pd.DataFrame({'lgb_blender': results['train']}),
                             pd.DataFrame({'lgb_blender': results['test']}))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['item_id'] = test_id
submission['deal_probability'] = results['test'].clip(0.0, 1.0)
submission.to_csv('submit/submit_lgb_blender.csv', index=False)
print_step('Done!')

print('~~~~~~~~~~~~~~~~~~~~')
print_step('Run Poisson LGB')
print(train_.shape)
print(test_.shape)
poisson_results = run_cv_model(train_, test_, target, runLGB, poisson_params, rmse, 'possion_lgb_blender')
import pdb
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('title_countvec'):
    print('~~~~~~~~~~~~~~~~~~~~')
    print_step('Title CountVec 1/2')
    cv = CountVectorizer(stop_words=stopwords.words('russian'),
                         lowercase=True,
                         min_df=2)
    tfidf_train = cv.fit_transform(train['title'])
    print(tfidf_train.shape)
    print_step('Title CountVec 2/2')
    tfidf_test = cv.transform(test['title'])
    print(tfidf_test.shape)
    print_step('Saving to cache...')
    save_in_cache('title_countvec', tfidf_train, tfidf_test)

if not is_in_cache('deep_text_feats3'):
    print('~~~~~~~~~~~~~~~~~~~~~~~')
    print_step('Importing Data 2/13')
    tfidf_train, tfidf_test = load_cache('title_countvec')

    print_step('Importing Data 3/13')
    tfidf_train2, tfidf_test2 = load_cache('text_tfidf')

    print_step('Importing Data 4/13')
    tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf')

    print_step('Importing Data 5/13')
    train = hstack((tfidf_train2, tfidf_train3)).tocsr()
    print_step('Importing Data 6/13')
Beispiel #9
0
    test_embeddings = (test['title'].str.cat(
        [
            test['description'],
        ], sep=' ', na_rep='').astype(str).fillna('missing').apply(
            clean_text).apply(text_to_embedding))

    print_step('Embedding 4/5')
    train_embeddings_df = pd.DataFrame(
        train_embeddings.values.tolist(),
        columns=['embed' + str(i) for i in range(EMBED_SIZE)])
    print_step('Embedding 5/5')
    test_embeddings_df = pd.DataFrame(
        test_embeddings.values.tolist(),
        columns=['embed' + str(i) for i in range(EMBED_SIZE)])
    print_step('Caching...')
    save_in_cache('avito_fasttext_300d', train_embeddings_df,
                  test_embeddings_df)
else:
    train_embeddings_df, test_embeddings_df = load_cache('avito_fasttext_300d')

train_fe['embedding_mean'] = train_embeddings_df.mean(axis=1)
train_fe['embedding_std'] = train_embeddings_df.std(axis=1)
train_fe['embedding_skew'] = skew(train_embeddings_df, axis=1)
train_fe['embedding_kurtosis'] = kurtosis(train_embeddings_df, axis=1)
test_fe['embedding_mean'] = test_embeddings_df.mean(axis=1)
test_fe['embedding_std'] = test_embeddings_df.std(axis=1)
test_fe['embedding_skew'] = skew(test_embeddings_df, axis=1)
test_fe['embedding_kurtosis'] = kurtosis(test_embeddings_df, axis=1)

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 12/19 1/7')
cat_cols = [
pool.join()
pool.terminate()
pool.restart()
print_step('Merging 2/5')
train_dfs = map(lambda x: x[0], dfs)
test_dfs = map(lambda x: x[1], dfs)
print_step('Merging 3/5')
train_df = pd.concat(train_dfs)
test_df = pd.concat(test_dfs)
print_step('Merging 4/5')
train_ridge = train.merge(train_df, on='item_id')
print_step('Merging 5/5')
test_ridge = test.merge(test_df, on='item_id')

print_step('RMSEs')
print(rmse(train_ridge['deal_probability'],
           train_ridge['cat_bin_title_ridge']))
print(rmse(train_ridge['deal_probability'], train_ridge['cat_bin_desc_ridge']))
print(
    rmse(train_ridge['deal_probability'],
         train_ridge['cat_bin_desc_char_ridge']))
print(
    rmse(train_ridge['deal_probability'],
         train_ridge['cat_bin_all_text_ridge']))
import pdb
pdb.set_trace()

print('~~~~~~~~~~~~~~~')
print_step('Caching...')
save_in_cache('cat_bin_ridges', train_ridge, test_ridge)
def run_ridge_on_cat_bin(cat_bin):
    if not is_in_cache('cat_bin_ridges_' + cat_bin):
        print_step(cat_bin + ' > Subsetting')
        train_c = train[train['cat_bin'] == cat_bin].copy()
        test_c = test[test['cat_bin'] == cat_bin].copy()
        print(train_c.shape)
        print(test_c.shape)
        target = train_c['deal_probability'].values
        train_id = train_c['item_id']
        test_id = test_c['item_id']
        train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
        test_c.drop(['item_id'], axis=1, inplace=True)

        print_step(cat_bin + ' > Titlecat TFIDF 1/3')
        train_c['titlecat'] = train_c['category_name'].fillna(
            '') + ' ' + train_c['param_1'].fillna('') + ' ' + train_c[
                'param_2'].fillna('') + ' ' + train_c['param_3'].fillna(
                    '') + ' ' + train_c['title'].fillna('')
        test_c['titlecat'] = test_c['category_name'].fillna('') + ' ' + test_c[
            'param_1'].fillna('') + ' ' + test_c['param_2'].fillna(
                '') + ' ' + test_c['param_3'].fillna(
                    '') + ' ' + test_c['title'].fillna('')
        print_step(cat_bin + ' > Titlecat TFIDF 2/3')
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_features=50000,
                                binary=True,
                                encoding='KOI8-R')
        tfidf_train = tfidf.fit_transform(train_c['titlecat'])
        print(tfidf_train.shape)
        print_step(cat_bin + ' > Titlecat TFIDF 3/3')
        tfidf_test = tfidf.transform(test_c['titlecat'])
        print(tfidf_test.shape)

        print_step(cat_bin + ' > Titlecat TFIDF Ridge')
        results = run_cv_model(tfidf_train, tfidf_test, target, runRidge,
                               {'alpha': 5.0}, rmse,
                               cat_bin + '-titlecat-ridge')
        train_c['cat_bin_title_ridge'] = results['train']
        test_c['cat_bin_title_ridge'] = results['test']

        print_step(cat_bin + ' > Description TFIDF 1/3')
        train_c['desc'] = train_c['title'].fillna(
            '') + ' ' + train_c['description'].fillna('')
        test_c['desc'] = test_c['title'].fillna(
            '') + ' ' + test_c['description'].fillna('')
        print_step(cat_bin + ' > Description TFIDF 2/3')
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_features=50000,
                                binary=True,
                                encoding='KOI8-R')
        tfidf_train2 = tfidf.fit_transform(train_c['desc'].fillna(''))
        print(tfidf_train2.shape)
        print_step(cat_bin + ' > Description TFIDF 3/3')
        tfidf_test2 = tfidf.transform(test_c['desc'].fillna(''))
        print(tfidf_test2.shape)
        results = run_cv_model(tfidf_train2, tfidf_test2, target, runRidge,
                               {'alpha': 5.0}, rmse, cat_bin + '-desc-ridge')
        train_c['cat_bin_desc_ridge'] = results['train']
        test_c['cat_bin_desc_ridge'] = results['test']

        print_step(cat_bin + ' > Text Char TFIDF 1/2')
        # Using char n-grams ends up being surprisingly good, HT https://www.kaggle.com/c/avito-demand-prediction/discussion/56061#325063
        tfidf = TfidfVectorizer(ngram_range=(2, 5),
                                max_features=50000,
                                binary=True,
                                analyzer='char',
                                encoding='KOI8-R')
        tfidf_train3 = tfidf.fit_transform(train_c['desc'])
        print(tfidf_train3.shape)
        print_step(cat_bin + ' > Text Char TFIDF 2/2')
        tfidf_test3 = tfidf.transform(test_c['desc'])
        print(tfidf_test3.shape)

        results = run_cv_model(tfidf_train3, tfidf_test3, target, runRidge,
                               {'alpha': 5.0}, rmse,
                               cat_bin + '-desc-char-ridge')
        train_c['cat_bin_desc_char_ridge'] = results['train']
        test_c['cat_bin_desc_char_ridge'] = results['test']

        print_step('Merging 1/2')
        train_c2 = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr()
        print_step('Merging 2/2')
        test_c2 = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr()
        print(train_c2.shape)
        print(test_c2.shape)

        print('~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step('Run Full Text Ridge')
        results = run_cv_model(train_c2, test_c2, target, runRidge,
                               {'alpha': 8.0}, rmse, cat_bin + '-text-ridge')
        train_c['cat_bin_all_text_ridge'] = results['train']
        test_c['cat_bin_all_text_ridge'] = results['test']

        print('~~~~~~~~~~~~~~~~~~~~~~')
        print_step(cat_bin + ' > Dropping')
        train_c.drop([c for c in train_c.columns if 'ridge' not in c],
                     axis=1,
                     inplace=True)
        test_c.drop([c for c in test_c.columns if 'ridge' not in c],
                    axis=1,
                    inplace=True)
        train_c['item_id'] = train_id
        test_c['item_id'] = test_id

        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step(cat_bin + ' > Saving in Cache')
        save_in_cache('cat_bin_ridges_' + cat_bin, train_c, test_c)
    else:
        print(cat_bin + ' already in cache! Skipping...')
    return True
Beispiel #12
0
]
all_models = [
    'count_logreg', 'bad_word_logreg', 'tfidf_logreg', 'char_vdcnn',
    'glove_gru', 'glove_lstm', 'glove_scnn', 'fasttext_dpcnn', 'fasttext_gru',
    'fasttext_scnn', 'glove_dpcnn', 'word2vec_scnn', 'fasttext_lstm',
    'word2vec_gru', 'word2vec_lstm', 'word2vec_dpcnn'
]

train, test = get_data()
train.drop(['comment_text'], axis=1, inplace=True)
test.drop(['comment_text'], axis=1, inplace=True)
for model in all_models:
    train_ = pd.read_csv(base + model + train_tail).drop(['fold_id'], axis=1)
    test_ = (pd.read_csv(base + model + test_tail).groupby('id').mean().drop(
        ['fold_id'], axis=1).reset_index())
    for label in labels:
        train_['neptune_' + model + '_' + label] = train_[label]
        train_.drop(label, axis=1, inplace=True)
        test_['neptune_' + model + '_' + label] = test_[label]
        test_.drop(label, axis=1, inplace=True)
    train = pd.merge(train, train_, on='id')
    test = pd.merge(test, test_, on='id')
    for label in labels:
        print(model + ' ' + label + ' AUC: ' + str(
            roc_auc_score(train[label], train['neptune_' + model + '_' +
                                              label])))

print('Saving...')
save_in_cache('neptune_models', train, test)
print('Done')
    ]).tocsr()
    del train_word_features, train_num_features, train_char_features, train_subword_features
    gc.collect()

    print_step('Merging 2/2')
    test_features = hstack([
        test_char_features, test_word_features, test_num_features,
        test_subword_features
    ]).tocsr()
    del test_word_features, test_num_features, test_char_features, test_subword_features
    gc.collect()

    print("Shapes just to be sure : ", train_features.shape,
          test_features.shape)
    print_step('Saving')
    save_in_cache('fm_data', train_features, test_features)
    del train_features
    del test_features
    gc.collect()

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

print('~~~~~~~~~~~')
print_step('Run Ridge')
train, test = run_cv_model(label='ridge',
                           data_key='fm_data',
                           model_fn=runRidge,
                           train=train,
                           test=test,
Beispiel #14
0
        pprint(zip(classes, cv_score))
        cv_scores.append(cv_score)
        i += 1
    print_step('All folds done!')
    print('CV scores')
    pprint(zip(classes, np.mean(cv_scores, axis=0)))
    mean_cv_score = np.mean(np.mean(cv_scores, axis=0))
    print('mean cv score : ' + str(mean_cv_score))
    pred_full_test = pred_full_test / 5.
    for k, classx in enumerate(classes):
        train_df['rnncnn_' + classx] = pred_train[:, k]
        test_df['rnncnn_' + classx] = pred_full_test[:, k]

    print('~~~~~~~~~~~~~~~~~~')
    print_step('Cache Level 1')
    save_in_cache('lvl1_rnncnn', train_df, test_df)
    print_step('Done!')

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['toxic'] = test_df['rnncnn_toxic']
submission['severe_toxic'] = test_df['rnncnn_severe_toxic']
submission['obscene'] = test_df['rnncnn_obscene']
submission['threat'] = test_df['rnncnn_threat']
submission['insult'] = test_df['rnncnn_insult']
submission['identity_hate'] = test_df['rnncnn_identity_hate']
submission.to_csv('submit/submit_lvl1_rnncnn.csv', index=False)
print_step('Done')
# [('toxic', 0.9817250985709102),
Beispiel #15
0
    ]).tocsr()
    del train_word_features, train_num_features, train_char_features, train_subword_features
    gc.collect()

    print_step('Merging 2/2')
    test_features = hstack([
        test_char_features, test_word_features, test_num_features,
        test_subword_features
    ]).tocsr()
    del test_word_features, test_num_features, test_char_features, test_subword_features
    gc.collect()

    print("Shapes just to be sure : ", train_features.shape,
          test_features.shape)
    print_step('Saving')
    save_in_cache('fm_data', train_features, test_features)
    del train_features
    del test_features
    gc.collect()

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

print('~~~~~~~~~~~')
print_step('Run FM')
train, test = run_cv_model(label='fm',
                           data_key='fm_data',
                           model_fn=runFM,
                           train=train,
                           test=test,
Beispiel #16
0
        wordbatch_train = wb.fit_transform(train['titlecat'])
        print(wordbatch_train.shape)
        print_step('Titlecat Wordbatch 3/5')
        wordbatch_test = wb.transform(test['titlecat'])
        print(wordbatch_test.shape)
        del(wb)
        gc.collect()
        print_step('Titlecat Wordbatch 4/5')
        mask = np.where(wordbatch_train.getnnz(axis=0) > 3)[0]
        wordbatch_train = wordbatch_train[:, mask]
        print(wordbatch_train.shape)
        print_step('Titlecat Wordbatch 5/5')
        wordbatch_test = wordbatch_test[:, mask]
        print(wordbatch_test.shape)
        print_step('Saving to cache...')
        save_in_cache('titlecat_wordbatch', wordbatch_train, wordbatch_test)
    else:
        print_step('Loading from cache...')
        wordbatch_train, wordbatch_test = load_cache('titlecat_wordbatch')

    print('~~~~~~~~~~~~~~~~~~~~~~~')
    print_step('Text Wordbatch 1/5')
    train['desc'] = train['title'].fillna('') + ' ' + train['description'].fillna('')
    test['desc'] = test['title'].fillna('') + ' ' + test['description'].fillna('')
    if not is_in_cache('text_wordbatch'):
        print_step('Text Wordbatch 2/5')
        wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2,
                                                                      "hash_ngrams_weights": [1.0, 1.0],
                                                                      "hash_size": 2 ** 28,
                                                                      "norm": "l2",
                                                                      "tf": 1.0,
Beispiel #17
0
    del dpcnn_test
    del rnncnn_train
    del rnncnn_test
    del rnncnn2_train
    del rnncnn2_test
    del capsule_net_train
    del capsule_net_test
    del attention_train
    del attention_test
    del neptune_train
    del neptune_test
    gc.collect()
    print('Train shape: {}'.format(train_.shape))
    print('Test shape: {}'.format(test_.shape))
    print_step('Saving')
    save_in_cache('lvl2_all', train_, test_)
else:
    train_, test_ = load_cache('lvl2_all')

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

print('~~~~~~~~~~~~~~~~~~~~')
print_step('Run Level 2 LGB')
print(train_.columns.values)
train, test = get_data()
train_, test_ = run_cv_model(label='lvl2_all_lgb',
                             data_key='lvl2_all',
                             model_fn=runLGB,
                             train=train,
    print_step('Processing 4/9')
    attack['attack_score'] = attack['attack']
    attack['quoting_attack'] = attack['quoting_attack'].apply(
        lambda x: 1 if x > 0.1 else 0)
    attack['recipient_attack'] = attack['recipient_attack'].apply(
        lambda x: 1 if x > 0.1 else 0)
    attack['third_party_attack'] = attack['third_party_attack'].apply(
        lambda x: 1 if x > 0.1 else 0)
    attack['other_attack'] = attack['other_attack'].apply(lambda x: 1
                                                          if x > 0.1 else 0)
    attack['attack'] = attack['attack_score'].apply(lambda x: 1
                                                    if x > 0.1 else 0)
    attack['comment_text'] = attack['comment']
    attack.drop('comment', axis=1, inplace=True)
    print_step('Processing 5/9')
    save_in_cache('extra_data_attack', attack, test)

    print_step('Processing 6/9')
    toxic = toxic.drop('worker_id',
                       axis=1).groupby('rev_id').mean().reset_index()
    print_step('Processing 7/9')
    toxic = toxic_comments[['rev_id',
                            'comment']].merge(toxic,
                                              on='rev_id').drop('rev_id',
                                                                axis=1)
    print_step('Processing 8/9')
    toxic['toxicity_label'] = toxic['toxicity'].apply(lambda x: 1
                                                      if x > 0.1 else 0)
    toxic['comment_text'] = toxic['comment']
    toxic.drop('comment', axis=1, inplace=True)
    print_step('Processing 9/9')
Beispiel #19
0
pool.restart()
print_step('Merging 2/5')
train_dfs = map(lambda x: x[0], dfs)
test_dfs = map(lambda x: x[1], dfs)
print_step('Merging 3/5')
train_df = pd.concat(train_dfs)
test_df = pd.concat(test_dfs)
print_step('Merging 4/5')
train_lasso = train.merge(train_df, on='item_id')
print_step('Merging 5/5')
test_lasso = test.merge(test_df, on='item_id')
print_step(
    'RMSE: ' +
    str(rmse(train_lasso['deal_probability'], train_lasso['cat_ridge'])))
import pdb
pdb.set_trace()

print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('pc_lasso_l3',
              pd.DataFrame({'pc_lasso_l3': train_lasso['cat_ridge']}),
              pd.DataFrame({'pc_lasso_l3': test_lasso['cat_ridge']}))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.DataFrame()
submission['item_id'] = test_id
submission['deal_probability'] = test_lasso['cat_ridge'].clip(0.0, 1.0)
submission.to_csv('submit/submit_pc_lasso_l3_blender.csv', index=False)
print_step('Done!')
    gp = all_periods.groupby(['user_id'])[['days_up_sum', 'times_put_up'
                                           ]].agg(['min', 'max', 'mean'])
    gp = pd.DataFrame(gp.to_records())
    gp.columns = [
        'user_id', 'days_up_sum_min', 'days_up_sum_max', 'days_up_sum_mean',
        'times_put_up_min', 'times_put_up_max', 'times_put_up_mean'
    ]
    print_step('Grouping 4/4 1/3')
    n_user_items = all_samples.groupby(['user_id'])[['item_id']].count().reset_index() \
        .rename(index=str, columns={
            'item_id': 'n_user_items'
        })
    print_step('Grouping 4/4 2/3')
    gp = gp.merge(n_user_items, on='user_id', how='outer')
    print_step('Grouping 4/4 3/3')
    gp.fillna(0, inplace=True)

    print('~~~~~~~~~~~~~~~~~~~~')
    print_step('Merging 2/2 1/4')
    train = train.merge(gp, on='user_id', how='left')
    print_step('Merging 2/2 2/4')
    test = test.merge(gp, on='user_id', how='left')
    print_step('Merging 2/2 3/4')
    train = train[gp.columns.values]
    print_step('Merging 2/2 4/4')
    test = test[gp.columns.values]

    print('~~~~~~~~~~~~')
    print_step('Caching')
    save_in_cache('active_feats', train, test)