コード例 #1
0
def run_ridge_on_cat(cat):
    if not is_in_cache('cat_ridges_blend_l3_' + cat):
        print_step(cat + ' > Subsetting')
        train_c = train_[train['parent_category_name'] == cat].copy()
        test_c = test_[test['parent_category_name'] == cat].copy()
        print(train_c.shape)
        print(test_c.shape)
        target = train_c['deal_probability'].values
        train_id = train_c['item_id']
        test_id = test_c['item_id']
        train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
        test_c.drop('item_id', axis=1, inplace=True)

        print_step(cat + ' > Modeling')
        results = run_cv_model(train_c, test_c, target, runLasso, params, rmse,
                               cat + '-ridge-blend')
        train_c['cat_ridge'] = results['train']
        test_c['cat_ridge'] = results['test']
        print_step(cat + ' > RMSE: ' + str(rmse(target, train_c['cat_ridge'])))

        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step(cat + ' > Saving in Cache')
        train_c['item_id'] = train_id
        test_c['item_id'] = test_id
        save_in_cache('cat_ridges_blend_l3_' + cat,
                      train_c[['item_id',
                               'cat_ridge']], test_c[['item_id', 'cat_ridge']])
        return True
    else:
        print_step('Already have ' + cat + '...')
        return True
コード例 #2
0
def run_query_in_batches(df, label=''):
    responses = []
    total = len(df.comment_text.values)
    i = 0
    while i <= total:
        skip = False
        if i % 500 == 0 or i == total:
            batch_num = str(i / 500 + 1)
            if is_in_cache('convai-batches-' + label + batch_num):
                print_step('BATCH ' + label + batch_num + ' ALREADY DONE...')
                i += 500
                skip = True
            elif len(responses) > 100:
                batch_num = str(i / 500)
                if i == total:
                    batch_num = str(int(batch_num) + 1)
                    skip = True
                print_step('COLLECTING BATCH ' + label + batch_num + ' / ' +
                           str(round(total / 500) + 1))
                batch_df = pd.DataFrame([dict(x) for x in responses])
                save_in_cache('convai-batches-' + label + batch_num, batch_df,
                              None)
                batch_num = str(i / 500 + 1)
                print_step('SLEEPING 60s')
                time.sleep(60)
                responses = []
                print_step('STARTING BATCH ' + label + batch_num)
            else:
                print_step('STARTING BATCH ' + label + batch_num)
        if not skip:
            print_step(str(i + 1) + ' / ' + str(total))
            responses.append(
                run_query(df.comment_text.values[i], df.id.values[i]))
            i += 1
コード例 #3
0
def run_with_target(label,
                    target,
                    data_key,
                    model_fn,
                    kf,
                    train_key=None,
                    eval_fn=None):
    if is_in_cache(label + '_' + target):
        return load_cache(label + '_' + target)[0]
    else:
        print('-')
        print_step('Training ' + target)
        if train_key is None:
            train, test = get_data()
        else:
            train, test = load_cache(train_key)
        post_train, post_test = load_cache(data_key)
        if isinstance(post_train, pd.DataFrame):
            post_train = post_train.values
            post_test = post_test.values

        train_y = train[target]
        cv_scores = []
        pred_full_test = 0
        pred_train = np.zeros(train.shape[0])
        i = 1

        if isinstance(kf, StratifiedKFold):
            fold_splits = kf.split(post_train, train_y)
        else:
            fold_splits = kf.split(post_train)

        for dev_index, val_index in fold_splits:
            print_step('Started ' + label + ' ' + target + ' fold ' + str(i))
            dev_X, val_X = post_train[dev_index], post_train[val_index]
            dev_y, val_y = train_y[dev_index], train_y[val_index]
            pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y,
                                               post_test, target, dev_index,
                                               val_index)
            pred_full_test = pred_full_test + pred_test_y
            pred_train[val_index] = pred_val_y
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(eval_fn(val_y, pred_val_y))
            print_step(label + ' ' + target + ' cv score ' + str(i) + ' : ' +
                       str(cv_score))
            i += 1
        print_step(label + ' ' + target + ' cv scores : ' + str(cv_scores))
        mean_cv_score = np.mean(cv_scores)
        print_step(label + ' ' + target + ' mean cv score : ' +
                   str(mean_cv_score))
        pred_full_test = pred_full_test / 5.
        results = {
            'label': label,
            'target': target,
            'train': pred_train,
            'test': pred_full_test,
            'cv': cv_scores
        }
        save_in_cache(label + '_' + target, results, None)
        return results
コード例 #4
0
def get_img_data(index, image_files):
    print_step('[Core %d] Start' % index)
    if not is_in_cache('img_data_' + str(index)):
        data = []
        i = 0
        for image_file in image_files:
            dat = get_image(image_file)
            if dat:
                data += [get_data_from_image(dat, core=index, i=i)]
            i += 1
            if i % 50 == 0:
                print_step('[Core %d] Completed %d / %d...' %
                           (index, i, len(image_files)))
        print_step('[Core %d] Done. Saving...' % index)
        save_in_cache('img_data_' + str(index), data_to_df(data), None)
    else:
        print(str(index) + ' already in cache! Skipping...')
    return True
コード例 #5
0
ファイル: main.py プロジェクト: yubshang/PiResultsWebServer
def display_select_result_html():
    """
    Displays the html to select the results file that you want to view. This is the main page
    """
    html_string = ""
    html_string += """<HTML>
   <HEAD>
      <TITLE>
         Python Server
      </TITLE>
      <link rel="stylesheet" type="text/css" href="static/file_list.css" />
   </HEAD>
<BODY>
   <div style="text-align:center">
   <font size="6">Available Results</font>
   <br><br>
   <table align="center">
  <tr>
    <th>Result File</th>
    <th>Size</th>
    <th>Date Modified</th>
    <th>Date Created</th>
    <th>Cached</th>
  </tr>
   """
    for result_file in os.listdir(result_directory):
        absolute_result_file_path = os.path.join(result_directory, result_file)
        html_string += "<tr>"
        html_string += "<td><div><a href=results/" + result_file + ">" + result_file + "</a></div></td>"
        html_string += "<td>" + str(os.path.getsize(absolute_result_file_path) / float(1000)) + "kb </td>"
        html_string += "<td>" + time.ctime(os.path.getmtime(absolute_result_file_path)) + "</td>"
        html_string += "<td>" + time.ctime(os.path.getctime(absolute_result_file_path)) + "</td>"
        cached = cache.is_in_cache(absolute_result_file_path, cache_directory)
        if cached:
            html_string += '<td><img src="static/checkmark.png" style="width:15px;height:15px;">'
        else:
            html_string += '<td><img src="static/x.png" style="width:15px;height:15px;">'
        html_string += "</tr>\n"
    html_string += """
    </table>
    </div>
    </BODY>
    </HTML>"""
    return html_string
コード例 #6
0
        'nthread': min(mp.cpu_count() - 1, 6),
        'lambda_l1': 1,
        'lambda_l2': 1
    }
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=100)
    print(model.feature_importance())
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


if not is_in_cache('convai_with_fe'):
    print_step('Importing base data')
    train_base, test_base = get_data()

    print_step('Importing ConvAI data')
    train, test = load_cache('convai_data')

    print_step('Importing FE')
    train_fe, test_fe = load_cache('fe_lgb_data')

    print_step('Merging')
    train_fe['id'] = train_base['id']
    test_fe['id'] = test_base['id']
    train_ = pd.merge(train_fe, train, on='id')
    test_ = pd.merge(test_fe, test, on='id')
    del train_base
コード例 #7
0
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0], self.features_dim


if not is_in_cache('lvl1_attention-lstm'):
    train_df, test_df = get_data()

    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
コード例 #8
0
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" %
                  (epoch + 1, score))


EMBEDDING_FILE = 'cache/glove/glove.840B.300d.txt'

max_features = 100000
maxlen = 150
embed_size = 300
epochs = 4
batch_size = 128
predict_batch_size = 1024

if not is_in_cache('lvl1_gru-conv'):
    print_step('Loading data')
    train_df, test_df = get_data()
    print_step('Preprocessing 1/3')
    train_df['comment_text'] = train_df['comment_text'].apply(
        glove_preprocess).apply(normalize_text)
    print_step('Preprocessing 2/3')
    test_df['comment_text'] = test_df['comment_text'].apply(
        glove_preprocess).apply(normalize_text)

    print_step('Preprocessing 3/3')
    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
コード例 #9
0
    x_nb = train_X.multiply(r)
    model.fit(x_nb, train_y)
    pred_test_y = model.predict_proba(test_X.multiply(r))[:, 1]
    pred_test_y2 = model.predict_proba(test_X2.multiply(r))[:, 1]
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()
train['non_toxic'] = train[[
    'toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate'
]].sum(axis=1).apply(lambda x: 0 if x > 1 else 1)
save_in_cache('extra_label', train, test)

if not is_in_cache('cleaned'):
    print('~~~~~~~~~~~~~')
    print_step('Cleaning')
    train_cleaned, test_cleaned = clean_text(train, test)
    save_in_cache('cleaned', train_cleaned, test_cleaned)
else:
    train_cleaned, test_cleaned = load_cache('cleaned')

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

if not is_in_cache('tfidf_word'):
    print('~~~~~~~~~~~~~~~~~~~')
    print_step('Run TFIDF WORD')
    TFIDF_PARAMS_WORD.update({'train': train, 'test': test})
コード例 #10
0
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 1/10')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('titlecat_wordbatch') or not is_in_cache('text_wordbatch'):
    print('~~~~~~~~~~~~~~~~~~~~')
    print_step('Titlecat Wordbatch 1/5')
    train['titlecat'] = train['parent_category_name'].fillna('') + ' ' + train['category_name'].fillna('') + ' ' + train['param_1'].fillna('') + ' ' + train['param_2'].fillna('') + ' ' + train['param_3'].fillna('') + ' ' + train['title'].fillna('')
    test['titlecat'] = test['parent_category_name'].fillna('') + ' ' + test['category_name'].fillna('') + ' ' + test['param_1'].fillna('') + ' ' + test['param_2'].fillna('') + ' ' + test['param_3'].fillna('') + ' ' + test['title'].fillna('')
    if not is_in_cache('titlecat_wordbatch'):
        print_step('Titlecat Wordbatch 2/5')
        wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2,
                                                                      "hash_ngrams_weights": [1.5, 1.0],
                                                                      "hash_size": 2 ** 29,
                                                                      "norm": None,
                                                                      "tf": 'binary',
                                                                      "idf": None,
                                                                      }), procs=8)
        wb.dictionary_freeze = True
        wordbatch_train = wb.fit_transform(train['titlecat'])
コード例 #11
0
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 1/13')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('title_countvec'):
    print('~~~~~~~~~~~~~~~~~~~~')
    print_step('Title CountVec 1/2')
    cv = CountVectorizer(stop_words=stopwords.words('russian'),
                         lowercase=True,
                         min_df=2)
    tfidf_train = cv.fit_transform(train['title'])
    print(tfidf_train.shape)
    print_step('Title CountVec 2/2')
    tfidf_test = cv.transform(test['title'])
    print(tfidf_test.shape)
    print_step('Saving to cache...')
    save_in_cache('title_countvec', tfidf_train, tfidf_test)

if not is_in_cache('deep_text_feats2'):
    print('~~~~~~~~~~~~~~~~~~~~~~~')
コード例 #12
0
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


EMBEDDING_FILE = 'cache/crawl/crawl-300d-2M.vec'


max_features = 30000
maxlen = 100
embed_size = 300
epochs = 4
batch_size = 32
predict_batch_size = 1024


if not is_in_cache('lvl1_double-gru'):
    train_df, test_df = get_data()

    classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
コード例 #13
0
EMBED_SIZE = 300
NCOMP = 20


def text_to_embedding(text):
    mean = np.mean(
        [embeddings_index.get(w, np.zeros(EMBED_SIZE)) for w in text.split()],
        axis=0)
    if mean.shape == ():
        return np.zeros(EMBED_SIZE)
    else:
        return mean


print_step('Importing Data 11/19 1/3')
if not is_in_cache('avito_fasttext_300d'):
    print_step('Embedding 1/5')
    train, test = get_data()

    print_step('Embedding 1/5')

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    embeddings_index = dict(
        get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

    print_step('Embedding 2/5')
    train_embeddings = (train['title'].str.cat(
        [
            train['description'],
コード例 #14
0
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('tfidf_ridges') or not is_in_cache(
        'titlecat_tfidf') or not is_in_cache('text_tfidf') or not is_in_cache(
            'text_char_tfidf'):
    print('~~~~~~~~~~~~~~~~~~~~')
    print_step('Title TFIDF 1/2')
    tfidf = TfidfVectorizer(ngram_range=(1, 1),
                            max_features=100000,
                            min_df=2,
                            max_df=0.8,
                            binary=True,
                            encoding='KOI8-R')
    tfidf_train = tfidf.fit_transform(train['title'])
    print(tfidf_train.shape)
    print_step('Title TFIDF 2/2')
    tfidf_test = tfidf.transform(test['title'])
    print(tfidf_test.shape)
コード例 #15
0
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" %
                  (epoch + 1, score))


EMBEDDING_FILE = 'cache/crawl/crawl-300d-2M.vec'

max_features = 100000
maxlen = 500
embed_size = 300
epochs = 20
batch_size = 256
predict_batch_size = 1024

if not is_in_cache('lvl1_cudnngru'):
    train_df, test_df = get_data()

    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
コード例 #16
0
        'class_weight': 'balanced_subsample',
        'n_jobs': min(mp.cpu_count() - 1, 6),
        'random_state': 16,
        'verbose': 2
    }
    model = RandomForestClassifier(**params)
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)[:, 1]
    pred_test_y2 = model.predict_proba(test_X2)[:, 1]
    pred_test_y = minmax_scale(
        pd.Series(pred_test_y).rank().values)  # Rank transform
    pred_test_y2 = minmax_scale(pd.Series(pred_test_y2).rank().values)
    return pred_test_y, pred_test_y2


if not is_in_cache('lvl2_all'):
    print_step('Importing 1/21: LRs')
    lr_train, lr_test = load_cache('lvl1_lr')
    print_step('Importing 2/21: FE')
    train_fe, test_fe = load_cache('fe_lgb_data')
    print_step('Importing 3/21: Sparse LGBs')
    lgb_train, lgb_test = load_cache('lvl1_sparse_lgb')
    print_step('Importing 4/21: FE LGB')
    fe_lgb_train, fe_lgb_test = load_cache('lvl1_fe_lgb')
    print_step('Importing 5/21: Sparse FE LGB')
    sfe_lgb_train, sfe_lgb_test = load_cache('lvl1_sparse_fe_lgb')
    print_step('Importing 6/21: FM')
    fm_train, fm_test = load_cache('lvl1_fm')
    print_step('IMporting 7/21: Ridge')
    ridge_train, ridge_test = load_cache('lvl1_ridge')
    print_step('Importing 8/21: GRU')
コード例 #17
0
    """
    ohe = OneHotEncoder()
    full_csr = ohe.fit_transform(np.vstack((trn.values, sub.values)))
    csr_trn = full_csr[:trn.shape[0]]
    csr_sub = full_csr[trn.shape[0]:]
    del full_csr
    gc.collect()
    # Now remove features that don't have enough samples either in train or test
    return clean_csr(csr_trn, csr_sub, 3)


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

if not is_in_cache('fm_data'):
    print('~~~~~~~~~~~~~')
    print_step('Cleaning 1/2')
    train_cleaned = get_indicators_and_clean_comments(train)
    print_step('Cleaning 2/2')
    test_cleaned = get_indicators_and_clean_comments(test)
    train_text = train['clean_comment'].fillna('')
    test_text = test['clean_comment'].fillna('')
    all_text = pd.concat([train_text, test_text])

    class_names = [
        'toxic', 'severe_toxic', 'insult', 'threat', 'obscene', 'identity_hate'
    ]
    num_features = [
        f_ for f_ in train.columns if f_ not in [
            'comment_text', 'clean_comment', 'id', 'remaining_chars',
コード例 #18
0
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


EMBEDDING_FILE = 'cache/twitter/glove.twitter.27B.200d.txt'


max_features = 30000
maxlen = 100
embed_size = 200
epochs = 3
batch_size = 32
predict_batch_size = 1024


if not is_in_cache('lvl1_gru80'):
    print_step('Loading data')
    train_df, test_df = get_data()

    print_step('Preprocessing 1/3')
    train_df['comment_text'] = train_df['comment_text'].apply(glove_preprocess).apply(normalize_text)
    print_step('Preprocessing 2/3')
    test_df['comment_text'] = test_df['comment_text'].apply(glove_preprocess).apply(normalize_text)

    print_step('Preprocessing 3/3')
    classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
コード例 #19
0

EMBEDDING_FILE = 'cache/crawl/crawl-300d-2M.vec'


max_features = 100000
maxlen = 200
embed_size = 300
epochs = 3
batch_size = 256
predict_batch_size = 1024
filter_sizes = [1, 2, 3, 5]
num_filters = 32


if not is_in_cache('lvl1_2dconv'):
    print_step('Loading data')
    train_df = pd.read_csv('data/train_zafar_cleaned.csv')
    test_df = pd.read_csv('data/test_zafar_cleaned.csv')
    classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features, lower=True)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
コード例 #20
0
        'threat': 380,
        'insult': 500,
        'identity_hate': 480
    }
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=10)
    print(model.feature_importance())
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


if is_in_cache('lgb_fe_with_embeddings_and_svd'):
    train, test = load_cache('lgb_fe_with_embeddings_and_svd')
else:
    print('~~~~~~~~~~~~~~~~~~~')
    print_step('Importing Data')
    train, test = get_data()
    if is_in_cache('fe_lgb_data'):
        train_fe, test_fe = load_cache('fe_lgb_data')
    else:
        print_step('Adding Features')
        train_fe, test_fe = add_features(train, test)
        print_step('Dropping')
        train_fe.drop(['id', 'comment_text'], axis=1, inplace=True)
        test_fe.drop(['id', 'comment_text'], axis=1, inplace=True)
        train_fe.drop([
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
コード例 #21
0

def runRidge(train_X, train_y, test_X, test_y, test_X2, label, dev_index,
             val_index):
    model = Ridge(solver="sag",
                  fit_intercept=True,
                  random_state=205,
                  alpha=3.3)
    model.fit(train_X, train_y)
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~~~~~')
if not is_in_cache('extra_data_attack') and not is_in_cache(
        'extra_data_toxic'):
    print_step('Importing Data 1/5')
    attack = pd.read_csv('data/attack_annotations.tsv', sep='\t')
    print_step('Importing Data 2/5')
    attack_comments = pd.read_csv('data/attack_annotated_comments.tsv',
                                  sep='\t')
    print_step('Importing Data 3/5')
    toxic = pd.read_csv('data/toxicity_annotations.tsv', sep='\t')
    print_step('Importing Data 4/5')
    toxic_comments = pd.read_csv('data/toxicity_annotated_comments.tsv',
                                 sep='\t')
    print_step('Importing Data 5/5')
    train, test = get_data()

    print_step('Processing 1/9')
コード例 #22
0
def run_ridge_on_regioncat(regioncat):
    if not is_in_cache('regioncat_ridges_' + regioncat):
        print_step(regioncat + ' > Subsetting')
        train_c = train[train['region_X_cat'] == regioncat].copy()
        test_c = test[test['region_X_cat'] == regioncat].copy()
        print(train_c.shape)
        print(test_c.shape)
        target = train_c['deal_probability'].values
        train_id = train_c['item_id']
        test_id = test_c['item_id']
        train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
        test_c.drop(['item_id'], axis=1, inplace=True)

        print_step(regioncat + ' > Titlecat TFIDF 1/3')
        train_c['titlecat'] = train_c['category_name'].fillna(
            '') + ' ' + train_c['param_1'].fillna('') + ' ' + train_c[
                'param_2'].fillna('') + ' ' + train_c['param_3'].fillna(
                    '') + ' ' + train_c['title'].fillna('')
        test_c['titlecat'] = test_c['category_name'].fillna('') + ' ' + test_c[
            'param_1'].fillna('') + ' ' + test_c['param_2'].fillna(
                '') + ' ' + test_c['param_3'].fillna(
                    '') + ' ' + test_c['title'].fillna('')
        print_step(regioncat + ' > Titlecat TFIDF 2/3')
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_features=100000,
                                min_df=2,
                                max_df=0.8,
                                binary=True,
                                encoding='KOI8-R')
        tfidf_train = tfidf.fit_transform(train_c['titlecat'])
        print(tfidf_train.shape)
        print_step(regioncat + ' > Titlecat TFIDF 3/3')
        tfidf_test = tfidf.transform(test_c['titlecat'])
        print(tfidf_test.shape)

        print_step(regioncat + ' > Titlecat TFIDF Ridge')
        results = run_cv_model(tfidf_train, tfidf_test, target, runRidge,
                               {'alpha': 5.0}, rmse,
                               regioncat + '-titlecat-ridge')
        train_c['regioncat_title_ridge'] = results['train']
        test_c['regioncat_title_ridge'] = results['test']

        print_step(regioncat + ' > Description TFIDF 1/3')
        train_c['desc'] = train_c['title'].fillna(
            '') + ' ' + train_c['description'].fillna('')
        test_c['desc'] = test_c['title'].fillna(
            '') + ' ' + test_c['description'].fillna('')
        print_step(regioncat + ' > Description TFIDF 2/3')
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_features=100000,
                                min_df=2,
                                max_df=0.8,
                                binary=True,
                                encoding='KOI8-R')
        tfidf_train2 = tfidf.fit_transform(train_c['desc'].fillna(''))
        print(tfidf_train2.shape)
        print_step(regioncat + ' > Description TFIDF 3/3')
        tfidf_test2 = tfidf.transform(test_c['desc'].fillna(''))
        print(tfidf_test2.shape)
        results = run_cv_model(tfidf_train2, tfidf_test2, target, runRidge,
                               {'alpha': 5.0}, rmse, regioncat + '-desc-ridge')
        train_c['regioncat_desc_ridge'] = results['train']
        test_c['regioncat_desc_ridge'] = results['test']

        print_step(regioncat + ' > Text Char TFIDF 1/2')
        # Using char n-grams ends up being surprisingly good, HT https://www.kaggle.com/c/avito-demand-prediction/discussion/56061#325063
        tfidf = TfidfVectorizer(ngram_range=(2, 5),
                                max_features=100000,
                                min_df=2,
                                max_df=0.8,
                                binary=True,
                                analyzer='char',
                                encoding='KOI8-R')
        tfidf_train3 = tfidf.fit_transform(train_c['desc'])
        print(tfidf_train3.shape)
        print_step(regioncat + ' > Text Char TFIDF 2/2')
        tfidf_test3 = tfidf.transform(test_c['desc'])
        print(tfidf_test3.shape)

        results = run_cv_model(tfidf_train3, tfidf_test3, target, runRidge,
                               {'alpha': 5.0}, rmse,
                               regioncat + '-desc-char-ridge')
        train_c['regioncat_desc_char_ridge'] = results['train']
        test_c['regioncat_desc_char_ridge'] = results['test']

        print_step('Merging 1/2')
        train_c2 = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr()
        print_step('Merging 2/2')
        test_c2 = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr()
        print(train_c2.shape)
        print(test_c2.shape)

        print('~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step('Run Full Text Ridge')
        results = run_cv_model(train_c2, test_c2, target, runRidge,
                               {'alpha': 8.0}, rmse, regioncat + '-text-ridge')
        train_c['regioncat_all_text_ridge'] = results['train']
        test_c['regioncat_all_text_ridge'] = results['test']

        print('~~~~~~~~~~~~~~~~~~~~~~')
        print_step(regioncat + ' > Dropping')
        train_c.drop([c for c in train_c.columns if 'ridge' not in c],
                     axis=1,
                     inplace=True)
        test_c.drop([c for c in test_c.columns if 'ridge' not in c],
                    axis=1,
                    inplace=True)
        train_c['item_id'] = train_id
        test_c['item_id'] = test_id

        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step(regioncat + ' > Saving in Cache')
        save_in_cache('regioncat_ridges_' + regioncat, train_c, test_c)
    else:
        print(regioncat + ' already in cache! Skipping...')
    return True
コード例 #23
0
filter_nr = 64
filter_size = 3
max_pool_size = 3
max_pool_strides = 2
dense_nr = 256
spatial_dropout = 0.2
dense_dropout = 0.5
train_embed = False


def schedule(ind):
    a = [0.001, 0.0005, 0.0001, 0.0001]
    return a[ind]


if not is_in_cache('lvl1_dpcnn'):
    print_step('Loading data')
    train_df, test_df = get_data()
    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features, lower=True)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
コード例 #24
0
def run_nn_model(label, model, max_features, maxlen, epochs, batch_size,
                 predict_batch_size):
    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    for embedding_name, embedding_file in EMBEDDING_FILES.items():
        if is_in_cache(label + '_' + embedding_name):
            print_step('Already trained ' + label + '_' + embedding_name +
                       '! Skipping...')
        else:
            train_df, test_df = get_data()

            print_step('Loading embed ' + embedding_name + '...')
            embed_size = EMBED_SIZE_LOOKUP[embedding_name]
            x_train, x_test, embedding_matrix = tokenize_and_embed(
                train_df, test_df, embedding_file, max_features, maxlen,
                embed_size, embedding_name)
            y_train = train_df[classes].values

            print_step('Build model...')
            model = model(max_features, maxlen, embed_size, embedding_matrix)
            model.save_weights('cache/gru-model-weights.h5')

            print_step('Making KFold for CV')
            kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

            i = 1
            cv_scores = []
            pred_train = np.zeros((train_df.shape[0], 6))
            pred_full_test = np.zeros((test_df.shape[0], 6))
            for dev_index, val_index in kf.split(x_train, y_train[:, 0]):
                print_step('Started fold ' + str(i))
                model.load_weights('cache/' + label + '_' + embedding_name +
                                   '-model-weights.h5')
                dev_X, val_X = x_train[dev_index], x_train[val_index]
                dev_y, val_y = y_train[dev_index, :], y_train[val_index, :]
                RocAuc = RocAucEvaluation(validation_data=(val_X, val_y),
                                          interval=1)
                model.fit(dev_X,
                          dev_y,
                          batch_size=batch_size,
                          epochs=epochs,
                          validation_data=(val_X, val_y),
                          callbacks=[RocAuc])
                val_pred = model.predict(val_X,
                                         batch_size=predict_batch_size,
                                         verbose=1)
                pred_train[val_index, :] = val_pred
                test_pred = model.predict(x_test,
                                          batch_size=predict_batch_size,
                                          verbose=1)
                pred_full_test = pred_full_test + test_pred
                cv_score = [
                    roc_auc_score(val_y[:, j], val_pred[:, j])
                    for j in range(6)
                ]
                print_step('Fold ' + str(i) + ' done')
                pprint(zip(classes, cv_score))
                cv_scores.append(cv_score)
                i += 1
            print_step('All folds done!')
            print('CV scores')
            pprint(zip(classes, np.mean(cv_scores, axis=0)))
            mean_cv_score = np.mean(np.mean(cv_scores, axis=0))
            print('mean cv score : ' + str(mean_cv_score))
            pred_full_test = pred_full_test / 5.
            for k, classx in enumerate(classes):
                train_df['gru_' + classx] = pred_train[:, k]
                test_df['gru_' + classx] = pred_full_test[:, k]

            print('~~~~~~~~~~~~~~~~~~')
            print_step('Cache Level 1')
            save_in_cache('lvl1_' + label + '_' + embedding_name, train_df,
                          test_df)
            print_step('Done!')

            print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            print_step('Prepping submission file')
            submission = pd.DataFrame()
            submission['id'] = test_df['id']
            submission['toxic'] = test_df[label + '_' + embedding_name +
                                          '_toxic']
            submission['severe_toxic'] = test_df[label + '_' + embedding_name +
                                                 '_severe_toxic']
            submission['obscene'] = test_df[label + '_' + embedding_name +
                                            '_obscene']
            submission['threat'] = test_df[label + '_' + embedding_name +
                                           '_threat']
            submission['insult'] = test_df[label + '_' + embedding_name +
                                           '_insult']
            submission['identity_hate'] = test_df[label + '_' +
                                                  embedding_name +
                                                  '_identity_hate']
            submission.to_csv('submit/submit_lvl1_' + label + '_' +
                              embedding_name + '.csv',
                              index=False)
            print_step('Done')
コード例 #25
0
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


EMBEDDING_FILE = 'cache/twitter/glove.twitter.27B.200d.txt'


max_features = 200000
maxlen = 200
embed_size = 200
epochs = 2
batch_size = 32
predict_batch_size = 1024


if not is_in_cache('lvl1_rnncnn2'):
    train_df, test_df = get_data()

    classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
コード例 #26
0
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" %
                  (epoch + 1, score))


EMBEDDING_FILE = 'cache/twitter/glove.twitter.27B.200d.txt'

max_features = 100000
maxlen = 200
embed_size = 200
epochs = 12
batch_size = 1024
predict_batch_size = 1024

if not is_in_cache('lvl1_gru128-2'):
    train_df = pd.read_csv('data/train_zafar_cleaned.csv')
    test_df = pd.read_csv('data/test_zafar_cleaned.csv')

    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
コード例 #27
0
            b = K.permute_dimensions(
                b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)


if not is_in_cache('lvl1_capsule_net'):

    train_df = pd.read_csv('data/train_zafar_cleaned.csv')
    test_df = pd.read_csv('data/test_zafar_cleaned.csv')

    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
コード例 #28
0
    }
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=10)
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

if not is_in_cache('cleaned'):
    print('~~~~~~~~~~~~~')
    print_step('Cleaning')
    train_cleaned, test_cleaned = clean_text(train, test)
    save_in_cache('cleaned', train_cleaned, test_cleaned)
else:
    train_cleaned, test_cleaned = load_cache('cleaned')
    print_step('Filling missing')
    train_cleaned['comment_text'].fillna('missing', inplace=True)
    test_cleaned['comment_text'].fillna('missing', inplace=True)
    print('Train shape: {}'.format(train_cleaned.shape))
    print('Test shape: {}'.format(test_cleaned.shape))

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)
コード例 #29
0
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 1/11')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('deep_text_feats3'):
    print('~~~~~~~~~~~~~~~~~~~~~~~')
    print_step('Importing Data 2/11')
    tfidf_train, tfidf_test = load_cache('titlecat_tfidf')

    print_step('Importing Data 3/11')
    tfidf_train2, tfidf_test2 = load_cache('text_tfidf')

    print_step('Importing Data 4/11')
    tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf')


    print_step('Importing Data 5/11')
    train = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr()
    print_step('Importing Data 6/11')
    test = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr()
コード例 #30
0
from cache import get_data, is_in_cache, load_cache, save_in_cache


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('data_with_fe'):
    print('~~~~~~~~~~~~')
    print_step('Merging')
    merge = pd.concat([train, test])

    print('~~~~~~~~~~~~~~~~~~~')
    print_step('Imputation 1/7')
    merge['param_1'].fillna('missing', inplace=True)
    print_step('Imputation 2/7')
    merge['param_2'].fillna('missing', inplace=True)
    print_step('Imputation 3/7')
    merge['param_3'].fillna('missing', inplace=True)
    print_step('Imputation 4/7')
    merge['price_missing'] = merge['price'].isna().astype(int)
    merge['price'].fillna(0, inplace=True)
    print_step('Imputation 5/7')
コード例 #31
0
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" %
                  (epoch + 1, score))


EMBEDDING_FILE = 'cache/glove/glove.840B.300d.txt'

max_features = 100000
maxlen = 150
embed_size = 300
epochs = 1
batch_size = 32
predict_batch_size = 32

if not is_in_cache('lvl1_lstm-conv'):
    print_step('Loading data')
    train_df = pd.read_csv('data/train_zafar_cleaned.csv')
    test_df = pd.read_csv('data/test_zafar_cleaned.csv')
    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features, lower=True)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)