def run_query(comment_text, idx):
    value = normalize_text(comment_text)
    value = value[:2999] if len(value) >= 3000 else value
    value = 'empty' if len(value) == 0 else value
    try:
        rr = requests.post(
            'https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze',
            params={'key': CONVAI_KEY},
            data=json.dumps({
                'comment': {
                    'text': value
                },
                'languages': ['en'],
                'requestedAttributes': {
                    'TOXICITY': {},
                    'ATTACK_ON_AUTHOR': {},
                    'ATTACK_ON_COMMENTER': {},
                    'INCOHERENT': {},
                    'INFLAMMATORY': {},
                    'LIKELY_TO_REJECT': {},
                    'OBSCENE': {},
                    'SEVERE_TOXICITY': {},
                    'SPAM': {},
                    'UNSUBSTANTIAL': {}
                }
            }))
        return [(k, v['summaryScore']['value'])
                for k, v in rr.json()['attributeScores'].items()
                ] + [('id', idx)]
    except Exception as error:
        print_step('FATAL ABORT:')
        import pdb
        pdb.set_trace()
Ejemplo n.º 2
0
def run_ridge_on_cat(cat):
    if not is_in_cache('cat_ridges_blend_l3_' + cat):
        print_step(cat + ' > Subsetting')
        train_c = train_[train['parent_category_name'] == cat].copy()
        test_c = test_[test['parent_category_name'] == cat].copy()
        print(train_c.shape)
        print(test_c.shape)
        target = train_c['deal_probability'].values
        train_id = train_c['item_id']
        test_id = test_c['item_id']
        train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
        test_c.drop('item_id', axis=1, inplace=True)

        print_step(cat + ' > Modeling')
        results = run_cv_model(train_c, test_c, target, runLasso, params, rmse,
                               cat + '-ridge-blend')
        train_c['cat_ridge'] = results['train']
        test_c['cat_ridge'] = results['test']
        print_step(cat + ' > RMSE: ' + str(rmse(target, train_c['cat_ridge'])))

        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print_step(cat + ' > Saving in Cache')
        train_c['item_id'] = train_id
        test_c['item_id'] = test_id
        save_in_cache('cat_ridges_blend_l3_' + cat,
                      train_c[['item_id',
                               'cat_ridge']], test_c[['item_id', 'cat_ridge']])
        return True
    else:
        print_step('Already have ' + cat + '...')
        return True
Ejemplo n.º 3
0
def run_with_target(label,
                    target,
                    data_key,
                    model_fn,
                    kf,
                    train_key=None,
                    eval_fn=None):
    if is_in_cache(label + '_' + target):
        return load_cache(label + '_' + target)[0]
    else:
        print('-')
        print_step('Training ' + target)
        if train_key is None:
            train, test = get_data()
        else:
            train, test = load_cache(train_key)
        post_train, post_test = load_cache(data_key)
        if isinstance(post_train, pd.DataFrame):
            post_train = post_train.values
            post_test = post_test.values

        train_y = train[target]
        cv_scores = []
        pred_full_test = 0
        pred_train = np.zeros(train.shape[0])
        i = 1

        if isinstance(kf, StratifiedKFold):
            fold_splits = kf.split(post_train, train_y)
        else:
            fold_splits = kf.split(post_train)

        for dev_index, val_index in fold_splits:
            print_step('Started ' + label + ' ' + target + ' fold ' + str(i))
            dev_X, val_X = post_train[dev_index], post_train[val_index]
            dev_y, val_y = train_y[dev_index], train_y[val_index]
            pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y,
                                               post_test, target, dev_index,
                                               val_index)
            pred_full_test = pred_full_test + pred_test_y
            pred_train[val_index] = pred_val_y
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(eval_fn(val_y, pred_val_y))
            print_step(label + ' ' + target + ' cv score ' + str(i) + ' : ' +
                       str(cv_score))
            i += 1
        print_step(label + ' ' + target + ' cv scores : ' + str(cv_scores))
        mean_cv_score = np.mean(cv_scores)
        print_step(label + ' ' + target + ' mean cv score : ' +
                   str(mean_cv_score))
        pred_full_test = pred_full_test / 5.
        results = {
            'label': label,
            'target': target,
            'train': pred_train,
            'test': pred_full_test,
            'cv': cv_scores
        }
        save_in_cache(label + '_' + target, results, None)
        return results
Ejemplo n.º 4
0
def runLasso(train_X, train_y, val_X, val_y, test_X, params):
    model = Lasso(**params)
    model.fit(train_X, train_y)
    for i in zip(train_X.columns, model.coef_):
        print(i)
    print_step('Predict Val 1/2')
    pred_val_y = model.predict(val_X)
    print_step('Predict Test 2/2')
    pred_test_y = model.predict(test_X)
    return pred_val_y, pred_test_y
Ejemplo n.º 5
0
def get_data():
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    print('Train shape: {}'.format(train.shape))
    print('Test shape: {}'.format(test.shape))

    print_step('Filling missing')
    train['comment_text'].fillna('missing', inplace=True)
    test['comment_text'].fillna('missing', inplace=True)
    print('Train shape: {}'.format(train.shape))
    print('Test shape: {}'.format(test.shape))
    return train, test
Ejemplo n.º 6
0
def runSparseLGB(train_X, train_y, test_X, test_y, test_X2, label, dev_index,
                 val_index):
    print_step('Get K Best')
    model = LogisticRegression(solver='sag')
    sfm = SelectFromModel(model, threshold=0.2)
    print(train_X.shape)
    train_sparse_matrix = sfm.fit_transform(train_X, train_y)
    print(train_sparse_matrix.shape)
    test_sparse_matrix = sfm.transform(test_X)
    test_sparse_matrix2 = sfm.transform(test_X2)
    del train_X
    del test_X
    del test_X2
    gc.collect()
    d_train = lgb.Dataset(train_sparse_matrix, label=train_y)
    d_valid = lgb.Dataset(test_sparse_matrix, label=test_y)
    watchlist = [d_train, d_valid]
    params = {
        'boosting': 'dart',
        'learning_rate': 0.1,
        'application': 'binary',
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'auc',
        'data_random_seed': 2,
        'bagging_fraction': 0.8,
        'feature_fraction': 0.1,
        'nthread': min(mp.cpu_count() - 1, 6),
        'lambda_l1': 1,
        'lambda_l2': 1,
        'min_data_in_leaf': 40
    }
    rounds_lookup = {
        'toxic': 1400,
        'severe_toxic': 500,
        'obscene': 550,
        'threat': 380,
        'insult': 500,
        'identity_hate': 480
    }
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=10)
    pred_test_y = model.predict(test_sparse_matrix)
    pred_test_y2 = model.predict(test_sparse_matrix2)
    return pred_test_y, pred_test_y2
Ejemplo n.º 7
0
    def random_inference(self):
        steps = 0
        total_reward = 0

        done = False
        self.env.reset()
        # one game round of randomness
        while not done:
            action = self.env.action_space.sample()
            state, reward, done, info = self.env.step(action)

            steps += 1
            total_reward += reward
            self.env.render()
            print_step(steps, state, action, reward, total_reward)
            time.sleep(0.25)
            os.system('clear')
Ejemplo n.º 8
0
    def validation(self, sleep_timer=0):
        state = self.env.reset()
        steps, cum_reward = 0, 0
        done = False

        while not done:
            action = np.argmax(self.q_table[state])
            state, reward, done, info = self.env.step(action)
            if sleep_timer > 0:
                os.system('clear')
                self.env.render()
                print_step(steps, state, action, reward, cum_reward)
                time.sleep(sleep_timer)

            cum_reward += reward
            steps += 1

        return steps, cum_reward
Ejemplo n.º 9
0
def run_cv_model(train, test, target, model_fn, params, eval_fn, label):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    fold_splits = kf.split(train)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros(train.shape[0])
    i = 1
    for dev_index, val_index in fold_splits:
        print_step('Started ' + label + ' fold ' + str(i) + '/5')
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.values[dev_index], train.values[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
            dev_X = pd.DataFrame(dev_X, columns=train.columns)
            val_X = pd.DataFrame(val_X, columns=train.columns)
            for (column, dtype) in list(zip(train.columns,
                                            list(train.dtypes))):
                dev_X[column] = dev_X[column].astype(dtype)
                val_X[column] = val_X[column].astype(dtype)
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]

        params2 = params.copy()
        pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, test,
                                           params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        cv_score = eval_fn(val_y, pred_val_y)
        cv_scores.append(eval_fn(val_y, pred_val_y))
        print_step(label + ' cv score ' + str(i) + ' : ' + str(cv_score))
        i += 1
    print_step(label + ' cv scores : ' + str(cv_scores))
    print_step(label + ' mean cv score : ' + str(np.mean(cv_scores)))
    print_step(label + ' std cv score : ' + str(np.std(cv_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {
        'label': label,
        'train': pred_train,
        'test': pred_full_test,
        'cv': cv_scores
    }
    return results
Ejemplo n.º 10
0
def runLGB(train_X, train_y, test_X, test_y, test_X2, params):
    print_step('Prep LGB')
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(test_X, label=test_y)
    watchlist = [d_train, d_valid]
    print_step('Train LGB')
    num_rounds = params.pop('num_rounds')
    verbose_eval = params.pop('verbose_eval')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval)
    print_step('Feature importance')
    pprint(sorted(list(zip(model.feature_importance(), train_X.columns)), reverse=True))
    print_step('Predict 1/2')
    pred_test_y = model.predict(test_X)
    print_step('Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2
Ejemplo n.º 11
0
def save_in_cache(key, train, test):
    if isinstance(train, dict):
        train = np.array(train)
        train_path = 'cache/model_' + key
        np.save(train_path, train)
    elif is_csr_matrix(train):
        train_path = 'cache/train_' + key + '.npcsr'
        save_sparse_csr(train_path, train)
        if test is not None:
            test_path = 'cache/test_' + key + '.npcsr'
            save_sparse_csr(test_path, test)
    else:
        train_path = 'cache/train_' + key + '.csv'
        train.to_csv(train_path, index=False)
        if test is not None:
            test_path = 'cache/test_' + key + '.csv'
            test.to_csv(test_path, index=False)
    if test is None:
        print_step('Saved ' + train_path + ' to cache!')
    else:
        print_step('Saved ' + train_path + ' and ' + test_path + ' to cache!')
def runRidge(train_X, train_y, test_X, test_y, test_X2, params):
    model = Ridge(**params)
    print_step('Fit Ridge')
    model.fit(train_X, train_y)
    print_step('Ridge Predict 1/2')
    pred_test_y = model.predict(test_X)
    print_step('Ridge Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2
Ejemplo n.º 13
0
def runLGB(train_X, train_y, test_X, test_y, test_X2, params):
    print_step('Prep LGB')
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(test_X, label=test_y)
    watchlist = [d_train, d_valid]
    print_step('Train LGB')
    num_rounds = params.pop('num_rounds')
    verbose_eval = params.pop('verbose_eval')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval)
    print_step('Predict 1/2')
    pred_test_y = model.predict(test_X)
    print_step('Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2
Ejemplo n.º 14
0
def runTFFM(train_X, train_y, test_X, test_y, test_X2, params):
    model = TFFMRegressor(**params)
    print_step('Fit TFFM')
    for i in range(rounds):
        model.fit(train_X, train_y.values, n_epochs=iters)
        pred_test_y = model.predict(test_X)
        print_step('Iteration {}/{} -- RMSE: {}'.format(
            i + 1, rounds, rmse(pred_test_y, test_y)))
    print_step('TFFM Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2
Ejemplo n.º 15
0
def runFM(train_X, train_y, test_X, test_y, test_X2, params):
    params['D'] = train_X.shape[1]
    rounds = params.pop('rounds')
    model = FM_FTRL(**params)
    print_step('Fit FM')
    for i in range(rounds):
        model.fit(train_X, train_y, reset=False)
        pred_test_y = model.predict(test_X)
        print_step('Iteration {}/{} -- RMSE: {}'.format(i + 1, rounds, rmse(pred_test_y, test_y)))
    print_step('FM Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2
Ejemplo n.º 16
0
def get_img_data(index, image_files):
    print_step('[Core %d] Start' % index)
    if not is_in_cache('img_data_' + str(index)):
        data = []
        i = 0
        for image_file in image_files:
            dat = get_image(image_file)
            if dat:
                data += [get_data_from_image(dat, core=index, i=i)]
            i += 1
            if i % 50 == 0:
                print_step('[Core %d] Completed %d / %d...' %
                           (index, i, len(image_files)))
        print_step('[Core %d] Done. Saving...' % index)
        save_in_cache('img_data_' + str(index), data_to_df(data), None)
    else:
        print(str(index) + ' already in cache! Skipping...')
    return True
Ejemplo n.º 17
0
def run_tfidf(train,
              test,
              ngram_min=1,
              ngram_max=2,
              min_df=5,
              max_features=20000,
              rm_stopwords=True,
              analyzer='word',
              sublinear_tf=False,
              token_pattern=r'(?u)\b\w\w+\b',
              binary=False,
              tokenize=False,
              tokenizer=None):
    rm_stopwords = 'english' if rm_stopwords else None
    strip_accents = 'unicode' if tokenize else None
    tfidf_vec = TfidfVectorizer(ngram_range=(ngram_min, ngram_max),
                                analyzer=analyzer,
                                stop_words=rm_stopwords,
                                strip_accents=strip_accents,
                                token_pattern=token_pattern,
                                tokenizer=tokenizer,
                                min_df=min_df,
                                max_features=max_features,
                                sublinear_tf=sublinear_tf,
                                binary=binary)
    print_step('TFIDF ngrams ' + str(ngram_min) + ' to ' + str(ngram_max) +
               ' on ' + str(analyzer) + ' with strip accents = ' +
               str(strip_accents) + ', token_pattern = ' + str(token_pattern) +
               ', tokenizer = ' + str(tokenizer) + ', rm_stopwords = ' +
               str(rm_stopwords) + ', min_df = ' + str(min_df) +
               ', max_features = ' + str(max_features) + ', sublinear_tf = ' +
               str(sublinear_tf) + ', binary = ' + str(binary))
    train_tfidf = tfidf_vec.fit_transform(train['comment_text'])
    if test is not None:
        print_step('TFIDF 1/2')
        test_tfidf = tfidf_vec.transform(test['comment_text'])
        print_step('TFIDF 2/2')
        print('TFIDF train shape: {}'.format(train_tfidf.shape))
        print('TFIDF test shape: {}'.format(test_tfidf.shape))
    else:
        print('TFIDF train shape: {}'.format(train_tfidf.shape))
        test_tfidf = None
    return train_tfidf, test_tfidf
Ejemplo n.º 18
0
def load_cache(key):
    if is_in_cache(key):
        if is_in_cache(key) == 'dict':
            train_path = 'cache/model_' + key + '.npy'
            train = np.load(train_path).tolist()
            test = None
        elif is_in_cache(key) == 'csr':
            train_path = 'cache/train_' + key + '.npcsr.npz'
            train = load_sparse_csr(train_path)
            try:
                test_path = 'cache/test_' + key + '.npcsr.npz'
                test = load_sparse_csr(test_path)
                print('Train shape: {}'.format(train.shape))
                print('Test shape: {}'.format(test.shape))
            except IOError:
                test = None
                print('Train shape: {}'.format(train.shape))
        else:
            train_path = 'cache/train_' + key + '.csv'
            test_path = 'cache/test_' + key + '.csv'
            train = pd.read_csv(train_path)
            try:
                test = pd.read_csv(test_path)
            except IOError:
                test = None
            if 'comment_text' in train.columns:
                print_step('Filling missing')
                train['comment_text'].fillna('missing', inplace=True)
                if test is not None:
                    test['comment_text'].fillna('missing', inplace=True)
            if test is not None:
                print('Test shape: {}'.format(test.shape))
            print('Train shape: {}'.format(train.shape))

        if test is None:
            print_step('Skipped... Loaded ' + train_path + ' from cache!')
        else:
            print_step('Skipped... Loaded ' + train_path + ' and ' +
                       test_path + ' from cache!')
        return train, test
    else:
        raise ValueError
Ejemplo n.º 19
0
import re
import string

import pandas as pd
import numpy as np

from nltk.corpus import stopwords

from utils import print_step, bin_and_ohe_data
from cache import get_data, is_in_cache, load_cache, save_in_cache


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('data_with_fe'):
    print('~~~~~~~~~~~~')
    print_step('Merging')
    merge = pd.concat([train, test])

    print('~~~~~~~~~~~~~~~~~~~')
    print_step('Imputation 1/7')
Ejemplo n.º 20
0

def runNBLR(train_X, train_y, test_X, test_y, test_X2, label, dev_index,
            val_index):
    train_y = train_y.values
    r = csr_matrix(np.log(pr(train_X, 1, train_y) / pr(train_X, 0, train_y)))
    model = LogisticRegression(C=4, dual=True)
    x_nb = train_X.multiply(r)
    model.fit(x_nb, train_y)
    pred_test_y = model.predict_proba(test_X.multiply(r))[:, 1]
    pred_test_y2 = model.predict_proba(test_X2.multiply(r))[:, 1]
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()
train['non_toxic'] = train[[
    'toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate'
]].sum(axis=1).apply(lambda x: 0 if x > 1 else 1)
save_in_cache('extra_label', train, test)

if not is_in_cache('cleaned'):
    print('~~~~~~~~~~~~~')
    print_step('Cleaning')
    train_cleaned, test_cleaned = clean_text(train, test)
    save_in_cache('cleaned', train_cleaned, test_cleaned)
else:
    train_cleaned, test_cleaned = load_cache('cleaned')

print('~~~~~~~~~~~~~~~~~~~~~~~~')
Ejemplo n.º 21
0
        'threat': 380,
        'insult': 500,
        'identity_hate': 480
    }
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=10)
    pred_test_y = model.predict(test_sparse_matrix)
    pred_test_y2 = model.predict(test_sparse_matrix2)
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

if not is_in_cache('cleaned'):
    print('~~~~~~~~~~~~~')
    print_step('Cleaning')
    train_cleaned, test_cleaned = clean_text(train, test)
    save_in_cache('cleaned', train_cleaned, test_cleaned)
else:
    train_cleaned, test_cleaned = load_cache('cleaned')
    print_step('Filling missing')
    train_cleaned['comment_text'].fillna('missing', inplace=True)
    test_cleaned['comment_text'].fillna('missing', inplace=True)
    print('Train shape: {}'.format(train_cleaned.shape))
    print('Test shape: {}'.format(test_cleaned.shape))
Ejemplo n.º 22
0
        'threat': 380,
        'insult': 500,
        'identity_hate': 480
    }
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=10)
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

if not is_in_cache('cleaned'):
    print('~~~~~~~~~~~~~')
    print_step('Cleaning')
    train_cleaned, test_cleaned = clean_text(train, test)
    save_in_cache('cleaned', train_cleaned, test_cleaned)
else:
    train_cleaned, test_cleaned = load_cache('cleaned')
    print_step('Filling missing')
    train_cleaned['comment_text'].fillna('missing', inplace=True)
    test_cleaned['comment_text'].fillna('missing', inplace=True)
    print('Train shape: {}'.format(train_cleaned.shape))
    print('Test shape: {}'.format(test_cleaned.shape))
Ejemplo n.º 23
0
maxlen = 100
embed_size = 300
epochs = 4
batch_size = 32
predict_batch_size = 1024


if not is_in_cache('lvl1_double-gru'):
    train_df, test_df = get_data()

    classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

    print_step('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
Ejemplo n.º 24
0
PRINT_LGB_FEATURE_IMPORTANCE = False
N_FOLDS = 100

import string

import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score as auc

from utils import print_step, ohe, run_cv_model, runLGB, runFFLGB, runLR, runTarget

print_step('Loading')
train = pd.read_csv('train.csv')
print(train.shape)
test = pd.read_csv('test.csv')
print(test.shape)

print_step('Subsetting')
target = train['target']
train_id = train['id']
test_id = test['id']
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

print_step('Combine')
traintest = pd.concat([train, test])
Ejemplo n.º 25
0
    num_rounds = params.pop('num_rounds')
    verbose_eval = params.pop('verbose_eval')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval)
    print_step('Predict 1/2')
    pred_test_y = model.predict(test_X)
    print_step('Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 1/13')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('title_countvec'):
    print('~~~~~~~~~~~~~~~~~~~~')
    print_step('Title CountVec 1/2')
    cv = CountVectorizer(stop_words=stopwords.words('russian'),
                         lowercase=True,
Ejemplo n.º 26
0
def run_nn_model(label, model, max_features, maxlen, epochs, batch_size,
                 predict_batch_size):
    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    for embedding_name, embedding_file in EMBEDDING_FILES.items():
        if is_in_cache(label + '_' + embedding_name):
            print_step('Already trained ' + label + '_' + embedding_name +
                       '! Skipping...')
        else:
            train_df, test_df = get_data()

            print_step('Loading embed ' + embedding_name + '...')
            embed_size = EMBED_SIZE_LOOKUP[embedding_name]
            x_train, x_test, embedding_matrix = tokenize_and_embed(
                train_df, test_df, embedding_file, max_features, maxlen,
                embed_size, embedding_name)
            y_train = train_df[classes].values

            print_step('Build model...')
            model = model(max_features, maxlen, embed_size, embedding_matrix)
            model.save_weights('cache/gru-model-weights.h5')

            print_step('Making KFold for CV')
            kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

            i = 1
            cv_scores = []
            pred_train = np.zeros((train_df.shape[0], 6))
            pred_full_test = np.zeros((test_df.shape[0], 6))
            for dev_index, val_index in kf.split(x_train, y_train[:, 0]):
                print_step('Started fold ' + str(i))
                model.load_weights('cache/' + label + '_' + embedding_name +
                                   '-model-weights.h5')
                dev_X, val_X = x_train[dev_index], x_train[val_index]
                dev_y, val_y = y_train[dev_index, :], y_train[val_index, :]
                RocAuc = RocAucEvaluation(validation_data=(val_X, val_y),
                                          interval=1)
                model.fit(dev_X,
                          dev_y,
                          batch_size=batch_size,
                          epochs=epochs,
                          validation_data=(val_X, val_y),
                          callbacks=[RocAuc])
                val_pred = model.predict(val_X,
                                         batch_size=predict_batch_size,
                                         verbose=1)
                pred_train[val_index, :] = val_pred
                test_pred = model.predict(x_test,
                                          batch_size=predict_batch_size,
                                          verbose=1)
                pred_full_test = pred_full_test + test_pred
                cv_score = [
                    roc_auc_score(val_y[:, j], val_pred[:, j])
                    for j in range(6)
                ]
                print_step('Fold ' + str(i) + ' done')
                pprint(zip(classes, cv_score))
                cv_scores.append(cv_score)
                i += 1
            print_step('All folds done!')
            print('CV scores')
            pprint(zip(classes, np.mean(cv_scores, axis=0)))
            mean_cv_score = np.mean(np.mean(cv_scores, axis=0))
            print('mean cv score : ' + str(mean_cv_score))
            pred_full_test = pred_full_test / 5.
            for k, classx in enumerate(classes):
                train_df['gru_' + classx] = pred_train[:, k]
                test_df['gru_' + classx] = pred_full_test[:, k]

            print('~~~~~~~~~~~~~~~~~~')
            print_step('Cache Level 1')
            save_in_cache('lvl1_' + label + '_' + embedding_name, train_df,
                          test_df)
            print_step('Done!')

            print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            print_step('Prepping submission file')
            submission = pd.DataFrame()
            submission['id'] = test_df['id']
            submission['toxic'] = test_df[label + '_' + embedding_name +
                                          '_toxic']
            submission['severe_toxic'] = test_df[label + '_' + embedding_name +
                                                 '_severe_toxic']
            submission['obscene'] = test_df[label + '_' + embedding_name +
                                            '_obscene']
            submission['threat'] = test_df[label + '_' + embedding_name +
                                           '_threat']
            submission['insult'] = test_df[label + '_' + embedding_name +
                                           '_insult']
            submission['identity_hate'] = test_df[label + '_' +
                                                  embedding_name +
                                                  '_identity_hate']
            submission.to_csv('submit/submit_lvl1_' + label + '_' +
                              embedding_name + '.csv',
                              index=False)
            print_step('Done')
Ejemplo n.º 27
0
def tokenize_and_embed(train_df, test_df, embedding_file, max_features, maxlen,
                       embed_size, label):
    X_train = train_df['comment_text'].fillna('peterhurford').values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    print('Average train sequence length: {}'.format(
        np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(
        np.mean(list(map(len, x_test)), dtype=int)))

    print_step('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)

    print_step('Defining pre-trained embedding (' + label + ')')

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    embeddings_index = dict(
        get_coefs(*o.rstrip().rsplit(' ')) for o in open(embedding_file))
    if label != 'local':
        print_step('Defining local embedding')
        local_embeddings_index = dict(
            get_coefs(*o.rstrip().rsplit(' '))
            for o in open('cache/local_fasttext_model.vec'))

    print_step('Defining tokenization - embedding scheme')
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    if label != 'local':
        local_embedding_matrix = np.zeros((nb_words, 100))
        print_step('Calculating pre-trained <-> local shared words')
        shared_words = np.intersect1d(embeddings_index.keys(),
                                      local_embeddings_index.keys())
        reference_matrix = np.array(
            [local_embeddings_index.get(w) for w in shared_words])
        reference_matrix = normalize(reference_matrix).T

    non_alphas = re.compile(u'[^A-Za-z]+')
    stemmer = PorterStemmer()

    print_step('Beginning embedding')
    for word, i in word_index.items():
        if i >= max_features: continue
        # First try to find the embedding vector as-is
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            # Second, try to replace in' -> ing
            print("in' -> ing")
            word = word.replace("in'", "ing")
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is None:
                # Third, remove all non-letters
                print('remove punct')
                new_word = non_alphas.sub('', word)
                if new_word == '':
                    # If the word is now blank, replace with null embedding
                    print('blank')
                    embedding_vector = np.zeros(embed_size)
                else:
                    try:
                        embedding_vector = embeddings_index.get(new_word)
                        if embedding_vector is None:
                            # Otherwise, try Porter stemming
                            print('stem')
                            stemmed_word = stemmer.stem(new_word)
                            embedding_vector = embeddings_index.get(
                                stemmed_word)
                            if embedding_vector is None:
                                # Fifth, impute from local FastText
                                print('local impute: ' + word + '(' +
                                      new_word + ')')
                                if local_embeddings_index.get(
                                        new_word) is not None:
                                    lookup_matrix = normalize(
                                        [local_embeddings_index.get(new_word)])
                                    similarity = np.matmul(
                                        lookup_matrix, reference_matrix)
                                    similar_word = shared_words[np.argmax(
                                        similarity)]
                                    embedding_vector = embeddings_index.get(
                                        similar_word)
                                    print(word + ' -> ' + similar_word)
                                else:
                                    # Sixth, try to correct a contraction
                                    if "'s" in word:
                                        new_word = word.replace("'s", "")
                                        print('local impute: ' + word + '(' +
                                              new_word + ')')
                                        if local_embeddings_index.get(
                                                new_word) is not None:
                                            lookup_matrix = normalize([
                                                local_embeddings_index.get(
                                                    new_word)
                                            ])
                                            similarity = np.matmul(
                                                lookup_matrix,
                                                reference_matrix)
                                            similar_word = shared_words[
                                                np.argmax(similarity)]
                                            embedding_vector = embeddings_index.get(
                                                similar_word)
                                            print(word + ' -> ' + similar_word)
                                    if embedding_vector is None:
                                        print('normalize text')
                                        new_words = normalize_text(
                                            word).split()
                                        if len(
                                                new_words
                                        ) == 2 and embeddings_index.get(
                                                new_words[0]
                                        ) is not None and embeddings_index.get(
                                                new_words[1]) is not None:
                                            embedding_vector = embeddings_index.get(
                                                new_words[0]
                                            ) + embeddings_index.get(
                                                new_words[1]) / 2
                                            print(word + ' -> ' +
                                                  ' '.join(new_words))
                                        else:
                                            print('spell correct')
                                            # Seventh, try to spell correct
                                            try:
                                                new_word = str(
                                                    TextBlob(word).correct())
                                            except:
                                                new_word = word
                                            embedding_vector = embeddings_index.get(
                                                new_word)
                                            if embedding_vector is not None:
                                                print(word + ' -> ' +
                                                      str(new_word))
                                            else:
                                                # Eighth, give up
                                                print('Giving up on ' +
                                                      str(word))
                                                import pdb
                                                pdb.set_trace()
                                                embedding_vector = np.zeros(
                                                    embed_size)
                    except Exception as e:
                        print('error')
                        import pdb
                        pdb.set_trace()

        embedding_matrix[i] = embedding_vector

    return x_train, x_test, embedding_matrix
Ejemplo n.º 28
0
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=10)
    print(model.feature_importance())
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


if is_in_cache('lgb_fe_with_embeddings_and_svd'):
    train, test = load_cache('lgb_fe_with_embeddings_and_svd')
else:
    print('~~~~~~~~~~~~~~~~~~~')
    print_step('Importing Data')
    train, test = get_data()
    if is_in_cache('fe_lgb_data'):
        train_fe, test_fe = load_cache('fe_lgb_data')
    else:
        print_step('Adding Features')
        train_fe, test_fe = add_features(train, test)
        print_step('Dropping')
        train_fe.drop(['id', 'comment_text'], axis=1, inplace=True)
        test_fe.drop(['id', 'comment_text'], axis=1, inplace=True)
        train_fe.drop([
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ],
                      axis=1,
                      inplace=True)
from utils import CHARACTERS, print_step

HOLDOUT = False

lines = pd.read_csv('scooby_doo_lines.csv')

if HOLDOUT:
    train_lines, test_lines, train_character, test_character = train_test_split(
        lines['line'], lines['character'], test_size=0.1, random_state=42)
else:
    train_lines = lines['line']
    test_lines = None
    train_character = lines['character']
    test_character = None

print_step('TFIDF')
tfidf = TfidfVectorizer(ngram_range=(1, 2),
                        max_features=10000,
                        min_df=2,
                        max_df=0.8,
                        binary=True)
tfidf_train = tfidf.fit_transform(train_lines)
print(tfidf_train.shape)
if HOLDOUT:
    tfidf_test = tfidf.transform(test_lines)
    print(tfidf_test.shape)

all_test_preds = defaultdict(lambda: '')
all_train_preds = defaultdict(lambda: '')
models = defaultdict(lambda: '')
Ejemplo n.º 30
0
def runTFFM(train_X, train_y, test_X, test_y, test_X2, params):
    model = TFFMRegressor(**params)
    print_step('Fit TFFM')
    for i in range(rounds):
        model.fit(train_X, train_y.values, n_epochs=iters)
        pred_test_y = model.predict(test_X)
        print_step('Iteration {}/{} -- RMSE: {}'.format(
            i + 1, rounds, rmse(pred_test_y, test_y)))
    print_step('TFFM Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 1/2')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

print('~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 2/2')
train, test = load_cache('complete_fm_data')

print('~~~~~~~~~~~~~~')