def run_query(comment_text, idx): value = normalize_text(comment_text) value = value[:2999] if len(value) >= 3000 else value value = 'empty' if len(value) == 0 else value try: rr = requests.post( 'https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze', params={'key': CONVAI_KEY}, data=json.dumps({ 'comment': { 'text': value }, 'languages': ['en'], 'requestedAttributes': { 'TOXICITY': {}, 'ATTACK_ON_AUTHOR': {}, 'ATTACK_ON_COMMENTER': {}, 'INCOHERENT': {}, 'INFLAMMATORY': {}, 'LIKELY_TO_REJECT': {}, 'OBSCENE': {}, 'SEVERE_TOXICITY': {}, 'SPAM': {}, 'UNSUBSTANTIAL': {} } })) return [(k, v['summaryScore']['value']) for k, v in rr.json()['attributeScores'].items() ] + [('id', idx)] except Exception as error: print_step('FATAL ABORT:') import pdb pdb.set_trace()
def run_ridge_on_cat(cat): if not is_in_cache('cat_ridges_blend_l3_' + cat): print_step(cat + ' > Subsetting') train_c = train_[train['parent_category_name'] == cat].copy() test_c = test_[test['parent_category_name'] == cat].copy() print(train_c.shape) print(test_c.shape) target = train_c['deal_probability'].values train_id = train_c['item_id'] test_id = test_c['item_id'] train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test_c.drop('item_id', axis=1, inplace=True) print_step(cat + ' > Modeling') results = run_cv_model(train_c, test_c, target, runLasso, params, rmse, cat + '-ridge-blend') train_c['cat_ridge'] = results['train'] test_c['cat_ridge'] = results['test'] print_step(cat + ' > RMSE: ' + str(rmse(target, train_c['cat_ridge']))) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step(cat + ' > Saving in Cache') train_c['item_id'] = train_id test_c['item_id'] = test_id save_in_cache('cat_ridges_blend_l3_' + cat, train_c[['item_id', 'cat_ridge']], test_c[['item_id', 'cat_ridge']]) return True else: print_step('Already have ' + cat + '...') return True
def run_with_target(label, target, data_key, model_fn, kf, train_key=None, eval_fn=None): if is_in_cache(label + '_' + target): return load_cache(label + '_' + target)[0] else: print('-') print_step('Training ' + target) if train_key is None: train, test = get_data() else: train, test = load_cache(train_key) post_train, post_test = load_cache(data_key) if isinstance(post_train, pd.DataFrame): post_train = post_train.values post_test = post_test.values train_y = train[target] cv_scores = [] pred_full_test = 0 pred_train = np.zeros(train.shape[0]) i = 1 if isinstance(kf, StratifiedKFold): fold_splits = kf.split(post_train, train_y) else: fold_splits = kf.split(post_train) for dev_index, val_index in fold_splits: print_step('Started ' + label + ' ' + target + ' fold ' + str(i)) dev_X, val_X = post_train[dev_index], post_train[val_index] dev_y, val_y = train_y[dev_index], train_y[val_index] pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, post_test, target, dev_index, val_index) pred_full_test = pred_full_test + pred_test_y pred_train[val_index] = pred_val_y cv_score = eval_fn(val_y, pred_val_y) cv_scores.append(eval_fn(val_y, pred_val_y)) print_step(label + ' ' + target + ' cv score ' + str(i) + ' : ' + str(cv_score)) i += 1 print_step(label + ' ' + target + ' cv scores : ' + str(cv_scores)) mean_cv_score = np.mean(cv_scores) print_step(label + ' ' + target + ' mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. results = { 'label': label, 'target': target, 'train': pred_train, 'test': pred_full_test, 'cv': cv_scores } save_in_cache(label + '_' + target, results, None) return results
def runLasso(train_X, train_y, val_X, val_y, test_X, params): model = Lasso(**params) model.fit(train_X, train_y) for i in zip(train_X.columns, model.coef_): print(i) print_step('Predict Val 1/2') pred_val_y = model.predict(val_X) print_step('Predict Test 2/2') pred_test_y = model.predict(test_X) return pred_val_y, pred_test_y
def get_data(): train = pd.read_csv('data/train.csv') test = pd.read_csv('data/test.csv') print('Train shape: {}'.format(train.shape)) print('Test shape: {}'.format(test.shape)) print_step('Filling missing') train['comment_text'].fillna('missing', inplace=True) test['comment_text'].fillna('missing', inplace=True) print('Train shape: {}'.format(train.shape)) print('Test shape: {}'.format(test.shape)) return train, test
def runSparseLGB(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): print_step('Get K Best') model = LogisticRegression(solver='sag') sfm = SelectFromModel(model, threshold=0.2) print(train_X.shape) train_sparse_matrix = sfm.fit_transform(train_X, train_y) print(train_sparse_matrix.shape) test_sparse_matrix = sfm.transform(test_X) test_sparse_matrix2 = sfm.transform(test_X2) del train_X del test_X del test_X2 gc.collect() d_train = lgb.Dataset(train_sparse_matrix, label=train_y) d_valid = lgb.Dataset(test_sparse_matrix, label=test_y) watchlist = [d_train, d_valid] params = { 'boosting': 'dart', 'learning_rate': 0.1, 'application': 'binary', 'num_leaves': 31, 'verbosity': -1, 'metric': 'auc', 'data_random_seed': 2, 'bagging_fraction': 0.8, 'feature_fraction': 0.1, 'nthread': min(mp.cpu_count() - 1, 6), 'lambda_l1': 1, 'lambda_l2': 1, 'min_data_in_leaf': 40 } rounds_lookup = { 'toxic': 1400, 'severe_toxic': 500, 'obscene': 550, 'threat': 380, 'insult': 500, 'identity_hate': 480 } model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=10) pred_test_y = model.predict(test_sparse_matrix) pred_test_y2 = model.predict(test_sparse_matrix2) return pred_test_y, pred_test_y2
def random_inference(self): steps = 0 total_reward = 0 done = False self.env.reset() # one game round of randomness while not done: action = self.env.action_space.sample() state, reward, done, info = self.env.step(action) steps += 1 total_reward += reward self.env.render() print_step(steps, state, action, reward, total_reward) time.sleep(0.25) os.system('clear')
def validation(self, sleep_timer=0): state = self.env.reset() steps, cum_reward = 0, 0 done = False while not done: action = np.argmax(self.q_table[state]) state, reward, done, info = self.env.step(action) if sleep_timer > 0: os.system('clear') self.env.render() print_step(steps, state, action, reward, cum_reward) time.sleep(sleep_timer) cum_reward += reward steps += 1 return steps, cum_reward
def run_cv_model(train, test, target, model_fn, params, eval_fn, label): kf = KFold(n_splits=5, shuffle=True, random_state=2017) fold_splits = kf.split(train) cv_scores = [] pred_full_test = 0 pred_train = np.zeros(train.shape[0]) i = 1 for dev_index, val_index in fold_splits: print_step('Started ' + label + ' fold ' + str(i) + '/5') if isinstance(train, pd.DataFrame): dev_X, val_X = train.values[dev_index], train.values[val_index] dev_y, val_y = target[dev_index], target[val_index] dev_X = pd.DataFrame(dev_X, columns=train.columns) val_X = pd.DataFrame(val_X, columns=train.columns) for (column, dtype) in list(zip(train.columns, list(train.dtypes))): dev_X[column] = dev_X[column].astype(dtype) val_X[column] = val_X[column].astype(dtype) else: dev_X, val_X = train[dev_index], train[val_index] dev_y, val_y = target[dev_index], target[val_index] params2 = params.copy() pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, test, params2) pred_full_test = pred_full_test + pred_test_y pred_train[val_index] = pred_val_y cv_score = eval_fn(val_y, pred_val_y) cv_scores.append(eval_fn(val_y, pred_val_y)) print_step(label + ' cv score ' + str(i) + ' : ' + str(cv_score)) i += 1 print_step(label + ' cv scores : ' + str(cv_scores)) print_step(label + ' mean cv score : ' + str(np.mean(cv_scores))) print_step(label + ' std cv score : ' + str(np.std(cv_scores))) pred_full_test = pred_full_test / 5.0 results = { 'label': label, 'train': pred_train, 'test': pred_full_test, 'cv': cv_scores } return results
def runLGB(train_X, train_y, test_X, test_y, test_X2, params): print_step('Prep LGB') d_train = lgb.Dataset(train_X, label=train_y) d_valid = lgb.Dataset(test_X, label=test_y) watchlist = [d_train, d_valid] print_step('Train LGB') num_rounds = params.pop('num_rounds') verbose_eval = params.pop('verbose_eval') model = lgb.train(params, train_set=d_train, num_boost_round=num_rounds, valid_sets=watchlist, verbose_eval=verbose_eval) print_step('Feature importance') pprint(sorted(list(zip(model.feature_importance(), train_X.columns)), reverse=True)) print_step('Predict 1/2') pred_test_y = model.predict(test_X) print_step('Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def save_in_cache(key, train, test): if isinstance(train, dict): train = np.array(train) train_path = 'cache/model_' + key np.save(train_path, train) elif is_csr_matrix(train): train_path = 'cache/train_' + key + '.npcsr' save_sparse_csr(train_path, train) if test is not None: test_path = 'cache/test_' + key + '.npcsr' save_sparse_csr(test_path, test) else: train_path = 'cache/train_' + key + '.csv' train.to_csv(train_path, index=False) if test is not None: test_path = 'cache/test_' + key + '.csv' test.to_csv(test_path, index=False) if test is None: print_step('Saved ' + train_path + ' to cache!') else: print_step('Saved ' + train_path + ' and ' + test_path + ' to cache!')
def runRidge(train_X, train_y, test_X, test_y, test_X2, params): model = Ridge(**params) print_step('Fit Ridge') model.fit(train_X, train_y) print_step('Ridge Predict 1/2') pred_test_y = model.predict(test_X) print_step('Ridge Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def runLGB(train_X, train_y, test_X, test_y, test_X2, params): print_step('Prep LGB') d_train = lgb.Dataset(train_X, label=train_y) d_valid = lgb.Dataset(test_X, label=test_y) watchlist = [d_train, d_valid] print_step('Train LGB') num_rounds = params.pop('num_rounds') verbose_eval = params.pop('verbose_eval') model = lgb.train(params, train_set=d_train, num_boost_round=num_rounds, valid_sets=watchlist, verbose_eval=verbose_eval) print_step('Predict 1/2') pred_test_y = model.predict(test_X) print_step('Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def runTFFM(train_X, train_y, test_X, test_y, test_X2, params): model = TFFMRegressor(**params) print_step('Fit TFFM') for i in range(rounds): model.fit(train_X, train_y.values, n_epochs=iters) pred_test_y = model.predict(test_X) print_step('Iteration {}/{} -- RMSE: {}'.format( i + 1, rounds, rmse(pred_test_y, test_y))) print_step('TFFM Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def runFM(train_X, train_y, test_X, test_y, test_X2, params): params['D'] = train_X.shape[1] rounds = params.pop('rounds') model = FM_FTRL(**params) print_step('Fit FM') for i in range(rounds): model.fit(train_X, train_y, reset=False) pred_test_y = model.predict(test_X) print_step('Iteration {}/{} -- RMSE: {}'.format(i + 1, rounds, rmse(pred_test_y, test_y))) print_step('FM Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def get_img_data(index, image_files): print_step('[Core %d] Start' % index) if not is_in_cache('img_data_' + str(index)): data = [] i = 0 for image_file in image_files: dat = get_image(image_file) if dat: data += [get_data_from_image(dat, core=index, i=i)] i += 1 if i % 50 == 0: print_step('[Core %d] Completed %d / %d...' % (index, i, len(image_files))) print_step('[Core %d] Done. Saving...' % index) save_in_cache('img_data_' + str(index), data_to_df(data), None) else: print(str(index) + ' already in cache! Skipping...') return True
def run_tfidf(train, test, ngram_min=1, ngram_max=2, min_df=5, max_features=20000, rm_stopwords=True, analyzer='word', sublinear_tf=False, token_pattern=r'(?u)\b\w\w+\b', binary=False, tokenize=False, tokenizer=None): rm_stopwords = 'english' if rm_stopwords else None strip_accents = 'unicode' if tokenize else None tfidf_vec = TfidfVectorizer(ngram_range=(ngram_min, ngram_max), analyzer=analyzer, stop_words=rm_stopwords, strip_accents=strip_accents, token_pattern=token_pattern, tokenizer=tokenizer, min_df=min_df, max_features=max_features, sublinear_tf=sublinear_tf, binary=binary) print_step('TFIDF ngrams ' + str(ngram_min) + ' to ' + str(ngram_max) + ' on ' + str(analyzer) + ' with strip accents = ' + str(strip_accents) + ', token_pattern = ' + str(token_pattern) + ', tokenizer = ' + str(tokenizer) + ', rm_stopwords = ' + str(rm_stopwords) + ', min_df = ' + str(min_df) + ', max_features = ' + str(max_features) + ', sublinear_tf = ' + str(sublinear_tf) + ', binary = ' + str(binary)) train_tfidf = tfidf_vec.fit_transform(train['comment_text']) if test is not None: print_step('TFIDF 1/2') test_tfidf = tfidf_vec.transform(test['comment_text']) print_step('TFIDF 2/2') print('TFIDF train shape: {}'.format(train_tfidf.shape)) print('TFIDF test shape: {}'.format(test_tfidf.shape)) else: print('TFIDF train shape: {}'.format(train_tfidf.shape)) test_tfidf = None return train_tfidf, test_tfidf
def load_cache(key): if is_in_cache(key): if is_in_cache(key) == 'dict': train_path = 'cache/model_' + key + '.npy' train = np.load(train_path).tolist() test = None elif is_in_cache(key) == 'csr': train_path = 'cache/train_' + key + '.npcsr.npz' train = load_sparse_csr(train_path) try: test_path = 'cache/test_' + key + '.npcsr.npz' test = load_sparse_csr(test_path) print('Train shape: {}'.format(train.shape)) print('Test shape: {}'.format(test.shape)) except IOError: test = None print('Train shape: {}'.format(train.shape)) else: train_path = 'cache/train_' + key + '.csv' test_path = 'cache/test_' + key + '.csv' train = pd.read_csv(train_path) try: test = pd.read_csv(test_path) except IOError: test = None if 'comment_text' in train.columns: print_step('Filling missing') train['comment_text'].fillna('missing', inplace=True) if test is not None: test['comment_text'].fillna('missing', inplace=True) if test is not None: print('Test shape: {}'.format(test.shape)) print('Train shape: {}'.format(train.shape)) if test is None: print_step('Skipped... Loaded ' + train_path + ' from cache!') else: print_step('Skipped... Loaded ' + train_path + ' and ' + test_path + ' from cache!') return train, test else: raise ValueError
import re import string import pandas as pd import numpy as np from nltk.corpus import stopwords from utils import print_step, bin_and_ohe_data from cache import get_data, is_in_cache, load_cache, save_in_cache print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('data_with_fe'): print('~~~~~~~~~~~~') print_step('Merging') merge = pd.concat([train, test]) print('~~~~~~~~~~~~~~~~~~~') print_step('Imputation 1/7')
def runNBLR(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): train_y = train_y.values r = csr_matrix(np.log(pr(train_X, 1, train_y) / pr(train_X, 0, train_y))) model = LogisticRegression(C=4, dual=True) x_nb = train_X.multiply(r) model.fit(x_nb, train_y) pred_test_y = model.predict_proba(test_X.multiply(r))[:, 1] pred_test_y2 = model.predict_proba(test_X2.multiply(r))[:, 1] return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() train['non_toxic'] = train[[ 'toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate' ]].sum(axis=1).apply(lambda x: 0 if x > 1 else 1) save_in_cache('extra_label', train, test) if not is_in_cache('cleaned'): print('~~~~~~~~~~~~~') print_step('Cleaning') train_cleaned, test_cleaned = clean_text(train, test) save_in_cache('cleaned', train_cleaned, test_cleaned) else: train_cleaned, test_cleaned = load_cache('cleaned') print('~~~~~~~~~~~~~~~~~~~~~~~~')
'threat': 380, 'insult': 500, 'identity_hate': 480 } model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=10) pred_test_y = model.predict(test_sparse_matrix) pred_test_y2 = model.predict(test_sparse_matrix2) return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() if not is_in_cache('cleaned'): print('~~~~~~~~~~~~~') print_step('Cleaning') train_cleaned, test_cleaned = clean_text(train, test) save_in_cache('cleaned', train_cleaned, test_cleaned) else: train_cleaned, test_cleaned = load_cache('cleaned') print_step('Filling missing') train_cleaned['comment_text'].fillna('missing', inplace=True) test_cleaned['comment_text'].fillna('missing', inplace=True) print('Train shape: {}'.format(train_cleaned.shape)) print('Test shape: {}'.format(test_cleaned.shape))
'threat': 380, 'insult': 500, 'identity_hate': 480 } model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=10) pred_test_y = model.predict(test_X) pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() if not is_in_cache('cleaned'): print('~~~~~~~~~~~~~') print_step('Cleaning') train_cleaned, test_cleaned = clean_text(train, test) save_in_cache('cleaned', train_cleaned, test_cleaned) else: train_cleaned, test_cleaned = load_cache('cleaned') print_step('Filling missing') train_cleaned['comment_text'].fillna('missing', inplace=True) test_cleaned['comment_text'].fillna('missing', inplace=True) print('Train shape: {}'.format(train_cleaned.shape)) print('Test shape: {}'.format(test_cleaned.shape))
maxlen = 100 embed_size = 300 epochs = 4 batch_size = 32 predict_batch_size = 1024 if not is_in_cache('lvl1_double-gru'): train_df, test_df = get_data() classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int))) print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) print_step('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape)
PRINT_LGB_FEATURE_IMPORTANCE = False N_FOLDS = 100 import string import numpy as np import pandas as pd from scipy.sparse import csr_matrix, hstack from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.metrics import roc_auc_score as auc from utils import print_step, ohe, run_cv_model, runLGB, runFFLGB, runLR, runTarget print_step('Loading') train = pd.read_csv('train.csv') print(train.shape) test = pd.read_csv('test.csv') print(test.shape) print_step('Subsetting') target = train['target'] train_id = train['id'] test_id = test['id'] train.drop(['target', 'id'], axis=1, inplace=True) test.drop('id', axis=1, inplace=True) print_step('Combine') traintest = pd.concat([train, test])
num_rounds = params.pop('num_rounds') verbose_eval = params.pop('verbose_eval') model = lgb.train(params, train_set=d_train, num_boost_round=num_rounds, valid_sets=watchlist, verbose_eval=verbose_eval) print_step('Predict 1/2') pred_test_y = model.predict(test_X) print_step('Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 1/13') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('title_countvec'): print('~~~~~~~~~~~~~~~~~~~~') print_step('Title CountVec 1/2') cv = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True,
def run_nn_model(label, model, max_features, maxlen, epochs, batch_size, predict_batch_size): classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] for embedding_name, embedding_file in EMBEDDING_FILES.items(): if is_in_cache(label + '_' + embedding_name): print_step('Already trained ' + label + '_' + embedding_name + '! Skipping...') else: train_df, test_df = get_data() print_step('Loading embed ' + embedding_name + '...') embed_size = EMBED_SIZE_LOOKUP[embedding_name] x_train, x_test, embedding_matrix = tokenize_and_embed( train_df, test_df, embedding_file, max_features, maxlen, embed_size, embedding_name) y_train = train_df[classes].values print_step('Build model...') model = model(max_features, maxlen, embed_size, embedding_matrix) model.save_weights('cache/gru-model-weights.h5') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) i = 1 cv_scores = [] pred_train = np.zeros((train_df.shape[0], 6)) pred_full_test = np.zeros((test_df.shape[0], 6)) for dev_index, val_index in kf.split(x_train, y_train[:, 0]): print_step('Started fold ' + str(i)) model.load_weights('cache/' + label + '_' + embedding_name + '-model-weights.h5') dev_X, val_X = x_train[dev_index], x_train[val_index] dev_y, val_y = y_train[dev_index, :], y_train[val_index, :] RocAuc = RocAucEvaluation(validation_data=(val_X, val_y), interval=1) model.fit(dev_X, dev_y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_y), callbacks=[RocAuc]) val_pred = model.predict(val_X, batch_size=predict_batch_size, verbose=1) pred_train[val_index, :] = val_pred test_pred = model.predict(x_test, batch_size=predict_batch_size, verbose=1) pred_full_test = pred_full_test + test_pred cv_score = [ roc_auc_score(val_y[:, j], val_pred[:, j]) for j in range(6) ] print_step('Fold ' + str(i) + ' done') pprint(zip(classes, cv_score)) cv_scores.append(cv_score) i += 1 print_step('All folds done!') print('CV scores') pprint(zip(classes, np.mean(cv_scores, axis=0))) mean_cv_score = np.mean(np.mean(cv_scores, axis=0)) print('mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. for k, classx in enumerate(classes): train_df['gru_' + classx] = pred_train[:, k] test_df['gru_' + classx] = pred_full_test[:, k] print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_' + label + '_' + embedding_name, train_df, test_df) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test_df['id'] submission['toxic'] = test_df[label + '_' + embedding_name + '_toxic'] submission['severe_toxic'] = test_df[label + '_' + embedding_name + '_severe_toxic'] submission['obscene'] = test_df[label + '_' + embedding_name + '_obscene'] submission['threat'] = test_df[label + '_' + embedding_name + '_threat'] submission['insult'] = test_df[label + '_' + embedding_name + '_insult'] submission['identity_hate'] = test_df[label + '_' + embedding_name + '_identity_hate'] submission.to_csv('submit/submit_lvl1_' + label + '_' + embedding_name + '.csv', index=False) print_step('Done')
def tokenize_and_embed(train_df, test_df, embedding_file, max_features, maxlen, embed_size, label): X_train = train_df['comment_text'].fillna('peterhurford').values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Average train sequence length: {}'.format( np.mean(list(map(len, x_train)), dtype=int))) print('Average test sequence length: {}'.format( np.mean(list(map(len, x_test)), dtype=int))) print_step('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) print_step('Defining pre-trained embedding (' + label + ')') def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') embeddings_index = dict( get_coefs(*o.rstrip().rsplit(' ')) for o in open(embedding_file)) if label != 'local': print_step('Defining local embedding') local_embeddings_index = dict( get_coefs(*o.rstrip().rsplit(' ')) for o in open('cache/local_fasttext_model.vec')) print_step('Defining tokenization - embedding scheme') word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.zeros((nb_words, embed_size)) if label != 'local': local_embedding_matrix = np.zeros((nb_words, 100)) print_step('Calculating pre-trained <-> local shared words') shared_words = np.intersect1d(embeddings_index.keys(), local_embeddings_index.keys()) reference_matrix = np.array( [local_embeddings_index.get(w) for w in shared_words]) reference_matrix = normalize(reference_matrix).T non_alphas = re.compile(u'[^A-Za-z]+') stemmer = PorterStemmer() print_step('Beginning embedding') for word, i in word_index.items(): if i >= max_features: continue # First try to find the embedding vector as-is embedding_vector = embeddings_index.get(word) if embedding_vector is None: # Second, try to replace in' -> ing print("in' -> ing") word = word.replace("in'", "ing") embedding_vector = embeddings_index.get(word) if embedding_vector is None: # Third, remove all non-letters print('remove punct') new_word = non_alphas.sub('', word) if new_word == '': # If the word is now blank, replace with null embedding print('blank') embedding_vector = np.zeros(embed_size) else: try: embedding_vector = embeddings_index.get(new_word) if embedding_vector is None: # Otherwise, try Porter stemming print('stem') stemmed_word = stemmer.stem(new_word) embedding_vector = embeddings_index.get( stemmed_word) if embedding_vector is None: # Fifth, impute from local FastText print('local impute: ' + word + '(' + new_word + ')') if local_embeddings_index.get( new_word) is not None: lookup_matrix = normalize( [local_embeddings_index.get(new_word)]) similarity = np.matmul( lookup_matrix, reference_matrix) similar_word = shared_words[np.argmax( similarity)] embedding_vector = embeddings_index.get( similar_word) print(word + ' -> ' + similar_word) else: # Sixth, try to correct a contraction if "'s" in word: new_word = word.replace("'s", "") print('local impute: ' + word + '(' + new_word + ')') if local_embeddings_index.get( new_word) is not None: lookup_matrix = normalize([ local_embeddings_index.get( new_word) ]) similarity = np.matmul( lookup_matrix, reference_matrix) similar_word = shared_words[ np.argmax(similarity)] embedding_vector = embeddings_index.get( similar_word) print(word + ' -> ' + similar_word) if embedding_vector is None: print('normalize text') new_words = normalize_text( word).split() if len( new_words ) == 2 and embeddings_index.get( new_words[0] ) is not None and embeddings_index.get( new_words[1]) is not None: embedding_vector = embeddings_index.get( new_words[0] ) + embeddings_index.get( new_words[1]) / 2 print(word + ' -> ' + ' '.join(new_words)) else: print('spell correct') # Seventh, try to spell correct try: new_word = str( TextBlob(word).correct()) except: new_word = word embedding_vector = embeddings_index.get( new_word) if embedding_vector is not None: print(word + ' -> ' + str(new_word)) else: # Eighth, give up print('Giving up on ' + str(word)) import pdb pdb.set_trace() embedding_vector = np.zeros( embed_size) except Exception as e: print('error') import pdb pdb.set_trace() embedding_matrix[i] = embedding_vector return x_train, x_test, embedding_matrix
model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=10) print(model.feature_importance()) pred_test_y = model.predict(test_X) pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 if is_in_cache('lgb_fe_with_embeddings_and_svd'): train, test = load_cache('lgb_fe_with_embeddings_and_svd') else: print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() if is_in_cache('fe_lgb_data'): train_fe, test_fe = load_cache('fe_lgb_data') else: print_step('Adding Features') train_fe, test_fe = add_features(train, test) print_step('Dropping') train_fe.drop(['id', 'comment_text'], axis=1, inplace=True) test_fe.drop(['id', 'comment_text'], axis=1, inplace=True) train_fe.drop([ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ], axis=1, inplace=True)
from utils import CHARACTERS, print_step HOLDOUT = False lines = pd.read_csv('scooby_doo_lines.csv') if HOLDOUT: train_lines, test_lines, train_character, test_character = train_test_split( lines['line'], lines['character'], test_size=0.1, random_state=42) else: train_lines = lines['line'] test_lines = None train_character = lines['character'] test_character = None print_step('TFIDF') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000, min_df=2, max_df=0.8, binary=True) tfidf_train = tfidf.fit_transform(train_lines) print(tfidf_train.shape) if HOLDOUT: tfidf_test = tfidf.transform(test_lines) print(tfidf_test.shape) all_test_preds = defaultdict(lambda: '') all_train_preds = defaultdict(lambda: '') models = defaultdict(lambda: '')
def runTFFM(train_X, train_y, test_X, test_y, test_X2, params): model = TFFMRegressor(**params) print_step('Fit TFFM') for i in range(rounds): model.fit(train_X, train_y.values, n_epochs=iters) pred_test_y = model.predict(test_X) print_step('Iteration {}/{} -- RMSE: {}'.format( i + 1, rounds, rmse(pred_test_y, test_y))) print_step('TFFM Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 1/2') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 2/2') train, test = load_cache('complete_fm_data') print('~~~~~~~~~~~~~~')