def run_ridge_on_cat(cat): if not is_in_cache('cat_ridges_blend_l3_' + cat): print_step(cat + ' > Subsetting') train_c = train_[train['parent_category_name'] == cat].copy() test_c = test_[test['parent_category_name'] == cat].copy() print(train_c.shape) print(test_c.shape) target = train_c['deal_probability'].values train_id = train_c['item_id'] test_id = test_c['item_id'] train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test_c.drop('item_id', axis=1, inplace=True) print_step(cat + ' > Modeling') results = run_cv_model(train_c, test_c, target, runLasso, params, rmse, cat + '-ridge-blend') train_c['cat_ridge'] = results['train'] test_c['cat_ridge'] = results['test'] print_step(cat + ' > RMSE: ' + str(rmse(target, train_c['cat_ridge']))) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step(cat + ' > Saving in Cache') train_c['item_id'] = train_id test_c['item_id'] = test_id save_in_cache('cat_ridges_blend_l3_' + cat, train_c[['item_id', 'cat_ridge']], test_c[['item_id', 'cat_ridge']]) return True else: print_step('Already have ' + cat + '...') return True
def run_query_in_batches(df, label=''): responses = [] total = len(df.comment_text.values) i = 0 while i <= total: skip = False if i % 500 == 0 or i == total: batch_num = str(i / 500 + 1) if is_in_cache('convai-batches-' + label + batch_num): print_step('BATCH ' + label + batch_num + ' ALREADY DONE...') i += 500 skip = True elif len(responses) > 100: batch_num = str(i / 500) if i == total: batch_num = str(int(batch_num) + 1) skip = True print_step('COLLECTING BATCH ' + label + batch_num + ' / ' + str(round(total / 500) + 1)) batch_df = pd.DataFrame([dict(x) for x in responses]) save_in_cache('convai-batches-' + label + batch_num, batch_df, None) batch_num = str(i / 500 + 1) print_step('SLEEPING 60s') time.sleep(60) responses = [] print_step('STARTING BATCH ' + label + batch_num) else: print_step('STARTING BATCH ' + label + batch_num) if not skip: print_step(str(i + 1) + ' / ' + str(total)) responses.append( run_query(df.comment_text.values[i], df.id.values[i])) i += 1
def run_with_target(label, target, data_key, model_fn, kf, train_key=None, eval_fn=None): if is_in_cache(label + '_' + target): return load_cache(label + '_' + target)[0] else: print('-') print_step('Training ' + target) if train_key is None: train, test = get_data() else: train, test = load_cache(train_key) post_train, post_test = load_cache(data_key) if isinstance(post_train, pd.DataFrame): post_train = post_train.values post_test = post_test.values train_y = train[target] cv_scores = [] pred_full_test = 0 pred_train = np.zeros(train.shape[0]) i = 1 if isinstance(kf, StratifiedKFold): fold_splits = kf.split(post_train, train_y) else: fold_splits = kf.split(post_train) for dev_index, val_index in fold_splits: print_step('Started ' + label + ' ' + target + ' fold ' + str(i)) dev_X, val_X = post_train[dev_index], post_train[val_index] dev_y, val_y = train_y[dev_index], train_y[val_index] pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, post_test, target, dev_index, val_index) pred_full_test = pred_full_test + pred_test_y pred_train[val_index] = pred_val_y cv_score = eval_fn(val_y, pred_val_y) cv_scores.append(eval_fn(val_y, pred_val_y)) print_step(label + ' ' + target + ' cv score ' + str(i) + ' : ' + str(cv_score)) i += 1 print_step(label + ' ' + target + ' cv scores : ' + str(cv_scores)) mean_cv_score = np.mean(cv_scores) print_step(label + ' ' + target + ' mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. results = { 'label': label, 'target': target, 'train': pred_train, 'test': pred_full_test, 'cv': cv_scores } save_in_cache(label + '_' + target, results, None) return results
def get_img_data(index, image_files): print_step('[Core %d] Start' % index) if not is_in_cache('img_data_' + str(index)): data = [] i = 0 for image_file in image_files: dat = get_image(image_file) if dat: data += [get_data_from_image(dat, core=index, i=i)] i += 1 if i % 50 == 0: print_step('[Core %d] Completed %d / %d...' % (index, i, len(image_files))) print_step('[Core %d] Done. Saving...' % index) save_in_cache('img_data_' + str(index), data_to_df(data), None) else: print(str(index) + ' already in cache! Skipping...') return True
def display_select_result_html(): """ Displays the html to select the results file that you want to view. This is the main page """ html_string = "" html_string += """<HTML> <HEAD> <TITLE> Python Server </TITLE> <link rel="stylesheet" type="text/css" href="static/file_list.css" /> </HEAD> <BODY> <div style="text-align:center"> <font size="6">Available Results</font> <br><br> <table align="center"> <tr> <th>Result File</th> <th>Size</th> <th>Date Modified</th> <th>Date Created</th> <th>Cached</th> </tr> """ for result_file in os.listdir(result_directory): absolute_result_file_path = os.path.join(result_directory, result_file) html_string += "<tr>" html_string += "<td><div><a href=results/" + result_file + ">" + result_file + "</a></div></td>" html_string += "<td>" + str(os.path.getsize(absolute_result_file_path) / float(1000)) + "kb </td>" html_string += "<td>" + time.ctime(os.path.getmtime(absolute_result_file_path)) + "</td>" html_string += "<td>" + time.ctime(os.path.getctime(absolute_result_file_path)) + "</td>" cached = cache.is_in_cache(absolute_result_file_path, cache_directory) if cached: html_string += '<td><img src="static/checkmark.png" style="width:15px;height:15px;">' else: html_string += '<td><img src="static/x.png" style="width:15px;height:15px;">' html_string += "</tr>\n" html_string += """ </table> </div> </BODY> </HTML>""" return html_string
'nthread': min(mp.cpu_count() - 1, 6), 'lambda_l1': 1, 'lambda_l2': 1 } model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=100) print(model.feature_importance()) pred_test_y = model.predict(test_X) pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 if not is_in_cache('convai_with_fe'): print_step('Importing base data') train_base, test_base = get_data() print_step('Importing ConvAI data') train, test = load_cache('convai_data') print_step('Importing FE') train_fe, test_fe = load_cache('fe_lgb_data') print_step('Merging') train_fe['id'] = train_base['id'] test_fe['id'] = test_base['id'] train_ = pd.merge(train_fe, train, on='id') test_ = pd.merge(test_fe, test, on='id') del train_base
a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a #print weigthted_input.shape return K.sum(weighted_input, axis=1) def compute_output_shape(self, input_shape): #return input_shape[0], input_shape[-1] return input_shape[0], self.features_dim if not is_in_cache('lvl1_attention-lstm'): train_df, test_df = get_data() classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences')
y_pred = self.model.predict(self.X_val, verbose=0) score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch + 1, score)) EMBEDDING_FILE = 'cache/glove/glove.840B.300d.txt' max_features = 100000 maxlen = 150 embed_size = 300 epochs = 4 batch_size = 128 predict_batch_size = 1024 if not is_in_cache('lvl1_gru-conv'): print_step('Loading data') train_df, test_df = get_data() print_step('Preprocessing 1/3') train_df['comment_text'] = train_df['comment_text'].apply( glove_preprocess).apply(normalize_text) print_step('Preprocessing 2/3') test_df['comment_text'] = test_df['comment_text'].apply( glove_preprocess).apply(normalize_text) print_step('Preprocessing 3/3') classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values
x_nb = train_X.multiply(r) model.fit(x_nb, train_y) pred_test_y = model.predict_proba(test_X.multiply(r))[:, 1] pred_test_y2 = model.predict_proba(test_X2.multiply(r))[:, 1] return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() train['non_toxic'] = train[[ 'toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate' ]].sum(axis=1).apply(lambda x: 0 if x > 1 else 1) save_in_cache('extra_label', train, test) if not is_in_cache('cleaned'): print('~~~~~~~~~~~~~') print_step('Cleaning') train_cleaned, test_cleaned = clean_text(train, test) save_in_cache('cleaned', train_cleaned, test_cleaned) else: train_cleaned, test_cleaned = load_cache('cleaned') print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) if not is_in_cache('tfidf_word'): print('~~~~~~~~~~~~~~~~~~~') print_step('Run TFIDF WORD') TFIDF_PARAMS_WORD.update({'train': train, 'test': test})
return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 1/10') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('titlecat_wordbatch') or not is_in_cache('text_wordbatch'): print('~~~~~~~~~~~~~~~~~~~~') print_step('Titlecat Wordbatch 1/5') train['titlecat'] = train['parent_category_name'].fillna('') + ' ' + train['category_name'].fillna('') + ' ' + train['param_1'].fillna('') + ' ' + train['param_2'].fillna('') + ' ' + train['param_3'].fillna('') + ' ' + train['title'].fillna('') test['titlecat'] = test['parent_category_name'].fillna('') + ' ' + test['category_name'].fillna('') + ' ' + test['param_1'].fillna('') + ' ' + test['param_2'].fillna('') + ' ' + test['param_3'].fillna('') + ' ' + test['title'].fillna('') if not is_in_cache('titlecat_wordbatch'): print_step('Titlecat Wordbatch 2/5') wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True wordbatch_train = wb.fit_transform(train['titlecat'])
return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 1/13') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('title_countvec'): print('~~~~~~~~~~~~~~~~~~~~') print_step('Title CountVec 1/2') cv = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True, min_df=2) tfidf_train = cv.fit_transform(train['title']) print(tfidf_train.shape) print_step('Title CountVec 2/2') tfidf_test = cv.transform(test['title']) print(tfidf_test.shape) print_step('Saving to cache...') save_in_cache('title_countvec', tfidf_train, tfidf_test) if not is_in_cache('deep_text_feats2'): print('~~~~~~~~~~~~~~~~~~~~~~~')
score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score)) EMBEDDING_FILE = 'cache/crawl/crawl-300d-2M.vec' max_features = 30000 maxlen = 100 embed_size = 300 epochs = 4 batch_size = 32 predict_batch_size = 1024 if not is_in_cache('lvl1_double-gru'): train_df, test_df = get_data() classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
EMBED_SIZE = 300 NCOMP = 20 def text_to_embedding(text): mean = np.mean( [embeddings_index.get(w, np.zeros(EMBED_SIZE)) for w in text.split()], axis=0) if mean.shape == (): return np.zeros(EMBED_SIZE) else: return mean print_step('Importing Data 11/19 1/3') if not is_in_cache('avito_fasttext_300d'): print_step('Embedding 1/5') train, test = get_data() print_step('Embedding 1/5') def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') embeddings_index = dict( get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE)) print_step('Embedding 2/5') train_embeddings = (train['title'].str.cat( [ train['description'],
return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('tfidf_ridges') or not is_in_cache( 'titlecat_tfidf') or not is_in_cache('text_tfidf') or not is_in_cache( 'text_char_tfidf'): print('~~~~~~~~~~~~~~~~~~~~') print_step('Title TFIDF 1/2') tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=100000, min_df=2, max_df=0.8, binary=True, encoding='KOI8-R') tfidf_train = tfidf.fit_transform(train['title']) print(tfidf_train.shape) print_step('Title TFIDF 2/2') tfidf_test = tfidf.transform(test['title']) print(tfidf_test.shape)
y_pred = self.model.predict(self.X_val, verbose=0) score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch + 1, score)) EMBEDDING_FILE = 'cache/crawl/crawl-300d-2M.vec' max_features = 100000 maxlen = 500 embed_size = 300 epochs = 20 batch_size = 256 predict_batch_size = 1024 if not is_in_cache('lvl1_cudnngru'): train_df, test_df = get_data() classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences')
'class_weight': 'balanced_subsample', 'n_jobs': min(mp.cpu_count() - 1, 6), 'random_state': 16, 'verbose': 2 } model = RandomForestClassifier(**params) model.fit(train_X, train_y) pred_test_y = model.predict_proba(test_X)[:, 1] pred_test_y2 = model.predict_proba(test_X2)[:, 1] pred_test_y = minmax_scale( pd.Series(pred_test_y).rank().values) # Rank transform pred_test_y2 = minmax_scale(pd.Series(pred_test_y2).rank().values) return pred_test_y, pred_test_y2 if not is_in_cache('lvl2_all'): print_step('Importing 1/21: LRs') lr_train, lr_test = load_cache('lvl1_lr') print_step('Importing 2/21: FE') train_fe, test_fe = load_cache('fe_lgb_data') print_step('Importing 3/21: Sparse LGBs') lgb_train, lgb_test = load_cache('lvl1_sparse_lgb') print_step('Importing 4/21: FE LGB') fe_lgb_train, fe_lgb_test = load_cache('lvl1_fe_lgb') print_step('Importing 5/21: Sparse FE LGB') sfe_lgb_train, sfe_lgb_test = load_cache('lvl1_sparse_fe_lgb') print_step('Importing 6/21: FM') fm_train, fm_test = load_cache('lvl1_fm') print_step('IMporting 7/21: Ridge') ridge_train, ridge_test = load_cache('lvl1_ridge') print_step('Importing 8/21: GRU')
""" ohe = OneHotEncoder() full_csr = ohe.fit_transform(np.vstack((trn.values, sub.values))) csr_trn = full_csr[:trn.shape[0]] csr_sub = full_csr[trn.shape[0]:] del full_csr gc.collect() # Now remove features that don't have enough samples either in train or test return clean_csr(csr_trn, csr_sub, 3) print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() if not is_in_cache('fm_data'): print('~~~~~~~~~~~~~') print_step('Cleaning 1/2') train_cleaned = get_indicators_and_clean_comments(train) print_step('Cleaning 2/2') test_cleaned = get_indicators_and_clean_comments(test) train_text = train['clean_comment'].fillna('') test_text = test['clean_comment'].fillna('') all_text = pd.concat([train_text, test_text]) class_names = [ 'toxic', 'severe_toxic', 'insult', 'threat', 'obscene', 'identity_hate' ] num_features = [ f_ for f_ in train.columns if f_ not in [ 'comment_text', 'clean_comment', 'id', 'remaining_chars',
score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score)) EMBEDDING_FILE = 'cache/twitter/glove.twitter.27B.200d.txt' max_features = 30000 maxlen = 100 embed_size = 200 epochs = 3 batch_size = 32 predict_batch_size = 1024 if not is_in_cache('lvl1_gru80'): print_step('Loading data') train_df, test_df = get_data() print_step('Preprocessing 1/3') train_df['comment_text'] = train_df['comment_text'].apply(glove_preprocess).apply(normalize_text) print_step('Preprocessing 2/3') test_df['comment_text'] = test_df['comment_text'].apply(glove_preprocess).apply(normalize_text) print_step('Preprocessing 3/3') classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...')
EMBEDDING_FILE = 'cache/crawl/crawl-300d-2M.vec' max_features = 100000 maxlen = 200 embed_size = 300 epochs = 3 batch_size = 256 predict_batch_size = 1024 filter_sizes = [1, 2, 3, 5] num_filters = 32 if not is_in_cache('lvl1_2dconv'): print_step('Loading data') train_df = pd.read_csv('data/train_zafar_cleaned.csv') test_df = pd.read_csv('data/test_zafar_cleaned.csv') classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features, lower=True) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences')
'threat': 380, 'insult': 500, 'identity_hate': 480 } model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=10) print(model.feature_importance()) pred_test_y = model.predict(test_X) pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 if is_in_cache('lgb_fe_with_embeddings_and_svd'): train, test = load_cache('lgb_fe_with_embeddings_and_svd') else: print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() if is_in_cache('fe_lgb_data'): train_fe, test_fe = load_cache('fe_lgb_data') else: print_step('Adding Features') train_fe, test_fe = add_features(train, test) print_step('Dropping') train_fe.drop(['id', 'comment_text'], axis=1, inplace=True) test_fe.drop(['id', 'comment_text'], axis=1, inplace=True) train_fe.drop([ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
def runRidge(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): model = Ridge(solver="sag", fit_intercept=True, random_state=205, alpha=3.3) model.fit(train_X, train_y) pred_test_y = model.predict(test_X) pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~~~~~') if not is_in_cache('extra_data_attack') and not is_in_cache( 'extra_data_toxic'): print_step('Importing Data 1/5') attack = pd.read_csv('data/attack_annotations.tsv', sep='\t') print_step('Importing Data 2/5') attack_comments = pd.read_csv('data/attack_annotated_comments.tsv', sep='\t') print_step('Importing Data 3/5') toxic = pd.read_csv('data/toxicity_annotations.tsv', sep='\t') print_step('Importing Data 4/5') toxic_comments = pd.read_csv('data/toxicity_annotated_comments.tsv', sep='\t') print_step('Importing Data 5/5') train, test = get_data() print_step('Processing 1/9')
def run_ridge_on_regioncat(regioncat): if not is_in_cache('regioncat_ridges_' + regioncat): print_step(regioncat + ' > Subsetting') train_c = train[train['region_X_cat'] == regioncat].copy() test_c = test[test['region_X_cat'] == regioncat].copy() print(train_c.shape) print(test_c.shape) target = train_c['deal_probability'].values train_id = train_c['item_id'] test_id = test_c['item_id'] train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test_c.drop(['item_id'], axis=1, inplace=True) print_step(regioncat + ' > Titlecat TFIDF 1/3') train_c['titlecat'] = train_c['category_name'].fillna( '') + ' ' + train_c['param_1'].fillna('') + ' ' + train_c[ 'param_2'].fillna('') + ' ' + train_c['param_3'].fillna( '') + ' ' + train_c['title'].fillna('') test_c['titlecat'] = test_c['category_name'].fillna('') + ' ' + test_c[ 'param_1'].fillna('') + ' ' + test_c['param_2'].fillna( '') + ' ' + test_c['param_3'].fillna( '') + ' ' + test_c['title'].fillna('') print_step(regioncat + ' > Titlecat TFIDF 2/3') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100000, min_df=2, max_df=0.8, binary=True, encoding='KOI8-R') tfidf_train = tfidf.fit_transform(train_c['titlecat']) print(tfidf_train.shape) print_step(regioncat + ' > Titlecat TFIDF 3/3') tfidf_test = tfidf.transform(test_c['titlecat']) print(tfidf_test.shape) print_step(regioncat + ' > Titlecat TFIDF Ridge') results = run_cv_model(tfidf_train, tfidf_test, target, runRidge, {'alpha': 5.0}, rmse, regioncat + '-titlecat-ridge') train_c['regioncat_title_ridge'] = results['train'] test_c['regioncat_title_ridge'] = results['test'] print_step(regioncat + ' > Description TFIDF 1/3') train_c['desc'] = train_c['title'].fillna( '') + ' ' + train_c['description'].fillna('') test_c['desc'] = test_c['title'].fillna( '') + ' ' + test_c['description'].fillna('') print_step(regioncat + ' > Description TFIDF 2/3') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100000, min_df=2, max_df=0.8, binary=True, encoding='KOI8-R') tfidf_train2 = tfidf.fit_transform(train_c['desc'].fillna('')) print(tfidf_train2.shape) print_step(regioncat + ' > Description TFIDF 3/3') tfidf_test2 = tfidf.transform(test_c['desc'].fillna('')) print(tfidf_test2.shape) results = run_cv_model(tfidf_train2, tfidf_test2, target, runRidge, {'alpha': 5.0}, rmse, regioncat + '-desc-ridge') train_c['regioncat_desc_ridge'] = results['train'] test_c['regioncat_desc_ridge'] = results['test'] print_step(regioncat + ' > Text Char TFIDF 1/2') # Using char n-grams ends up being surprisingly good, HT https://www.kaggle.com/c/avito-demand-prediction/discussion/56061#325063 tfidf = TfidfVectorizer(ngram_range=(2, 5), max_features=100000, min_df=2, max_df=0.8, binary=True, analyzer='char', encoding='KOI8-R') tfidf_train3 = tfidf.fit_transform(train_c['desc']) print(tfidf_train3.shape) print_step(regioncat + ' > Text Char TFIDF 2/2') tfidf_test3 = tfidf.transform(test_c['desc']) print(tfidf_test3.shape) results = run_cv_model(tfidf_train3, tfidf_test3, target, runRidge, {'alpha': 5.0}, rmse, regioncat + '-desc-char-ridge') train_c['regioncat_desc_char_ridge'] = results['train'] test_c['regioncat_desc_char_ridge'] = results['test'] print_step('Merging 1/2') train_c2 = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr() print_step('Merging 2/2') test_c2 = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr() print(train_c2.shape) print(test_c2.shape) print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Run Full Text Ridge') results = run_cv_model(train_c2, test_c2, target, runRidge, {'alpha': 8.0}, rmse, regioncat + '-text-ridge') train_c['regioncat_all_text_ridge'] = results['train'] test_c['regioncat_all_text_ridge'] = results['test'] print('~~~~~~~~~~~~~~~~~~~~~~') print_step(regioncat + ' > Dropping') train_c.drop([c for c in train_c.columns if 'ridge' not in c], axis=1, inplace=True) test_c.drop([c for c in test_c.columns if 'ridge' not in c], axis=1, inplace=True) train_c['item_id'] = train_id test_c['item_id'] = test_id print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step(regioncat + ' > Saving in Cache') save_in_cache('regioncat_ridges_' + regioncat, train_c, test_c) else: print(regioncat + ' already in cache! Skipping...') return True
filter_nr = 64 filter_size = 3 max_pool_size = 3 max_pool_strides = 2 dense_nr = 256 spatial_dropout = 0.2 dense_dropout = 0.5 train_embed = False def schedule(ind): a = [0.001, 0.0005, 0.0001, 0.0001] return a[ind] if not is_in_cache('lvl1_dpcnn'): print_step('Loading data') train_df, test_df = get_data() classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features, lower=True) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences')
def run_nn_model(label, model, max_features, maxlen, epochs, batch_size, predict_batch_size): classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] for embedding_name, embedding_file in EMBEDDING_FILES.items(): if is_in_cache(label + '_' + embedding_name): print_step('Already trained ' + label + '_' + embedding_name + '! Skipping...') else: train_df, test_df = get_data() print_step('Loading embed ' + embedding_name + '...') embed_size = EMBED_SIZE_LOOKUP[embedding_name] x_train, x_test, embedding_matrix = tokenize_and_embed( train_df, test_df, embedding_file, max_features, maxlen, embed_size, embedding_name) y_train = train_df[classes].values print_step('Build model...') model = model(max_features, maxlen, embed_size, embedding_matrix) model.save_weights('cache/gru-model-weights.h5') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) i = 1 cv_scores = [] pred_train = np.zeros((train_df.shape[0], 6)) pred_full_test = np.zeros((test_df.shape[0], 6)) for dev_index, val_index in kf.split(x_train, y_train[:, 0]): print_step('Started fold ' + str(i)) model.load_weights('cache/' + label + '_' + embedding_name + '-model-weights.h5') dev_X, val_X = x_train[dev_index], x_train[val_index] dev_y, val_y = y_train[dev_index, :], y_train[val_index, :] RocAuc = RocAucEvaluation(validation_data=(val_X, val_y), interval=1) model.fit(dev_X, dev_y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_y), callbacks=[RocAuc]) val_pred = model.predict(val_X, batch_size=predict_batch_size, verbose=1) pred_train[val_index, :] = val_pred test_pred = model.predict(x_test, batch_size=predict_batch_size, verbose=1) pred_full_test = pred_full_test + test_pred cv_score = [ roc_auc_score(val_y[:, j], val_pred[:, j]) for j in range(6) ] print_step('Fold ' + str(i) + ' done') pprint(zip(classes, cv_score)) cv_scores.append(cv_score) i += 1 print_step('All folds done!') print('CV scores') pprint(zip(classes, np.mean(cv_scores, axis=0))) mean_cv_score = np.mean(np.mean(cv_scores, axis=0)) print('mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. for k, classx in enumerate(classes): train_df['gru_' + classx] = pred_train[:, k] test_df['gru_' + classx] = pred_full_test[:, k] print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_' + label + '_' + embedding_name, train_df, test_df) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test_df['id'] submission['toxic'] = test_df[label + '_' + embedding_name + '_toxic'] submission['severe_toxic'] = test_df[label + '_' + embedding_name + '_severe_toxic'] submission['obscene'] = test_df[label + '_' + embedding_name + '_obscene'] submission['threat'] = test_df[label + '_' + embedding_name + '_threat'] submission['insult'] = test_df[label + '_' + embedding_name + '_insult'] submission['identity_hate'] = test_df[label + '_' + embedding_name + '_identity_hate'] submission.to_csv('submit/submit_lvl1_' + label + '_' + embedding_name + '.csv', index=False) print_step('Done')
score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score)) EMBEDDING_FILE = 'cache/twitter/glove.twitter.27B.200d.txt' max_features = 200000 maxlen = 200 embed_size = 200 epochs = 2 batch_size = 32 predict_batch_size = 1024 if not is_in_cache('lvl1_rnncnn2'): train_df, test_df = get_data() classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
y_pred = self.model.predict(self.X_val, verbose=0) score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch + 1, score)) EMBEDDING_FILE = 'cache/twitter/glove.twitter.27B.200d.txt' max_features = 100000 maxlen = 200 embed_size = 200 epochs = 12 batch_size = 1024 predict_batch_size = 1024 if not is_in_cache('lvl1_gru128-2'): train_df = pd.read_csv('data/train_zafar_cleaned.csv') test_df = pd.read_csv('data/test_zafar_cleaned.csv') classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test)
b = K.permute_dimensions( b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule] c = K.softmax(b) c = K.permute_dimensions(c, (0, 2, 1)) b = K.permute_dimensions(b, (0, 2, 1)) outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) return outputs def compute_output_shape(self, input_shape): return (None, self.num_capsule, self.dim_capsule) if not is_in_cache('lvl1_capsule_net'): train_df = pd.read_csv('data/train_zafar_cleaned.csv') test_df = pd.read_csv('data/test_zafar_cleaned.csv') classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train)
} model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=10) pred_test_y = model.predict(test_X) pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() if not is_in_cache('cleaned'): print('~~~~~~~~~~~~~') print_step('Cleaning') train_cleaned, test_cleaned = clean_text(train, test) save_in_cache('cleaned', train_cleaned, test_cleaned) else: train_cleaned, test_cleaned = load_cache('cleaned') print_step('Filling missing') train_cleaned['comment_text'].fillna('missing', inplace=True) test_cleaned['comment_text'].fillna('missing', inplace=True) print('Train shape: {}'.format(train_cleaned.shape)) print('Test shape: {}'.format(test_cleaned.shape)) print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)
return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 1/11') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('deep_text_feats3'): print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 2/11') tfidf_train, tfidf_test = load_cache('titlecat_tfidf') print_step('Importing Data 3/11') tfidf_train2, tfidf_test2 = load_cache('text_tfidf') print_step('Importing Data 4/11') tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf') print_step('Importing Data 5/11') train = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr() print_step('Importing Data 6/11') test = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr()
from cache import get_data, is_in_cache, load_cache, save_in_cache print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('data_with_fe'): print('~~~~~~~~~~~~') print_step('Merging') merge = pd.concat([train, test]) print('~~~~~~~~~~~~~~~~~~~') print_step('Imputation 1/7') merge['param_1'].fillna('missing', inplace=True) print_step('Imputation 2/7') merge['param_2'].fillna('missing', inplace=True) print_step('Imputation 3/7') merge['param_3'].fillna('missing', inplace=True) print_step('Imputation 4/7') merge['price_missing'] = merge['price'].isna().astype(int) merge['price'].fillna(0, inplace=True) print_step('Imputation 5/7')
y_pred = self.model.predict(self.X_val, verbose=0) score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch + 1, score)) EMBEDDING_FILE = 'cache/glove/glove.840B.300d.txt' max_features = 100000 maxlen = 150 embed_size = 300 epochs = 1 batch_size = 32 predict_batch_size = 32 if not is_in_cache('lvl1_lstm-conv'): print_step('Loading data') train_df = pd.read_csv('data/train_zafar_cleaned.csv') test_df = pd.read_csv('data/test_zafar_cleaned.csv') classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features, lower=True) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test)