def run(self): self.output().makedirs() train_data = rf_dataset.Dataset().load_all('train', as_df=True) test_data = rf_dataset.Dataset().load_all('test', as_df=True) all_questions = pandas.concat([ train_data.question1_clean, train_data.question2_clean, test_data.question1_clean, test_data.question2_clean, ]) question_freq = all_questions.value_counts().to_dict() train_feat = pandas.DataFrame({ 'freq1': train_data.question1_clean.map(question_freq), 'freq2': train_data.question2_clean.map(question_freq) }) train_feat['freq_diff'] = np.abs(train_feat.freq1 - train_feat.freq2) test_feat = pandas.DataFrame({ 'freq1': test_data.question1_clean.map(question_freq), 'freq2': test_data.question2_clean.map(question_freq) }) test_feat['freq_diff'] = np.abs(test_feat.freq1 - test_feat.freq2) train_feat.to_msgpack(self.make_path('train.msg')) test_feat.to_msgpack(self.make_path('test.msg')) with self.output().open('w'): pass
def run(self): self.output().makedirs() wc_data = rf_word_count_features.WordCountMatrix() X = wc_data.load('train', self.fold).astype(np.float32) y = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate cls = self.make_cls() cls.fit(X, y) X_val = wc_data.load('valid', self.fold).astype(np.float32) y_val = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate y_pred = cls.predict_proba(X_val)[:, 1] np.savez_compressed(self.make_path('valid.npz'), data=y_pred) score = core.score_data(y_val, y_pred) del X, y, X_val, y_val X_test = wc_data.load('test', None).astype(np.float32) y_test_pred = cls.predict_proba(X_test)[:, 1] np.savez_compressed(self.make_path('test.npz'), data=y_test_pred) print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score)) with self.output().open('w') as f: f.write('Score: {:s}: {:f}'.format(repr(self), score)) return score
def run(self): self.output().makedirs() data = self.xdataset() X = data.load('train', self.fold) y = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate cls = self.make_cls() print('Training classifier {:s} on data of size: {}'.format( repr(cls), X.shape)) cls.fit(X, y) self.post_fit(cls) X_val = data.load('valid', self.fold) y_val = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate y_pred = cls.predict_proba(X_val)[:, 1] np.savez_compressed(self.make_path('valid.npz'), data=y_pred) score = core.score_data(y_val, y_pred) del X, y, X_val, y_val X_test = data.load('test', None) y_test_pred = cls.predict_proba(X_test)[:, 1] np.savez_compressed(self.make_path('test.npz'), data=y_test_pred) print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score)) with self.output().open('w') as f: f.write('Score: {:s}: {:f}'.format(repr(self), score)) return score
def run(self): self.English = spacy.en.English() train_data = rf_dataset.Dataset().load_all('train') test_data = rf_dataset.Dataset().load_all('test') train_q12 = zip(train_data.question1_clean, train_data.question2_clean) test_q12 = zip(test_data.question1_clean, test_data.question2_clean) all_ent_train = [ self.entity_diffs(v) for v in tqdm( train_q12, total=train_data.shape[0], desc='ents - train') ] all_ent_test = [ self.entity_diffs(v) for v in tqdm( test_q12, total=test_data.shape[0], desc='ents - test') ] all_ent_train = np.asarray(all_ent_train) all_ent_test = np.asarray(all_ent_test) nose.tools.assert_equal(all_ent_train.shape[1], all_ent_test.shape[1]) nose.tools.assert_equal(all_ent_train.shape[0], train_data.shape[0]) nose.tools.assert_equal(all_ent_test.shape[0], test_data.shape[0]) self.output().makedirs() np.savez_compressed(self.make_path('done_tmp.npz'), train=all_ent_train, test=all_ent_test) os.rename(self.make_path('done_tmp.npz'), self.output().path)
def run(self): self.output().makedirs() batch_size = 128 normalizer = preprocessing.StandardScaler() train_q1, train_q2, train_other = rf_seq_data.RFWordSequenceDataset().load('train', fold=self.fold) train_other = normalizer.fit_transform(train_other) train_labels = rf_dataset.Dataset().load('train', fold=self.fold, as_df=True).is_duplicate print(train_q1.shape, train_q2.shape, train_other.shape) embedding = rf_seq_data.RFWordSequenceDataset().load_embedding_mat() np.random.seed(self.fold) model = self.model(embedding, train_q2.shape[1], train_other.shape[1]) early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=6) slow_plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3) model_path = self.make_path('model.h5') model_checkpointer = keras.callbacks.ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) if self.include_distances(): train_data = [train_q1, train_q2, train_other] else: train_data = [train_q1, train_q2] model.fit( train_data, train_labels, validation_split=0.05, epochs=20, batch_size=batch_size, shuffle=True, class_weight=dictweights, callbacks=[early_stopping, slow_plateau, model_checkpointer]) model.load_weights(model_path) valid_q1, valid_q2, valid_other = rf_seq_data.RFWordSequenceDataset().load('valid', fold=self.fold) valid_other = normalizer.transform(valid_other) valid_labels = rf_dataset.Dataset().load('valid', fold=self.fold, as_df=True).is_duplicate if self.include_distances(): valid_data = [valid_q1, valid_q2, valid_other] else: valid_data = [valid_q1, valid_q2] valid_preds = model.predict(valid_data, verbose=1, batch_size=batch_size) valid_preds = np.clip(valid_preds, 1e-7, 1 - 1e-7) score = score_data(valid_labels.values, valid_preds) print(colors.green | "Score for {:s}: {:f}".format(repr(self), score)) test_q1, test_q2, test_other = rf_seq_data.RFWordSequenceDataset().load('test', None) test_other = normalizer.transform(test_other) if self.include_distances(): test_data = [test_q1, test_q2, test_other] else: test_data = [test_q1, test_q2] test_preds = model.predict(test_data, verbose=1, batch_size=batch_size) np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_preds, test=test_preds) os.rename(self.make_path('done_tmp.npz'), self.output().path) return score
def run(self): self.output().makedirs() X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True) y_train = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True) y_valid = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat( (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train)) y_train = np.array( [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0]) del pos_train, neg_train #pos_valid = X_valid[y_valid == 1] #neg_valid = X_valid[y_valid == 0] #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid)) #y_valid = np.array( # [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0]) #del pos_valid, neg_valid X_tr_tr, X_tr_es, y_tr_tr, y_tr_es = model_selection.train_test_split( X_train, y_train, test_size=0.05) d_train = xgb.DMatrix(X_tr_tr, label=y_tr_tr) d_es = xgb.DMatrix(X_tr_es, label=y_tr_es) d_valid = xgb.DMatrix(X_valid, label=y_valid) watchlist = [(d_train, 'train'), (d_es, 'd_es')] params = {} params['objective'] = 'binary:logistic' params['eval_metric'] = 'logloss' params['eta'] = 0.02 params['max_depth'] = 7 params['subsample'] = 0.6 params['base_score'] = 0.2 #bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50) bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=50) p_valid = bst.predict(d_valid) print(score_data(y_valid, p_valid, weighted=False)) X_test = RF_LeakyXGB_Dataset().load('test', None, as_df=True) d_test = xgb.DMatrix(X_test) p_test = bst.predict(d_test) np.savez_compressed(self.make_path('done_tmp.npz'), valid=p_valid, test=p_test) os.rename(self.make_path('done_tmp.npz'), self.output().path)
def run(self): self.output().makedirs() self.tokenzier = treebank.TreebankWordTokenizer() self.stemmer = snowball.SnowballStemmer('english') self.vectorizer = CountVectorizer(ngram_range=(1, self.ngram_max), min_df=self.ngram_min_df) train_data = rf_dataset.Dataset().load('train', fold=None, as_df=True) test_data = rf_dataset.Dataset().load('test', fold=None, as_df=True) all_questions = np.concatenate([ train_data.question1_clean.values, test_data.question1_clean.values, train_data.question2_clean.values, test_data.question2_clean.values ]) print(colors.lightblue | 'Tokenizing') all_tokens = multiprocessing.Pool(4).map(self.vectorize_question, all_questions) print(colors.lightblue | 'Finished tokenizing, now fitting') transformed_tokens = self.vectorizer.fit_transform(all_tokens) print(colors.lightblue | colors.bold | 'Gosh that takes a long time') transformed_tokens = transformed_tokens.tocsr() halfpt = transformed_tokens.shape[0] // 2 assert halfpt == train_data.shape[0] + test_data.shape[0] q1s = transformed_tokens[:halfpt] q2s = transformed_tokens[halfpt:] train_q1s = q1s[:train_data.shape[0]] train_q2s = q2s[:train_data.shape[0]] test_q1s = q1s[train_data.shape[0]:] test_q2s = q2s[train_data.shape[0]:] nose.tools.assert_equal(test_q1s.shape[0], test_data.shape[0]) nose.tools.assert_equal(test_q2s.shape[0], test_data.shape[0]) nose.tools.assert_equal(train_q1s.shape[0], train_data.shape[0]) nose.tools.assert_equal(train_q2s.shape[0], train_data.shape[0]) self.write_mat_to(self.make_path('train_q1.pkl'), train_q1s) self.write_mat_to(self.make_path('train_q2.pkl'), train_q2s) self.write_mat_to(self.make_path('test_q1.pkl'), test_q1s) self.write_mat_to(self.make_path('test_q2.pkl'), test_q2s) diffs = sp.hstack([np.abs(q1s - q2s), q1s.multiply(q2s)]).tocsr() train_vecs = diffs[:train_data.shape[0]] test_vecs = diffs[train_data.shape[0]:] nose.tools.assert_equal(train_vecs.shape[0], train_data.shape[0]) nose.tools.assert_equal(test_vecs.shape[0], test_data.shape[0]) self.write_mat_to(self.make_path('train_mat.pkl'), train_vecs) self.write_mat_to(self.make_path('test_mat.pkl'), test_vecs) with self.output().open('w'): pass
def run(self): self.output().makedirs() X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True) y_train = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True) y_valid = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat( (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train)) y_train = np.array( [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0]) del pos_train, neg_train #pos_valid = X_valid[y_valid == 1] #neg_valid = X_valid[y_valid == 0] #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid)) #y_valid = np.array( # [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0]) #del pos_valid, neg_valid cls = lightgbm.sklearn.LGBMClassifier(n_estimators=2048, num_leaves=1024, learning_rate=0.03, subsample=0.75) X_tr_tr, X_tr_es, y_tr_tr, y_tr_es = model_selection.train_test_split( X_train, y_train, test_size=0.05) cls.fit(X_tr_tr, y_tr_tr, eval_set=[(X_tr_es, y_tr_es)], early_stopping_rounds=50) valid_pred = cls.predict_proba(X_valid)[:, 1] print(colors.green | '{:s} == {:f}'.format( repr(self), score_data(y_valid, valid_pred, weighted=False))) print(colors.yellow | str( pandas.Series(cls.feature_importances_, index=X_train.columns).sort_values())) X_test = RF_LeakyXGB_Dataset().load('test', None, as_df=True).fillna(-999).clip( -1000, 1000) test_pred = cls.predict_proba(X_test)[:, 1] np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_pred, test=test_pred) os.rename(self.make_path('done_tmp.npz'), self.output().path)
def score(self): self.output().makedirs() train_Xs = [] train_ys = [] for fold in range(1, fold_max): y = rf_dataset.Dataset().load('valid', fold, as_df=True).is_duplicate.values.squeeze() x = self.fold_x(fold, 'valid') nose.tools.assert_equal(x.shape[0], y.shape[0]) train_Xs.append(x) train_ys.append(y) sns.clustermap(pandas.concat(train_Xs, 0).corr()) plt.yticks(rotation=90) plt.savefig('./corr.png') plt.close() train_X = pandas.concat(train_Xs, 0).values train_y = np.concatenate(train_ys, 0).squeeze() cls = AutoExitingGBMLike(XGBClassifier( n_estimators=1024, learning_rate=0.05, max_depth=8, gamma=1, subsample=0.5 ), additional_fit_args={'verbose': False}) #cls = AutoExitingGBMLike(lightgbm.sklearn.LGBMClassifier( # n_estimators=1024, # learning_rate=0.01, # subsample=0.5, # num_leaves=2048 #), additional_fit_args={'verbose': False}) #cls = pipeline.Pipeline([ # ('poly', preprocessing.PolynomialFeatures(2)), # ('anova', feature_selection.SelectPercentile(feature_selection.f_classif)), # ('lin', linear_model.LogisticRegression(C=1, class_weight=core.dictweights)) #]) #cls = keras.wrappers.scikit_learn.KerasClassifier(build_fn=self.simple_nn) cls.fit(train_X, train_y) if hasattr(cls, 'feature_importances_'): ds_names = [repr(d) for d in self.classifiers(0)] print(colors.yellow | str(pandas.Series(cls.feature_importances_, index=ds_names).sort_values())) test_x = self.fold_x(0, 'valid').values test_y = rf_dataset.Dataset().load('valid', 0, as_df=True).is_duplicate.values.squeeze() score = core.score_data(test_y, cls.predict_proba(test_x)[:, 1]) return score, cls
def _load(self, as_df): res = np.load(self.output().path)['train'] if as_df: res = pandas.DataFrame( res, columns=['ent', 'nent', 'ent_diff', 'ent_ratio']) folds = rf_dataset.Dataset().load_dataset_folds() return res, folds
def load(self, name, fold, as_df=False, include_smallfeat=True): assert self.complete() assert not as_df, 'Dataframe mode not supported' assert include_smallfeat, 'implement in load_all then remove assert.' assert name in {'train', 'test', 'valid'} if name == 'test': res = np.load(self.make_path('test.npz')) if include_smallfeat: smallfeat = rf_small_features.SmallFeaturesTask()._load_test( False) return res['q1'], res['q2'], smallfeat else: return res['q1'], res['q2'] else: res = np.load(self.make_path('train.npz')) smallfeat, sm_folds = rf_small_features.SmallFeaturesTask()._load( False) folds = (rf_dataset.Dataset().load_dataset_folds() + fold) % fold_max if name == 'valid': selection = folds == 0 else: selection = folds != 0 if include_smallfeat: return res['q1'][selection], res['q2'][selection], smallfeat[ selection] else: return res['q1'][selection], res['q2'][selection]
def _load(self, as_df): res = np.load(self.output().path)['train_distances'] if as_df: res = pandas.DataFrame( res, columns=['cosine', 'dice', 'hamming', 'kulsinski']) folds = rf_dataset.Dataset().load_dataset_folds() return res, folds
def _load(self, as_df): folds = rf_dataset.Dataset().load_dataset_folds() features = pandas.read_msgpack(_train_loc).fillna(9999).clip( -10000, 10000) if not as_df: features = features.values return features, folds
def run(self): self.output().makedirs() X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True).fillna(-999).clip( -1000, 1000) y_train = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True).fillna(-999).clip( -1000, 1000) y_valid = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat( (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train)) y_train = np.array( [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0]) del pos_train, neg_train #pos_valid = X_valid[y_valid == 1] #neg_valid = X_valid[y_valid == 0] #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid)) #y_valid = np.array( # [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0]) #del pos_valid, neg_valid cls = ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=1024) cls.fit(X_train.values, y_train.values) valid_pred = cls.predict_proba(X_valid)[:, 1] print(colors.green | '{:s} == {:f}'.format( repr(self), score_data(y_valid, valid_pred))) print(colors.yellow | str( pandas.Series(cls.feature_importances_, index=X_train.columns).sort_values())) X_test = RF_LeakyXGB_Dataset().load('test', None, as_df=True).fillna(-999).clip( -1000, 1000) test_pred = cls.predict_proba(X_test.values)[:, 1] np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_pred, test=test_pred) os.rename(self.make_path('done_tmp.npz'), self.output().path)
def run(self): self.output().makedirs() train_data = rf_dataset.Dataset().load_all('train', as_df=True) test_data = rf_dataset.Dataset().load_all('test', as_df=True) all_questions = (list(train_data.question1_clean) + list(test_data.question1_clean) + list(train_data.question2_clean) + list(test_data.question2_clean)) allq = multiprocessing.Pool().imap(self.hash_question, all_questions, chunksize=10000) hashes = list(tqdm(allq, total=len(all_questions))) train_size = train_data.shape[0] test_size = test_data.shape[0] q1s = hashes[:(train_size + test_size)] q2s = hashes[(train_size + test_size):] hashes = { 'train_q1': q1s[:train_size], 'train_q2': q2s[:train_size], 'test_q1': q1s[train_size:], 'test_q2': q2s[train_size:], } self.hashgraph = networkx.Graph() self.hashgraph.add_edges_from( zip(hashes['train_q1'], hashes['train_q2'])) self.hashgraph.add_edges_from(zip(hashes['test_q1'], hashes['test_q2'])) train_iter = tqdm(zip(hashes['train_q1'], hashes['train_q2']), total=train_size, desc='train_feat') train_feat = [self.rehash(h1, h2) for h1, h2 in train_iter] np.savez_compressed(self.make_path('train.npz'), data=np.asarray(train_feat)) test_iter = tqdm(zip(hashes['test_q1'], hashes['test_q2']), total=test_size, desc='test_feat') test_feat = [self.rehash(h1, h2) for h1, h2 in test_iter] np.savez_compressed(self.make_path('test.npz'), data=np.asarray(test_feat)) with self.output().open('w'): pass
def _load(self, as_df): res = np.load(self.make_path('train.npz'))['data'] if as_df: res = pandas.DataFrame( res, columns=['l1_neighbours', 'graph_neighbours', 'l2_neighbours']) folds = rf_dataset.Dataset().load_dataset_folds() return res, folds
def _load(self, as_df): folds = rf_dataset.Dataset().load_dataset_folds() feat = kq.core.fillna( np.load(self.make_path('train.npz'))['data'], 9999).clip(-10000, 10000) if as_df: return pandas.DataFrame(feat, columns=self.columns()), folds else: return feat, folds
def run(self): self.tokenzier = treebank.TreebankWordTokenizer() self.kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file) train_data = rf_dataset.Dataset().load_all( 'train', as_df=True)[['question1_clean', 'question2_clean']] test_data = rf_dataset.Dataset().load_all( 'test', as_df=True)[['question1_clean', 'question2_clean']] all_data = pandas.concat([train_data, test_data], 0) distances = list( tqdm(multiprocessing.Pool().imap(self.vectorize, zip(all_data['question1_clean'], all_data['question2_clean']), chunksize=50_000), total=all_data.shape[0], desc='vectorizing the words'))
def requires(self): yield rf_dataset.Dataset() xs = [] for fold in range(fold_max): for cls in self.classifiers(fold): xs.append(cls) for v in sorted(xs, key=lambda c: c.__class__.__name__): yield v
def run(self): self.output().makedirs() kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file) train_dataset = rf_dataset.Dataset().load_all('train', as_df=True) test_dataset = rf_dataset.Dataset().load_all('test', as_df=True) self.tokenzier = treebank.TreebankWordTokenizer() all_words = pandas.concat([ train_dataset.question1_clean.str.lower(), train_dataset.question2_clean.str.lower(), test_dataset.question1_clean.str.lower(), test_dataset.question2_clean.str.lower(), ]) tokenizer = Tokenizer(num_words=250_000) tokenizer.fit_on_texts(all_words) all_seqs = tokenizer.texts_to_sequences(all_words) all_padded_seqs = pad_sequences(all_seqs, 32) train_seqs = all_padded_seqs[:train_dataset.shape[0] * 2] test_seqs = all_padded_seqs[train_dataset.shape[0] * 2:] nose.tools.assert_equal(test_seqs.shape[0], test_dataset.shape[0] * 2) train_q1 = train_seqs[:train_dataset.shape[0]] train_q2 = train_seqs[train_dataset.shape[0]:] test_q1 = test_seqs[:test_dataset.shape[0]] test_q2 = test_seqs[test_dataset.shape[0]:] np.savez_compressed(self.make_path('train.npz'), q1=train_q1, q2=train_q2) np.savez_compressed(self.make_path('test.npz'), q1=test_q1, q2=test_q2) embedding_matrix = np.zeros((250_000, 300)) for word, ix in tokenizer.word_index.items(): if word in kvecs: embedding_matrix[ix, :] = kvecs[word] np.savez_compressed(self.make_path('embedding.npz'), data=embedding_matrix) with self.output().open('w'): pass
def _load(self, as_df): decomps = [r._load(as_df)[0] for r in self.requires()] if as_df: for d, r in zip(decomps, self.requires()): self.wangjangle_columns(d, r) decomps = pandas.concat(decomps, 1) else: decomps = np.concatenate(decomps, 1) folds = rf_dataset.Dataset().load_dataset_folds() return decomps, folds
def run(self): self.output().makedirs() train = rf_dataset.Dataset().load_all('train', as_df=True) test = rf_dataset.Dataset().load_all('test', as_df=True) true_qid = {q: id for q, id in zip(train.question1, train.qid1)} true_qid.update({q: id for q, id in zip(train.question2, train.qid2)}) train_max_id = max(true_qid.values()) step_size = 507000 / 2345806 current_id = train_max_id + step_size for q1, q2 in tqdm(zip(test.question1, test.question2), total=test.shape[0]): if q1 not in true_qid: true_qid[q1] = current_id current_id += step_size if q2 not in true_qid: true_qid[q2] = current_id current_id += step_size train_feature = [ min(true_qid[q1], true_qid[q2]) for q1, q2 in tqdm(zip(train.question1, train.question2), total=train.shape[0]) ] test_feature = [ min(true_qid[q1], true_qid[q2]) for q1, q2 in tqdm(zip(test.question1, test.question2), total=test.shape[0]) ] np.savez_compressed(self.make_path('train.npz'), data=np.asarray(train_feature)[:, None]) np.savez_compressed(self.make_path('test.npz'), data=np.asarray(test_feature)[:, None]) with self.output().open('w'): pass
def _load(self, as_df): res = kq.core.fillna( np.load(self.make_path('train.npz'))['data'], 9999).clip(-10000, 10000) if not self.include_space: res = res[:, :len(distances)] cols = self.colnames()[:len(distances)] else: cols = self.colnames() if as_df: res = pandas.DataFrame(res, columns=cols) folds = rf_dataset.Dataset().load_dataset_folds() return res, folds
def run(self): self.output().makedirs() m1, m2 = rf_word_count_features.WordCountMatrix().load_raw_vectors( 'train') m1 = m1 > 0 m2 = m2 > 0 X = m1.multiply(m2) folds = (rf_dataset.Dataset().load_dataset_folds() + self.fold) % fold_max train_X = X[folds != 0] train_y = rf_dataset.Dataset().load('train', fold=self.fold, as_df=True).is_duplicate.values cls = naive_bayes.BernoulliNB() cls.fit(train_X, train_y) valid_X = X[folds == 0] valid_y = rf_dataset.Dataset().load('valid', fold=self.fold, as_df=True).is_duplicate.values valid_pred = cls.predict_proba(valid_X)[:, 1] score = score_data(valid_y, valid_pred) print(colors.green | "Score for {:s}: {:f}".format(repr(self), score)) t1, t2 = rf_word_count_features.WordCountMatrix().load_raw_vectors( 'test') t1 = t1 > 0 t2 = t2 > 0 test_X = t1.multiply(t2) test_pred = cls.predict_proba(test_X)[:, 1] np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_pred, test=test_pred) os.rename(self.make_path('done_tmp.npz'), self.make_path('done.npz')) return score
def _load(self, as_df): assert not as_df, 'Pandas mode not supported' Xs = [] n_vs = None for r in self.requires(): x = r.load_all('train', False) nose.tools.assert_equal(len(x.shape), 2, repr(r)) if isinstance(x, np.ndarray): x[np.isnan(x)] = 999 x = np.clip(x, -1000, 1000) if n_vs is None: n_vs = x.shape[0] else: nose.tools.assert_equal(n_vs, x.shape[0], repr(r)) Xs.append(x) folds = rf_dataset.Dataset().load_dataset_folds() res = sp.hstack(Xs) return res.tocsr(), folds
def _load(self, as_df): Xs = [] n_vs = None for r in self.requires(): x = r.load_all('train', as_df) nose.tools.assert_equal(len(x.shape), 2, repr(r)) if n_vs is None: n_vs = x.shape[0] else: nose.tools.assert_equal(n_vs, x.shape[0], repr(r)) if as_df: nose.tools.assert_is_instance(x, pandas.DataFrame, repr(r)) x.columns = [r.__class__.__name__ + '_' + c for c in x.columns] x = x.fillna(999).clip(-1000, 1000) Xs.append(x) folds = rf_dataset.Dataset().load_dataset_folds() if as_df: res = pandas.concat(Xs, 1).reset_index(drop=True) return res, folds else: res = np.concatenate(Xs, 1) res[np.isnan(res)] = 999 return np.clip(res, -1000, 1000), folds
def requires(self): yield rf_dataset.Dataset() yield rf_word_count_features.WordCountMatrix()
def requires(self): yield rf_dataset.Dataset() yield rf_small_features.SmallFeaturesTask()
def requires(self): yield rf_seq_data.RFWordSequenceDataset() yield rf_dataset.Dataset()
def requires(self): yield rf_dataset.Dataset() yield rf_ab.ABDataset()