def run(self): self.output().makedirs() wc_mat_train = rf_word_count_features.WordCountMatrix( ).load_raw_vectors('train') wc_mat_test = rf_word_count_features.WordCountMatrix( ).load_raw_vectors('test') all_vecs = sp.vstack(list(wc_mat_train) + list(wc_mat_test)) decomp = self.decomposition() decomposed = decomp.fit_transform(all_vecs) train_size = wc_mat_train[0].shape[0] test_size = wc_mat_test[0].shape[0] train_decomp = decomposed[:train_size * 2] test_decomp = decomposed[train_size * 2:] assert test_decomp.shape[0] == test_size * 2 decomp_train = { 'q1': train_decomp[:train_size], 'q2': train_decomp[train_size:] } decomp_test = { 'q1': test_decomp[:test_size], 'q2': test_decomp[test_size:] } train_dists = list( tqdm(multiprocessing.Pool().imap(self.decomp_dist, zip(decomp_train['q1'], decomp_train['q2']), chunksize=50_000), total=train_size, desc='vectorizing the training data'))
def run(self): self.output().makedirs() train_q1, train_q2 = rf_word_count_features.WordCountMatrix( ).load_raw_vectors('train') train_distances = distances_from_mats(train_q1, train_q2) test_q1, test_q2 = rf_word_count_features.WordCountMatrix( ).load_raw_vectors('test') test_distances = distances_from_mats(test_q1, test_q2) np.savez_compressed(self.make_path('done_tmp.npz'), train_distances=train_distances, test_distances=test_distances) os.rename(self.make_path('done_tmp.npz'), self.output().path)
def run(self): self.output().makedirs() wc_data = rf_word_count_features.WordCountMatrix() X = wc_data.load('train', self.fold).astype(np.float32) y = rf_dataset.Dataset().load('train', self.fold, as_df=True).is_duplicate cls = self.make_cls() cls.fit(X, y) X_val = wc_data.load('valid', self.fold).astype(np.float32) y_val = rf_dataset.Dataset().load('valid', self.fold, as_df=True).is_duplicate y_pred = cls.predict_proba(X_val)[:, 1] np.savez_compressed(self.make_path('valid.npz'), data=y_pred) score = core.score_data(y_val, y_pred) del X, y, X_val, y_val X_test = wc_data.load('test', None).astype(np.float32) y_test_pred = cls.predict_proba(X_test)[:, 1] np.savez_compressed(self.make_path('test.npz'), data=y_test_pred) print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score)) with self.output().open('w') as f: f.write('Score: {:s}: {:f}'.format(repr(self), score)) return score
def requires(self): yield rf_decomposition.AllDecompositions() yield rf_word_count_distances.WordCountDistances() yield rf_distances.RFDistanceCalculator() yield rf_vectorspaces.VectorSpaceTask(include_space=False) yield rf_magic_features.QuestionFrequency() yield rf_magic_features.NeighbourhoodFeature() yield rf_magic_features.QuestionOrderFeature() yield rf_leaky.RF_LeakyXGB_Dataset() yield rf_pos_distances.RF_POS_Distance() yield rf_word_count_features.WordCountMatrix()
def run(self): self.output().makedirs() m1, m2 = rf_word_count_features.WordCountMatrix().load_raw_vectors( 'train') m1 = m1 > 0 m2 = m2 > 0 X = m1.multiply(m2) folds = (rf_dataset.Dataset().load_dataset_folds() + self.fold) % fold_max train_X = X[folds != 0] train_y = rf_dataset.Dataset().load('train', fold=self.fold, as_df=True).is_duplicate.values cls = naive_bayes.BernoulliNB() cls.fit(train_X, train_y) valid_X = X[folds == 0] valid_y = rf_dataset.Dataset().load('valid', fold=self.fold, as_df=True).is_duplicate.values valid_pred = cls.predict_proba(valid_X)[:, 1] score = score_data(valid_y, valid_pred) print(colors.green | "Score for {:s}: {:f}".format(repr(self), score)) t1, t2 = rf_word_count_features.WordCountMatrix().load_raw_vectors( 'test') t1 = t1 > 0 t2 = t2 > 0 test_X = t1.multiply(t2) test_pred = cls.predict_proba(test_X)[:, 1] np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_pred, test=test_pred) os.rename(self.make_path('done_tmp.npz'), self.make_path('done.npz')) return score
def requires(self): yield rf_dataset.Dataset() yield rf_word_count_features.WordCountMatrix()
def requires(self): yield rf_word_count_features.WordCountMatrix()