Esempio n. 1
0
    def run(self):
        self.output().makedirs()
        train_data = rf_dataset.Dataset().load_all('train', as_df=True)
        test_data = rf_dataset.Dataset().load_all('test', as_df=True)

        all_questions = pandas.concat([
            train_data.question1_clean,
            train_data.question2_clean,
            test_data.question1_clean,
            test_data.question2_clean,
        ])
        question_freq = all_questions.value_counts().to_dict()

        train_feat = pandas.DataFrame({
            'freq1':
            train_data.question1_clean.map(question_freq),
            'freq2':
            train_data.question2_clean.map(question_freq)
        })
        train_feat['freq_diff'] = np.abs(train_feat.freq1 - train_feat.freq2)
        test_feat = pandas.DataFrame({
            'freq1':
            test_data.question1_clean.map(question_freq),
            'freq2':
            test_data.question2_clean.map(question_freq)
        })
        test_feat['freq_diff'] = np.abs(test_feat.freq1 - test_feat.freq2)

        train_feat.to_msgpack(self.make_path('train.msg'))
        test_feat.to_msgpack(self.make_path('test.msg'))

        with self.output().open('w'):
            pass
Esempio n. 2
0
    def run(self):
        self.output().makedirs()
        wc_data = rf_word_count_features.WordCountMatrix()

        X = wc_data.load('train', self.fold).astype(np.float32)
        y = rf_dataset.Dataset().load('train', self.fold,
                                      as_df=True).is_duplicate

        cls = self.make_cls()
        cls.fit(X, y)

        X_val = wc_data.load('valid', self.fold).astype(np.float32)
        y_val = rf_dataset.Dataset().load('valid', self.fold,
                                          as_df=True).is_duplicate

        y_pred = cls.predict_proba(X_val)[:, 1]
        np.savez_compressed(self.make_path('valid.npz'), data=y_pred)
        score = core.score_data(y_val, y_pred)

        del X, y, X_val, y_val
        X_test = wc_data.load('test', None).astype(np.float32)
        y_test_pred = cls.predict_proba(X_test)[:, 1]
        np.savez_compressed(self.make_path('test.npz'), data=y_test_pred)

        print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score))

        with self.output().open('w') as f:
            f.write('Score: {:s}: {:f}'.format(repr(self), score))
        return score
Esempio n. 3
0
    def run(self):
        self.output().makedirs()
        data = self.xdataset()

        X = data.load('train', self.fold)
        y = rf_dataset.Dataset().load('train', self.fold,
                                      as_df=True).is_duplicate

        cls = self.make_cls()
        print('Training classifier {:s} on data of size: {}'.format(
            repr(cls), X.shape))
        cls.fit(X, y)

        self.post_fit(cls)

        X_val = data.load('valid', self.fold)
        y_val = rf_dataset.Dataset().load('valid', self.fold,
                                          as_df=True).is_duplicate

        y_pred = cls.predict_proba(X_val)[:, 1]
        np.savez_compressed(self.make_path('valid.npz'), data=y_pred)
        score = core.score_data(y_val, y_pred)

        del X, y, X_val, y_val
        X_test = data.load('test', None)
        y_test_pred = cls.predict_proba(X_test)[:, 1]
        np.savez_compressed(self.make_path('test.npz'), data=y_test_pred)

        print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score))

        with self.output().open('w') as f:
            f.write('Score: {:s}: {:f}'.format(repr(self), score))
        return score
Esempio n. 4
0
    def run(self):
        self.English = spacy.en.English()
        train_data = rf_dataset.Dataset().load_all('train')
        test_data = rf_dataset.Dataset().load_all('test')

        train_q12 = zip(train_data.question1_clean, train_data.question2_clean)
        test_q12 = zip(test_data.question1_clean, test_data.question2_clean)
        all_ent_train = [
            self.entity_diffs(v) for v in tqdm(
                train_q12, total=train_data.shape[0], desc='ents - train')
        ]
        all_ent_test = [
            self.entity_diffs(v) for v in tqdm(
                test_q12, total=test_data.shape[0], desc='ents - test')
        ]

        all_ent_train = np.asarray(all_ent_train)
        all_ent_test = np.asarray(all_ent_test)
        nose.tools.assert_equal(all_ent_train.shape[1], all_ent_test.shape[1])
        nose.tools.assert_equal(all_ent_train.shape[0], train_data.shape[0])
        nose.tools.assert_equal(all_ent_test.shape[0], test_data.shape[0])

        self.output().makedirs()
        np.savez_compressed(self.make_path('done_tmp.npz'),
                            train=all_ent_train,
                            test=all_ent_test)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
Esempio n. 5
0
    def run(self):
        self.output().makedirs()
        batch_size = 128
        normalizer = preprocessing.StandardScaler()

        train_q1, train_q2, train_other = rf_seq_data.RFWordSequenceDataset().load('train', fold=self.fold)
        train_other = normalizer.fit_transform(train_other)
        train_labels = rf_dataset.Dataset().load('train', fold=self.fold, as_df=True).is_duplicate
        print(train_q1.shape, train_q2.shape, train_other.shape)
        embedding = rf_seq_data.RFWordSequenceDataset().load_embedding_mat()

        np.random.seed(self.fold)
        model = self.model(embedding, train_q2.shape[1], train_other.shape[1])

        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=6)
        slow_plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3)
        model_path = self.make_path('model.h5')
        model_checkpointer = keras.callbacks.ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True)

        if self.include_distances():
            train_data = [train_q1, train_q2, train_other]
        else:
            train_data = [train_q1, train_q2]

        model.fit(
            train_data, train_labels,
            validation_split=0.05,
            epochs=20, batch_size=batch_size, shuffle=True,
            class_weight=dictweights,
            callbacks=[early_stopping, slow_plateau, model_checkpointer])
        model.load_weights(model_path)

        valid_q1, valid_q2, valid_other = rf_seq_data.RFWordSequenceDataset().load('valid', fold=self.fold)
        valid_other = normalizer.transform(valid_other)
        valid_labels = rf_dataset.Dataset().load('valid', fold=self.fold, as_df=True).is_duplicate
        if self.include_distances():
            valid_data = [valid_q1, valid_q2, valid_other]
        else:
            valid_data = [valid_q1, valid_q2]

        valid_preds = model.predict(valid_data, verbose=1, batch_size=batch_size)
        valid_preds = np.clip(valid_preds, 1e-7, 1 - 1e-7)

        score = score_data(valid_labels.values, valid_preds)
        print(colors.green | "Score for {:s}: {:f}".format(repr(self), score))

        test_q1, test_q2, test_other = rf_seq_data.RFWordSequenceDataset().load('test', None)
        test_other = normalizer.transform(test_other)
        if self.include_distances():
            test_data = [test_q1, test_q2, test_other]
        else:
            test_data = [test_q1, test_q2]

        test_preds = model.predict(test_data, verbose=1, batch_size=batch_size)

        np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_preds, test=test_preds)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
        return score
Esempio n. 6
0
    def run(self):
        self.output().makedirs()
        X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True)
        y_train = rf_dataset.Dataset().load('train', self.fold,
                                            as_df=True).is_duplicate
        X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True)
        y_valid = rf_dataset.Dataset().load('valid', self.fold,
                                            as_df=True).is_duplicate

        pos_train = X_train[y_train == 1]
        neg_train = X_train[y_train == 0]
        X_train = pd.concat(
            (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
        y_train = np.array(
            [0] * neg_train.shape[0] +
            [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] +
            [0] * neg_train.shape[0])
        del pos_train, neg_train

        #pos_valid = X_valid[y_valid == 1]
        #neg_valid = X_valid[y_valid == 0]
        #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
        #y_valid = np.array(
        #    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
        #del pos_valid, neg_valid
        X_tr_tr, X_tr_es, y_tr_tr, y_tr_es = model_selection.train_test_split(
            X_train, y_train, test_size=0.05)

        d_train = xgb.DMatrix(X_tr_tr, label=y_tr_tr)
        d_es = xgb.DMatrix(X_tr_es, label=y_tr_es)
        d_valid = xgb.DMatrix(X_valid, label=y_valid)
        watchlist = [(d_train, 'train'), (d_es, 'd_es')]

        params = {}
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = 0.02
        params['max_depth'] = 7
        params['subsample'] = 0.6
        params['base_score'] = 0.2

        #bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
        bst = xgb.train(params,
                        d_train,
                        1000,
                        watchlist,
                        early_stopping_rounds=50,
                        verbose_eval=50)
        p_valid = bst.predict(d_valid)
        print(score_data(y_valid, p_valid, weighted=False))
        X_test = RF_LeakyXGB_Dataset().load('test', None, as_df=True)
        d_test = xgb.DMatrix(X_test)
        p_test = bst.predict(d_test)

        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=p_valid,
                            test=p_test)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
Esempio n. 7
0
    def run(self):
        self.output().makedirs()
        self.tokenzier = treebank.TreebankWordTokenizer()
        self.stemmer = snowball.SnowballStemmer('english')
        self.vectorizer = CountVectorizer(ngram_range=(1, self.ngram_max),
                                          min_df=self.ngram_min_df)
        train_data = rf_dataset.Dataset().load('train', fold=None, as_df=True)
        test_data = rf_dataset.Dataset().load('test', fold=None, as_df=True)

        all_questions = np.concatenate([
            train_data.question1_clean.values,
            test_data.question1_clean.values,
            train_data.question2_clean.values, test_data.question2_clean.values
        ])

        print(colors.lightblue | 'Tokenizing')
        all_tokens = multiprocessing.Pool(4).map(self.vectorize_question,
                                                 all_questions)
        print(colors.lightblue | 'Finished tokenizing, now fitting')
        transformed_tokens = self.vectorizer.fit_transform(all_tokens)
        print(colors.lightblue | colors.bold | 'Gosh that takes a long time')
        transformed_tokens = transformed_tokens.tocsr()

        halfpt = transformed_tokens.shape[0] // 2
        assert halfpt == train_data.shape[0] + test_data.shape[0]
        q1s = transformed_tokens[:halfpt]
        q2s = transformed_tokens[halfpt:]

        train_q1s = q1s[:train_data.shape[0]]
        train_q2s = q2s[:train_data.shape[0]]
        test_q1s = q1s[train_data.shape[0]:]
        test_q2s = q2s[train_data.shape[0]:]
        nose.tools.assert_equal(test_q1s.shape[0], test_data.shape[0])
        nose.tools.assert_equal(test_q2s.shape[0], test_data.shape[0])
        nose.tools.assert_equal(train_q1s.shape[0], train_data.shape[0])
        nose.tools.assert_equal(train_q2s.shape[0], train_data.shape[0])

        self.write_mat_to(self.make_path('train_q1.pkl'), train_q1s)
        self.write_mat_to(self.make_path('train_q2.pkl'), train_q2s)
        self.write_mat_to(self.make_path('test_q1.pkl'), test_q1s)
        self.write_mat_to(self.make_path('test_q2.pkl'), test_q2s)

        diffs = sp.hstack([np.abs(q1s - q2s), q1s.multiply(q2s)]).tocsr()

        train_vecs = diffs[:train_data.shape[0]]
        test_vecs = diffs[train_data.shape[0]:]
        nose.tools.assert_equal(train_vecs.shape[0], train_data.shape[0])
        nose.tools.assert_equal(test_vecs.shape[0], test_data.shape[0])

        self.write_mat_to(self.make_path('train_mat.pkl'), train_vecs)
        self.write_mat_to(self.make_path('test_mat.pkl'), test_vecs)

        with self.output().open('w'):
            pass
Esempio n. 8
0
    def run(self):
        self.output().makedirs()
        X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True)
        y_train = rf_dataset.Dataset().load('train', self.fold,
                                            as_df=True).is_duplicate
        X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True)
        y_valid = rf_dataset.Dataset().load('valid', self.fold,
                                            as_df=True).is_duplicate

        pos_train = X_train[y_train == 1]
        neg_train = X_train[y_train == 0]
        X_train = pd.concat(
            (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
        y_train = np.array(
            [0] * neg_train.shape[0] +
            [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] +
            [0] * neg_train.shape[0])
        del pos_train, neg_train

        #pos_valid = X_valid[y_valid == 1]
        #neg_valid = X_valid[y_valid == 0]
        #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
        #y_valid = np.array(
        #    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
        #del pos_valid, neg_valid

        cls = lightgbm.sklearn.LGBMClassifier(n_estimators=2048,
                                              num_leaves=1024,
                                              learning_rate=0.03,
                                              subsample=0.75)
        X_tr_tr, X_tr_es, y_tr_tr, y_tr_es = model_selection.train_test_split(
            X_train, y_train, test_size=0.05)
        cls.fit(X_tr_tr,
                y_tr_tr,
                eval_set=[(X_tr_es, y_tr_es)],
                early_stopping_rounds=50)
        valid_pred = cls.predict_proba(X_valid)[:, 1]
        print(colors.green | '{:s} == {:f}'.format(
            repr(self), score_data(y_valid, valid_pred, weighted=False)))
        print(colors.yellow | str(
            pandas.Series(cls.feature_importances_,
                          index=X_train.columns).sort_values()))

        X_test = RF_LeakyXGB_Dataset().load('test', None,
                                            as_df=True).fillna(-999).clip(
                                                -1000, 1000)
        test_pred = cls.predict_proba(X_test)[:, 1]

        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=valid_pred,
                            test=test_pred)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
Esempio n. 9
0
    def score(self):
        self.output().makedirs()
        train_Xs = []
        train_ys = []
        for fold in range(1, fold_max):
            y = rf_dataset.Dataset().load('valid', fold, as_df=True).is_duplicate.values.squeeze()
            x = self.fold_x(fold, 'valid')
            nose.tools.assert_equal(x.shape[0], y.shape[0])
            train_Xs.append(x)
            train_ys.append(y)
        sns.clustermap(pandas.concat(train_Xs, 0).corr())
        plt.yticks(rotation=90)
        plt.savefig('./corr.png')
        plt.close()
        train_X = pandas.concat(train_Xs, 0).values
        train_y = np.concatenate(train_ys, 0).squeeze()

        cls = AutoExitingGBMLike(XGBClassifier(
            n_estimators=1024,
            learning_rate=0.05,
            max_depth=8,
            gamma=1,
            subsample=0.5
        ), additional_fit_args={'verbose': False})

        #cls = AutoExitingGBMLike(lightgbm.sklearn.LGBMClassifier(
        #    n_estimators=1024,
        #    learning_rate=0.01,
        #    subsample=0.5,
        #    num_leaves=2048
        #), additional_fit_args={'verbose': False})
        #cls = pipeline.Pipeline([
        #    ('poly', preprocessing.PolynomialFeatures(2)),
        #    ('anova', feature_selection.SelectPercentile(feature_selection.f_classif)),
        #    ('lin', linear_model.LogisticRegression(C=1, class_weight=core.dictweights))
        #])
        #cls = keras.wrappers.scikit_learn.KerasClassifier(build_fn=self.simple_nn)

        cls.fit(train_X, train_y)
        if hasattr(cls, 'feature_importances_'):
            ds_names = [repr(d) for d in self.classifiers(0)]
            print(colors.yellow | str(pandas.Series(cls.feature_importances_, index=ds_names).sort_values()))

        test_x = self.fold_x(0, 'valid').values
        test_y = rf_dataset.Dataset().load('valid', 0, as_df=True).is_duplicate.values.squeeze()

        score = core.score_data(test_y, cls.predict_proba(test_x)[:, 1])
        return score, cls
Esempio n. 10
0
 def _load(self, as_df):
     res = np.load(self.output().path)['train']
     if as_df:
         res = pandas.DataFrame(
             res, columns=['ent', 'nent', 'ent_diff', 'ent_ratio'])
     folds = rf_dataset.Dataset().load_dataset_folds()
     return res, folds
Esempio n. 11
0
 def load(self, name, fold, as_df=False, include_smallfeat=True):
     assert self.complete()
     assert not as_df, 'Dataframe mode not supported'
     assert include_smallfeat, 'implement in load_all then remove assert.'
     assert name in {'train', 'test', 'valid'}
     if name == 'test':
         res = np.load(self.make_path('test.npz'))
         if include_smallfeat:
             smallfeat = rf_small_features.SmallFeaturesTask()._load_test(
                 False)
             return res['q1'], res['q2'], smallfeat
         else:
             return res['q1'], res['q2']
     else:
         res = np.load(self.make_path('train.npz'))
         smallfeat, sm_folds = rf_small_features.SmallFeaturesTask()._load(
             False)
         folds = (rf_dataset.Dataset().load_dataset_folds() +
                  fold) % fold_max
         if name == 'valid':
             selection = folds == 0
         else:
             selection = folds != 0
         if include_smallfeat:
             return res['q1'][selection], res['q2'][selection], smallfeat[
                 selection]
         else:
             return res['q1'][selection], res['q2'][selection]
Esempio n. 12
0
 def _load(self, as_df):
     res = np.load(self.output().path)['train_distances']
     if as_df:
         res = pandas.DataFrame(
             res, columns=['cosine', 'dice', 'hamming', 'kulsinski'])
     folds = rf_dataset.Dataset().load_dataset_folds()
     return res, folds
Esempio n. 13
0
 def _load(self, as_df):
     folds = rf_dataset.Dataset().load_dataset_folds()
     features = pandas.read_msgpack(_train_loc).fillna(9999).clip(
         -10000, 10000)
     if not as_df:
         features = features.values
     return features, folds
Esempio n. 14
0
    def run(self):
        self.output().makedirs()
        X_train = RF_LeakyXGB_Dataset().load('train', self.fold,
                                             as_df=True).fillna(-999).clip(
                                                 -1000, 1000)
        y_train = rf_dataset.Dataset().load('train', self.fold,
                                            as_df=True).is_duplicate
        X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold,
                                             as_df=True).fillna(-999).clip(
                                                 -1000, 1000)
        y_valid = rf_dataset.Dataset().load('valid', self.fold,
                                            as_df=True).is_duplicate

        pos_train = X_train[y_train == 1]
        neg_train = X_train[y_train == 0]
        X_train = pd.concat(
            (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
        y_train = np.array(
            [0] * neg_train.shape[0] +
            [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] +
            [0] * neg_train.shape[0])
        del pos_train, neg_train

        #pos_valid = X_valid[y_valid == 1]
        #neg_valid = X_valid[y_valid == 0]
        #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
        #y_valid = np.array(
        #    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
        #del pos_valid, neg_valid

        cls = ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=1024)
        cls.fit(X_train.values, y_train.values)

        valid_pred = cls.predict_proba(X_valid)[:, 1]
        print(colors.green | '{:s} == {:f}'.format(
            repr(self), score_data(y_valid, valid_pred)))
        print(colors.yellow | str(
            pandas.Series(cls.feature_importances_,
                          index=X_train.columns).sort_values()))
        X_test = RF_LeakyXGB_Dataset().load('test', None,
                                            as_df=True).fillna(-999).clip(
                                                -1000, 1000)
        test_pred = cls.predict_proba(X_test.values)[:, 1]
        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=valid_pred,
                            test=test_pred)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
Esempio n. 15
0
    def run(self):
        self.output().makedirs()
        train_data = rf_dataset.Dataset().load_all('train', as_df=True)
        test_data = rf_dataset.Dataset().load_all('test', as_df=True)

        all_questions = (list(train_data.question1_clean) +
                         list(test_data.question1_clean) +
                         list(train_data.question2_clean) +
                         list(test_data.question2_clean))

        allq = multiprocessing.Pool().imap(self.hash_question,
                                           all_questions,
                                           chunksize=10000)
        hashes = list(tqdm(allq, total=len(all_questions)))
        train_size = train_data.shape[0]
        test_size = test_data.shape[0]
        q1s = hashes[:(train_size + test_size)]
        q2s = hashes[(train_size + test_size):]
        hashes = {
            'train_q1': q1s[:train_size],
            'train_q2': q2s[:train_size],
            'test_q1': q1s[train_size:],
            'test_q2': q2s[train_size:],
        }
        self.hashgraph = networkx.Graph()
        self.hashgraph.add_edges_from(
            zip(hashes['train_q1'], hashes['train_q2']))
        self.hashgraph.add_edges_from(zip(hashes['test_q1'],
                                          hashes['test_q2']))

        train_iter = tqdm(zip(hashes['train_q1'], hashes['train_q2']),
                          total=train_size,
                          desc='train_feat')
        train_feat = [self.rehash(h1, h2) for h1, h2 in train_iter]
        np.savez_compressed(self.make_path('train.npz'),
                            data=np.asarray(train_feat))

        test_iter = tqdm(zip(hashes['test_q1'], hashes['test_q2']),
                         total=test_size,
                         desc='test_feat')
        test_feat = [self.rehash(h1, h2) for h1, h2 in test_iter]
        np.savez_compressed(self.make_path('test.npz'),
                            data=np.asarray(test_feat))

        with self.output().open('w'):
            pass
Esempio n. 16
0
 def _load(self, as_df):
     res = np.load(self.make_path('train.npz'))['data']
     if as_df:
         res = pandas.DataFrame(
             res,
             columns=['l1_neighbours', 'graph_neighbours', 'l2_neighbours'])
     folds = rf_dataset.Dataset().load_dataset_folds()
     return res, folds
Esempio n. 17
0
 def _load(self, as_df):
     folds = rf_dataset.Dataset().load_dataset_folds()
     feat = kq.core.fillna(
         np.load(self.make_path('train.npz'))['data'],
         9999).clip(-10000, 10000)
     if as_df:
         return pandas.DataFrame(feat, columns=self.columns()), folds
     else:
         return feat, folds
Esempio n. 18
0
    def run(self):
        self.tokenzier = treebank.TreebankWordTokenizer()
        self.kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file)

        train_data = rf_dataset.Dataset().load_all(
            'train', as_df=True)[['question1_clean', 'question2_clean']]
        test_data = rf_dataset.Dataset().load_all(
            'test', as_df=True)[['question1_clean', 'question2_clean']]

        all_data = pandas.concat([train_data, test_data], 0)

        distances = list(
            tqdm(multiprocessing.Pool().imap(self.vectorize,
                                             zip(all_data['question1_clean'],
                                                 all_data['question2_clean']),
                                             chunksize=50_000),
                 total=all_data.shape[0],
                 desc='vectorizing the words'))
Esempio n. 19
0
    def requires(self):
        yield rf_dataset.Dataset()
        xs = []
        for fold in range(fold_max):
            for cls in self.classifiers(fold):
                xs.append(cls)

        for v in sorted(xs, key=lambda c: c.__class__.__name__):
            yield v
Esempio n. 20
0
    def run(self):
        self.output().makedirs()
        kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file)
        train_dataset = rf_dataset.Dataset().load_all('train', as_df=True)
        test_dataset = rf_dataset.Dataset().load_all('test', as_df=True)
        self.tokenzier = treebank.TreebankWordTokenizer()

        all_words = pandas.concat([
            train_dataset.question1_clean.str.lower(),
            train_dataset.question2_clean.str.lower(),
            test_dataset.question1_clean.str.lower(),
            test_dataset.question2_clean.str.lower(),
        ])

        tokenizer = Tokenizer(num_words=250_000)
        tokenizer.fit_on_texts(all_words)
        all_seqs = tokenizer.texts_to_sequences(all_words)
        all_padded_seqs = pad_sequences(all_seqs, 32)

        train_seqs = all_padded_seqs[:train_dataset.shape[0] * 2]
        test_seqs = all_padded_seqs[train_dataset.shape[0] * 2:]
        nose.tools.assert_equal(test_seqs.shape[0], test_dataset.shape[0] * 2)

        train_q1 = train_seqs[:train_dataset.shape[0]]
        train_q2 = train_seqs[train_dataset.shape[0]:]
        test_q1 = test_seqs[:test_dataset.shape[0]]
        test_q2 = test_seqs[test_dataset.shape[0]:]

        np.savez_compressed(self.make_path('train.npz'),
                            q1=train_q1,
                            q2=train_q2)
        np.savez_compressed(self.make_path('test.npz'), q1=test_q1, q2=test_q2)

        embedding_matrix = np.zeros((250_000, 300))
        for word, ix in tokenizer.word_index.items():
            if word in kvecs:
                embedding_matrix[ix, :] = kvecs[word]
        np.savez_compressed(self.make_path('embedding.npz'),
                            data=embedding_matrix)

        with self.output().open('w'):
            pass
Esempio n. 21
0
    def _load(self, as_df):
        decomps = [r._load(as_df)[0] for r in self.requires()]
        if as_df:
            for d, r in zip(decomps, self.requires()):
                self.wangjangle_columns(d, r)
            decomps = pandas.concat(decomps, 1)
        else:
            decomps = np.concatenate(decomps, 1)
        folds = rf_dataset.Dataset().load_dataset_folds()

        return decomps, folds
Esempio n. 22
0
    def run(self):
        self.output().makedirs()
        train = rf_dataset.Dataset().load_all('train', as_df=True)
        test = rf_dataset.Dataset().load_all('test', as_df=True)

        true_qid = {q: id for q, id in zip(train.question1, train.qid1)}
        true_qid.update({q: id for q, id in zip(train.question2, train.qid2)})
        train_max_id = max(true_qid.values())
        step_size = 507000 / 2345806
        current_id = train_max_id + step_size

        for q1, q2 in tqdm(zip(test.question1, test.question2),
                           total=test.shape[0]):
            if q1 not in true_qid:
                true_qid[q1] = current_id
                current_id += step_size
            if q2 not in true_qid:
                true_qid[q2] = current_id
                current_id += step_size

        train_feature = [
            min(true_qid[q1], true_qid[q2])
            for q1, q2 in tqdm(zip(train.question1, train.question2),
                               total=train.shape[0])
        ]

        test_feature = [
            min(true_qid[q1], true_qid[q2])
            for q1, q2 in tqdm(zip(test.question1, test.question2),
                               total=test.shape[0])
        ]

        np.savez_compressed(self.make_path('train.npz'),
                            data=np.asarray(train_feature)[:, None])
        np.savez_compressed(self.make_path('test.npz'),
                            data=np.asarray(test_feature)[:, None])
        with self.output().open('w'):
            pass
Esempio n. 23
0
    def _load(self, as_df):
        res = kq.core.fillna(
            np.load(self.make_path('train.npz'))['data'],
            9999).clip(-10000, 10000)
        if not self.include_space:
            res = res[:, :len(distances)]
            cols = self.colnames()[:len(distances)]
        else:
            cols = self.colnames()

        if as_df:
            res = pandas.DataFrame(res, columns=cols)
        folds = rf_dataset.Dataset().load_dataset_folds()
        return res, folds
Esempio n. 24
0
    def run(self):
        self.output().makedirs()
        m1, m2 = rf_word_count_features.WordCountMatrix().load_raw_vectors(
            'train')
        m1 = m1 > 0
        m2 = m2 > 0
        X = m1.multiply(m2)
        folds = (rf_dataset.Dataset().load_dataset_folds() +
                 self.fold) % fold_max
        train_X = X[folds != 0]
        train_y = rf_dataset.Dataset().load('train',
                                            fold=self.fold,
                                            as_df=True).is_duplicate.values
        cls = naive_bayes.BernoulliNB()
        cls.fit(train_X, train_y)

        valid_X = X[folds == 0]
        valid_y = rf_dataset.Dataset().load('valid',
                                            fold=self.fold,
                                            as_df=True).is_duplicate.values
        valid_pred = cls.predict_proba(valid_X)[:, 1]

        score = score_data(valid_y, valid_pred)

        print(colors.green | "Score for {:s}: {:f}".format(repr(self), score))

        t1, t2 = rf_word_count_features.WordCountMatrix().load_raw_vectors(
            'test')
        t1 = t1 > 0
        t2 = t2 > 0
        test_X = t1.multiply(t2)
        test_pred = cls.predict_proba(test_X)[:, 1]
        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=valid_pred,
                            test=test_pred)
        os.rename(self.make_path('done_tmp.npz'), self.make_path('done.npz'))
        return score
Esempio n. 25
0
    def _load(self, as_df):
        assert not as_df, 'Pandas mode not supported'
        Xs = []
        n_vs = None
        for r in self.requires():
            x = r.load_all('train', False)
            nose.tools.assert_equal(len(x.shape), 2, repr(r))
            if isinstance(x, np.ndarray):
                x[np.isnan(x)] = 999
                x = np.clip(x, -1000, 1000)
            if n_vs is None:
                n_vs = x.shape[0]
            else:
                nose.tools.assert_equal(n_vs, x.shape[0], repr(r))
            Xs.append(x)

        folds = rf_dataset.Dataset().load_dataset_folds()
        res = sp.hstack(Xs)
        return res.tocsr(), folds
Esempio n. 26
0
    def _load(self, as_df):
        Xs = []
        n_vs = None
        for r in self.requires():
            x = r.load_all('train', as_df)
            nose.tools.assert_equal(len(x.shape), 2, repr(r))
            if n_vs is None:
                n_vs = x.shape[0]
            else:
                nose.tools.assert_equal(n_vs, x.shape[0], repr(r))
            if as_df:
                nose.tools.assert_is_instance(x, pandas.DataFrame, repr(r))
                x.columns = [r.__class__.__name__ + '_' + c for c in x.columns]
                x = x.fillna(999).clip(-1000, 1000)
            Xs.append(x)

        folds = rf_dataset.Dataset().load_dataset_folds()
        if as_df:
            res = pandas.concat(Xs, 1).reset_index(drop=True)
            return res, folds
        else:
            res = np.concatenate(Xs, 1)
            res[np.isnan(res)] = 999
            return np.clip(res, -1000, 1000), folds
Esempio n. 27
0
 def requires(self):
     yield rf_dataset.Dataset()
     yield rf_word_count_features.WordCountMatrix()
Esempio n. 28
0
 def requires(self):
     yield rf_dataset.Dataset()
     yield rf_small_features.SmallFeaturesTask()
Esempio n. 29
0
 def requires(self):
     yield rf_seq_data.RFWordSequenceDataset()
     yield rf_dataset.Dataset()
Esempio n. 30
0
 def requires(self):
     yield rf_dataset.Dataset()
     yield rf_ab.ABDataset()