Beispiel #1
0
    def train(self):
        self.output().makedirs()
        preproc = pipeline.Pipeline([
            ('norm', preprocessing.Normalizer()),
            ('poly', preprocessing.PolynomialFeatures(self.npoly.get()))
        ])

        X = abhishek_feats.AbhishekFeatures().load('train',
                                                   self.fold,
                                                   as_df=True)
        X = preproc.fit_transform(X)
        y = xval_dataset.BaseDataset().load('train', self.fold).squeeze()
        cls = linear_model.LogisticRegression(C=self.C.get(),
                                              solver='sag',
                                              class_weight=core.dictweights)
        cls.fit(X, y)

        print('Validating')
        validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold)
        validX = preproc.transform(validX)
        y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze()
        y_pred = cls.predict_proba(validX)[:, 1]

        score = core.score_data(y, y_pred)
        np.save(
            'cache/abhishek/logit/{:f}/{:d}/valid.npy'.format(
                self.C.get(), self.fold), y_pred)

        return score, cls, preproc
Beispiel #2
0
    def run(self):
        self.output().makedirs()
        data = self.xdataset()

        X = data.load('train', self.fold)
        y = rf_dataset.Dataset().load('train', self.fold,
                                      as_df=True).is_duplicate

        cls = self.make_cls()
        print('Training classifier {:s} on data of size: {}'.format(
            repr(cls), X.shape))
        cls.fit(X, y)

        self.post_fit(cls)

        X_val = data.load('valid', self.fold)
        y_val = rf_dataset.Dataset().load('valid', self.fold,
                                          as_df=True).is_duplicate

        y_pred = cls.predict_proba(X_val)[:, 1]
        np.savez_compressed(self.make_path('valid.npz'), data=y_pred)
        score = core.score_data(y_val, y_pred)

        del X, y, X_val, y_val
        X_test = data.load('test', None)
        y_test_pred = cls.predict_proba(X_test)[:, 1]
        np.savez_compressed(self.make_path('test.npz'), data=y_test_pred)

        print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score))

        with self.output().open('w') as f:
            f.write('Score: {:s}: {:f}'.format(repr(self), score))
        return score
Beispiel #3
0
    def run(self):
        self.output().makedirs()
        wc_data = rf_word_count_features.WordCountMatrix()

        X = wc_data.load('train', self.fold).astype(np.float32)
        y = rf_dataset.Dataset().load('train', self.fold,
                                      as_df=True).is_duplicate

        cls = self.make_cls()
        cls.fit(X, y)

        X_val = wc_data.load('valid', self.fold).astype(np.float32)
        y_val = rf_dataset.Dataset().load('valid', self.fold,
                                          as_df=True).is_duplicate

        y_pred = cls.predict_proba(X_val)[:, 1]
        np.savez_compressed(self.make_path('valid.npz'), data=y_pred)
        score = core.score_data(y_val, y_pred)

        del X, y, X_val, y_val
        X_test = wc_data.load('test', None).astype(np.float32)
        y_test_pred = cls.predict_proba(X_test)[:, 1]
        np.savez_compressed(self.make_path('test.npz'), data=y_test_pred)

        print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score))

        with self.output().open('w') as f:
            f.write('Score: {:s}: {:f}'.format(repr(self), score))
        return score
Beispiel #4
0
    def run(self):
        self.output().makedirs()
        X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True)
        y_train = rf_dataset.Dataset().load('train', self.fold,
                                            as_df=True).is_duplicate
        X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True)
        y_valid = rf_dataset.Dataset().load('valid', self.fold,
                                            as_df=True).is_duplicate

        pos_train = X_train[y_train == 1]
        neg_train = X_train[y_train == 0]
        X_train = pd.concat(
            (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
        y_train = np.array(
            [0] * neg_train.shape[0] +
            [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] +
            [0] * neg_train.shape[0])
        del pos_train, neg_train

        #pos_valid = X_valid[y_valid == 1]
        #neg_valid = X_valid[y_valid == 0]
        #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
        #y_valid = np.array(
        #    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
        #del pos_valid, neg_valid
        X_tr_tr, X_tr_es, y_tr_tr, y_tr_es = model_selection.train_test_split(
            X_train, y_train, test_size=0.05)

        d_train = xgb.DMatrix(X_tr_tr, label=y_tr_tr)
        d_es = xgb.DMatrix(X_tr_es, label=y_tr_es)
        d_valid = xgb.DMatrix(X_valid, label=y_valid)
        watchlist = [(d_train, 'train'), (d_es, 'd_es')]

        params = {}
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = 0.02
        params['max_depth'] = 7
        params['subsample'] = 0.6
        params['base_score'] = 0.2

        #bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
        bst = xgb.train(params,
                        d_train,
                        1000,
                        watchlist,
                        early_stopping_rounds=50,
                        verbose_eval=50)
        p_valid = bst.predict(d_valid)
        print(score_data(y_valid, p_valid, weighted=False))
        X_test = RF_LeakyXGB_Dataset().load('test', None, as_df=True)
        d_test = xgb.DMatrix(X_test)
        p_test = bst.predict(d_test)

        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=p_valid,
                            test=p_test)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
Beispiel #5
0
    def run(self):
        self.output().makedirs()
        batch_size = 128
        normalizer = preprocessing.StandardScaler()

        train_q1, train_q2, train_other = rf_seq_data.RFWordSequenceDataset().load('train', fold=self.fold)
        train_other = normalizer.fit_transform(train_other)
        train_labels = rf_dataset.Dataset().load('train', fold=self.fold, as_df=True).is_duplicate
        print(train_q1.shape, train_q2.shape, train_other.shape)
        embedding = rf_seq_data.RFWordSequenceDataset().load_embedding_mat()

        np.random.seed(self.fold)
        model = self.model(embedding, train_q2.shape[1], train_other.shape[1])

        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=6)
        slow_plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3)
        model_path = self.make_path('model.h5')
        model_checkpointer = keras.callbacks.ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True)

        if self.include_distances():
            train_data = [train_q1, train_q2, train_other]
        else:
            train_data = [train_q1, train_q2]

        model.fit(
            train_data, train_labels,
            validation_split=0.05,
            epochs=20, batch_size=batch_size, shuffle=True,
            class_weight=dictweights,
            callbacks=[early_stopping, slow_plateau, model_checkpointer])
        model.load_weights(model_path)

        valid_q1, valid_q2, valid_other = rf_seq_data.RFWordSequenceDataset().load('valid', fold=self.fold)
        valid_other = normalizer.transform(valid_other)
        valid_labels = rf_dataset.Dataset().load('valid', fold=self.fold, as_df=True).is_duplicate
        if self.include_distances():
            valid_data = [valid_q1, valid_q2, valid_other]
        else:
            valid_data = [valid_q1, valid_q2]

        valid_preds = model.predict(valid_data, verbose=1, batch_size=batch_size)
        valid_preds = np.clip(valid_preds, 1e-7, 1 - 1e-7)

        score = score_data(valid_labels.values, valid_preds)
        print(colors.green | "Score for {:s}: {:f}".format(repr(self), score))

        test_q1, test_q2, test_other = rf_seq_data.RFWordSequenceDataset().load('test', None)
        test_other = normalizer.transform(test_other)
        if self.include_distances():
            test_data = [test_q1, test_q2, test_other]
        else:
            test_data = [test_q1, test_q2]

        test_preds = model.predict(test_data, verbose=1, batch_size=batch_size)

        np.savez_compressed(self.make_path('done_tmp.npz'), valid=valid_preds, test=test_preds)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
        return score
Beispiel #6
0
    def valid(self):
        pred = self.predict('valid')
        print(colors.green | "prediction sample...")
        print(colors.green | str(pred.head()))
        y = dataset.Dataset().load()[2]
        loss = core.score_data(y.is_duplicate, pred)
        print(colors.green | "Performance: " + str(loss))

        return pred
Beispiel #7
0
    def run(self):
        batch_size = 128
        self.output().makedirs()
        train_data, train_labels = self.load_dataset('train')
        valid_data, valid_labels = self.load_dataset('valid')
        valid_weights = core.weights[valid_labels]
        class_weights = dict(enumerate(core.weights))
        embedding = keras_kaggle_data.KaggleDataset().load_embedding()

        model = self.model(
            embedding,
            keras_kaggle_data.KaggleDataset().MAX_SEQUENCE_LENGTH,
            train_data[2].shape[1])
        model.summary()

        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                       patience=6)
        slow_plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                         patience=3)
        model_path = 'cache/%s/model.h5' % self.base_name
        model_checkpointer = keras.callbacks.ModelCheckpoint(
            model_path, save_best_only=True, save_weights_only=True)

        train_data = [
            np.vstack([train_data[0], train_data[1]]),
            np.vstack([train_data[1], train_data[0]]),
            np.vstack([train_data[2], train_data[2]])
        ]
        train_labels = np.concatenate([train_labels, train_labels])

        model.fit(train_data,
                  train_labels,
                  validation_data=(valid_data, valid_labels, valid_weights),
                  epochs=20,
                  batch_size=batch_size,
                  shuffle=True,
                  class_weight=class_weights,
                  callbacks=[early_stopping, slow_plateau, model_checkpointer])
        model.load_weights(model_path)

        valid_preds = model.predict(valid_data, batch_size=batch_size)
        print(colors.green | ('Valid loss: %f ' %
                              core.score_data(valid_labels, valid_preds)))
        del valid_labels, valid_data
        del train_labels, train_data

        merge_data, merge_labels = self.load_dataset('merge')
        merge_preds = model.predict(merge_data, batch_size=batch_size)

        np.save('cache/%s/merge.npy' % self.base_name, merge_preds)

        test_data, _ = self.load_dataset('test')
        test_preds = model.predict(test_data, batch_size=batch_size, verbose=1)

        np.save('cache/%s/classifications.npy' % self.base_name, test_preds)
Beispiel #8
0
    def run(self):
        self.output().makedirs()
        X_train = RF_LeakyXGB_Dataset().load('train', self.fold, as_df=True)
        y_train = rf_dataset.Dataset().load('train', self.fold,
                                            as_df=True).is_duplicate
        X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold, as_df=True)
        y_valid = rf_dataset.Dataset().load('valid', self.fold,
                                            as_df=True).is_duplicate

        pos_train = X_train[y_train == 1]
        neg_train = X_train[y_train == 0]
        X_train = pd.concat(
            (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
        y_train = np.array(
            [0] * neg_train.shape[0] +
            [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] +
            [0] * neg_train.shape[0])
        del pos_train, neg_train

        #pos_valid = X_valid[y_valid == 1]
        #neg_valid = X_valid[y_valid == 0]
        #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
        #y_valid = np.array(
        #    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
        #del pos_valid, neg_valid

        cls = lightgbm.sklearn.LGBMClassifier(n_estimators=2048,
                                              num_leaves=1024,
                                              learning_rate=0.03,
                                              subsample=0.75)
        X_tr_tr, X_tr_es, y_tr_tr, y_tr_es = model_selection.train_test_split(
            X_train, y_train, test_size=0.05)
        cls.fit(X_tr_tr,
                y_tr_tr,
                eval_set=[(X_tr_es, y_tr_es)],
                early_stopping_rounds=50)
        valid_pred = cls.predict_proba(X_valid)[:, 1]
        print(colors.green | '{:s} == {:f}'.format(
            repr(self), score_data(y_valid, valid_pred, weighted=False)))
        print(colors.yellow | str(
            pandas.Series(cls.feature_importances_,
                          index=X_train.columns).sort_values()))

        X_test = RF_LeakyXGB_Dataset().load('test', None,
                                            as_df=True).fillna(-999).clip(
                                                -1000, 1000)
        test_pred = cls.predict_proba(X_test)[:, 1]

        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=valid_pred,
                            test=test_pred)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
Beispiel #9
0
    def score(self):
        self.output().makedirs()
        train_Xs = []
        train_ys = []
        for fold in range(1, fold_max):
            y = rf_dataset.Dataset().load('valid', fold, as_df=True).is_duplicate.values.squeeze()
            x = self.fold_x(fold, 'valid')
            nose.tools.assert_equal(x.shape[0], y.shape[0])
            train_Xs.append(x)
            train_ys.append(y)
        sns.clustermap(pandas.concat(train_Xs, 0).corr())
        plt.yticks(rotation=90)
        plt.savefig('./corr.png')
        plt.close()
        train_X = pandas.concat(train_Xs, 0).values
        train_y = np.concatenate(train_ys, 0).squeeze()

        cls = AutoExitingGBMLike(XGBClassifier(
            n_estimators=1024,
            learning_rate=0.05,
            max_depth=8,
            gamma=1,
            subsample=0.5
        ), additional_fit_args={'verbose': False})

        #cls = AutoExitingGBMLike(lightgbm.sklearn.LGBMClassifier(
        #    n_estimators=1024,
        #    learning_rate=0.01,
        #    subsample=0.5,
        #    num_leaves=2048
        #), additional_fit_args={'verbose': False})
        #cls = pipeline.Pipeline([
        #    ('poly', preprocessing.PolynomialFeatures(2)),
        #    ('anova', feature_selection.SelectPercentile(feature_selection.f_classif)),
        #    ('lin', linear_model.LogisticRegression(C=1, class_weight=core.dictweights))
        #])
        #cls = keras.wrappers.scikit_learn.KerasClassifier(build_fn=self.simple_nn)

        cls.fit(train_X, train_y)
        if hasattr(cls, 'feature_importances_'):
            ds_names = [repr(d) for d in self.classifiers(0)]
            print(colors.yellow | str(pandas.Series(cls.feature_importances_, index=ds_names).sort_values()))

        test_x = self.fold_x(0, 'valid').values
        test_y = rf_dataset.Dataset().load('valid', 0, as_df=True).is_duplicate.values.squeeze()

        score = core.score_data(test_y, cls.predict_proba(test_x)[:, 1])
        return score, cls
Beispiel #10
0
    def run(self):
        self.output().makedirs()
        X_train = RF_LeakyXGB_Dataset().load('train', self.fold,
                                             as_df=True).fillna(-999).clip(
                                                 -1000, 1000)
        y_train = rf_dataset.Dataset().load('train', self.fold,
                                            as_df=True).is_duplicate
        X_valid = RF_LeakyXGB_Dataset().load('valid', self.fold,
                                             as_df=True).fillna(-999).clip(
                                                 -1000, 1000)
        y_valid = rf_dataset.Dataset().load('valid', self.fold,
                                            as_df=True).is_duplicate

        pos_train = X_train[y_train == 1]
        neg_train = X_train[y_train == 0]
        X_train = pd.concat(
            (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
        y_train = np.array(
            [0] * neg_train.shape[0] +
            [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] +
            [0] * neg_train.shape[0])
        del pos_train, neg_train

        #pos_valid = X_valid[y_valid == 1]
        #neg_valid = X_valid[y_valid == 0]
        #X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
        #y_valid = np.array(
        #    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
        #del pos_valid, neg_valid

        cls = ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=1024)
        cls.fit(X_train.values, y_train.values)

        valid_pred = cls.predict_proba(X_valid)[:, 1]
        print(colors.green | '{:s} == {:f}'.format(
            repr(self), score_data(y_valid, valid_pred)))
        print(colors.yellow | str(
            pandas.Series(cls.feature_importances_,
                          index=X_train.columns).sort_values()))
        X_test = RF_LeakyXGB_Dataset().load('test', None,
                                            as_df=True).fillna(-999).clip(
                                                -1000, 1000)
        test_pred = cls.predict_proba(X_test.values)[:, 1]
        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=valid_pred,
                            test=test_pred)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
Beispiel #11
0
    def testicles(self):
        X = self._load_named('train')
        y = dataset.Dataset().load_named('train').is_duplicate.values

        cls = lightgbm.LGBMClassifier(num_leaves=512, n_estimators=500)
        cls.fit(X.values, y)
        X_test = self._load_named('valid').values
        y_test = dataset.Dataset().load_named('valid').is_duplicate.values
        y_pred = cls.predict_proba(X_test)[:, 1]

        scoring = core.score_data(y_test, y_pred)
        importances = pandas.Series(cls.feature_importances_, index=X.columns)
        print(scoring)
        print(importances)
        with self.output().open('w') as f:
            f.write("Score: {:f}\n".format(scoring))
            f.write(str(importances))
Beispiel #12
0
    def run(self):
        self.output().makedirs()
        X = abhishek_feats.AbhishekFeatures().load('train', self.fold)
        y = xval_dataset.BaseDataset().load('train', self.fold).squeeze()
        cls = xgbsk.XGBClassifier(max_depth=self.max_depth.get(),
                                  learning_rate=self.eta.get(),
                                  n_estimators=self.n_est.get())
        X_tr, X_va, y_tr, y_va = model_selection.train_test_split(
            X, y, test_size=0.05)
        cls.fit(X_tr,
                y_tr,
                sample_weight=core.weight_from(y_tr),
                eval_set=[(X_va, y_va)],
                early_stopping_rounds=10)

        validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold)
        y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze()
        y_pred = cls.predict_proba(validX)[:, 1]
        score = core.score_data(y, y_pred)
        scorestr = "{:s} = {:f}".format(repr(self), score)
        print(colors.green | colors.bold | scorestr)

        valid_fn = 'cache/abhishek/xgb/maxdepth_{:d}_eta_{:f}_nest_{:d}/{:d}/valid.npy'.format(
            self.max_depth.get(), self.eta.get(), self.n_est.get(), self.fold)

        np.save(valid_fn, y_pred)

        trainX = abhishek_feats.AbhishekFeatures().load('test', None)
        pred = cls.predict_proba(trainX)[:, 1]

        test_fn = 'cache/abhishek/xgb/maxdepth_{:d}_eta_{:f}_nest_{:d}/{:d}/test.npy'.format(
            self.max_depth.get(), self.eta.get(), self.n_est.get(), self.fold)
        np.save(test_fn, pred)

        with self.output().open('w') as f:
            cols = abhishek_feats.AbhishekFeatures().load('valid',
                                                          self.fold,
                                                          as_df=True).columns
            v = pandas.Series(cls.feature_importances_,
                              index=cols).sort_values()
            v.to_csv(f)
            f.write("\n\n")
            f.write(scorestr)
            f.write("\n")
        return score
Beispiel #13
0
    def score(self):
        self.output().makedirs()
        poly = preprocessing.PolynomialFeatures(self.npoly.get())
        train_Xs = []
        train_ys = []
        for fold in range(1, 9):
            y = xval_dataset.BaseDataset().load('valid', fold).squeeze()
            x = self.fold_x(fold, 'valid')
            nose.tools.assert_equal(x.shape[0], y.shape[0])
            train_Xs.append(x)
            train_ys.append(y)
        train_X = poly.fit_transform(np.concatenate(train_Xs, 0))
        train_y = np.concatenate(train_ys, 0).squeeze()
        cls = linear_model.LogisticRegression(class_weight=core.dictweights)
        cls.fit(train_X, train_y)

        test_x = poly.transform(self.fold_x(0, 'valid'))
        test_y = xval_dataset.BaseDataset().load('valid', 0).squeeze()

        score = core.score_data(test_y, cls.predict_proba(test_x))
        return score, poly, cls
Beispiel #14
0
    def run(self):
        self.output().makedirs()

        X = abhishek_feats.AbhishekFeatures().load('train', self.fold)
        y = xval_dataset.BaseDataset().load('train', self.fold).squeeze()
        cls = lgbsklearn.LGBMClassifier(num_leaves=1024,
                                        n_estimators=1024,
                                        is_unbalance=True)
        X_tr, X_va, y_tr, y_va = model_selection.train_test_split(
            X, y, test_size=0.05)
        cls.fit(X_tr,
                y_tr,
                sample_weight=core.weight_from(y_tr),
                eval_set=(X_va, y_va),
                early_stopping_rounds=10)

        validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold)
        y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze()
        y_pred = cls.predict_proba(validX)[:, 1]

        scorestr = "{:s} = {:f}".format(repr(self), core.score_data(y, y_pred))
        print(colors.green | colors.bold | scorestr)

        np.save('cache/abhishek/lgbm/{:d}/valid.npy'.format(self.fold), y_pred)

        trainX = abhishek_feats.AbhishekFeatures().load('test', None)
        pred = cls.predict_proba(trainX)[:, 1]
        np.save('cache/abhishek/lgbm/{:d}/test.npy'.format(self.fold), pred)

        with self.output().open('w') as f:
            cols = abhishek_feats.AbhishekFeatures().load('valid',
                                                          self.fold,
                                                          as_df=True).columns
            v = pandas.Series(cls.feature_importances_,
                              index=cols).sort_values()
            v.to_csv(f)
            f.write("\n\n")
            f.write(scorestr)
            f.write("\n")
Beispiel #15
0
    def run(self):
        self.output().makedirs()
        m1, m2 = rf_word_count_features.WordCountMatrix().load_raw_vectors(
            'train')
        m1 = m1 > 0
        m2 = m2 > 0
        X = m1.multiply(m2)
        folds = (rf_dataset.Dataset().load_dataset_folds() +
                 self.fold) % fold_max
        train_X = X[folds != 0]
        train_y = rf_dataset.Dataset().load('train',
                                            fold=self.fold,
                                            as_df=True).is_duplicate.values
        cls = naive_bayes.BernoulliNB()
        cls.fit(train_X, train_y)

        valid_X = X[folds == 0]
        valid_y = rf_dataset.Dataset().load('valid',
                                            fold=self.fold,
                                            as_df=True).is_duplicate.values
        valid_pred = cls.predict_proba(valid_X)[:, 1]

        score = score_data(valid_y, valid_pred)

        print(colors.green | "Score for {:s}: {:f}".format(repr(self), score))

        t1, t2 = rf_word_count_features.WordCountMatrix().load_raw_vectors(
            'test')
        t1 = t1 > 0
        t2 = t2 > 0
        test_X = t1.multiply(t2)
        test_pred = cls.predict_proba(test_X)[:, 1]
        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=valid_pred,
                            test=test_pred)
        os.rename(self.make_path('done_tmp.npz'), self.make_path('done.npz'))
        return score
Beispiel #16
0
    def run(self):
        self.output().makedirs()

        X = abhishek_feats.AbhishekFeatures().load('train', self.fold)
        y = xval_dataset.BaseDataset().load('train', self.fold).squeeze()
        cls = ensemble.ExtraTreesClassifier(n_estimators=500,
                                            n_jobs=-1,
                                            class_weight=core.dictweights)
        cls.fit(X, y)

        validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold)
        y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze()
        y_pred = cls.predict_proba(validX)[:, 1]
        score = core.score_data(y, y_pred)
        scorestr = "{:s} = {:f}".format(repr(self), score)
        print(colors.green | colors.bold | scorestr)

        np.save('cache/abhishek/xtc/{:d}/valid.npy'.format(self.fold), y_pred)

        trainX = abhishek_feats.AbhishekFeatures().load('test', None)
        pred = cls.predict_proba(trainX)[:, 1]
        np.save('cache/abhishek/xtc/{:d}/test.npy'.format(self.fold), pred)

        with self.output().open('w') as f:
            cols = abhishek_feats.AbhishekFeatures().load('valid',
                                                          self.fold,
                                                          as_df=True).columns
            v = pandas.Series(cls.feature_importances_,
                              index=cols).sort_values()
            v.to_csv(f)
            f.write("\n")
            f.write("\n")
            f.write(scorestr)
            f.write("\n")

        return score
Beispiel #17
0
    def run(self):
        self.output().makedirs()
        fold_ord = np.random.permutation(fold_max)

        merge_fold = fold_ord[0]
        test_fold = fold_ord[1]
        stack_folds = fold_ord[2:]

        print(colors.red | 'Fold order: {}/{}/{}'.format(merge_fold, test_fold, stack_folds))

        stack_Xs = [self.fold_x(f, 'valid') for f in stack_folds]
        stack_ys = [self.fold_y(f, 'valid') for f in stack_folds]
        stack_X = pandas.concat(stack_Xs, 0)
        stack_y = np.concatenate(stack_ys, 0)
        merge_X = self.fold_x(merge_fold, 'valid')
        merge_y = self.fold_y(merge_fold, 'valid')
        test_X = self.fold_x(test_fold, 'valid')
        test_y = self.fold_y(test_fold, 'valid')

        classifiers = list(self.classifiers())
        merge_preds = []
        test_preds = []
        ds_names = [repr(d) for d in self.datasets(0)]
        for cls in classifiers:
            print(colors.blue | colors.bold | "Training {:s}".format(repr(cls)))
            cls.fit(stack_X, stack_y)
            if hasattr(cls, 'feature_importances_'):
                print(colors.yellow | str(pandas.Series(cls.feature_importances_, index=ds_names).sort_values()))
            test_pred = cls.predict_proba(test_X)[:, 1]
            merge_pred = cls.predict_proba(merge_X)[:, 1]
            score = score_data(test_y, test_pred)
            test_score = score_data(test_y, test_pred)
            print(colors.yellow | 'Score: {:f}'.format(score))
            print(colors.green | 'score: {:f}'.format(test_score))

            merge_preds.append(merge_pred)
            test_preds.append(test_pred)

        merge_pred = np.vstack(merge_preds).T
        test_pred = np.vstack(test_preds).T

        #merge_cls = AutoExitingGBMLike(XGBClassifier(
        #    n_estimators=1024,
        #    learning_rate=0.05,
        #    max_depth=4,
        #    subsample=0.5
        #), additional_fit_args={'verbose': False})
        merge_cls = FeatureMean()

        merge_cls.fit(merge_pred, merge_y)

        test_pred = merge_cls.predict_proba(test_pred)[:, 1]
        test_score = score_data(test_y, test_pred)
        print(colors.green | 'Final score: {:f}'.format(test_score))

        fold_preds = []
        for fold in range(fold_max):
            fold_X = self.fold_x(fold, 'test')
            fold_merge_X = np.zeros([fold_X.shape[0], len(classifiers)])
            for ix, cls in enumerate(classifiers):
                fold_merge_X[:, ix] = cls.predict_proba(fold_X)[:, 1]
            fold_preds.append(merge_cls.predict_proba(fold_merge_X)[:, 1])

        predmat = np.vstack(fold_preds).mean(0)

        index = pandas.Index(np.arange(fold_X.shape[0]), name='test_id')
        print(predmat.shape)
        print(index.shape)
        pred = pandas.Series(predmat, index=index, name='is_duplicate').to_frame()
        with gzip.open(self.make_path('stacked_pred.csv.gz.tmp'), 'wt') as f:
            pred.to_csv(f)
        os.rename(self.make_path('stacked_pred.csv.gz.tmp'), self.make_path('stacked_pred.csv.gz'))
        return test_score