Ejemplo n.º 1
0
    def __init__(self):
        self.Xtrain, self.ytrain = bow_tfidf.load_sparse_dataset("train", "tfidf")
        self.Xvalid, self.yvalid = bow_tfidf.load_sparse_dataset("validate", "tfidf")
        self.Xtest, self.ytest = bow_tfidf.load_sparse_dataset("test", "tfidf")
        print "train set {}, positive ratio: {:.2f}%".format(self.Xtrain.shape, self.ytrain.mean() * 100)
        print "validation set {}, positive ratio: {:.2f}%".format(self.Xvalid.shape, self.yvalid.mean() * 100)

        self.stats_file = common.ModelStatsFile()
Ejemplo n.º 2
0
    def __init__(self):
        self.Xtrain, self.ytrain = bow_tfidf.load_sparse_dataset(
            "train", 'tfidf')
        self.Xvalid, self.yvalid = bow_tfidf.load_sparse_dataset(
            "validate", 'tfidf')
        self.Xtest, self.ytest = bow_tfidf.load_sparse_dataset('test', 'tfidf')
        print "train set {}, positive ratio: {:.2f}%".format(
            self.Xtrain.shape,
            self.ytrain.mean() * 100)
        print "validation set {}, positive ratio: {:.2f}%".format(
            self.Xvalid.shape,
            self.yvalid.mean() * 100)

        self.stats_file = common.ModelStatsFile()
Ejemplo n.º 3
0
import numpy as np
import pandas as pd
import xgboost as xgb
import bow_tfidf
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.cross_validation import StratifiedKFold
import common
seed = 999

# ********************* load the data
Xtrain, ytrain = bow_tfidf.load_sparse_dataset("train", 'tfidf')
print "Train TF-IDF loaded"
train_matrix = xgb.DMatrix(Xtrain, ytrain)

Xvalid, yvalid = bow_tfidf.load_sparse_dataset("validate", 'tfidf')
print "Validation TF-IDF loaded"
valid_matrix = xgb.DMatrix(Xvalid, yvalid)

Xtest, ytest = bow_tfidf.load_sparse_dataset('test', 'tfidf')
print "Test TF-IDF loaded"
test_matrix = xgb.DMatrix(Xtest)


# ********************* #
def crossval_predict(tag, X, y, params, n_cv=5):

    n_samples = X.shape[0]
    print "totally {} samples, divided into {} folds".format(n_samples, n_cv)

    headers = ["{}_{}".format(tag, t) for t in ["proba", "log_proba"]]
    yvalidates = pd.DataFrame(np.full((n_samples, 2), np.NaN),
Ejemplo n.º 4
0
import numpy as np
import pandas as pd
import xgboost as xgb
import bow_tfidf
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.cross_validation import StratifiedKFold
import common
seed = 999

# ********************* load the data
Xtrain, ytrain = bow_tfidf.load_sparse_dataset("train", 'tfidf')
print "Train TF-IDF loaded"
train_matrix = xgb.DMatrix(Xtrain,ytrain)

Xvalid, yvalid = bow_tfidf.load_sparse_dataset("validate", 'tfidf')
print "Validation TF-IDF loaded"
valid_matrix = xgb.DMatrix(Xvalid,yvalid)

Xtest, ytest = bow_tfidf.load_sparse_dataset('test', 'tfidf')
print "Test TF-IDF loaded"
test_matrix = xgb.DMatrix(Xtest)

# ********************* #
def crossval_predict(tag,X,y,params,n_cv=5):

    n_samples = X.shape[0]
    print "totally {} samples, divided into {} folds".format(n_samples, n_cv)

    headers = ["{}_{}".format(tag, t) for t in ["proba", "log_proba"]]
    yvalidates = pd.DataFrame(np.full((n_samples, 2), np.NaN), columns=headers,index=y.index)