def __init__(self): self.Xtrain, self.ytrain = bow_tfidf.load_sparse_dataset("train", "tfidf") self.Xvalid, self.yvalid = bow_tfidf.load_sparse_dataset("validate", "tfidf") self.Xtest, self.ytest = bow_tfidf.load_sparse_dataset("test", "tfidf") print "train set {}, positive ratio: {:.2f}%".format(self.Xtrain.shape, self.ytrain.mean() * 100) print "validation set {}, positive ratio: {:.2f}%".format(self.Xvalid.shape, self.yvalid.mean() * 100) self.stats_file = common.ModelStatsFile()
def __init__(self): self.Xtrain, self.ytrain = bow_tfidf.load_sparse_dataset( "train", 'tfidf') self.Xvalid, self.yvalid = bow_tfidf.load_sparse_dataset( "validate", 'tfidf') self.Xtest, self.ytest = bow_tfidf.load_sparse_dataset('test', 'tfidf') print "train set {}, positive ratio: {:.2f}%".format( self.Xtrain.shape, self.ytrain.mean() * 100) print "validation set {}, positive ratio: {:.2f}%".format( self.Xvalid.shape, self.yvalid.mean() * 100) self.stats_file = common.ModelStatsFile()
import numpy as np import pandas as pd import xgboost as xgb import bow_tfidf from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.cross_validation import StratifiedKFold import common seed = 999 # ********************* load the data Xtrain, ytrain = bow_tfidf.load_sparse_dataset("train", 'tfidf') print "Train TF-IDF loaded" train_matrix = xgb.DMatrix(Xtrain, ytrain) Xvalid, yvalid = bow_tfidf.load_sparse_dataset("validate", 'tfidf') print "Validation TF-IDF loaded" valid_matrix = xgb.DMatrix(Xvalid, yvalid) Xtest, ytest = bow_tfidf.load_sparse_dataset('test', 'tfidf') print "Test TF-IDF loaded" test_matrix = xgb.DMatrix(Xtest) # ********************* # def crossval_predict(tag, X, y, params, n_cv=5): n_samples = X.shape[0] print "totally {} samples, divided into {} folds".format(n_samples, n_cv) headers = ["{}_{}".format(tag, t) for t in ["proba", "log_proba"]] yvalidates = pd.DataFrame(np.full((n_samples, 2), np.NaN),
import numpy as np import pandas as pd import xgboost as xgb import bow_tfidf from sklearn.metrics import accuracy_score,roc_auc_score from sklearn.cross_validation import StratifiedKFold import common seed = 999 # ********************* load the data Xtrain, ytrain = bow_tfidf.load_sparse_dataset("train", 'tfidf') print "Train TF-IDF loaded" train_matrix = xgb.DMatrix(Xtrain,ytrain) Xvalid, yvalid = bow_tfidf.load_sparse_dataset("validate", 'tfidf') print "Validation TF-IDF loaded" valid_matrix = xgb.DMatrix(Xvalid,yvalid) Xtest, ytest = bow_tfidf.load_sparse_dataset('test', 'tfidf') print "Test TF-IDF loaded" test_matrix = xgb.DMatrix(Xtest) # ********************* # def crossval_predict(tag,X,y,params,n_cv=5): n_samples = X.shape[0] print "totally {} samples, divided into {} folds".format(n_samples, n_cv) headers = ["{}_{}".format(tag, t) for t in ["proba", "log_proba"]] yvalidates = pd.DataFrame(np.full((n_samples, 2), np.NaN), columns=headers,index=y.index)