def main(stack_setting_): """ [rawdata2filterdata Step] 1. Reading raw datasets 2. Droping useless feat columns in training set 3. Droping useless feat columns in test set """ raw_train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['train']) raw_test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['test']) print("= Reading raw datasets ...") names = ("age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, TARGET").split(', ') raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1)#, index_col=0, sep=',' #raw_trai['TARGET'] = (raw_trai['TARGET'].values == ' >50K').astype(np.int32) #raw_train = raw_train.apply(lambda x: pd.factorize(x)[0]) train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['train']) raw_train.to_csv(train_path, index=True, index_label='ID') raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1)#, index_col=0, sep=',' #raw_test['TARGET'] = (raw_test['TARGET'].values == ' >50K').astype(np.int32) #raw_test = raw_test.apply(lambda x: pd.factorize(x)[0]) test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['test']) raw_test.to_csv(test_path, index=True, index_label='ID')
def write_cv_res_csv(cv_out, cv_csv_out): cv_out = os.path.join(Config.get_string('data.path'),'output',cv_out) cv_csv_out = os.path.join(Config.get_string('data.path'),'output',cv_csv_out) param_keys, param_vals, scores = cp.load(open(cv_out, 'rb')) assert len(param_vals) == len(scores), 'Error: param value list length do not match score list length!' assert len(param_keys) == len(param_vals[0]), 'Error: param key count and value count do not match!' if isinstance(param_vals[0], dict): param_keys = param_vals[0].keys() param_vals = [param.values() for param in param_vals] f = open(cv_csv_out, 'w') f.write('idx,') for key in param_keys: f.write('{0},'.format(key)) for i in xrange(len(scores[0])): f.write('score_{0},'.format(i)) f.write('score_mean,score_std\n') for i, params in enumerate(param_vals): f.write('{},'.format(i)) for p in params: f.write('{0},'.format(p)) for s in scores[i]: f.write('{0},'.format(s)) f.write('{0},{1}\n'.format(scores[i].mean(), scores[i].std())) f.close() pass
def __init__(self, data_folder, train_fname=None, test_fname=None, k_fold_=None): self.random_state = 325243 # do not change it for different l2 models! #self.random_state = 98754 # do not change it for different l2 models! if not train_fname: sys.stderr.write('Do not set train_meta_feature\n') sys.exit() if not test_fname: sys.stderr.write('Do not set test_meta_feature\n') sys.exit() train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname) test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname) # load train data train = pd.read_csv(train_fname) self.train_id = train.values self.train_y = train.label.values self.train_x = train.drop(['label'], axis=1) # load test data test = pd.read_csv(test_fname) self.test_id = test.values self.test_y = test.label.values self.test_x = test.drop(['label'], axis=1) #print self.train_x.head() #print self.test_x.head() if k_fold_ is None: self.k_fold_ = 5 else: self.k_fold_ = k_fold_
def main(stack_setting_): """ [rawdata2filterdata Step] 1. Reading raw datasets 2. Droping useless feat columns in training set 3. Droping useless feat columns in test set """ raw_train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['train']) raw_test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['test']) print("= Reading raw datasets ...") names = ("age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, TARGET").split(', ') raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1)#, index_col=0, sep=',' raw_train['TARGET'] = (raw_train['TARGET'].values == ' >50K').astype(np.int32) raw_train = raw_train.apply(lambda x: pd.factorize(x)[0]) train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['train']) raw_train.to_csv(train_path, index=True, index_label='ID') raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1)#, index_col=0, sep=',' raw_test['TARGET'] = (raw_test['TARGET'].values == ' >50K').astype(np.int32) raw_test = raw_test.apply(lambda x: pd.factorize(x)[0]) test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['test']) raw_test.to_csv(test_path, index=True, index_label='ID')
def __init__(self, wrapper_class, experiment, model_param_keys, model_param_space, cv_out=None, cv_pred_out=None, refit_pred_out=None, dump_round=10, use_lower=0, n_folds=5): """ Constructor of bayes search. Support search on a set of model parameters, and record the cv result of each param configuration. :param wrapper_class: model wrapper type string like 'XgboostModel' or 'SklearnModel' :param experiment: experiment object of ExperimentL1 or ExperimentL2 :param model_param_keys: list of model param keys. eg. ['paramA', 'paramB', 'paramC'] :param model_param_space: list of model param space :param cv_out: Output pickle file name of cross validation score results. :param cv_pred_out: prediction of cross validation each fold. :param refit_pred_out: refit on full train set and predict on test set. :return: None """ self.wrapper_class = wrapper_class self.experiment = experiment self.model_param_keys = model_param_keys self.model_param_space = model_param_space self.integer_params = set() self.n_folds = n_folds for k, v in model_param_space.iteritems(): vstr = str(v) if vstr.find('quniform') >= 0 \ or vstr.find('qloguniform') >= 0\ or vstr.find('qnormal') >= 0\ or vstr.find('qnormal') >= 0: #if v == hp.quniform or v == hp.qlognormal or v == hp.qnormal: self.integer_params.add(k) pass self.param_vals_list = [] self.preds_list = [] self.scores_list = [] self.refit_preds_list = [] self.model_name = self.wrapper_class.__name__ self.cv_out = os.path.join(Config.get_string('data.path'), 'output', cv_out) if cv_out else None self.cv_pred_out = os.path.join(Config.get_string('data.path'), 'output', cv_pred_out) if cv_pred_out else None self.refit_pred_out = os.path.join( Config.get_string('data.path'), 'output', refit_pred_out) if refit_pred_out else None self.eval_round = 0 self.dump_round = dump_round self.trials = Trials() self.use_lower = use_lower pass
def nn_param_avg_submission(prefix, top_k=1): exp = ExperimentL1() score_fname = os.path.join(Config.get_string('data.path'), 'output',prefix+ '-scores.pkl') refit_pred_fname =os.path.join(Config.get_string('data.path'), 'output',prefix+ '-refit-preds.pkl') preds = get_top_model_avg_preds(score_fname, refit_pred_fname, topK=top_k) submission_fname = os.path.join(Config.get_string('data.path'), 'submission', 'avg-{}-refit-preds{}.csv'.format(prefix, top_k)) save_submissions(submission_fname, exp.test_id, preds)
def __init__(self, wrapper_class, experiment, model_param_keys, model_param_vals, cv_folder=None, cv_out=None, cv_pred_out=None, refit_pred_out=None): """ Constructor of grid search. Support search on a set of model parameters, and record the cv result of each param configuration. :param wrapper_class: model wrapper type string like 'XgboostModel' or 'SklearnModel' :param experiment: experiment object of ExperimentL1 at 1-Level or ExperimentL2 at 2-Level :param model_param_keys: list of model param keys. eg. ['paramA', 'paramB', 'paramC'] :param model_param_vals: list of model param values (iterable). eg. [['valAa', 'valAb'], [0.1, 0.2], (1, 2, 3)] :param cv_out: Output pickle file name of cross validation score results. :param cv_pred_out: prediction of cross validation each fold. :param refit_pred_out: refit on full train set and predict on test set. :return: (best parameters, best score) """ self.wrapper_class = wrapper_class self.experiment = experiment self.model_param_keys = model_param_keys self.model_param_vals = model_param_vals self.str_match = re.compile(r'loss') if wrapper_class == SklearnModel: self.model_name = model_param_vals[0] else: self.model_name = 'xgb' self.cv_out = os.path.join(Config.get_string('data.path'), cv_folder, cv_out) if cv_out else None self.cv_pred_out = os.path.join(Config.get_string('data.path'), cv_folder, cv_pred_out) if cv_pred_out else None self.refit_pred_out = os.path.join(Config.get_string('data.path'), cv_folder, refit_pred_out) if refit_pred_out else None
def ridge_blend(stack_setting_, best_param_): folder = stack_setting_['2-Level']['blending']['folder'] blend_weight_fname = stack_setting_['2-Level']['blending']['weight'] blend_weight_fname = os.path.join(Config.get_string('data.path'), folder, blend_weight_fname) linear_weight = pd.read_csv(blend_weight_fname) folder = stack_setting_['1-Level']['meta_features']['folder'] test_fname = stack_setting_['1-Level']['meta_features']['test'] test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname) test = pd.read_csv(test_fname) folder = stack_setting_['2-Level']['blending']['folder'] model_fname = stack_setting_['2-Level']['blending']['model'] model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname) with gzip.open(model_fname, 'rb') as gf: model = cPickle.load(gf) y_test = test.label.values X_test = test.drop(['label'], axis = 1) del test y_predict = model.predict(X_test) #return mean_squared_error(y_test, y_predict) return precision_recall(y_test, y_predict)
def __init__(self, data_folder, train_fname=None, test_fname=None, k_fold_=None): #self.random_state = 325243 # do not change it for different l1 models! self.random_state = 98754 # do not change it for different l1 models! if not train_fname: train_fname = 'filtered_train.csv' if not test_fname: test_fname = 'filtered_test.csv' train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname) test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname) # load train data with gzip.open(train_fname, 'rb') as gf: self.train_x, self.train_y = cPickle.load(gf) # load test data with gzip.open(test_fname, 'rb') as gf: self.test_x, self.test_y = cPickle.load(gf) if k_fold_ is None: self.k_fold_ = 5 else: self.k_fold_ = k_fold_
def __init__(self, data_folder, train_fname=None, test_fname=None, k_fold_=None): #self.random_state = 325243 # do not change it for different l1 models! self.random_state = 98754 # do not change it for different l1 models! if not train_fname: train_fname = 'filtered_train.csv' if not test_fname: test_fname = 'filtered_test.csv' train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname) test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname) # load train data train = pd.read_csv(train_fname, dtype=np.float32) train.sort(columns='ID', inplace=1) self.train_id = train.values self.train_y = train.TARGET.values self.train_x = train.drop(['ID', 'TARGET'], axis=1) # load test data test = pd.read_csv(test_fname, dtype=np.float32) test.sort(columns='ID', inplace=1) self.test_id = test.values self.test_y = test.TARGET.values #self.test_x = test.drop(['ID'], axis=1) self.test_x = test.drop(['ID', 'TARGET'], axis=1) #print self.train_x.head() #print self.test_x.head() if k_fold_ is None: self.k_fold_ = 5 else: self.k_fold_ = k_fold_
def get_top_cv_and_test_preds(out_fname_prefix, top_k=10, use_lower=0): """ Get the top k cross-validation predictions of trainset and refit predictions of testset from experiment_l1 results. You can use numpy.hstack to join different model results :param out_fname_prefix: prefix to identify a given experiment (L1) :param k: top k :return: top k cv preds and refit preds (numpy array) """ from utils.config_utils import Config # file names score_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix+'-scores.pkl') pred_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix +'-preds.pkl') refit_pred_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix +'-refit-preds.pkl') # load pickle files param_keys, param_vals, scores = cp.load(open(score_fname, 'rb')) refit_preds = cp.load(open(refit_pred_fname, 'rb')) preds = cp.load(open(pred_fname, 'rb')) # calculate top results scores = np.asarray(scores) idxs = np.arange(len(scores)) mscores = scores.mean(axis=1) if use_lower: mscores -= scores.std() idxs = sorted(idxs, key=lambda x:mscores[x], reverse=1)[:top_k] preds = np.transpose(np.asarray(preds)[idxs]) refit_preds = np.transpose(np.asarray(refit_preds)[idxs]) return preds, refit_preds
def get_cv_and_test_preds(out_fname_prefix, idxs): from utils.config_utils import Config pred_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix +'-preds.pkl') refit_pred_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix +'-refit-preds.pkl') refit_preds = cp.load(open(refit_pred_fname, 'rb')) preds = cp.load(open(pred_fname, 'rb')) preds = np.transpose(np.asarray(preds)[idxs]) refit_preds = np.transpose(np.asarray(refit_preds)[idxs]) return preds, refit_preds
def save_l2_submission(prefix='stacking-xgb'): import os exp = ExperimentL1() score_fname = os.path.join(Config.get_string('data.path'), 'output', prefix+'-scores.pkl') refit_pred_fname =os.path.join(Config.get_string('data.path'), 'output', prefix+'-refit-preds.pkl') topK = 1 preds = get_top_model_avg_preds(score_fname, refit_pred_fname, topK=topK) submission_fname = os.path.join(Config.get_string('data.path'), 'submission', prefix+'-refit-preds{}.csv'.format(topK)) save_submissions(submission_fname, exp.test_id, preds)
def write2csv_meta_feature(self, model, meta_folder, meta_train_fname, meta_test_fname, meta_header, best_param_): kfold = cross_validation.StratifiedKFold( self.train_y, n_folds=5, shuffle=True, random_state=self.random_state) model.set_params(**best_param_) transform_train = np.zeros((self.train_x.shape[0], 2), dtype=np.float32) transform_test = np.zeros((self.test_x.shape[0], 2), dtype=np.float32) # transform train data for i, (train_idx, test_idx) in enumerate(kfold): print(' [Meta Feature] --------- fold {0} ---------- '.format(i)) train_x = self.train_x.iloc[train_idx] train_y = self.train_y[train_idx] test_x = self.train_x.iloc[test_idx] test_y = self.train_y[test_idx] model.fit(train_x, train_y) #transform_train[test_idx, 0] = model.predict_proba(test_x)[:,1].astype(np.float32)w transform_train[test_idx, 0] = self.get_proba(model, test_x).astype(np.float32) transform_train[test_idx, 1] = test_y.astype(np.int32) meta_train_fname = os.path.join(Config.get_string('data.path'), meta_folder, meta_train_fname) np.savetxt(meta_train_fname, transform_train, delimiter=',', header=meta_header, comments='', fmt='%1.10e,%d') del transform_train # transform test data model.fit(self.train_x, self.train_y) #transform_test = model.predict_proba(self.test_x)[:,1].astype(np.float32) transform_test[:, 0] = self.get_proba(model, self.test_x).astype(np.float32) transform_test[:, 1] = self.test_y.astype(np.int32) meta_test_fname = os.path.join(Config.get_string('data.path'), meta_folder, meta_test_fname) np.savetxt(meta_test_fname, transform_test, delimiter=',', header=meta_header, comments='', fmt='%1.10e,%d') del transform_test
def make_hold_out_backup(stack_setting_): """ input train output train, ptrain, ptest """ split_ratio = stack_setting_['2-Level']['blending']['hold_out_ratio'] # train folder = stack_setting_['1-Level']['meta_features']['folder'] train_fname = stack_setting_['1-Level']['meta_features']['train'] train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname) train = pd.read_csv(train_fname) nrows = len(train.index) #a,b = int(nrows * split_ratio), nrows - int(nrows * split_ratio) a = nrows - int(nrows * split_ratio) train, hold_out = train[:a], train[a:] # train data for(meta_feature, label) train.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['train']), index=False) nrows = len(hold_out.index) a = int(nrows * 0.5) # for hold out set, we split half train and test data set. p_train, p_test = hold_out[:a], hold_out[a:] p_train.to_csv(os.path.join( Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['ptrain']), index=False) p_test.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['ptest']), index=False) print '----------- train data -----------' print train['label'].value_counts() print '----------- p_train_data -----------' print p_train['label'].value_counts() print '----------- p_test_data -----------' print p_test['label'].value_counts() return True
def make_hold_out(stack_setting_): """ input train output train, ptrain, ptest """ split_ratio = stack_setting_['2-Level']['blending']['hold_out_ratio'] # train folder = stack_setting_['1-Level']['meta_features']['folder'] train_fname = stack_setting_['1-Level']['meta_features']['train'] train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname) # meta_train_at_blend = os.path.join( Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['train']) meta_ptrain_at_blend = os.path.join( Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['ptrain']) meta_ptest_at_blend = os.path.join( Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['ptest']) meta_test_at_blend = os.path.join( Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], 'meta_test_at_blend.csv') # 1. split train file into train and hold out fs = File_Split(test_size=split_ratio) fs.__iter__(fname=train_fname, train_fname=meta_train_at_blend, test_fname=meta_test_at_blend) del fs # 2. split fs = File_Split(test_size=0.5) fs.__iter__(fname=meta_test_at_blend, train_fname=meta_ptrain_at_blend, test_fname=meta_ptest_at_blend) del fs return True
def xgb_submmision(exp, param=None): if not param: param = { 'colsample_bytree': 0.6923529515220681, 'silent': 1, 'model_type': XGBClassifier, 'learning_rate': 0.014582411837608816, 'nthread': 4, 'min_child_weight': 6.0, 'n_estimators': 400, 'subsample': 0.5530324529773664, 'seed': 9438, 'objective': 'binary:logistic', 'max_depth': 8.0 } xgb_model = SklearnModel(param) final_preds = exp.fit_fullset_and_predict(xgb_model) submission_path = os.path.join(Config.get_string('data.path'), 'submission') fname = os.path.join(submission_path, xgb_model.to_string().split("-")[0] + '_res.csv') #fname = os.path.join(submission_path, 'xgb_bayes_param_res.csv') #print final_preds #print exp.test_id save_submissions(fname, exp.test_id, final_preds)
def write_cv_res_csv(cv_out, cv_csv_out): cv_out = os.path.join(Config.get_string('data.path'), 'output', cv_out) if cv_out else None param_keys, param_vals, scores = cp.load(open(cv_out, 'rb')) assert len(param_vals) == len( scores ), 'Error: param value list length do not match score list length!' assert len(param_keys) == len( param_vals[0]), 'Error: param key count and value count do not match!' if isinstance(param_vals[0], dict): param_keys = param_vals[0].keys() param_vals = [param.values() for param in param_vals] f = open(cv_csv_out, 'w') for key in param_keys: f.write('{0},'.format(key)) for i in xrange(len(scores[0])): f.write('score_{0},'.format(i)) f.write('score_mean,score_std\n') for i, params in enumerate(param_vals): for p in params: f.write('{0},'.format(p)) for s in scores[i]: f.write('{0},'.format(s)) f.write('{0},{1}\n'.format(scores[i].mean(), scores[i].std())) f.close()
def get_xgb_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. train_X, train_y = experiment_.get_train_data() clf = XGBClassifier() try: del best_param_['model_type'] except: pass clf.set_params(**best_param_) clf.fit(train_X, train_y) index2feature = clf.booster().get_fscore() fis = pd.DataFrame({'name':index2feature.keys(), 'score':index2feature.values()}) fis = fis.sort('score', ascending=False) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 2. plot #gs = GridSpec(2,2) #ax1 = plt.subplot(gs[:,0]) #ax2 = plt.subplot(gs[0,1]) #ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance sns.barplot(x = 'score', y = 'name', data = fis, #ax=ax1, color="blue") #plt.title("Feature_Importance", fontsize=10) plt.ylabel("Feature", fontsize=10) plt.xlabel("Feature_Importance : f-Score", fontsize=10) """ # 3.2 PDF confidence_score = clf.oob_decision_function_[:,1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = min(best_param_.get('n_estimators',1), 100) counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) """ png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname) plt.tight_layout() plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1) plt.close() return True
def write2csv_meta_feature(self, model, meta_folder, meta_train_fname, meta_test_fname, meta_header, best_param_): kfold = cross_validation.StratifiedKFold(self.train_y, n_folds=self.k_fold_, shuffle=True, random_state=self.random_state) # set model with best parameter model.set_params(**best_param_) transform_train = np.zeros((self.train_x.shape[0],2), dtype=np.float32) transform_test = np.zeros((self.test_x.shape[0], 2), dtype=np.float32) # transform train data for i, (train_idx, test_idx) in enumerate(kfold): print (' [Meta Feature] --------- fold {0} ---------- '.format(i)) train_x = self.train_x.iloc[train_idx] train_y = self.train_y[train_idx] test_x = self.train_x.iloc[test_idx] test_y = self.train_y[test_idx] model.fit(train_x, train_y) #transform_train[test_idx, 0] = model.predict_proba(test_x)[:,1].astype(np.float32) transform_train[test_idx, 0] = self.get_proba(model, test_x).astype(np.float32) transform_train[test_idx, 1] = test_y.astype(np.int32) meta_train_fname = os.path.join(Config.get_string('data.path'), meta_folder, meta_train_fname) np.savetxt(meta_train_fname, transform_train, delimiter=',', header=meta_header, comments='', fmt='%1.10e,%d') del transform_train # transform test data model.fit(self.train_x, self.train_y) #transform_test = model.predict_proba(self.test_x)[:,1].astype(np.float32) transform_test[:,0] = self.get_proba(model, self.test_x).astype(np.float32) # predict label prob transform_test[:,1] = self.test_y.astype(np.int32) # true label meta_test_fname = os.path.join(Config.get_string('data.path'), meta_folder, meta_test_fname) np.savetxt(meta_test_fname, transform_test, delimiter=',', header=meta_header, comments='', fmt='%1.10e,%d') del transform_test
def make_hold_out_backup(stack_setting_): """ input train output train, ptrain, ptest """ split_ratio = stack_setting_['2-Level']['blending']['hold_out_ratio'] # train folder = stack_setting_['1-Level']['meta_features']['folder'] train_fname = stack_setting_['1-Level']['meta_features']['train'] train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname) train = pd.read_csv(train_fname) nrows = len(train.index) #a,b = int(nrows * split_ratio), nrows - int(nrows * split_ratio) a = nrows - int(nrows * split_ratio) train, hold_out = train[:a], train[a:] # train data for(meta_feature, label) train.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['train']), index=False) nrows = len(hold_out.index) a = int(nrows * 0.5) # for hold out set, we split half train and test data set. p_train, p_test = hold_out[:a], hold_out[a:] p_train.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['ptrain']), index=False) p_test.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['ptest']), index=False) print '----------- train data -----------' print train['label'].value_counts() print '----------- p_train_data -----------' print p_train['label'].value_counts() print '----------- p_test_data -----------' print p_test['label'].value_counts() return True
def main(): if len(sys.argv) != 3: print 'Usage: python submit_utils.py <model-prefix> <model-idxs>' exit() from utils.config_utils import Config model_prefix = sys.argv[1] score_fname = os.path.join(Config.get_string('data.path'), 'output', model_prefix + '-scores.pkl') refit_pred_fname = os.path.join(Config.get_string('data.path'), 'output', model_prefix + '-refit-preds.pkl') model_idxs = sys.argv[2].strip() idxs = [int(s) for s in model_idxs.split(',')] preds = get_selected_model_avg_preds(score_fname, refit_pred_fname, idxs) from experiment.stacking.experiment_l1 import ExperimentL1 exp = ExperimentL1() submission_fname = os.path.join(Config.get_string('data.path'), 'submission', '{}-{}-submission.csv'.format(model_prefix, model_idxs)) save_submissions(submission_fname, exp.test_id, preds) pass
def __init__(self, wrapper_class, experiment, model_param_keys, model_param_space, cv_out=None, cv_pred_out=None, refit_pred_out=None, dump_round=10, use_lower=0,n_folds=5): """ Constructor of bayes search. Support search on a set of model parameters, and record the cv result of each param configuration. :param wrapper_class: model wrapper type string like 'XgboostModel' or 'SklearnModel' :param experiment: experiment object of ExperimentL1 or ExperimentL2 :param model_param_keys: list of model param keys. eg. ['paramA', 'paramB', 'paramC'] :param model_param_space: list of model param space :param cv_out: Output pickle file name of cross validation score results. :param cv_pred_out: prediction of cross validation each fold. :param refit_pred_out: refit on full train set and predict on test set. :return: None """ self.wrapper_class = wrapper_class self.experiment = experiment self.model_param_keys = model_param_keys self.model_param_space = model_param_space self.integer_params = set() self.n_folds = n_folds for k, v in model_param_space.iteritems(): vstr = str(v) if vstr.find('quniform') >= 0 \ or vstr.find('qloguniform') >= 0\ or vstr.find('qnormal') >= 0\ or vstr.find('qnormal') >= 0: #if v == hp.quniform or v == hp.qlognormal or v == hp.qnormal: self.integer_params.add(k) pass self.param_vals_list = [] self.preds_list = [] self.scores_list = [] self.refit_preds_list = [] self.model_name = self.wrapper_class.__name__ self.cv_out = os.path.join(Config.get_string('data.path'), 'output', cv_out) if cv_out else None self.cv_pred_out = os.path.join(Config.get_string('data.path'), 'output', cv_pred_out) if cv_pred_out else None self.refit_pred_out = os.path.join(Config.get_string('data.path'), 'output', refit_pred_out) if refit_pred_out else None self.eval_round = 0 self.dump_round = dump_round self.trials = Trials() self.use_lower=use_lower pass
def load_mnist_labels(filename): filename = os.path.join(Config.get_string('data.path'), 'input', filename) if not os.path.exists(filename): download(filename) # Read the labels in Yann LeCun's binary format. with gzip.open(filename, 'rb') as f: data = np.frombuffer(f.read(), np.uint8, offset=8) # The labels are vectors of integers now, that's exactly what we want. return data
def make_hold_out(stack_setting_): """ input train output train, ptrain, ptest """ split_ratio = stack_setting_['2-Level']['blending']['hold_out_ratio'] # train folder = stack_setting_['1-Level']['meta_features']['folder'] train_fname = stack_setting_['1-Level']['meta_features']['train'] train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname) # meta_train_at_blend = os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['train']) meta_ptrain_at_blend = os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['ptrain']) meta_ptest_at_blend = os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], stack_setting_['2-Level']['blending']['ptest']) meta_test_at_blend = os.path.join(Config.get_string('data.path'), stack_setting_['2-Level']['blending']['folder'], 'meta_test_at_blend.csv') # 1. split train file into train and hold out fs = File_Split(test_size=split_ratio) fs.__iter__(fname = train_fname, train_fname = meta_train_at_blend, test_fname = meta_test_at_blend) del fs # 2. split fs = File_Split(test_size=0.5) fs.__iter__(fname = meta_test_at_blend, train_fname = meta_ptrain_at_blend, test_fname = meta_ptest_at_blend) del fs return True
def __init__(self, data_folder, train_fname=None, test_fname=None): #self.random_state = 325243 # do not change it for different l1 models! self.random_state = 98754 # do not change it for different l1 models! if not train_fname: train_fname = 'filtered_train.csv' if not test_fname: test_fname = 'filtered_test.csv' train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname) test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname) # load train data with gzip.open(train_fname, 'rb') as gf: self.train_x, self.train_y = cPickle.load(gf) # load test data with gzip.open(test_fname, 'rb') as gf: self.test_x, self.test_y = cPickle.load(gf)
def load_raw_data(dataset_name): pkl_fname = os.path.join(Config.get_string('data.path'), 'input', dataset_name + '.pkl') if not os.path.exists(pkl_fname): path = os.path.join(Config.get_string('data.path'), 'input', dataset_name) order_df = load_order_data(path) traffic_df = load_traffic_data(path) weather_df = load_weather_data(path) cluster_map = pd.read_csv(os.path.join(path, 'cluster_map/cluster_map'), sep='\t', names=['district_hash', 'district_id']) poi_data, poi_cnt = load_poi_data(path) data = order_df, traffic_df, weather_df, cluster_map, poi_data, poi_cnt cp.dump(data, open(pkl_fname, 'wb'), protocol=2) else: data = cp.load(open(pkl_fname, 'rb')) return data
def principal_component_analysis(x_train): """ Principal Component Analysis (PCA) identifies the combination of attributes (principal components, or directions in the feature space) that account for the most variance in the data. Let's calculate the 2 first principal components of the training data, and then create a scatter plot visualizing the training data examples projected on the calculated components. """ # Extract the variable to be predicted y_train = x_train["TARGET"] x_train = x_train.drop(labels="TARGET", axis=1) classes = np.sort(np.unique(y_train)) labels = ["Satisfied customer", "Unsatisfied customer"] # Normalize each feature to unit norm (vector length) x_train_normalized = normalize(x_train, axis=0) # Run PCA pca = PCA(n_components=2) x_train_projected = pca.fit_transform(x_train_normalized) # Visualize fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(1, 1, 1) colors = [(0.0, 0.63, 0.69), 'black'] markers = ["o", "D"] for class_ix, marker, color, label in zip(classes, markers, colors, labels): ax.scatter(x_train_projected[np.where(y_train == class_ix), 0], x_train_projected[np.where(y_train == class_ix), 1], marker=marker, color=color, edgecolor='whitesmoke', linewidth='1', alpha=0.9, label=label) ax.legend(loc='best') plt.title("Scatter plot of the training data examples projected on the " "2 first principal components") plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (pca.explained_variance_ratio_[0] * 100.0)) plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (pca.explained_variance_ratio_[1] * 100.0)) #plt.show() #plt.savefig("../data/pca.pdf", format='pdf') #plt.savefig("../data/pca.png", format='png') plt.savefig(os.path.join(Config.get_string('data.path'), 'graph', 'pca.png'), format='png') plt.close()
def xgb_submmision(exp, param=None): if not param: param = {'colsample_bytree': 0.6923529515220681, 'silent': 1, 'model_type':XGBClassifier, 'learning_rate': 0.014582411837608816, 'nthread': 4, 'min_child_weight': 6.0, 'n_estimators': 400, 'subsample': 0.5530324529773664, 'seed': 9438, 'objective': 'binary:logistic', 'max_depth': 8.0} xgb_model = SklearnModel(param) final_preds = exp.fit_fullset_and_predict(xgb_model) submission_path = os.path.join(Config.get_string('data.path'), 'submission') fname = os.path.join(submission_path, xgb_model.to_string().split("-")[0] + '_res.csv') #fname = os.path.join(submission_path, 'xgb_bayes_param_res.csv') #print final_preds #print exp.test_id save_submissions(fname, exp.test_id, final_preds)
def dump_stacking_setting(stack_setting_): text = json.dumps(stack_setting_, sort_keys=True, ensure_ascii=False, indent=4) data_folder = stack_setting_['setting']['folder'] fname = stack_setting_['setting']['name'] fname = os.path.join(Config.get_string('data.path'), data_folder, fname) with open(fname, 'w') as fh: fh.write(text.encode('utf-8')) return True
def __init__(self, wrapper_class, experiment, model_param_keys, model_param_vals, cv_folder=None, cv_out=None, cv_pred_out=None, refit_pred_out=None): """ Constructor of grid search. Support search on a set of model parameters, and record the cv result of each param configuration. :param wrapper_class: model wrapper type string like 'XgboostModel' or 'SklearnModel' :param experiment: experiment object of ExperimentL1 at 1-Level or ExperimentL2 at 2-Level :param model_param_keys: list of model param keys. eg. ['paramA', 'paramB', 'paramC'] :param model_param_vals: list of model param values (iterable). eg. [['valAa', 'valAb'], [0.1, 0.2], (1, 2, 3)] :param cv_out: Output pickle file name of cross validation score results. :param cv_pred_out: prediction of cross validation each fold. :param refit_pred_out: refit on full train set and predict on test set. :return: (best parameters, best score) """ self.wrapper_class = wrapper_class self.experiment = experiment self.model_param_keys = model_param_keys self.model_param_vals = model_param_vals self.str_match = re.compile(r'loss') if wrapper_class == SklearnModel: self.model_name = model_param_vals[0] else: self.model_name = 'xgb' self.cv_out = os.path.join(Config.get_string('data.path'), cv_folder, cv_out) if cv_out else None self.cv_pred_out = os.path.join(Config.get_string('data.path'), cv_folder, cv_pred_out) if cv_pred_out else None self.refit_pred_out = os.path.join( Config.get_string('data.path'), cv_folder, refit_pred_out) if refit_pred_out else None
def get_optimal_blend_weigth(exp_, best_param_, folder, fname, model_fname): clf = RidgeClassifier() X_test, y_test = exp_.get_test_data() clf.set_params(**best_param_) clf.fit(X_test, y_test) # dump2csv optimal linear weight names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values) coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64) optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names) optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'), folder, fname), index=False) # dump2cpkle for ridge model model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname) with gzip.open(model_fname, 'wb') as gf: cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL) return True
def principal_component_analysis(x_train): """ Principal Component Analysis (PCA) identifies the combination of attributes (principal components, or directions in the feature space) that account for the most variance in the data. Let's calculate the 2 first principal components of the training data, and then create a scatter plot visualizing the training data examples projected on the calculated components. """ # Extract the variable to be predicted y_train = x_train["TARGET"] x_train = x_train.drop(labels="TARGET", axis=1) classes = np.sort(np.unique(y_train)) labels = ["Satisfied customer", "Unsatisfied customer"] # Normalize each feature to unit norm (vector length) x_train_normalized = normalize(x_train, axis=0) # Run PCA pca = PCA(n_components=2) x_train_projected = pca.fit_transform(x_train_normalized) # Visualize fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(1, 1, 1) colors = [(0.0, 0.63, 0.69), 'black'] markers = ["o", "D"] for class_ix, marker, color, label in zip( classes, markers, colors, labels): ax.scatter(x_train_projected[np.where(y_train == class_ix), 0], x_train_projected[np.where(y_train == class_ix), 1], marker=marker, color=color, edgecolor='whitesmoke', linewidth='1', alpha=0.9, label=label) ax.legend(loc='best') plt.title( "Scatter plot of the training data examples projected on the " "2 first principal components") plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % ( pca.explained_variance_ratio_[0] * 100.0)) plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % ( pca.explained_variance_ratio_[1] * 100.0)) #plt.show() #plt.savefig("../data/pca.pdf", format='pdf') #plt.savefig("../data/pca.png", format='png') plt.savefig(os.path.join(Config.get_string('data.path'), 'graph', 'pca.png'), format='png') plt.close()
def load_mnist_images(filename): filename = os.path.join(Config.get_string('data.path'), 'input', filename) if not os.path.exists(filename): download(filename) # Read the inputs in Yann LeCun's binary format. with gzip.open(filename, 'rb') as f: data = np.frombuffer(f.read(), np.uint8, offset=16) # The inputs are vectors now, we reshape them to monochrome 2D images, # following the shape convention: (examples, channels, rows, columns) data = data.reshape(-1, 1, 28, 28) # The inputs come as bytes, we convert them to float32 in range [0,1]. # (Actually to range [0, 255/256], for compatibility to the version # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) return data / np.float32(256)
def __init__(self, data_folder, train_fname=None, test_fname=None): self.random_state = 325243 # do not change it for different l2 models! #self.random_state = 98754 # do not change it for different l2 models! if not train_fname: sys.stderr.write('Do not set train_meta_feature\n') sys.exit() if not test_fname: sys.stderr.write('Do not set test_meta_feature\n') sys.exit() train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname) test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname) # load train data train = pd.read_csv(train_fname) self.train_id = train.values self.train_y = train.label.values self.train_x = train.drop(['label'], axis=1) # load test data test = pd.read_csv(test_fname) self.test_id = test.values self.test_y = test.label.values self.test_x = test.drop(['label'], axis=1)
def __init__(self, data_folder, train_fname=None, test_fname=None): #self.random_state = 325243 # do not change it for different l1 models! self.random_state = 98754 # do not change it for different l1 models! if not train_fname: train_fname = 'filtered_train.csv' if not test_fname: test_fname = 'filtered_test.csv' train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname) test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname) # load train data train = pd.read_csv(train_fname) train.sort(columns='ID', inplace=1) self.train_id = train.values self.train_y = train.TARGET.values self.train_x = train.drop(['ID', 'TARGET'], axis=1) # load test data test = pd.read_csv(test_fname) test.sort(columns='ID', inplace=1) self.test_id = test.values self.test_y = test.TARGET.values #self.test_x = test.drop(['ID'], axis=1) self.test_x = test.drop(['ID', 'TARGET'], axis=1)
def __init__(self, data_set, abits, wbits, network_type, seed): self.network_type = network_type self.abits = abits self.wbits = wbits self.data_set = data_set self.seed = seed self.model = Sequential() cfDeep = self.myCF(self) if self.data_set == 'mnist': cfg = 'config_MNIST' if self.data_set == 'fashion': cfg = 'config_FASHION' if self.data_set == 'cifar10': cfg = 'config_CIFAR-10' self.cf = Config(cfg, cmd_args=cfDeep.myDict) print("Dataset: " + str("%s_pic/" % self.data_set)) assure_path_exists("%s_pic/" % self.data_set)
def remove_feat_identicals(data_frame): # Find feature vectors having the same values in the same order and # remove all but one of those redundant features. print("") print("Deleting identical features...") n_features_originally = data_frame.shape[1] # Find the names of identical features by going through all the # combinations of features (each pair is compared only once). feat_names_delete = [] for feat_1, feat_2 in itertools.combinations( iterable=data_frame.columns, r=2): if np.array_equal(data_frame[feat_1], data_frame[feat_2]): feat_names_delete.append(feat_2) feat_names_delete = np.unique(feat_names_delete) # Delete the identical features data_frame = data_frame.drop(labels=feat_names_delete, axis=1) n_features_deleted = len(feat_names_delete) print(" - Deleted %s / %s features (~= %.1f %%)" % ( n_features_deleted, n_features_originally, 100.0 * (np.float(n_features_deleted) / n_features_originally))) return data_frame if __name__ == "__main__": train_data_path = os.path.join(Config.get_string('data.path'), 'input', 'train.csv') x_train = pd.read_csv(filepath_or_buffer=train_data_path, index_col=0, sep=',') x_train = remove_feat_constants(x_train) x_train = remove_feat_identicals(x_train) principal_component_analysis(x_train)
def main(stack_setting_): # for train set fname = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['filter']['train']) train = pd.read_csv(fname) # for test dataset fname = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['filter']['test']) test = pd.read_csv(fname) print("= Stats Summary in train and test set ") train1 = extend_df(train.copy()) test1 = extend_df(test.copy()) train1.to_csv(os.path.join( Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw_extend']['train']), index=False) test1.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw_extend']['test']), index=False) print("= Scailing in train and test set ") train1, test1 = scale(train, test) train1.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['scaled']['train']), index=False) test1.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['scaled']['test']), index=False) train1 = extend_df(train1) test1 = extend_df(test1) train1.to_csv(os.path.join( Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['scaled_extend']['train']), index=False) test1.to_csv(os.path.join( Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['scaled_extend']['test']), index=False) train1, test1 = standard(train, test) train1.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['standard']['train']), index=False) test1.to_csv(os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['standard']['test']), index=False) train1 = extend_df(train1) test1 = extend_df(test1) train1.to_csv(os.path.join( Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['standard_extend']['train']), index=False) test1.to_csv(os.path.join( Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['standard_extend']['test']), index=False) return True
from sklearn import preprocessing import scipy.special as special from pandas import DataFrame, Series from tqdm import tqdm import time # import sys # sys.path.extend('../') from utils.data_utils import preprocess from utils.config_utils import Config from sklearn.feature_extraction.text import TfidfVectorizer import scipy.io as scio cfg = Config() np.random.seed(cfg.seed) random.seed(cfg.seed) """ Feature Extraction Tools TF-IDF + W2V + Multi-label + Onehot + Click multiply + Time Sequence + Shuffle """ def tfidf(log, pivot_key, out_key, flag, max_df=0.99): """ TF-IDF Features
def main(): # for train set fname = os.path.join(Config.get_string('data.path'), 'input', 'filtered_train.csv') train = pd.read_csv(fname) # for test dataset fname = os.path.join(Config.get_string('data.path'), 'input', 'filtered_test.csv') test = pd.read_csv(fname) # for extended print '--- extending raw dataset ---' train1 = extend_df(train.copy()) test1 = extend_df(test.copy()) train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'raw_extend_train.csv'), index=0) test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'raw_extend_test.csv'), index=0) # for scaled print '--- scaling raw dataset to [0, 1] ---' train1, test1 = scale(train, test) train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'scaled_train.csv'), index=0) test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'scaled_test.csv'), index=0) # for extended scaled print '--- extending scaled dataset ---' train1 = extend_df(train1) test1 = extend_df(test1) train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'scaled_extend_train.csv'), index=0) test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'scaled_extend_test.csv'), index=0) # for normalized data print '--- standardizing dataset ---' train1, test1 = standard(train, test) train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'standard_train.csv'), index=0) test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'standard_test.csv'), index=0) # for pca print "--- transforming pca dataset ----" train2, test2 = pca(train1, test1, components=100) train2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca100_train.csv'), index=0) test2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca100_test.csv'), index=0) train2, test2 = pca(train1, test1, components=200) train2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca200_train.csv'), index=0) test2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca200_test.csv'), index=0) # for pca extend print "--- standard -> standard + pca ----" train2, test2 = pca_extend(train1, test1, components=10) train2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca10_and_standard_train.csv'), index=0) test2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca10_and_standard_test.csv'), index=0) train2, test2 = pca_extend(train1, test1, components=20) train2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca20_and_standard_train.csv'), index=0) test2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca20_and_standard_test.csv'), index=0) del train2 del test2 # for extended normalized data print '--- extending standard dataset ---' train1 = extend_df(train1) test1 = extend_df(test1) train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'standard_extend_train.csv'), index=0) test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'standard_extend_test.csv'), index=0) pass
def combine_meta_features(stack_setting_): #data_folder = stack_setting_['1-Level' ]['meta_features'] #fname = stack_setting_['setting']['name'] #fname = os.path.join(Config.get_string('data.path'), data_folder, fname) train_merge = [] test_merge = [] for model_name in stack_setting_['1-Level'].keys(): try: if model_name == 'gbdt_linear': # train folder = stack_setting_['1-Level'][model_name]['lower']['meta_feature']['folder'] train_fname = stack_setting_['1-Level'][model_name]['lower']['meta_feature']['train'] cmd = "ls %s%s/%s*%s*.%s" % (Config.get_string('data.path'), folder, "_".join(".".join(train_fname.split('.')[:-1]).split("_")[:-1]), ".".join(train_fname.split('.')[:-1]).split("_")[-1], train_fname.split('.')[-1]) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) for line in iter(p.stdout.readline, b''): train = pd.read_csv(line.rstrip('\n')) col_name = train.columns.values[:-1] X_train = train[col_name] col_name = train.columns.values[-1] y_train = train[col_name] train_merge.append(X_train) # test folder = stack_setting_['1-Level'][model_name]['lower']['meta_feature']['folder'] test_fname = stack_setting_['1-Level'][model_name]['lower']['meta_feature']['test'] cmd = "ls %s%s/%s*%s*.%s" % (Config.get_string('data.path'), folder, "_".join(".".join(test_fname.split('.')[:-1]).split("_")[:-1]), ".".join(test_fname.split('.')[:-1]).split("_")[-1], test_fname.split('.')[-1]) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) for line in iter(p.stdout.readline, b''): test = pd.read_csv(line.rstrip('\n')) col_name = test.columns.values[:-1] X_test = test[col_name] col_name = test.columns.values[-1] y_test = test[col_name] test_merge.append(X_test) else: # train folder = stack_setting_['1-Level'][model_name]['meta_feature']['folder'] train_fname = stack_setting_['1-Level'][model_name]['meta_feature']['train'] train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname) #data1.columns.values train = pd.read_csv(train_fname) col_name = train.columns.values[:-1] X_train = train[col_name] col_name = train.columns.values[-1] y_train = train[col_name] train_merge.append(X_train) # test folder = stack_setting_['1-Level'][model_name]['meta_feature']['folder'] test_fname = stack_setting_['1-Level'][model_name]['meta_feature']['test'] test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname) #data1.columns.values test = pd.read_csv(test_fname) col_name = test.columns.values[:-1] X_test = test[col_name] col_name = test.columns.values[-1] y_test = test[col_name] test_merge.append(X_test) except: pass train_merge.append(y_train) train_merge = pd.concat(train_merge, ignore_index=False, axis=1) #print train_merge.head(10) folder = stack_setting_['1-Level']['meta_features']['folder'] train_fname = stack_setting_['1-Level']['meta_features']['train'] train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname) train_merge.to_csv(train_fname, index=False) test_merge.append(y_test) test_merge = pd.concat(test_merge, ignore_index=False, axis=1) #print test_merge.head(10) folder = stack_setting_['1-Level']['meta_features']['folder'] test_fname = stack_setting_['1-Level']['meta_features']['test'] test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname) test_merge.to_csv(test_fname, index=False) return True
def main_transform(stack_setting_): """ [rawdata2filterdata Step] 1. Reading raw datasets 2. Droping useless feat columns in training set 3. Droping useless feat columns in test set """ raw_train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['train']) raw_test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['test']) print("= Reading raw datasets ...") names = ("age, workclass, fnlwgt, education, education-num, material, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, country, TARGET").split(', ') # train raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1)#, index_col=0, sep=',' train_labels = pd.DataFrame((raw_train['TARGET'].values == ' >50K').astype(np.int32), columns=['TARGET']) raw_train = raw_train.drop(labels=['TARGET'], axis=1) #print 'train summary' #print raw_train[['age', 'hours-per-week', 'fnlwgt']].describe() for key in ['fnlwgt', 'capital-gain', 'capital-loss']: raw_train['%s_%s' % (key, 'log')] = np.log(raw_train[key] + 1.0).astype(np.float32) raw_train = raw_train.drop(labels=[key], axis=1) raw_train['TARGET'] = train_labels; del train_labels # test raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1)#, index_col=0, sep=',' test_labels = pd.DataFrame((raw_test['TARGET'].values == ' >50K').astype(np.int32), columns=['TARGET']) raw_test = raw_test.drop(labels=['TARGET'], axis=1) print 'test summary-1' print raw_test.describe() for key in ['fnlwgt', 'capital-gain', 'capital-loss']: raw_test['%s_%s' % (key, 'log')] = np.log(raw_test[key] + 1.0).astype(np.float32) raw_test = raw_test.drop(labels=[key], axis=1) print '\n' print 'test summary-2' print raw_test.describe() raw_test['TARGET'] = test_labels; del test_labels # main print("= Transform Categorical Variables into One-Hot-Encoding ...") categoricals = ['workclass', 'education', 'material', 'occupation', 'relationship', 'race', 'sex', 'country'] combined = pd.concat([raw_train, raw_test]) #print combined.head() #combined_categoricals = combined[categoricals] clf = onehot_encode() clf.fit(combined, categoricals) train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['train']) # dump train raw_train = clf.transform(raw_train, categoricals) train_label = raw_train.TARGET.values raw_train = raw_train.drop(labels=['TARGET'], axis=1) raw_train['TARGET'] = pd.DataFrame(train_label); del train_label train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['train']) raw_train.to_csv(train_path, index=True, index_label='ID') # dump test raw_test = clf.transform(raw_test, categoricals) test_label = raw_test.TARGET.values raw_test = raw_test.drop(labels=['TARGET'], axis=1) raw_test['TARGET'] = pd.DataFrame(test_label); del test_label test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['test']) raw_test.to_csv(test_path, index=True, index_label='ID')
for feat_1, feat_2 in itertools.combinations( iterable=data_frame.columns, r=2): if np.array_equal(data_frame[feat_1], data_frame[feat_2]): feat_names_delete.append(feat_2) feat_names_delete = np.unique(feat_names_delete) # Delete the identical features data_frame = data_frame.drop(labels=feat_names_delete, axis=1) n_features_deleted = len(feat_names_delete) print(" - Deleted %s / %s features (~= %.1f %%)" % ( n_features_deleted, n_features_originally, 100.0 * (np.float(n_features_deleted) / n_features_originally))) return data_frame, feat_names_delete if __name__=='__main__': data_path = os.path.join(Config.get_string('data.path'), 'input') raw_train_path = os.path.join(data_path, 'train.csv') raw_test_path = os.path.join(data_path, 'test.csv') print("= Reading raw datasets ...") raw_train = pd.read_csv(raw_train_path, index_col=0, sep=',') raw_test = pd.read_csv(raw_test_path, index_col=0, sep=',') print("= Droping useless feat columns in training set ") raw_train, feat_to_delete = remove_feat_constants(raw_train) raw_train, temp = remove_feat_identicals(raw_train) feat_to_delete.extend(temp) print("= Droping useless feat columns in test set:") print feat_to_delete raw_test = raw_test.drop(feat_to_delete, axis=1)
import os from seq2seq_regression.Seq2SeqRegression import train_on_plouffe_copy from utils.config_utils import Config, flat_dict, flat_dict_helper if __name__ == "__main__": config_path = os.getcwd() config = Config(config_path) cmd_args = config.config_init_parser() load_params = 0 if cmd_args.config_file is None: sess_args = vars(cmd_args) load_params = 1 else: yml_args = config.config_parse_yaml() sess_args = flat_dict(yml_args) train_on_plouffe_copy(sess_args, load_params)
def gbdt_plus_liner_classifier_grid_search(stack_setting_, upper_param_keys=None, upper_param_vals=None, lower_param_keys=None, lower_param_vals=None, num_proc=None): """ upper model is GBDT or Random Forest lower model is Linear Classifier """ if stack_setting_ is None: sys.stderr.write('You have no setting Json file\n') sys.exit() if num_proc is None: num_proc = 6 # 1. upper model if upper_param_keys is None: upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf'] if upper_param_vals is None: upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]] # grid search for upper model : GBDT or Random Forest # ExperimentL1 has model free. On the other hand, data is fix exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'], train_fname = stack_setting_['0-Level']['train'], test_fname = stack_setting_['0-Level']['test']) # GridSearch has a single model. model is dertermined by param #gs = GridSearch(SklearnModel, exp, upper_param_keys, upper_param_vals, # cv_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['folder'], # cv_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_out'], # cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_pred_out'], # refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['refit_pred_out']) #upper_best_param, upper_best_score = gs.search_by_cv() model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_train_fname = os.path.join(Config.get_string('data.path'), model_folder, model_train_fname) model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] model_test_fname = os.path.join(Config.get_string('data.path'), model_folder, model_test_fname) upper_param_dict = dict(zip(upper_param_keys, upper_param_vals)) if os.path.isfile(model_train_fname) is False and \ os.path.isfile(model_test_fname) is False: #upper_param_dict['model_type'] == [GradientBoostingClassifier] del upper_param_dict['model_type'] clf = GradientBoostingClassifier() clf_cv = GridSearchCV(clf, upper_param_dict, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = num_proc, cv = 5) X_train, y_train = exp.get_train_data() clf_cv.fit(X_train, y_train) upper_best_params = clf_cv.best_params_ print upper_best_params del clf_cv clf.set_params(**upper_best_params) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) X_test, y_test = exp.get_test_data() for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder'] graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name'] graph_fname = os.path.join(Config.get_string('data.path'), graph_folder, graph_fname) gs = GridSpec(2,2) ax1 = plt.subplot(gs[0,1]) ax2 = plt.subplot(gs[1,1]) ax3 = plt.subplot(gs[:,0]) ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') ax1.set_xlabel('the number of weak learner:Boosting Iterations') ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax1.legend(loc="best") # dump for the transformated feature clf = TreeTransform(GradientBoostingClassifier(), best_params_ = upper_best_params) if type(X_train) == pd.core.frame.DataFrame: clf.fit(X_train.as_matrix().astype(np.float32), y_train) elif X_train == np.ndarray: clf.fit(X_train.astype(np.float32), y_train) # train result train_loss = clf.estimator_.train_score_ test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32) if type(X_train) == pd.core.frame.DataFrame: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) elif type(X_train) == np.ndarray: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) ax2.plot(train_loss, label="train_loss") ax2.plot(test_loss, label="test_loss") ax2.set_xlabel('Boosting Iterations') ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax2.legend(loc="best") # tree ensambles score_threshold=0.8 index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values)) feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]] feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index] fis = pd.DataFrame( {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index], 'score':feature_importances_score} ) score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) # where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) sns.barplot(x = 'score', y = 'name', data = fis, ax=ax3, color="blue") ax3.set_xlabel("Feature_Importance", fontsize=10) plt.tight_layout() plt.savefig(graph_fname) plt.close() #print clf.toarray().shape # >(26049, 100) #input_features = 26049, weak_learners = 100 #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0] #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:] ## feature transformation : get test data from train trees #print transformated_train_features.shape, X_train.shape #print transformated_test_features.shape, X_test.shape transformated_train_features = clf.one_hot_encoding if type(X_test) == pd.core.frame.DataFrame: transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), y_test) elif type(X_train) == np.ndarray: transformated_test_features = clf.transform(X_test, y_test) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] #model_train_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_train_fname) with gzip.open(model_train_fname, "wb") as gf: cPickle.dump([transformated_train_features, y_train], gf, cPickle.HIGHEST_PROTOCOL) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] #model_test_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_test_fname) with gzip.open(model_test_fname, "wb") as gf: cPickle.dump([transformated_test_features, y_test], gf, cPickle.HIGHEST_PROTOCOL) """ # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) if lower_param_dict['model_type'] == [LogisticRegression]: # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature exp.write2csv_meta_feature( model = LogisticRegression(), meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'], meta_test_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'], meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'], best_param_ = lower_best_param ) """ # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) clf_lower_model = None clf_lower_mname = None # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix if lower_param_dict['model_type'] == [LogisticRegression]: # Logistic Regression clf_lower_model = LogisticRegression() clf_lower_mname = 'LR' elif lower_param_dict['model_type'] == [SVM]: # SVM clf_lower_model = LinearSVC() clf_lower_mname = 'SVM' else: sys.stderr.write("You should input lower liner model\n") sys.exit() model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature meta_train_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1] ) meta_test_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1] ) exp.write2csv_meta_feature( model = clf_lower_model, meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = meta_train_fname_, meta_test_fname = meta_test_fname_, meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'], best_param_ = lower_best_param ) ## best parameter for GBDT and anohter sklearn classifier #return best_param, best_score return upper_best_params, lower_best_param
def combine_meta_features(stack_setting_): #data_folder = stack_setting_['1-Level' ]['meta_features'] #fname = stack_setting_['setting']['name'] #fname = os.path.join(Config.get_string('data.path'), data_folder, fname) train_merge = [] test_merge = [] for model_name in stack_setting_['1-Level'].keys(): try: if model_name == 'gbdt_linear': # train folder = stack_setting_['1-Level'][model_name]['lower'][ 'meta_feature']['folder'] train_fname = stack_setting_['1-Level'][model_name]['lower'][ 'meta_feature']['train'] cmd = "ls %s%s/%s*%s*.%s" % ( Config.get_string('data.path'), folder, "_".join(".".join( train_fname.split('.')[:-1]).split("_")[:-1]), ".".join(train_fname.split('.')[:-1]).split("_")[-1], train_fname.split('.')[-1]) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) for line in iter(p.stdout.readline, b''): train = pd.read_csv(line.rstrip('\n')) col_name = train.columns.values[:-1] X_train = train[col_name] col_name = train.columns.values[-1] y_train = train[col_name] train_merge.append(X_train) # test folder = stack_setting_['1-Level'][model_name]['lower'][ 'meta_feature']['folder'] test_fname = stack_setting_['1-Level'][model_name]['lower'][ 'meta_feature']['test'] cmd = "ls %s%s/%s*%s*.%s" % ( Config.get_string('data.path'), folder, "_".join(".".join( test_fname.split('.')[:-1]).split("_")[:-1]), ".".join( test_fname.split('.')[:-1]).split("_")[-1], test_fname.split('.')[-1]) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) for line in iter(p.stdout.readline, b''): test = pd.read_csv(line.rstrip('\n')) col_name = test.columns.values[:-1] X_test = test[col_name] col_name = test.columns.values[-1] y_test = test[col_name] test_merge.append(X_test) else: # train folder = stack_setting_['1-Level'][model_name]['meta_feature'][ 'folder'] train_fname = stack_setting_['1-Level'][model_name][ 'meta_feature']['train'] train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname) #data1.columns.values train = pd.read_csv(train_fname) col_name = train.columns.values[:-1] X_train = train[col_name] col_name = train.columns.values[-1] y_train = train[col_name] train_merge.append(X_train) # test folder = stack_setting_['1-Level'][model_name]['meta_feature'][ 'folder'] test_fname = stack_setting_['1-Level'][model_name][ 'meta_feature']['test'] test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname) #data1.columns.values test = pd.read_csv(test_fname) col_name = test.columns.values[:-1] X_test = test[col_name] col_name = test.columns.values[-1] y_test = test[col_name] test_merge.append(X_test) except: pass train_merge.append(y_train) train_merge = pd.concat(train_merge, ignore_index=False, axis=1) #print train_merge.head(10) folder = stack_setting_['1-Level']['meta_features']['folder'] train_fname = stack_setting_['1-Level']['meta_features']['train'] train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname) train_merge.to_csv(train_fname, index=False) test_merge.append(y_test) test_merge = pd.concat(test_merge, ignore_index=False, axis=1) #print test_merge.head(10) folder = stack_setting_['1-Level']['meta_features']['folder'] test_fname = stack_setting_['1-Level']['meta_features']['test'] test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname) test_merge.to_csv(test_fname, index=False) return True
print(checkpoint_name) #print(len(name_list)) #exit() train_from_config(lr, batch_size, num_nodes, dataset_size, teacher_forcing, checkpoint_name, log_dir_num, log_dir_path, train_option, sys.argv) log_dir_num += 1 #print(checkpoint_name_idx) if checkpoint_name_idx < len(name_list)-1: checkpoint_name_idx += 1 else: checkpoint_name_idx = 0 if __name__=="__main__": config_path = os.getcwd() config = Config(config_path) yml_args = config.config_parse_yaml() sess_args = flat_dict(yml_args) train_many_jobs(sess_args)
default=None, help='Configuration file') parser.add_argument('-o', '--override', action='store', nargs='*', default=[]) arguments = parser.parse_args() override_dir = {} for s in arguments.override: s_s = s.split("=") k = s_s[0].strip() v = "=".join(s_s[1:]).strip() override_dir[k] = v arguments.override = override_dir cfg = arguments.config_path cf = Config(cfg, cmd_args=arguments.override) # if necessary, only use the CPU for debugging if cf.cpu: os.environ["CUDA_VISIBLE_DEVICES"] = "" else: os.environ["CUDA_VISIBLE_DEVICES"] = cf.cuda # ## Construct the network print('Construct the Network\n') model = build_model(cf) print('loading data\n') train_data, val_data, test_data = load_dataset(cf.dataset, cf) print('setting up the network and creating callbacks\n')
def main_transform(stack_setting_): """ [rawdata2filterdata Step] 1. Reading raw datasets 2. Droping useless feat columns in training set 3. Droping useless feat columns in test set """ raw_train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['train']) raw_test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['test']) print("= Reading raw datasets ...") names = ( "age, workclass, fnlwgt, education, education-num, material, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, country, TARGET" ).split(', ') # train raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1) #, index_col=0, sep=',' train_labels = pd.DataFrame( (raw_train['TARGET'].values == ' >50K').astype(np.int32), columns=['TARGET']) raw_train = raw_train.drop(labels=['TARGET'], axis=1) #print 'train summary' #print raw_train[['age', 'hours-per-week', 'fnlwgt']].describe() for key in ['fnlwgt', 'capital-gain', 'capital-loss']: raw_train['%s_%s' % (key, 'log')] = np.log(raw_train[key] + 1.0).astype(np.float32) raw_train = raw_train.drop(labels=[key], axis=1) raw_train['TARGET'] = train_labels del train_labels # test raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1) #, index_col=0, sep=',' test_labels = pd.DataFrame( (raw_test['TARGET'].values == ' >50K').astype(np.int32), columns=['TARGET']) raw_test = raw_test.drop(labels=['TARGET'], axis=1) print 'test summary-1' print raw_test.describe() for key in ['fnlwgt', 'capital-gain', 'capital-loss']: raw_test['%s_%s' % (key, 'log')] = np.log(raw_test[key] + 1.0).astype( np.float32) raw_test = raw_test.drop(labels=[key], axis=1) print '\n' print 'test summary-2' print raw_test.describe() raw_test['TARGET'] = test_labels del test_labels # main print("= Transform Categorical Variables into One-Hot-Encoding ...") categoricals = [ 'workclass', 'education', 'material', 'occupation', 'relationship', 'race', 'sex', 'country' ] combined = pd.concat([raw_train, raw_test]) #print combined.head() #combined_categoricals = combined[categoricals] clf = onehot_encode() clf.fit(combined, categoricals) train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['train']) # dump train raw_train = clf.transform(raw_train, categoricals) train_label = raw_train.TARGET.values raw_train = raw_train.drop(labels=['TARGET'], axis=1) raw_train['TARGET'] = pd.DataFrame(train_label) del train_label train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['train']) raw_train.to_csv(train_path, index=True, index_label='ID') # dump test raw_test = clf.transform(raw_test, categoricals) test_label = raw_test.TARGET.values raw_test = raw_test.drop(labels=['TARGET'], axis=1) raw_test['TARGET'] = pd.DataFrame(test_label) del test_label test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['test']) raw_test.to_csv(test_path, index=True, index_label='ID')
def get_ridge_plot(best_param_, experiment_, param_keys_, param_vals_, png_folder, png_fname, score_threshold=0.8): parameters = dict(zip(param_keys_, param_vals_)) del parameters['model_type'] clf = RidgeClassifier() X_train, y_train = experiment_.get_train_data() clf.set_params(**best_param_) clf.fit(X_train, y_train) best_alpha = best_param_['alpha'] result = {'alphas':[], 'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ), 'scores':[], 'score':None} for i, alpha in enumerate(parameters.get('alpha',None)): result['alphas'].append(alpha) del best_param_['alpha'] best_param_['alpha'] = alpha clf.set_params(**best_param_) clf.fit(X_train, y_train) # regularization path tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32) if best_param_['fit_intercept']: tmp = np.append(clf.intercept_, clf.coef_) else: tmp[1:] = clf.intercept_ result['coefs'][i,:] = tmp result['scores'].append(experiment_.get_proba(clf, X_train)) del X_train, y_train # 2. tmp_len = len(experiment_.get_data_col_name()) index2feature = dict(zip(np.arange(1, tmp_len + 1), experiment_.get_data_col_name())) if best_param_['fit_intercept']: index2feature[0] = 'intercept' # 3. plot gs = GridSpec(2,2) ax1 = plt.subplot(gs[:,0]) ax2 = plt.subplot(gs[0,1]) ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name()) nrows, ncols = result['coefs'].shape for ncol in xrange(ncols): ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol]) ax1.legend(loc='best') ax1.set_xscale('log') ax1.set_title("Regularization Path:%1.3e" % (best_alpha)) ax1.set_xlabel("alpha", fontsize=10) # 3.2 PDF X_test, y_test = experiment_.get_test_data() result['score'] = clf.decision_function(X_test) sns.distplot(result['score'], kde=False, rug=False, ax=ax2) ax2.set_title("PDF : Decision_Function") # 3.3 CDF num_bins = 100 try: counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(result['score'], normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10) png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname) plt.tight_layout() plt.savefig(png_fname) plt.close() return True
def get_xgb_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. train_X, train_y = experiment_.get_train_data() clf = XGBClassifier() try: del best_param_['model_type'] except: pass clf.set_params(**best_param_) clf.fit(train_X, train_y) index2feature = clf.booster().get_fscore() fis = pd.DataFrame({ 'name': index2feature.keys(), 'score': index2feature.values() }) fis = fis.sort('score', ascending=False) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile( score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 2. plot #gs = GridSpec(2,2) #ax1 = plt.subplot(gs[:,0]) #ax2 = plt.subplot(gs[0,1]) #ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance sns.barplot( x='score', y='name', data=fis, #ax=ax1, color="blue") #plt.title("Feature_Importance", fontsize=10) plt.ylabel("Feature", fontsize=10) plt.xlabel("Feature_Importance : f-Score", fontsize=10) """ # 3.2 PDF confidence_score = clf.oob_decision_function_[:,1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = min(best_param_.get('n_estimators',1), 100) counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) """ png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname) plt.tight_layout() plt.savefig(png_fname) #, bbox_inches='tight', pad_inches=1) plt.close() return True
def get_rf_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. best_param_['oob_score'] = True # 2. train_X, train_y = experiment_.get_train_data() clf = RandomForestClassifier() clf.set_params(**best_param_) clf.fit(train_X, train_y) index2feature = dict( zip(np.arange(len(train_X.columns.values)), train_X.columns.values)) feature_importances_index = [ str(j) for j in clf.feature_importances_.argsort()[::-1] ] feature_importances_score = [ clf.feature_importances_[int(j)] for j in feature_importances_index ] fis = pd.DataFrame({ 'name': [ index2feature.get(int(key), 'Null') for key in feature_importances_index ], 'score': feature_importances_score }) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile( score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 3. plot gs = GridSpec(2, 2) ax1 = plt.subplot(gs[:, 0]) ax2 = plt.subplot(gs[0, 1]) ax3 = plt.subplot(gs[1, 1]) # 3.1 feature importance sns.barplot(x='score', y='name', data=fis, ax=ax1, color="blue") #ax1.set_title("Feature_Importance", fontsize=10) ax1.set_ylabel("Feature", fontsize=10) ax1.set_xlabel("Feature_Importance", fontsize=10) # 3.2 PDF confidence_score = clf.oob_decision_function_[:, 1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = 100 try: counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(confidence_score, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname) plt.tight_layout() plt.savefig(png_fname) plt.close() return True
override_dir = {} #arguments.override= #for s in arguments.override: # s_s = s.split("=") # k = s_s[0].strip() # v = "=".join(s_s[1:]).strip() # override_dir[k]=v #arguments.override = override_dir override_dir['lr'] = 0.01 override_dir['wbits'] = 4 override_dir['abits'] = 4 override_dir['network_type'] = 'full-qnn' #config_oath cfg = "config_CIFAR-10" cf = Config(cfg, cmd_args=override_dir) # if necessary, only use the CPU for debugging #if cf.cpu: # os.environ["CUDA_VISIBLE_DEVICES"] = "" # ## Construct the network print('Construct the Network\n') # In[4]: model = build_model(cf) print('setting up the network and creating callbacks\n') early_stop = EarlyStopping(monitor='loss', min_delta=0.001,