def split_continuum_value_data (data) : """ split the continuum value into some interval with same length then convert the category variable into binary variable @params: data: original data (ndarray) @return: $1: the corresponding data after spliting """ logging.info ('begin split_continuum_value_data') print data.shape if os.path.exists (ROOT + '/data/split_' + str (SPLITCONTINUUM)) : logging.info (ROOT + '/data/split_' + str (SPLITCONTINUUM) + ' exist!') return io.grab (ROOT + '/data/split_' + str (SPLITCONTINUUM)) else : data = pd.DataFrame (data) feature_list = data.columns for feature in feature_list : global min_val, max_val min_val = min (data[feature].values) max_val = max (data[feature].values) data[feature] = data[feature].map (lambda x : split_value (x)) data = convert.binary_feature (data, feature) data.drop (feature, axis = 1, inplace = 1) io.store (data.values[:,1:], ROOT + '/data/split_' + str (SPLITCONTINUUM)) return data.values[:,1:]
def clean_data (usePCA = False) : """ """ logging.info ('begin to clean the data') if os.path.exists (ROOT + '/data/cleandata.csv') : # we need not to clean the data each time # if you want to reclean the data, please delete '../data/cleandata.csv' file logging.info ('the clean data is already exists') data = pd.read_csv (ROOT + '/data/cleandata.csv') train_number, val_number, test_number, unlabel_number, label, uid = io.grab (ROOT + '/data/datadescribe') else : data, train_number, val_number, test_number, unlabel_number, label, uid = read.read_data () data = feature_handler (data) # store the result data.to_csv (ROOT + '/data/cleandata.csv') io.store ([train_number, val_number, test_number, unlabel_number, label, uid], ROOT + '/data/datadescribe') logging.info ('finished cleaning the data') if usePCA : # dimensionality reduction if not os.path.exists (ROOT + '/data/datapca') : # we need not to rerun this step # if you change the parameters and want to relearn it, please delete '../data/datapca' file data_values = decomposition.pca_solver (data) io.store (data_values, ROOT + '/data/datapca') data_values = io.grab (ROOT + '/data/datapca') else : data_values = data.values[:,1:] return data_values, train_number, val_number, test_number, unlabel_number, label, uid
def gbdt_feature_importance(train, label): if os.path.exists(ROOT + "/data/feature_importance"): logging.info("feature_importance exists!") feature_importance = io.grab(ROOT + "/data/feature_importance") else: logging.info("feature_importance start!") gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=1000000007).fit( train, label ) feature_importance = gb.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) io.store(feature_importance, ROOT + "/data/feature_importance") return feature_importance
def gbdt_dimreduce_threshold( train_data, train_label, validataion, test, unlabel, feature_threshold=GBDTFEATURETHRESHOLD ): """ """ logging.info("begin gbdt_dimreduce_threshold") if os.path.exists(ROOT + "/data/gbdt_threshold_" + str(GBDTFEATURETHRESHOLD)): logging.info(ROOT + "/data/gbdt_threshold_" + str(GBDTFEATURETHRESHOLD) + " exist!") important_index, sorted_index = io.grab(ROOT + "/data/gbdt_threshold_" + str(GBDTFEATURETHRESHOLD)) else: feature_importance = gbdt_feature_importance(train_data, train_label) important_index = np.where(feature_importance > feature_threshold)[0] sorted_index = np.argsort(feature_importance[important_index])[::-1] io.store([important_index, sorted_index], ROOT + "/data/gbdt_threshold_" + str(GBDTFEATURETHRESHOLD)) new_train_data = train_data[:, important_index][:, sorted_index] new_val = validataion[:, important_index][:, sorted_index] new_test = test[:, important_index][:, sorted_index] new_unlabel = unlabel[:, important_index][:, sorted_index] return new_train_data, new_val, new_test, new_unlabel
def mix_pca_gbdt(train_data, train_label, validataion, test, unlabel): """ """ if os.path.exists(ROOT + "/data/mix_pca_gbdt"): logging.info(ROOT + "/data/mix_pca_gbdt exists!") new_train_data, new_val, new_test, new_unlabel = io.grab(ROOT + "/data/mix_pca_gbdt") else: logging.info("before mix_pca_gbdt dim-reducing : (%d %d)" % (train_data.shape)) feature_importance = gbdt_feature_importance(train_data, train_label) important_index = np.where(feature_importance > GBDTFEATURETHRESHOLD)[0] sorted_index = np.argsort(feature_importance[important_index])[::-1] other_index = np.where(feature_importance <= GBDTFEATURETHRESHOLD)[0] pca_data = np.vstack( (train_data[:, other_index], validataion[:, other_index], test[:, other_index], unlabel[:, other_index]) ) pca_data = pca_solver(pca_data) new_train_data = np.hstack( (train_data[:, important_index][:, sorted_index], pca_data[: train_data.shape[0], :]) ) new_val = np.hstack( ( validataion[:, important_index][:, sorted_index], pca_data[train_data.shape[0] : train_data.shape[0] + validataion.shape[0], :], ) ) new_test = np.hstack( ( test[:, important_index][:, sorted_index], pca_data[train_data.shape[0] + validataion.shape[0] : -unlabel.shape[0], :], ) ) new_unlabel = np.hstack((unlabel[:, important_index][:, sorted_index], pca_data[-unlabel.shape[0] :, :])) logging.info("after mix_pca_gbdt dim-reducing : (%d %d)" % (new_train_data.shape)) io.store([new_train_data, new_val, new_test, new_unlabel], ROOT + "/data/mix_pca_gbdt") return new_train_data, new_val, new_test, new_unlabel
import utils.io as io sys.path.insert(0, '../..') import feature.splitvalue as split import model.evaluate as evaluate if __name__ == '__main__' : data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data () train = data[:train_number,:] validation = data[train_number:train_number+val_number,:] test = data[train_number+val_number:-unlabel_number,:] unlabel = data[-unlabel_number:,:] val_label = pd.read_csv ('../../data/val_cv_y.csv').y.values io.store ([train, label, validation, val_label, test, unlabel], '../../data/data_standard') train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) io.store ([train, label, validation, val_label, test, unlabel], '../../data/data_standard_decompose') # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel) train_data, train_label, validation_data, validation_label, test, unlabel = io.grab ('../../data/data_standard') print 'training set:' , train_data.shape print 'validation set: ' , validation_data.shape print 'testing set', test.shape print 'unlabel set', unlabel.shape assert train_data.shape[0] == len (train_label) assert validation_data.shape[0] == len (validation_label)