def grid_search(tuned_parameters, data, train_size, seed): ''' 参数优化 :param tuned_parameters: 待优化的参数字典 :param data: 数据集 :param train_size:训练集大小 :param seed:用于生成随机数种子 :return: ''' print("----- Begin run grid_search at %s -------" % current_time()) X = data[:, :-1] y = data[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, stratify=data[:, -1], random_state=seed) clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=10, scoring="roc_auc") clf.fit(X_train, y_train) print("Best parameters set found:", clf.best_params_) print("Randomized Grid scores:") for params, mean_score, scores in clf.grid_scores_: print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params)) print("Optimized Score:", clf.score(X_test, y_test)) print("Detailed classification report:") y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print("----- End run grid_search at %s -------" % current_time())
def _save_data(self): print("----- Begin run save_data at %s -------" % current_time()) with open(self.fname, 'wb') as file: #保存训练集、测试集、编码器、归一化器 pickle.dump([ self.train_datas, self.test_datas, self.encoders, self.scalers ], file) print("----- End run save_data at %s -------" % current_time())
def _scaled(self): ''' 特征归一化,采用 MaxAbsScaler 来进行归一化 :return: ''' print("----- Begin run scaled at %s -------" % current_time()) train_scales = {} test_scales = {} self.scalers = {} for _type in self.types: if _type == 'type 1': train_last_index = 5 #最后5列为 group_1/date_act/date_people/char_38/outcome test_last_index = 4 #最后4列为 group_1/date_act/date_people/char_38 else: train_last_index = 6 #最后6列为 group_1/char_10_act/date_act/date_people/char_38/outcome test_last_index = 5 #最后5列为 group_1/char_10_act/date_act/date_people/char_38 scaler = MaxAbsScaler() train_array = self.train_datas[_type].toarray() train_front = train_array[:, :-train_last_index] train_mid = scaler.fit_transform( train_array[:, -train_last_index:-1]) #outcome 不需要归一化 train_end = train_array[:, -1].reshape((-1, 1)) #outcome train_scales[_type] = np.hstack( (train_front, train_mid, train_end)) test_array = self.test_datas[_type].toarray() test_front = test_array[:, :-test_last_index] test_end = scaler.transform(test_array[:, -test_last_index:]) test_scales[_type] = np.hstack((test_front, test_end)) self.scalers[_type] = scaler self.train_datas = train_scales self.test_datas = test_scales print("----- End run scaled at %s -------" % current_time())
def _load_data(self): print('----- Begin run load_data at %s -----' %current_time()) with open(self.fname,'rb') as file: self.train_datas,self.test_datas = pickle.load(file) print('----- End run load_data at %s -----' %current_time())
def _save_data(self): print('----- Begin run save_data at %s -----' %current_time()) with open (self.fname,'wb') as file: pickle.dump([self.train_datas,self.test_datas],file=file) print('----- End run save_data at %s -----' %current_time()')
def _curve(self): print('----- Begin run learning_curve (%s) at %s -----' %(self.curve_name,current_time())) abs_trains_sizes,train_scores,test_scores = learning_curve(self.estimator,self.X,self.y,cv=3,scoring='roc_auc',train_sizes=self.train_sizes) print('----- End run learning_curve (%s) at %s -----' %(self.curve_name,current_time())') train_scores_mean = np.mean(train_scores,axis=1) train_scores_std = np.std(train_scores,axis=1) test_scores_mean = np.mean(test_scores_mean) test_scores_std = np.std(test_scores_std) return abs_trains_sizes,train_scores_mean,train_scores_std,test_scores_mean,test_scores_std
def _merge_data(self): ''' 合并people数据和activity数据 :return : ''' print('----- Begin run merge_data at %s -----' %current_time()) self.train_data = self.merge(act_train,people,how='left',left_index=True,right_index=True,suffixes=('_act','_people')) self.test_data = self.merge(act_test,people,how='left',left_index=True,right_index=True,suffixes=('_act','_people')) print('----- End run merge_data at %s -----' %current_time())
def _typecast_data(self): ''' 执行数据类型转换,将所有数据转换为浮点型 :return : ''' print('----- Begin run typecast_data at %s -----' %current_time()) str_col_list = ['group_1'] + ['char_%d_act'%i for i in range(1,11)] + ['char_%d_people'%i for i in range(1,10)] bool_col_list = ['char_10_people'] + ['char_%d'%i for i in range(11,18)] for _type in sel.types: for data_set in [train_datas,test_datas]: data_set[_type].date_act = (data_set[_type].date_act - np.datetime64('1970-01-01')) / np.timedelta64(1,'D') data_set[_type].date_people = (data_set[_type].date_people - np.datetime64('1970-01-01'))/np.timedelta64(1,'D') data_set[_type].group_1 = data_set[_type].group_1.str.replace('group','').str.strip().astype(np.float64) for col in bool_col_list: if col in data_set[_type]: data_set[_type][col] = data_set[_type][col].astype(np.float64) for col in str_col_list[1:]: if col in data_set[_type]: data_set[_type][col] = data_set[_type][col].str.replace('type','').str.strip().astype(np.float64) data_set[_type] = data_set[_type].astype(np.float64) print('----- Begin run typecast_data at %s -----' %current_time()) def _is_ready(self): if (os.path.exists(self.fname)): return True else: return False def _save_data(self): print('----- Begin run save_data at %s -----' %current_time()) with open (self.fname,'wb') as file: pickle.dump([self.train_datas,self.test_datas],file=file) print('----- End run save_data at %s -----' %current_time()') def _load_data(self): print('----- Begin run load_data at %s -----' %current_time()) with open(self.fname,'rb') as file: self.train_datas,self.test_datas = pickle.load(file) print('----- End run load_data at %s -----' %current_time())
def _scaled(self): ''' 特征归一化,采用 MaxAbsScale 来进行归一化 :return: ''' print('----- Begin run Scaled at %s -----'%current_time()) train_scales = {} test_scales = {} self.scaler = {} for _type in self.types: if _type == 'type 1': train_last_index = 5 test_last_index = 4 else: train_last_index = 6 test_last_index = 5 scaler = MaxAbsScaler() train_array = self.train_datas[_type].toarray() train_front = train_array[:,:-train_last_index] train_middle = scaler.fit_transform(train_array[:,-train_last_index:-1]) train_end = train_array[:,-1].reshape((-1,1)) train_scalers[_type] = np.hstack((train_front,train_middle,train_end)) self.scaler[_type] = scaler test_array = self.test_datas[_type].toarray() test_front = test_array[:,:-test_last_index] test_end = scaler.fit_transform(test_array[:,-test_last_index]) test_scales[_type] = np.hstack((test_front,test_end)) self.scalers[_type] = scaler self.train_datas = train_scalers self.test_datas = test_scalers
def _load_csv(self): ''' 加载CSV文件 :return : ''' print('----- Begin run load_csv at %s -----' %current_time()) self.people = pd.read_csv(self.p_fname,sep='',header=0,keep_default_na=True,parse_dates=['date']) self.act_train = pd.read_csv(self.train_fname,sep='',header=0,keep_default_na=True,parse_dates=['date']) self.act_test = pd.read_csv(self.test_fname,sep='',header=0,keep_default_na=True,parse_dates=['date']) self.people.set_index(kesys = ['people_id'],drop=True,append=False,inplace=True) self.act_train.set_index(kesys = ['people_id'],drop=True,append=False,inplace=True) self.act_test.set_index(kesys = ['people_id'],drop=True,append=False,inplace=True) print('----- End run load_csv at %s -----' %current_time())
def _curve(self): print("----- Begin run validation_curve(%s) at %s -------" % (self.curve_name, current_time())) train_scores, test_scores = validation_curve(self.estimator, self.X, self.y, param_name=self.p_name, param_range=self.p_range, cv=3, scoring="roc_auc", n_jobs=-1, verbose=1) print("----- End run validation_curve(%s) at %s -------" % (self.curve_name, current_time())) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) return [ item for item in self.p_range ], train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
def _curve(self): print("----- Begin run learning_curve(%s) at %s -------" % (self.curve_name, current_time())) #### 获取学习曲线 ###### abs_trains_sizes, train_scores, test_scores = learning_curve( self.estimator, self.X, self.y, cv=3, scoring="roc_auc", train_sizes=self.train_sizes, n_jobs=-1, verbose=1) print("----- End run learning_curve(%s) at %s -------" % (self.curve_name, current_time())) ###### 对每个 test_size ,获取 3 折交叉上的预测得分上的均值和方差 ##### train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) return abs_trains_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
def _onehot_encode(): ''' 独热编码 :return : ''' print('----- Begin run onehot_encoder at %s -----'%current_time()) train_results = {} test_results = {} self._encoders = {} for _type in self.types(): if _type == 'type 1': one_hot_cols = ['char_%d_act'%i for i in range(1,10)] + ['char_%d_people'%i for i in range(1,10)] train_end_cols = ['group_1','date_act','date_people','char_38','outcome'] test_end_cols = ['group_1','date_act','date_people','char_38'] else: one_hot_cols = ['char_%d_people'%i for i in range(1,10)] train_end_cols = ['group_1','char_10_act','date_act','date_people','char_38','outcome'] test_end_cols = ['group_1','char_10_act','date_act','date_people','char_38'] train_front_array = self.train_datas[_type][one_hot_cols].values train_end_array = self.train_datas[_type][train_end_cols].values train_middle_array = self.train_datas[_type].drop(train_end_cols + one_hot_cols,axis=1,inplace=False).values test_front_array = self.test_datas[_type][one_hot_cols].values test_end_array = self.test_datas[_type][test_end_cols].values test_middle_array = self.test_datas[_type].drop(test_end_cols + one_hot_cols,axis=1,inplace=False).values encoder = OneHotEncoder(categorical_features='all',sparse=True) train_result = hstack([encoder.fit_transform(train_front_array),csr_matrix(train_middle_array),csr_matrix(train_end_array)]) test_result = hstack([encoder.fit_transform(test_front_array),csr_matrix(test_middle_array),csr_matrix(test_end_array)]) train_results[_type] = train_result test_results[_type] = test_result self.train_datas = train_results self.test_datas = test_results print('----- End run onehot_encoder at %s -----'%current_time())
def _split_data(self): ''' 拆分数据为 type1-7 :return: ''' print('----- Begin run split_data at %s -----' %current_time()) self.train_datas = {} self.test_dats = {} for _type in self._types: #拆分 self.train_datas[_type] = self.train_data[self.train_data.activity_category == _type].dropna(axis=(0,1),how='all') self.test_datas[_type] = self.test_data[self.test_data.activity_category == _type].dropna(axis=(0,1),how='all') #删除列activity_category self.train_datas[_type].drop('activity_category',axis=1,inplace=True) self.test_datas[_type].drop('activity_category',axis=1,inplace=True) #将列activity_id设为索引 self.train_datas[_type].set_index(keys = ['activity_id'],drop=True,append=True,inplace=True) self.test_datas[_types].set_index(keys = ['activity_id'],drop=True,append=True,inplace=True) print('----- End run split_data at %s -----' %current_time())
def _load_data(self): print("----- Begin run _load_data at %s -------" % current_time()) with open(self.fname, 'rb') as file: #加载训练集、测试集、编码器、归一化器 self.train_datas, self.test_datas, self.encoders, self.scalers = pickle.load( file) print("----- End run _load_data at %s -------" % current_time())