inputs = (features, labels) # Convert the inputs to a Dataset. dataset = tf.data.Dataset.from_tensor_slices(inputs) # Batch the examples assert batch_size is not None, "batch_size must not be None" dataset = dataset.batch(batch_size) # Return the read end of the pipeline. return dataset.make_one_shot_iterator().get_next() newuser_dataset=pd.read_excel('/Users/andpay/Documents/job/model/newuser_marketing_credithands/newuser_marketing_dataset_v4_1.xlsx') var_list=list(newuser_dataset.columns) model_var_list=remove_list(var_list,['partyid','cate']) category_var=['sex','city-id','channel','brandcode'] continue_var=remove_list(model_var_list,category_var) newuser_dataset=disper_split(newuser_dataset,category_var) newuser_dataset[continue_var]=newuser_dataset[continue_var].fillna(-1) newuser_dataset=newuser_dataset[model_var_list+['cate']].apply(pd.to_numeric) #自变量标准化处理,可以减少训练时间 newuser_dataset[model_var_list]=preprocessing.scale(newuser_dataset[model_var_list]) #固定训练集和测试集数据 traindata, testdata= train_test_split(newuser_dataset,test_size=0.25,random_state=1) x_train,y_train=traindata[model_var_list],traindata['cate']
# 对好样本进行随机欠采样 # random_sample=RandomSample(np.array(end_user_info[end_user_info['cate']==0]),0.3).random_under_sample() # random_sample_df=pd.DataFrame(random_sample,columns=end_user_info.columns) # end_user_info=pd.concat([end_user_info[end_user_info['cate']==1],random_sample_df],axis=0) # print(end_user_info.shape) '''缺失值检测''' unuse_list=check_nullvalue(end_user_info) end_user_info=end_user_info.drop(unuse_list,axis=1) new_column=end_user_info.columns cat_var=['id_city'] con_var=remove_list(new_column,cat_var+['over_dueday','cate']) end_user_info=disper_split(end_user_info,cat_var) '''变量间相关性检验''' var_relative=regression_analysis(end_user_info,con_var) relative_df=pd.DataFrame(var_relative,columns=['var1','var2','p_value','relative_coeffient']) print(relative_df) #对变量进行卡方分箱 split_point_chi,chi_df_1=chi_equalwide(end_user_info,con_var,'cate',max_interval=5,numOfSplit=300,mont=False,special_list=['age']) end_col=list(chi_df_1.columns)
from xgboost.sklearn import XGBClassifier from sklearn.metrics import make_scorer, fbeta_score, accuracy_score, recall_score from sklearn import metrics import time start_time = time.time() dataset = pd.read_excel( '/Users/andpay/Documents/job/model/behave_model/behave_model_dataset_v1_1.xlsx' ) dataset.loc[dataset['last_overday'] >= 10, 'cate'] = 1 dataset.loc[dataset['last_overday'] < 10, 'cate'] = 0 var_list = list(dataset.columns) model_var_list = remove_list( var_list, ['partyid', 'loanid', 'last_overday', 'cate', 'register_duration']) category_var = ['sex', 'city_id', 'channel_type', 'brandcode'] continue_var = remove_list(model_var_list, category_var) #这里会改变newvar_list的元素数量 newuser_dataset = disper_split(dataset, category_var) newuser_dataset[continue_var] = newuser_dataset[continue_var].fillna(0) # x_train,x_test,y_train,y_test= train_test_split(,test_size=0.25,random_state=1) XGC = XGBClassifier(n_estimators=150, max_depth=9, learning_rate=0.03) XGC.fit(newuser_dataset[continue_var].astype(int), dataset['cate'].astype(int)) xgc_col = list(np.round(XGC.feature_importances_, 3)) #变量重要性排序 var_importance = pd.DataFrame({
start_time = time.time() dataset = pd.read_excel( '/Users/andpay/Documents/job/model/behave_model/behave_model_dataset_v2.xlsx' ) dataset.loc[dataset['last_overday'] >= 30, 'cate'] = 1 dataset.loc[dataset['last_overday'] < 30, 'cate'] = 0 print(dataset['partyid'].groupby(dataset['cate']).count()) test_dataset = pd.read_excel( '/Users/andpay/Documents/job/model/behave_model/model_practice/behave_userlist_v2_2.xlsx' ) var_list = list(dataset.columns) model_var_list = remove_list(var_list, ['partyid', 'loanid', 'last_overday', 'cate']) category_var = ['sex', 'city_id', 'channel_type', 'brandcode'] continue_var = remove_list(model_var_list, category_var) #这里会改变newvar_list的元素数量 newuser_dataset = disper_split(dataset, category_var) #分类变量处理 newuser_dataset[continue_var] = newuser_dataset[continue_var].fillna(0) test_dataset[continue_var] = test_dataset[continue_var].fillna(0) '''变量间相关性检验''' var_relative = regression_analysis(dataset, continue_var, rsquare_limit=0.8) relative_df = pd.DataFrame( var_relative, columns=['var1', 'var2', 'p_value', 'relative_coeffient']) print(relative_df) relative_var = [
dif=self.samples[nnarray[nn]]-self.samples[i] #生成一个随机数 gap=random.random() #合成人工样本 self.synthetic[self.newindex]=self.samples[i]+gap*dif self.newindex+=1 if __name__=='__main__': df = pd.read_excel('/Users/andpay/Documents/job/data/帮还活动/activity_history/marketing_modedata3_14.xlsx') df = df[0:100] print(df.shape) cate_list = ['sex', 'brandcode', 'channel_type', 'marry', 'ccerate'] df = df.fillna(0) var_list = list(df.columns) var_list.remove('partyid') var_list.remove('name') continue_list = remove_list(var_list, cate_list) #a=np.array([[1,2,3],[4,5,6],[2,3,1],[2,1,2],[2,3,4],[2,3,4]]) data=np.array(df[continue_list]) #print(np.round(data,3)) s=Smote(data,N=50) dataset=s.over_sampling() print (dataset.shape) print (s.newindex)