invest_year_dummies = pd.get_dummies(invest['BTYEAR']) #addColumnsPrefix(invest_year_dummies, 'invest_year') invest_year_dummies['EID'] = invest['EID'] invest_year = invest_year_dummies.groupby('EID').sum() #pca_ratio_curve(invest_year, 40, 25) invest_year = compress(invest_year, 18, 'invest_year', keepEID=True) #X_train = pd.merge(X_train, invest_year, how='left', on='EID') #X_answer = pd.merge(X_answer, invest_year, how='left', on='EID') #BTENDYEAR invest_year_end = invest.loc[~np.isnan(invest['BTENDYEAR']), ['EID', 'BTENDYEAR']] invest_year_end_dummies = pd.get_dummies(invest_year_end['BTENDYEAR']) addColumnsPrefix(invest_year_end_dummies, 'invest_year') invest_year_end_dummies['EID'] = invest_year_end['EID'] invest_year_end = invest_year_end_dummies.groupby('EID').sum() invest_year_end.reset_index(inplace=True) X_train = pd.merge(X_train, invest_year_end, how='left', on='EID') X_answer = pd.merge(X_answer, invest_year_end, how='left', on='EID') #============================================================================== # X_invested --被投资部分 #============================================================================== invested.rename(columns={'BTEID': 'EID'}, inplace=True) X_invested = pd.merge(X, invested, how='left', on='EID').loc[:, ['EID', 'BTBL', 'is_invested']] X_invested.fillna(0, inplace=True)
X_train = pd.read_pickle('X_train_with_invest_pickle') X_answer = pd.read_pickle('X_answer_with_invest_pickle') right = pd.read_csv('5right.csv') X = pd.concat([X_train.drop('TARGET', axis=1), X_answer]) X_right = pd.merge(X, right, how='left', on='EID').loc[:, ['EID', 'RIGHTTYPE', 'TYPECODE', 'ASKDATE', 'FBDATE']] #RIGHTTYPE right_type_dummies = pd.get_dummies(right['RIGHTTYPE'].fillna(0)) right_type_dummies['EID'] = right['EID'] right_type_dummies = right_type_dummies.groupby('EID').sum() addColumnsPrefix(right_type_dummies, 'right_type') right_type_dummies.reset_index(inplace=True) #pca_ratio_curve(right_type_dummies, 7, 4) #新定义两个属性 --right_get描述的是获得权利的数目 right_get = X_right[['EID', 'FBDATE']] right_get.loc[:, 'right_get'] = isExist(X_right['FBDATE']) right_get = right_get.groupby('EID').sum() #right_applied描述的是权利申请的数目 right_applied = X_right[['EID', 'RIGHTTYPE']] right_applied.loc[:, 'right_applied'] = isExist(X_right['RIGHTTYPE']) right_applied = right_applied.groupby('EID').sum() #取消键'EID',方便使用merge right_get.reset_index(inplace=True) right_applied.reset_index(inplace=True)
from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from dankit import pca_ratio_curve, compress, isExist, addColumnsPrefix, splitDate X_train = pd.read_pickle('X_train_with_project_pickle') X_answer = pd.read_pickle('X_answer_with_project_pickle') lawsuit = pd.read_csv('7lawsuit.csv') lawsuit['lawsuit_number'] = 1 lawsuit['lawsuit_year'], lawsuit['lawsuit_month'] = splitDate(lawsuit['LAWDATE']) lawsuit_year_dummies = pd.get_dummies(lawsuit['lawsuit_year']) lawsuit_month_dummies = pd.get_dummies(lawsuit['lawsuit_month']) addColumnsPrefix(lawsuit_year_dummies, 'lawsuit_year') addColumnsPrefix(lawsuit_month_dummies, 'lawsuit_month') #将年份与月份合并为一个DataFrame lawsuit_date_dummies = lawsuit_year_dummies.join(lawsuit_month_dummies) lawsuit_date_dummies[['EID']] = lawsuit[['EID']] lawsuit_date_dummies = lawsuit_date_dummies.groupby('EID').sum() lawsuit_date_dummies.reset_index(inplace=True) lawsuit_number = lawsuit[['EID', 'lawsuit_number']].groupby('EID').sum() lawsuit_number.reset_index(inplace=True) lawsuit_date_and_number = pd.merge(lawsuit_number, lawsuit_date_dummies, on='EID') X_train = pd.merge(X_train, lawsuit_date_and_number, how='left', on='EID') X_answer = pd.merge(X_answer, lawsuit_date_and_number, how='left', on='EID') X_train.fillna(0, inplace=True)
).loc[:, ['EID', 'DJDATE', 'project_number', 'project_at_home_number']] X_project.fillna(0, inplace=True) X_project_numbers = X_project[[ 'EID', 'project_number', 'project_at_home_number' ]].groupby('EID').sum() X_project_numbers.reset_index(inplace=True) #X_train = pd.merge(X_train, X_project_numbers, how='left', on='EID') #X_answer = pd.merge(X_answer, X_project_numbers, how='left', on='EID') #ASKDATE --拆分为年份跟日期两个属性 project['project_year'], project['project_month'] = splitDate( project['DJDATE']) project_year_dummies = pd.get_dummies(project['project_year']) project_month_dummies = pd.get_dummies(project['project_month']) addColumnsPrefix(project_year_dummies, 'project_year') addColumnsPrefix(project_month_dummies, 'project_month') #将年份与月份合并为一个DataFrame project_date_dummies = project_year_dummies.join(project_month_dummies) project_date_dummies[['EID']] = project[['EID']] project_date_dummies = project_date_dummies.groupby('EID').sum() project_date_dummies.reset_index(inplace=True) X_train = pd.merge(X_train, project_date_dummies, how='left', on='EID') X_answer = pd.merge(X_answer, project_date_dummies, how='left', on='EID') X_train.fillna(0, inplace=True) X_answer.fillna(0, inplace=True) X_train.to_pickle('X_train_with_project_pickle') X_answer.to_pickle('X_answer_with_project_pickle')
X_answer = pd.read_pickle('X_answer_with_alter_pickle') branch = pd.read_csv('3branch.csv') branch['branch_number'] = 1 #branch['branch_end_number'] = isExist(branch['B_ENDYEAR']) branch_number = branch[['EID', 'IFHOME', 'branch_number']].groupby('EID').sum() branch_number.reset_index(inplace=True) #B_REYEAR branch['year_or_old_endyear'] = list( [2000 if x <= 2000 else x for x in branch['B_ENDYEAR']]) branch['year_or_old_reyear'] = list( [1990 if x <= 1990 else x for x in branch['B_REYEAR']]) re_year_dummies = pd.get_dummies(branch['year_or_old_reyear']) addColumnsPrefix(re_year_dummies, 'branch_re_year') re_year_dummies['EID'] = branch['EID'] re_year_dummies = re_year_dummies.groupby('EID').sum() #B_ENDYEAR end_year_dummies = pd.get_dummies(branch['year_or_old_endyear']) addColumnsPrefix(end_year_dummies, 'branch_end_year') end_year_dummies['EID'] = branch['EID'] end_year_dummies = end_year_dummies.groupby('EID').sum() #绘制pca压缩后保留数据百分比的曲线,确定压缩维度 #pca_ratio_curve(re_year_dummies, 20, 4) re_year_compressed_df = compress(re_year_dummies, 4, 're_year') re_year_dummies.reset_index(inplace=True) re_year_compressed_df['EID'] = re_year_dummies['EID'] #绘制pca压缩后保留数据百分比的曲线,确定压缩维度 #pca_ratio_curve(end_year_dummies, 15, 8)
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import Imputer from dankit import pca_ratio_curve, compress, isExist, addColumnsPrefix, splitDate X_train = pd.read_pickle('X_train_with_breakfaith_pickle') X_answer = pd.read_pickle('X_answer_with_breakfaith_pickle') recruit = pd.read_csv('9recruit.csv') recruit['recruit_times'] = 1 recruit['recruit_year'], recruit['recruit_month'] = splitDate( recruit['RECDATE']) recruit_year_dummies = pd.get_dummies(recruit['recruit_year']) recruit_month_dummies = pd.get_dummies(recruit['recruit_month']) addColumnsPrefix(recruit_year_dummies, 'recruit_year') addColumnsPrefix(recruit_month_dummies, 'recruit_month') #将年份与月份合并为一个DataFrame recruit_date_dummies = recruit_year_dummies.join(recruit_month_dummies) recruit_date_dummies[['EID']] = recruit[['EID']] recruit_date_dummies = recruit_date_dummies.groupby('EID').sum() recruit_date_dummies.reset_index(inplace=True) recruit_website_dummies = pd.get_dummies(recruit['WZCODE']) addColumnsPrefix(recruit_website_dummies, 'recruit_website_') recruit_website_dummies[['EID']] = recruit[['EID']] recruit_website_dummies = recruit_website_dummies.groupby('EID').sum() recruit_website_dummies.reset_index(inplace=True) imp_nan = Imputer(missing_values='NaN', strategy='median', axis=0) imp_nan.fit(recruit.loc[:, ['RECRNUM']])
#l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTBE']]) #alter.loc[np.where(l == 'null')[0],'ALTBE'] = '0' #alter['ALTBE'] = np.array( # [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTBE']]) #alter['ALTAF'].fillna('0', inplace=True) #l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTAF']]) #alter.loc[np.where(l == 'null')[0],'ALTAF'] = '0' #alter['ALTAF'] = np.array( # [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTAF']]) #ALTDATE alter['alt_year'], alter['alt_month'] = splitDate(alter['ALTDATE']) alter_year_dummies = pd.get_dummies(alter['alt_year']) alter_month_dummies = pd.get_dummies(alter['alt_month']) addColumnsPrefix(alter_year_dummies, 'alter_year') addColumnsPrefix(alter_month_dummies, 'alter_month') alter_date_dummies = alter_year_dummies.join(alter_month_dummies) alter_date_dummies[['EID']] = alter[['EID']] alter_date_dummies = alter_date_dummies.groupby('EID').sum() alter_date_dummies.reset_index(inplace=True) #ALTERNO alterno_dummies = pd.get_dummies(X_alter['ALTERNO']) alterno_dummies[['EID']] = X_alter[['EID']] alterno_dummies = alterno_dummies.groupby('EID').sum() addColumnsPrefix(alterno_dummies, 'alterno') alterno_dummies.reset_index(inplace=True) #add alterno ont-hot columns