Exemple #1
0
invest_year_dummies = pd.get_dummies(invest['BTYEAR'])
#addColumnsPrefix(invest_year_dummies, 'invest_year')
invest_year_dummies['EID'] = invest['EID']

invest_year = invest_year_dummies.groupby('EID').sum()
#pca_ratio_curve(invest_year, 40, 25)
invest_year = compress(invest_year, 18, 'invest_year', keepEID=True)

#X_train = pd.merge(X_train, invest_year, how='left', on='EID')
#X_answer = pd.merge(X_answer, invest_year, how='left', on='EID')

#BTENDYEAR
invest_year_end = invest.loc[~np.isnan(invest['BTENDYEAR']),
                             ['EID', 'BTENDYEAR']]
invest_year_end_dummies = pd.get_dummies(invest_year_end['BTENDYEAR'])
addColumnsPrefix(invest_year_end_dummies, 'invest_year')
invest_year_end_dummies['EID'] = invest_year_end['EID']
invest_year_end = invest_year_end_dummies.groupby('EID').sum()
invest_year_end.reset_index(inplace=True)

X_train = pd.merge(X_train, invest_year_end, how='left', on='EID')
X_answer = pd.merge(X_answer, invest_year_end, how='left', on='EID')

#==============================================================================
#                           X_invested --被投资部分
#==============================================================================
invested.rename(columns={'BTEID': 'EID'}, inplace=True)
X_invested = pd.merge(X, invested, how='left',
                      on='EID').loc[:, ['EID', 'BTBL', 'is_invested']]

X_invested.fillna(0, inplace=True)
Exemple #2
0
X_train = pd.read_pickle('X_train_with_invest_pickle')
X_answer = pd.read_pickle('X_answer_with_invest_pickle')
right = pd.read_csv('5right.csv')

X = pd.concat([X_train.drop('TARGET', axis=1), X_answer])

X_right = pd.merge(X, right, how='left', on='EID').loc[:, 
                                                        ['EID', 'RIGHTTYPE',
                                                         'TYPECODE', 'ASKDATE',
                                                         'FBDATE']]

#RIGHTTYPE
right_type_dummies = pd.get_dummies(right['RIGHTTYPE'].fillna(0))
right_type_dummies['EID'] = right['EID']
right_type_dummies = right_type_dummies.groupby('EID').sum()
addColumnsPrefix(right_type_dummies, 'right_type')
right_type_dummies.reset_index(inplace=True)
#pca_ratio_curve(right_type_dummies, 7, 4)

#新定义两个属性 --right_get描述的是获得权利的数目
right_get = X_right[['EID', 'FBDATE']]
right_get.loc[:, 'right_get'] = isExist(X_right['FBDATE'])
right_get = right_get.groupby('EID').sum()
#right_applied描述的是权利申请的数目
right_applied = X_right[['EID', 'RIGHTTYPE']]
right_applied.loc[:, 'right_applied'] = isExist(X_right['RIGHTTYPE'])
right_applied = right_applied.groupby('EID').sum()
#取消键'EID',方便使用merge
right_get.reset_index(inplace=True)
right_applied.reset_index(inplace=True)
Exemple #3
0
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from dankit import pca_ratio_curve, compress, isExist, addColumnsPrefix, splitDate

X_train = pd.read_pickle('X_train_with_project_pickle')
X_answer = pd.read_pickle('X_answer_with_project_pickle')
lawsuit = pd.read_csv('7lawsuit.csv')

lawsuit['lawsuit_number'] = 1
lawsuit['lawsuit_year'], lawsuit['lawsuit_month'] = splitDate(lawsuit['LAWDATE'])


lawsuit_year_dummies = pd.get_dummies(lawsuit['lawsuit_year'])
lawsuit_month_dummies = pd.get_dummies(lawsuit['lawsuit_month'])
addColumnsPrefix(lawsuit_year_dummies, 'lawsuit_year')
addColumnsPrefix(lawsuit_month_dummies, 'lawsuit_month')
#将年份与月份合并为一个DataFrame
lawsuit_date_dummies = lawsuit_year_dummies.join(lawsuit_month_dummies)
lawsuit_date_dummies[['EID']] = lawsuit[['EID']]
lawsuit_date_dummies = lawsuit_date_dummies.groupby('EID').sum()
lawsuit_date_dummies.reset_index(inplace=True)

lawsuit_number = lawsuit[['EID', 'lawsuit_number']].groupby('EID').sum()
lawsuit_number.reset_index(inplace=True)

lawsuit_date_and_number = pd.merge(lawsuit_number, lawsuit_date_dummies, on='EID')
X_train = pd.merge(X_train, lawsuit_date_and_number, how='left', on='EID')
X_answer = pd.merge(X_answer, lawsuit_date_and_number, how='left', on='EID')

X_train.fillna(0, inplace=True)
Exemple #4
0
).loc[:, ['EID', 'DJDATE', 'project_number', 'project_at_home_number']]
X_project.fillna(0, inplace=True)
X_project_numbers = X_project[[
    'EID', 'project_number', 'project_at_home_number'
]].groupby('EID').sum()
X_project_numbers.reset_index(inplace=True)

#X_train = pd.merge(X_train, X_project_numbers, how='left', on='EID')
#X_answer = pd.merge(X_answer, X_project_numbers, how='left', on='EID')

#ASKDATE --拆分为年份跟日期两个属性
project['project_year'], project['project_month'] = splitDate(
    project['DJDATE'])
project_year_dummies = pd.get_dummies(project['project_year'])
project_month_dummies = pd.get_dummies(project['project_month'])
addColumnsPrefix(project_year_dummies, 'project_year')
addColumnsPrefix(project_month_dummies, 'project_month')
#将年份与月份合并为一个DataFrame
project_date_dummies = project_year_dummies.join(project_month_dummies)
project_date_dummies[['EID']] = project[['EID']]
project_date_dummies = project_date_dummies.groupby('EID').sum()
project_date_dummies.reset_index(inplace=True)

X_train = pd.merge(X_train, project_date_dummies, how='left', on='EID')
X_answer = pd.merge(X_answer, project_date_dummies, how='left', on='EID')

X_train.fillna(0, inplace=True)
X_answer.fillna(0, inplace=True)

X_train.to_pickle('X_train_with_project_pickle')
X_answer.to_pickle('X_answer_with_project_pickle')
Exemple #5
0
X_answer = pd.read_pickle('X_answer_with_alter_pickle')
branch = pd.read_csv('3branch.csv')

branch['branch_number'] = 1
#branch['branch_end_number'] = isExist(branch['B_ENDYEAR'])

branch_number = branch[['EID', 'IFHOME', 'branch_number']].groupby('EID').sum()
branch_number.reset_index(inplace=True)

#B_REYEAR
branch['year_or_old_endyear'] = list(
    [2000 if x <= 2000 else x for x in branch['B_ENDYEAR']])
branch['year_or_old_reyear'] = list(
    [1990 if x <= 1990 else x for x in branch['B_REYEAR']])
re_year_dummies = pd.get_dummies(branch['year_or_old_reyear'])
addColumnsPrefix(re_year_dummies, 'branch_re_year')
re_year_dummies['EID'] = branch['EID']
re_year_dummies = re_year_dummies.groupby('EID').sum()
#B_ENDYEAR
end_year_dummies = pd.get_dummies(branch['year_or_old_endyear'])
addColumnsPrefix(end_year_dummies, 'branch_end_year')
end_year_dummies['EID'] = branch['EID']
end_year_dummies = end_year_dummies.groupby('EID').sum()

#绘制pca压缩后保留数据百分比的曲线,确定压缩维度
#pca_ratio_curve(re_year_dummies, 20, 4)
re_year_compressed_df = compress(re_year_dummies, 4, 're_year')
re_year_dummies.reset_index(inplace=True)
re_year_compressed_df['EID'] = re_year_dummies['EID']
#绘制pca压缩后保留数据百分比的曲线,确定压缩维度
#pca_ratio_curve(end_year_dummies, 15, 8)
Exemple #6
0
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

from dankit import pca_ratio_curve, compress, isExist, addColumnsPrefix, splitDate

X_train = pd.read_pickle('X_train_with_breakfaith_pickle')
X_answer = pd.read_pickle('X_answer_with_breakfaith_pickle')
recruit = pd.read_csv('9recruit.csv')

recruit['recruit_times'] = 1
recruit['recruit_year'], recruit['recruit_month'] = splitDate(
    recruit['RECDATE'])

recruit_year_dummies = pd.get_dummies(recruit['recruit_year'])
recruit_month_dummies = pd.get_dummies(recruit['recruit_month'])
addColumnsPrefix(recruit_year_dummies, 'recruit_year')
addColumnsPrefix(recruit_month_dummies, 'recruit_month')
#将年份与月份合并为一个DataFrame
recruit_date_dummies = recruit_year_dummies.join(recruit_month_dummies)
recruit_date_dummies[['EID']] = recruit[['EID']]
recruit_date_dummies = recruit_date_dummies.groupby('EID').sum()
recruit_date_dummies.reset_index(inplace=True)

recruit_website_dummies = pd.get_dummies(recruit['WZCODE'])
addColumnsPrefix(recruit_website_dummies, 'recruit_website_')
recruit_website_dummies[['EID']] = recruit[['EID']]
recruit_website_dummies = recruit_website_dummies.groupby('EID').sum()
recruit_website_dummies.reset_index(inplace=True)

imp_nan = Imputer(missing_values='NaN', strategy='median', axis=0)
imp_nan.fit(recruit.loc[:, ['RECRNUM']])
Exemple #7
0
#l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTBE']])
#alter.loc[np.where(l == 'null')[0],'ALTBE'] = '0'
#alter['ALTBE'] = np.array(
#        [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTBE']])

#alter['ALTAF'].fillna('0', inplace=True)
#l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTAF']])
#alter.loc[np.where(l == 'null')[0],'ALTAF'] = '0'
#alter['ALTAF'] = np.array(
#        [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTAF']])

#ALTDATE
alter['alt_year'], alter['alt_month'] = splitDate(alter['ALTDATE'])
alter_year_dummies = pd.get_dummies(alter['alt_year'])
alter_month_dummies = pd.get_dummies(alter['alt_month'])
addColumnsPrefix(alter_year_dummies, 'alter_year')
addColumnsPrefix(alter_month_dummies, 'alter_month')

alter_date_dummies = alter_year_dummies.join(alter_month_dummies)
alter_date_dummies[['EID']] = alter[['EID']]
alter_date_dummies = alter_date_dummies.groupby('EID').sum()
alter_date_dummies.reset_index(inplace=True)

#ALTERNO
alterno_dummies = pd.get_dummies(X_alter['ALTERNO'])
alterno_dummies[['EID']] = X_alter[['EID']]
alterno_dummies = alterno_dummies.groupby('EID').sum()
addColumnsPrefix(alterno_dummies, 'alterno')
alterno_dummies.reset_index(inplace=True)

#add alterno ont-hot columns