def deal_marriage_income(df, features1, features2, unique_features): df['s_income'] = df['s_income'].apply(amend_income) df['s_income_diff'] = 0. df['ma_income_m'] = 0. df['ma_income_f'] = 0. unique_features.append('s_income_diff') unique_features.append('ma_income_m') unique_features.append('ma_income_f') male_income = 0 female_income = 0 for i in range(df.shape[0]): if (isMale(df, i)): male_income = df.loc[i, 'income'] female_income = df.loc[i, 's_income'] else: female_income = df.loc[i, 'income'] male_income = df.loc[i, 's_income'] female_income = mathUtil.Float(female_income) male_income = mathUtil.Float(male_income) if (female_income is not None and male_income is not None): df.loc[i, 's_income_diff'] = male_income - female_income df.loc[i, 'ma_income_m'] = male_income df.loc[i, 'ma_income_f'] = female_income return df
def get_all_nan_cells(df): mlist=[] for col in df.columns: for i in range(df.shape[0]): if(None==mathUtil.Float(df.loc[i,col])): mlist.append((col,i)) return mlist
def min_max_mean_nan(list): if(len(list)==0): return None,None,None,True contain_nan=False minV=np.inf maxV=-np.inf total=0. mean=0. count=0 for i in range(len(list)): v=mathUtil.Float(list[i]) if(v==None): contain_nan=True else: if(v>maxV): maxV=v if (v < minV): minV = v total+=v count+=1 if(count==0): mean=None else: mean=total/count return minV, maxV, mean, contain_nan
def fillna(df,column_name,fill_value): for i in range(df.shape[0]): v=mathUtil.Float(df.loc[i,column_name]) if(v is None): df.loc[i, column_name]=fill_value # s=df[column_name] # s=s.fillna(value=fill_value) # df[column_name]=s return True
def deal_marriage_edu(df, features1, features2, unique_features): df['s_edu_diff'] = 0. unique_features.append('s_edu_diff') male_edu_score = 0 female_edu_score = 0 for i in range(df.shape[0]): if (isMale(df, i)): male_edu_score = df.loc[i, 'edu_a'] female_edu_score = df.loc[i, 's_edu'] else: female_edu_score = df.loc[i, 'edu_a'] male_edu_score = df.loc[i, 's_edu'] female_edu_score = mathUtil.Float(female_edu_score) male_edu_score = mathUtil.Float(male_edu_score) if (female_edu_score is not None and male_edu_score is not None): df.loc[i, 's_edu_diff'] = male_edu_score - female_edu_score
def check_all_float(df): list=[] for column in df.columns: for i in range(df.shape[0]): if(None==mathUtil.Float(df.loc[i,column])): list.append(column) break if(len(list)>0): print(list) return False else: return True
def get_not_nan_data(arr,num): rtn=[] count=0 for i in range(len(arr)): if(count==num): break v=mathUtil.Float(arr[i]) if(None!=v and v not in rtn): rtn.append(v) count+=1 if(len(rtn)==0): rtn=arr[0:num] return rtn
def get_nan_cols(df): mlist=[] for col in df.columns: if(None==mathUtil.Float(df.loc[0,col])): mlist.append(col) return mlist
def scan_nan(df,non_float_list=[],column_name=None,nan_list=[]): #检查缺失情况,不直接使用 rowSet=set() for column in df.columns: if(column_name!=None): if(column!=column_name): continue else: count=0 for i in range(df.shape[0]): v=df.loc[i,column] if(None==v or v in nan_list or (column not in non_float_list and None == mathUtil.Float(v))): count +=1 rowSet.add(i) if (column_name != None or count>0): print('{}缺少率{:.2%}'.format(column,count/df.shape[0])) if (column_name == None): print('全部row缺少率{:.2%}'.format(len(rowSet) / df.shape[0]))