コード例 #1
0
def deal_marriage_income(df, features1, features2, unique_features):
    df['s_income'] = df['s_income'].apply(amend_income)
    df['s_income_diff'] = 0.
    df['ma_income_m'] = 0.
    df['ma_income_f'] = 0.

    unique_features.append('s_income_diff')
    unique_features.append('ma_income_m')
    unique_features.append('ma_income_f')

    male_income = 0
    female_income = 0
    for i in range(df.shape[0]):
        if (isMale(df, i)):
            male_income = df.loc[i, 'income']
            female_income = df.loc[i, 's_income']
        else:
            female_income = df.loc[i, 'income']
            male_income = df.loc[i, 's_income']

        female_income = mathUtil.Float(female_income)
        male_income = mathUtil.Float(male_income)
        if (female_income is not None and male_income is not None):
            df.loc[i, 's_income_diff'] = male_income - female_income
            df.loc[i, 'ma_income_m'] = male_income
            df.loc[i, 'ma_income_f'] = female_income

    return df
コード例 #2
0
def get_all_nan_cells(df):
    mlist=[]
    for col in df.columns:
        for i in range(df.shape[0]):
            if(None==mathUtil.Float(df.loc[i,col])):
               mlist.append((col,i))
    return mlist
コード例 #3
0
def min_max_mean_nan(list):
    if(len(list)==0):
        return None,None,None,True
    contain_nan=False
    minV=np.inf
    maxV=-np.inf
    total=0.
    mean=0.
    count=0
    for i in range(len(list)):
        v=mathUtil.Float(list[i])
        if(v==None):
            contain_nan=True
        else:
            if(v>maxV):
                maxV=v
            if (v < minV):
                minV = v
            total+=v
            count+=1
    if(count==0):
        mean=None
    else:
        mean=total/count
    return minV, maxV, mean, contain_nan
コード例 #4
0
def fillna(df,column_name,fill_value):
    for i in range(df.shape[0]):
        v=mathUtil.Float(df.loc[i,column_name])
        if(v is None):
            df.loc[i, column_name]=fill_value
    # s=df[column_name]
    # s=s.fillna(value=fill_value)
    # df[column_name]=s
    return True
コード例 #5
0
def deal_marriage_edu(df, features1, features2, unique_features):

    df['s_edu_diff'] = 0.
    unique_features.append('s_edu_diff')
    male_edu_score = 0
    female_edu_score = 0
    for i in range(df.shape[0]):
        if (isMale(df, i)):
            male_edu_score = df.loc[i, 'edu_a']
            female_edu_score = df.loc[i, 's_edu']
        else:
            female_edu_score = df.loc[i, 'edu_a']
            male_edu_score = df.loc[i, 's_edu']

        female_edu_score = mathUtil.Float(female_edu_score)
        male_edu_score = mathUtil.Float(male_edu_score)
        if (female_edu_score is not None and male_edu_score is not None):
            df.loc[i, 's_edu_diff'] = male_edu_score - female_edu_score
コード例 #6
0
def check_all_float(df):
    list=[]
    for column in df.columns:
        for i in range(df.shape[0]):
            if(None==mathUtil.Float(df.loc[i,column])):
                list.append(column)
                break
    if(len(list)>0):
        print(list)
        return False
    else:
        return True
コード例 #7
0
def get_not_nan_data(arr,num):
    rtn=[]
    count=0
    for i in range(len(arr)):
        if(count==num):
            break
        v=mathUtil.Float(arr[i])
        if(None!=v and v not in rtn):
            rtn.append(v)
            count+=1
    if(len(rtn)==0):
        rtn=arr[0:num]
    return rtn
コード例 #8
0
def get_nan_cols(df):
    mlist=[]
    for col in df.columns:
        if(None==mathUtil.Float(df.loc[0,col])):
           mlist.append(col)
    return mlist
コード例 #9
0
def scan_nan(df,non_float_list=[],column_name=None,nan_list=[]):
    #检查缺失情况,不直接使用
    rowSet=set()
    for column in df.columns:
        if(column_name!=None):
            if(column!=column_name):
                continue
        else:
            count=0
            for i in range(df.shape[0]):
                v=df.loc[i,column]
                if(None==v or v in nan_list or (column not in non_float_list and None == mathUtil.Float(v))):
                    count +=1
                    rowSet.add(i)
            if (column_name != None or count>0):
                print('{}缺少率{:.2%}'.format(column,count/df.shape[0]))
    if (column_name == None):
        print('全部row缺少率{:.2%}'.format(len(rowSet) / df.shape[0]))