def mp_get_corr(args):
    df1, df2 = args
    global ctr
    global tot
    ctr += 1
    if ctr % 100 == 0:
        print(ctr / tot * 100, os.getpid())
    return tuple(
        sorted((
            df1[data_util.KEY_SYMBOL].iloc[-1],
            df2[data_util.KEY_SYMBOL].iloc[-1]))), correlation.get_correlation(
                df1, df2)
Exemple #2
0
def treat_continuous_columns(E, Y, R_dico, dic, method = 'regression',R_min = 0.1, R_cont_y = 0.3,
                             R_Cramer_y = 0.25, verbose = False):

    if verbose == True:
        def vprint(*args):
            # Print each argument separately so caller doesn't need to
            # stuff everything to be printed into a single string
            for arg in args:
               print(arg,)
            print
    else:   
        vprint = lambda *a: None      # do-nothing function    
        
    G = pd.DataFrame(index = E.index)    
    drop_tb = pd.DataFrame(columns=['col_name','R2'])
    drop_index = 0
    keep_tb = pd.DataFrame(columns=['col_name','R2'])
    keep_index = 0
    #
    drop_tb_q = pd.DataFrame(columns=['col_name','R2'])
    drop_q_index = 0
    keep_tb_d = pd.DataFrame(columns=['col_name','R2'])
    keep_d_index = 0
    keep_tb_q = pd.DataFrame(columns=['col_name','R2'])
    keep_q_index = 0
    #
    keep_tb_d_1 = pd.DataFrame(columns=['col_name','R2'])
    keep_d_index_1 = 0
    keep_tb_q_1 = pd.DataFrame(columns=['col_name','R2'])
    keep_q_index_1 = 0
    
    if method == 'regression':
        for col in E.columns:
            if (col in dic) == True:
                # Add a log variable
                index = (E[col] < (E[col].mean() - 4*E[col].std())) | \
                        (E[col] > (E[col].mean() + 4*E[col].std()))
                if sum(index) > 0:
                    sgn = (E[col] - E[col].mean()) / abs(E[col] - E[col].mean())
                    v = pd.DataFrame(data = 0, columns = ['log_' + col] , index = E.index)
                    v.loc[index] = sgn.loc[index] * np.log(sgn.loc[index] * (E.loc[index, col] - E.loc[:,col].mean()) \
                                    / (4 *E.loc[:,col].std()))
                    # Garde ou vire la log variable
                    R = dic[col][1]
                    vprint('garde log_' + col + ' avec R2 = ' + str(round(R,3)))
                    keep_tb_q_1.loc[keep_q_index_1] = ['log_' + col,R]
                    keep_q_index_1 = keep_q_index_1 + 1
                    E = pd.concat([E,v], axis = 1)
                
                # Add a nan variable           
                index = E[col].isnull()
                if sum(index) > 0:             
                    u = pd.DataFrame(data = 0, columns = ['nan_'+col] , index = E.index)
                    u.loc[index] = 1
                    # Garde ou pas la variable nan
                    index_ = ~Y.isnull()
                    R = dic[col][1]
                    vprint('garde nan_' + col + ' avec R2 = ' + str(round(R,3)))
                    keep_tb_d_1.loc[keep_d_index_1] = ['nan_' + col,R]
                    keep_d_index_1 = keep_d_index_1 + 1
                    E = pd.concat([E,u], axis = 1)
                
                # Garde ou vire la variable
                R = dic[col][1]
                vprint('garde ' + col + ' with R2 = ' + str(round(R,3)))
                keep_tb_q_1.loc[keep_q_index_1] = [col,R]
                keep_q_index_1 = keep_q_index_1 + 1
            else:                
                # Add a log variable
                index = (E[col] < (E[col].mean() - 4*E[col].std())) | \
                        (E[col] > (E[col].mean() + 4*E[col].std()))
                if sum(index) > 0:
                    sgn = (E[col] - E[col].mean()) / abs(E[col] - E[col].mean())
                    v = pd.DataFrame(data = 0, columns = ['log_' + col] , index = E.index)
                    v.loc[index] = sgn.loc[index] * np.log(sgn.loc[index] * (E.loc[index, col] - E.loc[:,col].mean()) \
                                    / (4 *E.loc[:,col].std()))
                    # Garde ou vire la log variable
                    R = corr.get_R_continuous(v.iloc[:,0],Y, m = 3)
                    if R > R_cont_y:
                        vprint('garde log_' + col + ' avec R2 = ' + str(round(R,3)))
                        keep_tb_q.loc[keep_q_index] = ['log_' + col,R]
                        keep_q_index = keep_q_index + 1
                        E = pd.concat([E,v], axis = 1)
                
                # Add a nan variable           
                index = E[col].isnull()
                if sum(index) > 0:             
                    u = pd.DataFrame(data = 0, columns = ['nan_'+col] , index = E.index)
                    u.loc[index] = 1
                    # Garde ou pas la variable nan
                    index_ = ~Y.isnull()
                    R, bool_R =  corr.get_correlation(u.loc[index_,'nan_'+col], Y.loc[index_],
                                                seuil_cramer = 1, seuil_corr = 1)
                    if R > R_Cramer_y:
                        vprint('garde nan_' + col + ' avec R2 = ' + str(round(R,3)))
                        keep_tb_d.loc[keep_d_index] = ['nan_' + col,R]
                        keep_d_index = keep_d_index + 1
                        E = pd.concat([E,u], axis = 1)
                
                # Garde ou vire la variable
                R = corr.get_R_continuous(E[col],Y,3)
                if R > R_cont_y:
                    vprint('garde ' + col + ' with R2 = ' + str(round(R,3)))
                    keep_tb_q.loc[keep_q_index] = [col,R]
                    keep_q_index = keep_q_index + 1
                else:
                    vprint('vire ' + col + ' with R2 = ' + str(round(R,3)))
                    E.drop(col, axis = 1, inplace = True)
                    drop_tb_q.loc[drop_q_index] = [col,R]
                    drop_q_index = drop_q_index + 1
        R_dico['variables'] = pd.concat([R_dico['variables'],keep_tb_q_1], axis = 0)
        R_dico['variables'] = pd.concat([R_dico['variables'],keep_tb_d_1], axis = 0)
        R_dico['variables continues gardees'] = pd.concat([R_dico['variables continues gardees'],keep_tb_q], axis = 0)
        R_dico['variables continues jetees'] = pd.concat([R_dico['variables continues jetees'],drop_tb_q], axis = 0)  
        R_dico['variables discretes gardees'] = pd.concat([R_dico['variables discretes gardees'],keep_tb_d], axis = 0)
        return E, R_dico
        
        
    elif method == 'Cramer':
        for col in E.columns:
            # Convert to string
            #♀corr.quantify_col(E[col], treat_na_as_zero = False)
                        
            index = E[col].apply(np.isreal)
            index = index & ~E[col].isnull()
            E[col][index] = E[col][index].astype(int)
            E[col] = E[col].astype(str)
            
            # Cramer
            index = ~Y.isnull()
            if len(E.loc[index,col].unique()) > 1:
                R, bool_R =  corr.get_correlation(E.loc[index,col], Y.loc[index], 1, 1)                
                if (col in dic)== True:
                    R=dic[col][1]
                    vprint('garde ' + col + ' avec R2 = ' + str(round(R,3)))
                    keep_tb.loc[keep_index] = [col,R]
                    keep_index = keep_index + 1
                    replace_dico = {}
                    for value in E[col].unique():
                        index = (E[col] == value)
                        replace_dico[value] = round(Y.loc[index].mean(),0)
                    G = pd.concat([G,E[col].replace(replace_dico)], axis =1)                                
                elif R < R_min:
                    vprint('vire ' + col + ' avec R2 = ' + str(round(R,3)))
                    drop_tb.loc[drop_index] = [col,R]
                    drop_index = drop_index + 1
                else:
                    vprint('garde ' + col + ' avec R2 = ' + str(round(R,3)))
                    keep_tb.loc[keep_index] = [col,R]
                    keep_index = keep_index + 1
                    replace_dico = {}
                    for value in E[col].unique():
                        index = (E[col] == value)
                        replace_dico[value] = round(Y.loc[index].mean(),0)
                    G = pd.concat([G,E[col].replace(replace_dico)], axis =1)
                    
            else:
                vprint('vire ' + col + ' car valeurs constantes')
                #tdc.drop(i, axis = 1, inplace = True)
                drop_tb.loc[drop_index] = [col,0]
                drop_index = drop_index + 1
        
        drop_tb.sort_values(by = 'R2', inplace = True, ascending = False)
        keep_tb.sort_values(by = 'R2', inplace = True, ascending = False)
        R_dico['variables continues gardees'] = pd.concat([R_dico['variables continues gardees'],keep_tb], axis = 0)
        R_dico['variables continues jetees'] = pd.concat([R_dico['variables continues jetees'],drop_tb], axis = 0)
        return G, R_dico     
    
    
    else:
        raise ValueError('methode non reconnue')
Exemple #3
0
def treat_discrete_columns(T, Y, R_dico, dic, method = 'regression', R_min = 0.1, verbose = False):

    if verbose == True:
        def vprint(*args):
            # Print each argument separately so caller doesn't need to
            # stuff everything to be printed into a single string
            for arg in args:
               print(arg,)
            print
    else:   
        vprint = lambda *a: None      # do-nothing function    
    
    # Replace nan
    for col in T.columns:
        if T.dtypes[col]=='object' or T.dtypes[col]=='O':
                T.loc[T[col] == ".",col] = 'Na'
        T.loc[T[col].isnull(),col] = float('NaN')
      
    F = pd.DataFrame(index = T.index)
    drop_tb = pd.DataFrame(columns=['col_name','R2'])
    drop_index = 0
    keep_tb = pd.DataFrame(columns=['col_name','R2'])
    keep_index = 0
    keep_tb_1 = pd.DataFrame(columns=['col_name','R2'])
    keep_index_1 = 0
    
    if method == 'regression':
        for col in T.columns:
            if (col in dic)== True:
                # Construit TDC
                T[col] = T[col].astype(str)
                T[col] = T[col].astype('category')
                tdc = pd.DataFrame(pd.get_dummies(T[col]))
                new_col_name =[]
                for i in range(0,len(tdc.columns)):
                    new_col_name.append(col + '_' + str(tdc.columns[i]))
                tdc.columns = new_col_name
                # Garde ou pas les variables du TDC
                index = ~Y.isnull()
                for i in tdc.columns:
                    if len(tdc.loc[index,i].unique()) == 2:
                        R=dic[col][1]
                        vprint('keep ' + i + ' with R2 = ' + str(round(R,3)))
                        keep_tb_1.loc[keep_index_1] = [i,R]
                        keep_index_1 = keep_index_1 + 1 
                    
            else:    
                # Construit TDC
                T[col] = T[col].astype(str)
                T[col] = T[col].astype('category')
                tdc = pd.DataFrame(pd.get_dummies(T[col]))
                new_col_name =[]
                for i in range(0,len(tdc.columns)):
                    new_col_name.append(col + '_' + str(tdc.columns[i]))
                tdc.columns = new_col_name
                # Garde ou pas les variables du TDC
                index = ~Y.isnull()
                for i in tdc.columns:
                    R, bool_R = corr.get_correlation(tdc.loc[index,i], Y.loc[index],
                                            seuil_cramer = 1, seuil_corr = 1)
                    if len(tdc.loc[index,i].unique()) == 2:
                        R, bool_R = corr.get_correlation(tdc.loc[index,i], Y.loc[index],
                                            seuil_cramer = 1, seuil_corr = 1)                                       
                        if R < R_min:
                            vprint('vire ' + i + ' with R2 = ' + str(round(R,3)))
                            tdc.drop(i, axis = 1, inplace = True)
                            drop_tb.loc[drop_index] = [i,R]
                            drop_index = drop_index + 1
                        else:
                            vprint('keep ' + i + ' with R2 = ' + str(round(R,3)))
                            keep_tb.loc[keep_index] = [i,R]
                            keep_index = keep_index + 1
                    else:                    
                        vprint('vire ' + i + ' with R2 = ' + str(round(R,3)))
                        tdc.drop(i, axis = 1, inplace = True)
                        drop_tb.loc[drop_index] = [i,R]
                        drop_index = drop_index + 1
            # Add tdc to F
            F = pd.concat([F,tdc], axis = 1)
            del(tdc)
        
        drop_tb.sort_values(by = 'R2', inplace = True, ascending = False)
        keep_tb.sort_values(by = 'R2', inplace = True, ascending = False)
        R_dico['variables'] = pd.concat([R_dico['variables'],keep_tb_1], axis = 0)
        R_dico['variables discretes gardees'] = pd.concat([R_dico['variables discretes gardees'],keep_tb], axis = 0)
        R_dico['variables discretes jetees'] = pd.concat([R_dico['variables discretes jetees'],drop_tb], axis = 0)
        return F, R_dico
        
    elif method == 'Cramer':
        for col in T.columns:
            index = T[col].apply(np.isreal)
            index = index & ~T[col].isnull()
            T[col][index] = T[col][index].astype(int)
            T[col] = T[col].astype(str)            
            # Cramer
            index = ~Y.isnull()
            if len(T.loc[index,col].unique()) > 1:
                R, bool_R =  corr.get_correlation(T.loc[index,col], Y.loc[index], 1, 1)
                if (col in dic)== True:
                    R=dic[col][1]
                    vprint('garde ' + col + ' avec R2 = ' + str(round(R,3)))
                    keep_tb_1.loc[keep_index_1] = [col,R]
                    keep_index_1 = keep_index_1 + 1
                    replace_dico = {}
                    for value in T[col].unique():
                        index = (T[col] == value)
                        replace_dico[value] = round(Y.loc[index].mean(),0)
                    F = pd.concat([F,T[col].replace(replace_dico)], axis =1)
                elif R < R_min:
                    vprint('vire ' + col + ' avec R2 = ' + str(round(R,3)))
                    drop_tb.loc[drop_index] = [col,R]
                    drop_index = drop_index + 1
                else:
                    vprint('garde ' + col + ' avec R2 = ' + str(round(R,3)))
                    keep_tb.loc[keep_index] = [col,R]
                    keep_index = keep_index + 1
                    replace_dico = {}
                    for value in T[col].unique():
                        index = (T[col] == value)
                        replace_dico[value] = round(Y.loc[index].mean(),0)
                    F = pd.concat([F,T[col].replace(replace_dico)], axis =1)
                
            else:
                vprint('vire ' + col + ' car valeurs constantes')
                drop_tb.loc[drop_index] = [col,0]
                drop_index = drop_index + 1
    
        drop_tb.sort_values(by = 'R2', inplace = True, ascending = False)
        keep_tb.sort_values(by = 'R2', inplace = True, ascending = False)
        R_dico['variables'] = pd.concat([R_dico['variables'],keep_tb_1], axis = 0)
        R_dico['variables discretes gardees'] = pd.concat([R_dico['variables discretes gardees'],keep_tb], axis = 0)
        R_dico['variables discretes jetees'] = pd.concat([R_dico['variables discretes jetees'],drop_tb], axis = 0)
        return F, R_dico
    
    else:
        raise ValueError('methode non reconnue')