def mp_get_corr(args): df1, df2 = args global ctr global tot ctr += 1 if ctr % 100 == 0: print(ctr / tot * 100, os.getpid()) return tuple( sorted(( df1[data_util.KEY_SYMBOL].iloc[-1], df2[data_util.KEY_SYMBOL].iloc[-1]))), correlation.get_correlation( df1, df2)
def treat_continuous_columns(E, Y, R_dico, dic, method = 'regression',R_min = 0.1, R_cont_y = 0.3, R_Cramer_y = 0.25, verbose = False): if verbose == True: def vprint(*args): # Print each argument separately so caller doesn't need to # stuff everything to be printed into a single string for arg in args: print(arg,) print else: vprint = lambda *a: None # do-nothing function G = pd.DataFrame(index = E.index) drop_tb = pd.DataFrame(columns=['col_name','R2']) drop_index = 0 keep_tb = pd.DataFrame(columns=['col_name','R2']) keep_index = 0 # drop_tb_q = pd.DataFrame(columns=['col_name','R2']) drop_q_index = 0 keep_tb_d = pd.DataFrame(columns=['col_name','R2']) keep_d_index = 0 keep_tb_q = pd.DataFrame(columns=['col_name','R2']) keep_q_index = 0 # keep_tb_d_1 = pd.DataFrame(columns=['col_name','R2']) keep_d_index_1 = 0 keep_tb_q_1 = pd.DataFrame(columns=['col_name','R2']) keep_q_index_1 = 0 if method == 'regression': for col in E.columns: if (col in dic) == True: # Add a log variable index = (E[col] < (E[col].mean() - 4*E[col].std())) | \ (E[col] > (E[col].mean() + 4*E[col].std())) if sum(index) > 0: sgn = (E[col] - E[col].mean()) / abs(E[col] - E[col].mean()) v = pd.DataFrame(data = 0, columns = ['log_' + col] , index = E.index) v.loc[index] = sgn.loc[index] * np.log(sgn.loc[index] * (E.loc[index, col] - E.loc[:,col].mean()) \ / (4 *E.loc[:,col].std())) # Garde ou vire la log variable R = dic[col][1] vprint('garde log_' + col + ' avec R2 = ' + str(round(R,3))) keep_tb_q_1.loc[keep_q_index_1] = ['log_' + col,R] keep_q_index_1 = keep_q_index_1 + 1 E = pd.concat([E,v], axis = 1) # Add a nan variable index = E[col].isnull() if sum(index) > 0: u = pd.DataFrame(data = 0, columns = ['nan_'+col] , index = E.index) u.loc[index] = 1 # Garde ou pas la variable nan index_ = ~Y.isnull() R = dic[col][1] vprint('garde nan_' + col + ' avec R2 = ' + str(round(R,3))) keep_tb_d_1.loc[keep_d_index_1] = ['nan_' + col,R] keep_d_index_1 = keep_d_index_1 + 1 E = pd.concat([E,u], axis = 1) # Garde ou vire la variable R = dic[col][1] vprint('garde ' + col + ' with R2 = ' + str(round(R,3))) keep_tb_q_1.loc[keep_q_index_1] = [col,R] keep_q_index_1 = keep_q_index_1 + 1 else: # Add a log variable index = (E[col] < (E[col].mean() - 4*E[col].std())) | \ (E[col] > (E[col].mean() + 4*E[col].std())) if sum(index) > 0: sgn = (E[col] - E[col].mean()) / abs(E[col] - E[col].mean()) v = pd.DataFrame(data = 0, columns = ['log_' + col] , index = E.index) v.loc[index] = sgn.loc[index] * np.log(sgn.loc[index] * (E.loc[index, col] - E.loc[:,col].mean()) \ / (4 *E.loc[:,col].std())) # Garde ou vire la log variable R = corr.get_R_continuous(v.iloc[:,0],Y, m = 3) if R > R_cont_y: vprint('garde log_' + col + ' avec R2 = ' + str(round(R,3))) keep_tb_q.loc[keep_q_index] = ['log_' + col,R] keep_q_index = keep_q_index + 1 E = pd.concat([E,v], axis = 1) # Add a nan variable index = E[col].isnull() if sum(index) > 0: u = pd.DataFrame(data = 0, columns = ['nan_'+col] , index = E.index) u.loc[index] = 1 # Garde ou pas la variable nan index_ = ~Y.isnull() R, bool_R = corr.get_correlation(u.loc[index_,'nan_'+col], Y.loc[index_], seuil_cramer = 1, seuil_corr = 1) if R > R_Cramer_y: vprint('garde nan_' + col + ' avec R2 = ' + str(round(R,3))) keep_tb_d.loc[keep_d_index] = ['nan_' + col,R] keep_d_index = keep_d_index + 1 E = pd.concat([E,u], axis = 1) # Garde ou vire la variable R = corr.get_R_continuous(E[col],Y,3) if R > R_cont_y: vprint('garde ' + col + ' with R2 = ' + str(round(R,3))) keep_tb_q.loc[keep_q_index] = [col,R] keep_q_index = keep_q_index + 1 else: vprint('vire ' + col + ' with R2 = ' + str(round(R,3))) E.drop(col, axis = 1, inplace = True) drop_tb_q.loc[drop_q_index] = [col,R] drop_q_index = drop_q_index + 1 R_dico['variables'] = pd.concat([R_dico['variables'],keep_tb_q_1], axis = 0) R_dico['variables'] = pd.concat([R_dico['variables'],keep_tb_d_1], axis = 0) R_dico['variables continues gardees'] = pd.concat([R_dico['variables continues gardees'],keep_tb_q], axis = 0) R_dico['variables continues jetees'] = pd.concat([R_dico['variables continues jetees'],drop_tb_q], axis = 0) R_dico['variables discretes gardees'] = pd.concat([R_dico['variables discretes gardees'],keep_tb_d], axis = 0) return E, R_dico elif method == 'Cramer': for col in E.columns: # Convert to string #♀corr.quantify_col(E[col], treat_na_as_zero = False) index = E[col].apply(np.isreal) index = index & ~E[col].isnull() E[col][index] = E[col][index].astype(int) E[col] = E[col].astype(str) # Cramer index = ~Y.isnull() if len(E.loc[index,col].unique()) > 1: R, bool_R = corr.get_correlation(E.loc[index,col], Y.loc[index], 1, 1) if (col in dic)== True: R=dic[col][1] vprint('garde ' + col + ' avec R2 = ' + str(round(R,3))) keep_tb.loc[keep_index] = [col,R] keep_index = keep_index + 1 replace_dico = {} for value in E[col].unique(): index = (E[col] == value) replace_dico[value] = round(Y.loc[index].mean(),0) G = pd.concat([G,E[col].replace(replace_dico)], axis =1) elif R < R_min: vprint('vire ' + col + ' avec R2 = ' + str(round(R,3))) drop_tb.loc[drop_index] = [col,R] drop_index = drop_index + 1 else: vprint('garde ' + col + ' avec R2 = ' + str(round(R,3))) keep_tb.loc[keep_index] = [col,R] keep_index = keep_index + 1 replace_dico = {} for value in E[col].unique(): index = (E[col] == value) replace_dico[value] = round(Y.loc[index].mean(),0) G = pd.concat([G,E[col].replace(replace_dico)], axis =1) else: vprint('vire ' + col + ' car valeurs constantes') #tdc.drop(i, axis = 1, inplace = True) drop_tb.loc[drop_index] = [col,0] drop_index = drop_index + 1 drop_tb.sort_values(by = 'R2', inplace = True, ascending = False) keep_tb.sort_values(by = 'R2', inplace = True, ascending = False) R_dico['variables continues gardees'] = pd.concat([R_dico['variables continues gardees'],keep_tb], axis = 0) R_dico['variables continues jetees'] = pd.concat([R_dico['variables continues jetees'],drop_tb], axis = 0) return G, R_dico else: raise ValueError('methode non reconnue')
def treat_discrete_columns(T, Y, R_dico, dic, method = 'regression', R_min = 0.1, verbose = False): if verbose == True: def vprint(*args): # Print each argument separately so caller doesn't need to # stuff everything to be printed into a single string for arg in args: print(arg,) print else: vprint = lambda *a: None # do-nothing function # Replace nan for col in T.columns: if T.dtypes[col]=='object' or T.dtypes[col]=='O': T.loc[T[col] == ".",col] = 'Na' T.loc[T[col].isnull(),col] = float('NaN') F = pd.DataFrame(index = T.index) drop_tb = pd.DataFrame(columns=['col_name','R2']) drop_index = 0 keep_tb = pd.DataFrame(columns=['col_name','R2']) keep_index = 0 keep_tb_1 = pd.DataFrame(columns=['col_name','R2']) keep_index_1 = 0 if method == 'regression': for col in T.columns: if (col in dic)== True: # Construit TDC T[col] = T[col].astype(str) T[col] = T[col].astype('category') tdc = pd.DataFrame(pd.get_dummies(T[col])) new_col_name =[] for i in range(0,len(tdc.columns)): new_col_name.append(col + '_' + str(tdc.columns[i])) tdc.columns = new_col_name # Garde ou pas les variables du TDC index = ~Y.isnull() for i in tdc.columns: if len(tdc.loc[index,i].unique()) == 2: R=dic[col][1] vprint('keep ' + i + ' with R2 = ' + str(round(R,3))) keep_tb_1.loc[keep_index_1] = [i,R] keep_index_1 = keep_index_1 + 1 else: # Construit TDC T[col] = T[col].astype(str) T[col] = T[col].astype('category') tdc = pd.DataFrame(pd.get_dummies(T[col])) new_col_name =[] for i in range(0,len(tdc.columns)): new_col_name.append(col + '_' + str(tdc.columns[i])) tdc.columns = new_col_name # Garde ou pas les variables du TDC index = ~Y.isnull() for i in tdc.columns: R, bool_R = corr.get_correlation(tdc.loc[index,i], Y.loc[index], seuil_cramer = 1, seuil_corr = 1) if len(tdc.loc[index,i].unique()) == 2: R, bool_R = corr.get_correlation(tdc.loc[index,i], Y.loc[index], seuil_cramer = 1, seuil_corr = 1) if R < R_min: vprint('vire ' + i + ' with R2 = ' + str(round(R,3))) tdc.drop(i, axis = 1, inplace = True) drop_tb.loc[drop_index] = [i,R] drop_index = drop_index + 1 else: vprint('keep ' + i + ' with R2 = ' + str(round(R,3))) keep_tb.loc[keep_index] = [i,R] keep_index = keep_index + 1 else: vprint('vire ' + i + ' with R2 = ' + str(round(R,3))) tdc.drop(i, axis = 1, inplace = True) drop_tb.loc[drop_index] = [i,R] drop_index = drop_index + 1 # Add tdc to F F = pd.concat([F,tdc], axis = 1) del(tdc) drop_tb.sort_values(by = 'R2', inplace = True, ascending = False) keep_tb.sort_values(by = 'R2', inplace = True, ascending = False) R_dico['variables'] = pd.concat([R_dico['variables'],keep_tb_1], axis = 0) R_dico['variables discretes gardees'] = pd.concat([R_dico['variables discretes gardees'],keep_tb], axis = 0) R_dico['variables discretes jetees'] = pd.concat([R_dico['variables discretes jetees'],drop_tb], axis = 0) return F, R_dico elif method == 'Cramer': for col in T.columns: index = T[col].apply(np.isreal) index = index & ~T[col].isnull() T[col][index] = T[col][index].astype(int) T[col] = T[col].astype(str) # Cramer index = ~Y.isnull() if len(T.loc[index,col].unique()) > 1: R, bool_R = corr.get_correlation(T.loc[index,col], Y.loc[index], 1, 1) if (col in dic)== True: R=dic[col][1] vprint('garde ' + col + ' avec R2 = ' + str(round(R,3))) keep_tb_1.loc[keep_index_1] = [col,R] keep_index_1 = keep_index_1 + 1 replace_dico = {} for value in T[col].unique(): index = (T[col] == value) replace_dico[value] = round(Y.loc[index].mean(),0) F = pd.concat([F,T[col].replace(replace_dico)], axis =1) elif R < R_min: vprint('vire ' + col + ' avec R2 = ' + str(round(R,3))) drop_tb.loc[drop_index] = [col,R] drop_index = drop_index + 1 else: vprint('garde ' + col + ' avec R2 = ' + str(round(R,3))) keep_tb.loc[keep_index] = [col,R] keep_index = keep_index + 1 replace_dico = {} for value in T[col].unique(): index = (T[col] == value) replace_dico[value] = round(Y.loc[index].mean(),0) F = pd.concat([F,T[col].replace(replace_dico)], axis =1) else: vprint('vire ' + col + ' car valeurs constantes') drop_tb.loc[drop_index] = [col,0] drop_index = drop_index + 1 drop_tb.sort_values(by = 'R2', inplace = True, ascending = False) keep_tb.sort_values(by = 'R2', inplace = True, ascending = False) R_dico['variables'] = pd.concat([R_dico['variables'],keep_tb_1], axis = 0) R_dico['variables discretes gardees'] = pd.concat([R_dico['variables discretes gardees'],keep_tb], axis = 0) R_dico['variables discretes jetees'] = pd.concat([R_dico['variables discretes jetees'],drop_tb], axis = 0) return F, R_dico else: raise ValueError('methode non reconnue')