import pandas as pd from Week6.ggikko_helper import get_file_abs_path mm1 = pd.read_csv(get_file_abs_path('202001_refined.csv')).drop_duplicates( ['사업장명'], keep='first') mm2 = pd.read_csv(get_file_abs_path('202002_refined.csv')).drop_duplicates( ['사업장명'], keep='first') mm3 = pd.read_csv(get_file_abs_path('202003_refined.csv')).drop_duplicates( ['사업장명'], keep='first') mm4 = pd.read_csv(get_file_abs_path('202004_refined.csv')).drop_duplicates( ['사업장명'], keep='first') mm5 = pd.read_csv(get_file_abs_path('202005_refined.csv')).drop_duplicates( ['사업장명'], keep='first') mm6 = pd.read_csv(get_file_abs_path('202006_refined.csv')).drop_duplicates( ['사업장명'], keep='first') mm7 = pd.read_csv(get_file_abs_path('202007_refined.csv')).drop_duplicates( ['사업장명'], keep='first') mm8 = pd.read_csv(get_file_abs_path('202008_refined.csv')).drop_duplicates( ['사업장명'], keep='first') print(mm8.shape) concat_df = pd.concat([mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8], axis=0) concat_df = concat_df[concat_df['사업장명'].notna()] groupby_df = concat_df.groupby(['사업장명']) new_df = pd.DataFrame({ '사업장명': groupby_df['사업장명'].first(), '광역시코드': groupby_df['광역시코드'].first(), '평균연봉': groupby_df['평균연봉'].mean(),
import pandas as pd import numpy as np from Week6.ggikko_helper import get_file_abs_path df = pd.read_csv(get_file_abs_path('concat_salary.csv')) print(df.head(10)) df['사업장명'] = df['사업장명'].str.strip() df['사업장명'].replace('', np.nan, inplace=True) df = df[df['사업장명'].notna()] df.to_csv('concat_salary2.csv', index=False)
import pandas as pd import numpy as np from pycaret.classification import * from sklearn.model_selection import train_test_split from Week6.ggikko_helper import get_file_abs_path # 사업장명,광역시코드,평균연봉,joinlossratio sample = pd.read_csv(get_file_abs_path('train.csv')) sample['joinlossratio'] = sample['가입대비상실'] print(sample.info()) print(sample.shape) sample = sample.loc[:, ['사업장명', '광역시코드', '평균연봉', 'joinlossratio', 'status']] sample.to_csv("train2.csv", index=False) # 사업장명,광역시코드,평균연봉,joinlossratio # 사업장명,가입자수,도로명주소,신규,상실,고지금액,평균월급,평균연봉,가입대비상실,status # 베인앤드컴퍼니코리아인크,181,서울특별시 중구 퇴계로,10,14,74173220,4553297,54639572,7.73,1 # 브레이브모바일,61,경기도 성남시 분당구 황새울로200번길,6,4,17825180,3246845,38962142,6.56,1 # 카카오페이,544,경기도 성남시 분당구 판교역로,12,12,214293900,4376917,52523014,2.21,1 # train_features = sample['사업장명', '도로명주소'] # train_labels = train_df['status'] # test_features = test_df.loc[:100, :] # # x_train, x_valid, y_train, y_valid = train_test_split(sample[feature], sample[label], test_size=0.2, shuffle=True, # random_state=30)
# refined_salary_data("202001.csv") # refined_salary_data("202002.csv") # refined_salary_data("202003.csv") # refined_salary_data("202004.csv") # refined_salary_data("202005.csv") # refined_salary_data("202006.csv") # refined_salary_data("202007.csv") # refined_salary_data("202008.csv") temp_list: list = [] # temp_list.append(pd.read_csv(get_file_abs_path('202001.csv'))) # temp_list.append(pd.read_csv(get_file_abs_path('202002.csv'))) temp_list.append(pd.read_csv(get_file_abs_path('202001_refined.csv'))) temp_list.append(pd.read_csv(get_file_abs_path('202002_refined.csv'))) # temp_list.append(pd.read_csv(get_file_abs_path('202003_refined.csv'))) # temp_list.append(pd.read_csv(get_file_abs_path('202004_refined.csv'))) # temp_list.append(pd.read_csv(get_file_abs_path('202005_refined.csv'))) # temp_list.append(pd.read_csv(get_file_abs_path('202006_refined.csv'))) # temp_list.append(pd.read_csv(get_file_abs_path('202007_refined.csv'))) # temp_list.append(pd.read_csv(get_file_abs_path('202008_refined.csv'))) # temp_list[0]['joinlossratio'] = temp_list[0]['joinlossratio'].add(temp_list[1]['joinlossratio'], fill_value=0) # print(temp_list[0]['joinlossratio']) root_df: DataFrame = temp_list[0] second_df: DataFrame = temp_list[1] # def hoho(company_name):