def move_to_second_valid(best_select=[], path='', rank=0, gain=0, key_list=[]): logger = logger_func() if len(best_select)==0: try: if path=='': path = sys.argv[2] except IndexError: pass best_select = pd.read_csv(path) try: select_list = sys.argv[3].split('_') select_type = select_list[0] select_num = np.int(select_list[1]) except IndexError: pass if select_type=='rank': best_feature = best_select.query(f"rank<={select_num}")['feature'].values elif select_type=='gain': best_feature = best_select.query(f"importance>={select_num}")['feature'].values try: best_feature = [col for col in best_feature if col.count(sys.argv[4])] except IndexError: best_feature = [col for col in best_feature if col.count('')] if len(best_feature)==0: sys.exit() path_list = glob.glob('../features/*.gz') select_path = [] for feature in best_feature: for path in path_list: filename = re.search(r'/([^/.]*).gz', path).group(1) if filename[:3]=='tra': filename = filename[6:] if filename[:3]=='tes': filename = filename[5:] # if path.count(feature) and feature not in ignore_list: if feature==filename: select_path.append(path) move_path = list(set(path_list) - set(select_path)) for move in move_path: try: shutil.move(move, '../features/no_use/') except FileNotFoundError: logger.info(f'FileNotFoundError: {feature}') except shutil.Error: logger.info(f'Shutil Error: {feature}') print(f'move to third_valid:{len(best_feature)}')
def move_to_use(): try: path = sys.argv[2] except IndexError: path = '' best_select = pd.read_csv(path) best_feature = best_select['feature'].values win_list = glob.glob(win_path + '*') first_list = glob.glob('../features/1_first_valid/*') second_list = glob.glob('../features/2_second_valid/*') third_list = glob.glob('../features/3_third_valid/*') tmp_list = glob.glob('../features/5_tmp/*') path_list = third_list # path_list = third_list + tmp_list + win_list # path_list = first_list + second_list + third_list + tmp_list + win_list done_list = [] for feature in best_feature: for path in path_list: try: filename = re.search(r'/([^/.]*).gz', path).group(1) except AttributeError: continue # if path.count(feature): # if filename==feature: if filename.replace('stan_', '')==feature: try: shutil.move(path, win_path) # filename = re.search(r'/([^/.]*).gz', path).group(1) done_list.append(filename) except shutil.Error: pass # shutil.move(path, gdrive_path) except FileNotFoundError: pass # shutil.move(path, gdrive_path) logger = logger_func() best_feature = [f for f in best_feature] loss_list = set(list(best_feature)) - set(done_list) logger.info(f"Loss List:") for loss in loss_list: logger.info(f"{loss}")
def move_to_second_valid(best_select=[], path='', rank=0, key_list=[]): logger = logger_func() if len(best_select) == 0: try: if path == '': path = sys.argv[2] except IndexError: pass best_select = pd.read_csv(path) try: if rank == 0: rank = int(sys.argv[3]) except IndexError: pass best_feature = best_select.query(f"rank>={rank}")['feature'].values try: best_feature = [ col for col in best_feature if col.count(sys.argv[4]) ] except IndexError: best_feature = [col for col in best_feature if col.count('')] if len(best_feature) == 0: sys.exit() path_list = glob.glob('../features/4_winner/*') for feature in best_feature: move_path = [] for path in path_list: filename = re.search(r'/([^/.]*).gz', path).group(1) # if path.count(feature) and feature not in ignore_list: # if feature==filename: if feature == filename.replace('stan_', ''): # print(f"{filename} | {feature}") move_path.append(path) for move in move_path: try: shutil.move(move, second_path) except FileNotFoundError: logger.info(f'FileNotFoundError: {feature}') except shutil.Error: logger.info(f'Shutil Error: {feature}') print(f'move to third_valid:{len(best_feature)}')
def move_to_use(): try: path = sys.argv[2] except IndexError: path = '' best_select = pd.read_csv(path) best_feature = best_select['feature'].values win_list = glob.glob(win_path + '*') first_list = glob.glob('../season1_features/1_first_valid/*') second_list = glob.glob('../season1_features/2_second_valid/*') third_list = glob.glob('../season1_features/3_third_valid/*') tmp_list = glob.glob('../features/5_tmp/*') path_list = third_list + tmp_list + win_list # path_list = first_list + second_list + third_list + tmp_list + win_list done_list = [] for feature in best_feature: for path in path_list: if path.replace('.0', '_0').count(feature[:7]) and path.replace( '.0', '_0').count(feature[9:]): try: shutil.move(path, win_path) filename = re.search(r'/([^/.]*).gz', path.replace('.0', '_0')).group(1) if filename.count('train'): done_list.append(filename[14:]) elif filename.count('test'): done_list.append(filename[13:]) except shutil.Error: pass # shutil.move(path, gdrive_path) except FileNotFoundError: pass # shutil.move(path, gdrive_path) logger = logger_func() best_feature = [f[8:] for f in best_feature] loss_list = set(list(best_feature)) - set(done_list) logger.info(f"Loss List:") for loss in loss_list: logger.info(f"{loss}")
def move_to_second_valid(best_select=[], path='', rank=0, key_list=[]): logger = logger_func() if len(best_select) == 0: try: if path == '': path = sys.argv[2] except IndexError: pass best_select = pd.read_csv(path) try: if rank == 0: rank = int(sys.argv[3]) except IndexError: pass best_feature = best_select.query(f"rank>={rank}")['feature'].values try: best_feature = [ col for col in best_feature if col.count(sys.argv[4]) ] except IndexError: best_feature = [col for col in best_feature if col.count('')] if len(best_feature) == 0: sys.exit() path_list = glob.glob('../season1_features/4_winner/*') for feature in best_feature: move_path = [ path for path in path_list if path.count(feature[:7]) and path.count(feature[9:]) and feature not in ignore_list ] for move in move_path: try: shutil.move(move, second_path) except FileNotFoundError: print(f'FileNotFound. : {feature}.gz') pass except shutil.Error: logger.info(f'Shutil Error: {feature}') print(f'move to third_valid:{len(best_feature)}')
import numpy as np import datetime from sklearn.preprocessing import LabelEncoder #======================================================================== # original library HOME = os.path.expanduser('~') sys.path.append(f"{HOME}/kaggle/data_analysis/library/") sys.path.append(f"{HOME}/kaggle/data_analysis/model/") import MS_utils from params_MS import params_lgb import utils, ml_utils from utils import logger_func try: if not logger: logger = logger_func() except NameError: logger = logger_func() #======================================================================== """ argv[1]: comment argv[2]: feature_key argv[3]: group """ # Columns base_path = '../input/base_exclude*' base_path = '../input/base_Av*' key, target, ignore_list = MS_utils.get_basic_var() ignore_list = [key, target, 'country_group', 'down_flg'] base = utils.read_df_pkl(base_path)[[key, target, 'country_group']]