Beispiel #1
0
def move_to_second_valid(best_select=[], path='', rank=0, gain=0, key_list=[]):
    logger = logger_func()
    if len(best_select)==0:
        try:
            if path=='':
                path = sys.argv[2]
        except IndexError:
            pass
        best_select = pd.read_csv(path)
        try:
            select_list = sys.argv[3].split('_')
            select_type = select_list[0]
            select_num = np.int(select_list[1])
        except IndexError:
            pass

        if select_type=='rank':
            best_feature = best_select.query(f"rank<={select_num}")['feature'].values
        elif select_type=='gain':
            best_feature = best_select.query(f"importance>={select_num}")['feature'].values
        try:
            best_feature = [col for col in best_feature if col.count(sys.argv[4])]
        except IndexError:
            best_feature = [col for col in best_feature if col.count('')]

        if len(best_feature)==0:
            sys.exit()

        path_list = glob.glob('../features/*.gz')

        select_path = []
        for feature in best_feature:
            for path in path_list:
                filename = re.search(r'/([^/.]*).gz', path).group(1)
                if filename[:3]=='tra':
                    filename = filename[6:]
                if filename[:3]=='tes':
                    filename = filename[5:]
                #  if path.count(feature) and feature not in ignore_list:
                if feature==filename:
                    select_path.append(path)

        move_path = list(set(path_list) - set(select_path))
        for move in move_path:
            try:
                shutil.move(move, '../features/no_use/')
            except FileNotFoundError:
                logger.info(f'FileNotFoundError: {feature}')
            except shutil.Error:
                logger.info(f'Shutil Error: {feature}')
        print(f'move to third_valid:{len(best_feature)}')
Beispiel #2
0
def move_to_use():

    try:
        path = sys.argv[2]
    except IndexError:
        path = ''
    best_select = pd.read_csv(path)
    best_feature = best_select['feature'].values

    win_list = glob.glob(win_path + '*')
    first_list = glob.glob('../features/1_first_valid/*')
    second_list = glob.glob('../features/2_second_valid/*')
    third_list = glob.glob('../features/3_third_valid/*')
    tmp_list = glob.glob('../features/5_tmp/*')
    path_list = third_list
    #  path_list = third_list + tmp_list + win_list
    #  path_list = first_list + second_list + third_list + tmp_list + win_list

    done_list = []
    for feature in best_feature:
        for path in path_list:
            try:
                filename = re.search(r'/([^/.]*).gz', path).group(1)
            except AttributeError:
                continue
            #  if path.count(feature):
            #  if filename==feature:
            if filename.replace('stan_', '')==feature:
                try:
                    shutil.move(path, win_path)
                    #  filename = re.search(r'/([^/.]*).gz', path).group(1)
                    done_list.append(filename)
                except shutil.Error:
                    pass
                    #  shutil.move(path, gdrive_path)
                except FileNotFoundError:
                    pass
                    #  shutil.move(path, gdrive_path)

    logger = logger_func()
    best_feature = [f for f in best_feature]

    loss_list = set(list(best_feature)) - set(done_list)
    logger.info(f"Loss List:")
    for loss in loss_list:
        logger.info(f"{loss}")
Beispiel #3
0
def move_to_second_valid(best_select=[], path='', rank=0, key_list=[]):
    logger = logger_func()
    if len(best_select) == 0:
        try:
            if path == '':
                path = sys.argv[2]
        except IndexError:
            pass
        best_select = pd.read_csv(path)
        try:
            if rank == 0:
                rank = int(sys.argv[3])
        except IndexError:
            pass
        best_feature = best_select.query(f"rank>={rank}")['feature'].values
        try:
            best_feature = [
                col for col in best_feature if col.count(sys.argv[4])
            ]
        except IndexError:
            best_feature = [col for col in best_feature if col.count('')]

        if len(best_feature) == 0:
            sys.exit()

        path_list = glob.glob('../features/4_winner/*')

        for feature in best_feature:
            move_path = []
            for path in path_list:
                filename = re.search(r'/([^/.]*).gz', path).group(1)
                #  if path.count(feature) and feature not in ignore_list:
                #  if feature==filename:
                if feature == filename.replace('stan_', ''):
                    #  print(f"{filename} | {feature}")
                    move_path.append(path)

            for move in move_path:
                try:
                    shutil.move(move, second_path)
                except FileNotFoundError:
                    logger.info(f'FileNotFoundError: {feature}')
                except shutil.Error:
                    logger.info(f'Shutil Error: {feature}')
        print(f'move to third_valid:{len(best_feature)}')
def move_to_use():

    try:
        path = sys.argv[2]
    except IndexError:
        path = ''
    best_select = pd.read_csv(path)
    best_feature = best_select['feature'].values

    win_list = glob.glob(win_path + '*')
    first_list = glob.glob('../season1_features/1_first_valid/*')
    second_list = glob.glob('../season1_features/2_second_valid/*')
    third_list = glob.glob('../season1_features/3_third_valid/*')
    tmp_list = glob.glob('../features/5_tmp/*')
    path_list = third_list + tmp_list + win_list
    #  path_list = first_list + second_list + third_list + tmp_list + win_list

    done_list = []
    for feature in best_feature:
        for path in path_list:
            if path.replace('.0', '_0').count(feature[:7]) and path.replace(
                    '.0', '_0').count(feature[9:]):
                try:
                    shutil.move(path, win_path)
                    filename = re.search(r'/([^/.]*).gz',
                                         path.replace('.0', '_0')).group(1)
                    if filename.count('train'):
                        done_list.append(filename[14:])
                    elif filename.count('test'):
                        done_list.append(filename[13:])
                except shutil.Error:
                    pass
                    #  shutil.move(path, gdrive_path)
                except FileNotFoundError:
                    pass
                    #  shutil.move(path, gdrive_path)

    logger = logger_func()
    best_feature = [f[8:] for f in best_feature]

    loss_list = set(list(best_feature)) - set(done_list)
    logger.info(f"Loss List:")
    for loss in loss_list:
        logger.info(f"{loss}")
def move_to_second_valid(best_select=[], path='', rank=0, key_list=[]):
    logger = logger_func()
    if len(best_select) == 0:
        try:
            if path == '':
                path = sys.argv[2]
        except IndexError:
            pass
        best_select = pd.read_csv(path)
        try:
            if rank == 0:
                rank = int(sys.argv[3])
        except IndexError:
            pass
        best_feature = best_select.query(f"rank>={rank}")['feature'].values
        try:
            best_feature = [
                col for col in best_feature if col.count(sys.argv[4])
            ]
        except IndexError:
            best_feature = [col for col in best_feature if col.count('')]

        if len(best_feature) == 0:
            sys.exit()

        path_list = glob.glob('../season1_features/4_winner/*')
        for feature in best_feature:
            move_path = [
                path for path in path_list if path.count(feature[:7])
                and path.count(feature[9:]) and feature not in ignore_list
            ]
            for move in move_path:
                try:
                    shutil.move(move, second_path)
                except FileNotFoundError:
                    print(f'FileNotFound. : {feature}.gz')
                    pass
                except shutil.Error:
                    logger.info(f'Shutil Error: {feature}')
        print(f'move to third_valid:{len(best_feature)}')
Beispiel #6
0
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder

#========================================================================
# original library
HOME = os.path.expanduser('~')
sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
sys.path.append(f"{HOME}/kaggle/data_analysis/model/")
import MS_utils
from params_MS import params_lgb
import utils, ml_utils
from utils import logger_func
try:
    if not logger:
        logger = logger_func()
except NameError:
    logger = logger_func()
#========================================================================
"""
argv[1]: comment
argv[2]: feature_key
argv[3]: group
"""

# Columns
base_path = '../input/base_exclude*'
base_path = '../input/base_Av*'
key, target, ignore_list = MS_utils.get_basic_var()
ignore_list = [key, target, 'country_group', 'down_flg']
base = utils.read_df_pkl(base_path)[[key, target, 'country_group']]