Example #1
0
def main():

    logger.debug('config: {}'.format(options.config))
    logger.debug(feats)
    logger.debug(model_params)
    # 指定した特徴量からデータをロード
    X_train_all, X_test = load_datasets(feats)
    y_train_all = load_target(target_name)
    cols = X_train_all.columns

    # stacking
    if "stacking" in config and config["stacking"] == True:
        oof_df, test_df = stack_load_df(config["stacking_name"])
        X_train_all = pd.concat([X_train_all, oof_df], axis=1)
        X_test = pd.concat([X_test, test_df], axis=1)

    if (model_name != "lightgbm") or ("sampling" in config):
        logger.debug("rank gauss")
        scaler = QuantileTransformer(n_quantiles=100,
                                     random_state=model_params["seed"],
                                     output_distribution="normal")
        all_df = pd.concat([X_train_all, X_test])
        all_df = all_df.fillna(all_df.median())  # 欠損値埋め
        all_df[cols] = scaler.fit_transform(all_df[cols])  # scale
        X_train_all = all_df[:X_train_all.shape[0]].reset_index(drop=True)
        X_test = all_df[X_train_all.shape[0]:].reset_index(drop=True)

    logger.debug("X_train_all shape: {}".format(X_train_all.shape))
    print(X_train_all.info())

    # seed ごとにループ
    class_cols = [i for i in range(model_params["num_class"])]
    oof_df = pd.DataFrame(index=[i for i in range(X_train_all.shape[0])],
                          columns=class_cols)
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])
    oof_df[class_cols] = 0
    sub[target_name] = 0
    for seed_num in range(config["seed_num"]):
        logger.debug(f"SEED: {seed_num}")
        one_oof_df, one_sub = train_and_predict(X_train_all,
                                                y_train_all,
                                                X_test,
                                                seed_num=seed_num)
        oof_df[class_cols] += one_oof_df[class_cols] / config["seed_num"]
        sub[target_name] += one_sub[target_name] / config["seed_num"]

    auc_score = evaluate_score(y_train_all.values, oof_df.values[:, 1], "auc")
    acc_score = evaluate_score(y_train_all.values,
                               oof_df.values.argmax(axis=1), "acc")
    logloss_score = evaluate_score(y_train_all.values, oof_df.values[:, 1],
                                   "logloss")
    logger.debug('=== OOF CV scores ===')
    logger.debug(
        f"\t auc:{auc_score}, acc: {acc_score}, logloss: {logloss_score}")

    sub = sub.rename(columns={ID_name: 'Id', target_name: "label"})
    oof_df.to_csv(f'./data/output/oof_{config_filename}.csv', index=False)
    sub.to_csv(f'./data/output/sub_{config_filename}.csv', index=False)
Example #2
0
def main():
    logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now))

    logging.debug('config: {}'.format(options.config))
    logging.debug(feats)
    logging.debug(params)

    # 指定した特徴量からデータをロード
    X_train_all, X_test = load_datasets(feats)
    y_train_all = load_target(target_name)
    logging.debug("X_train_all shape: {}".format(X_train_all.shape))

    stacking(X_train_all, y_train_all, X_test)
Example #3
0
def main():
    logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now))

    logging.debug('config: {}'.format(options.config))
    logging.debug(feats)
    logging.debug(params)

    # 指定した特徴量からデータをロード
    X_train_all, X_test = load_datasets(feats)
    y_train_all = load_target(target_name)
    logging.debug("X_train_all shape: {}".format(X_train_all.shape))
    if config['model'] == 'LightGBM':
        train_and_predict_lightgbm(X_train_all, y_train_all, X_test)
    elif config['model'] in ['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet', 'KernelRidge', "SVR", "XGBoost", "RandomForest", "GradientBoosting", "CatBoost"]:
        train_and_predict_linear(X_train_all, y_train_all, X_test)
Example #4
0
gc.enable()

from collections import OrderedDict

parser = argparse.ArgumentParser()
parser.add_argument('--config', default='./configs/config001.debug.json')
parser.add_argument('--debug', default='False')
parser.add_argument('--tuning', action='store_true')
options = parser.parse_args()

with open(options.config, "r") as fp:
    conf = json.load(fp, object_pairs_hook=OrderedDict)

trn, tst = load_feature_sets(options.config)
target = utils.load_target()

trn = pd.concat([target, trn], axis=1)
trn['target_outlier'] = 0
trn.loc[(trn.target < -30), 'target_outlier'] = 1
trn.drop(columns=['target'], inplace=True)


def get_feature_importances(trn, y, shuffle, seed=None):
    # Gather real features

    if shuffle:
        # Here you could as well use a binomial distribution
        y = y.copy().sample(frac=1.0)

    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
Example #5
0
files_tr = []
for pref in PREFS:
    files_tr += glob(f'../data/train_{pref}*.pkl')

files_te = [f'../feature/test_{c}.pkl' for c in COL]
sw = False
for i in files_te:
    if os.path.exists(i) == False:
        print(i)
        sw = True
if sw:
    raise Exception()

X = pd.concat([pd.read_pickle(f) for f in tqdm(files_tr, mininterval=60)],
              axis=1)[COL]
y = utils.load_target().target

#X.drop(DROP, axis=1, inplace=True)

target_dict = {}
target_dict_r = {}
for i, e in enumerate(y.sort_values().unique()):
    target_dict[e] = i
    target_dict_r[i] = e

y = y.replace(target_dict)

if X.columns.duplicated().sum() > 0:
    raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }')
print('no dup :) ')
print(f'X.shape {X.shape}')
Example #6
0
options = parser.parse_args()
config = json.load(open(options.config))

now = datetime.datetime.now()
logging.basicConfig(
    filename='./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now), level=logging.DEBUG
)
logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now))

feats = config['features']
logging.debug(feats)

target_name = config['target_name']

X_train_all, X_test = load_datasets(feats, target_name)
y_train_all = load_target(target_name)
logging.debug(X_train_all.shape)

y_preds = []
models = []

lgbm_params = config['lgbm_params']

kf = KFold(n_splits=10, random_state=0)
for train_index, valid_index in kf.split(X_train_all):
    X_train, X_valid = (
        X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :]
    )
    y_train, y_valid = y_train_all[train_index], y_train_all[valid_index]

    # lgbmの実行
li = []
for i in files_tr:
    for j in USE_PREF:
        if j in i:
            li.append(i)
            break
files_tr = li

[print(i, f) for i, f in enumerate(files_tr)]

X_train = pd.concat(
    [pd.read_feather(f) for f in tqdm(files_tr, mininterval=30)] +
    [joblib.load('../external/X_train_nejumi.pkl.gz')],
    axis=1)

y_train = utils.load_target()['HasDetections']

# drop
if len(col_drop) > 0:
    X_train.drop(col_drop, axis=1, inplace=True)

if X_train.columns.duplicated().sum() > 0:
    raise Exception(
        f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {X_train.shape}')

gc.collect()

CAT = list(set(X_train.columns) & set(utils_cat.ALL))
print(f'CAT: {CAT}')
Example #8
0
def train_model(config, _debug, logger, start_dt, train_and_predict):
    """
    train model with features. model and features are designated in config
    """
    features = config['features']
    label_name = config['label_name']
    id_name = config['id_name']

    # load only train features and label
    x_train_all = load_features(features, _debug, target='train')
    y_train_all = load_target(label_name, _debug)

    gc.collect()

    logger.debug('x_train_all:{0}'.format(x_train_all.shape))
    logger.debug('y_train_all:{0}'.format(y_train_all.shape))

    # save feature names and index
    feature_names = x_train_all.columns.tolist()
    x_train_idx = x_train_all.index

    # convert from df to matrix
    x_train_all = df_to_matrix(x_train_all)

    # load model params
    params = config['params']
    seed = config['seed']
    model_name = config['model_name']

    # generate stratified k-fold instance
    n_splits = config['n_splits']
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    # to store results
    y_te_prs = np.zeros(len(y_train_all))
    scores_tr, scores_te = defaultdict(list), defaultdict(list)
    importances_df = pd.DataFrame()
    trained_models = []

    # cross validation
    for _fold, (tr_idx,
                te_idx) in enumerate(skf.split(x_train_idx, y_train_all)):
        _fold += 1
        logger.debug('------ {0} / {1} fold ------'.format(_fold, n_splits))

        # extract dataset
        x_tr, x_te = x_train_all[tr_idx, :], x_train_all[te_idx, :]
        y_tr, y_te = y_train_all[tr_idx], y_train_all[te_idx]

        logger.debug('x_tr:{0} x_te:{1}'.format(x_tr.shape, x_te.shape))
        logger.debug('y_tr:{0} y_te:{1}'.format(y_tr.shape, x_te.shape))

        # train model
        y_tr_pr, y_te_pr, model = train_and_predict(x_tr, y_tr, x_te, params)

        # save prediction
        y_te_prs[te_idx] += y_te_pr / (n_splits - 1)

        # compute metric
        scores_tr = calc_metrics(scores_tr, y_tr_pr, y_tr)
        scores_te = calc_metrics(scores_te, y_te_pr, y_te)

        logger.debug('[{0}f] train_acc:{1} test_acc:{2}'.format(
            _fold, scores_tr['acc'][-1], scores_te['acc'][-1]))
        logger.debug('[{0}f] train_auc:{1} test_auc:{2}'.format(
            _fold, scores_tr['auc'][-1], scores_te['auc'][-1]))

        # save model
        trained_models.append(model)

        # feature importance
        if hasattr(model, 'feature_importances_'):
            importances_df['{}_fold'.format(
                _fold)] = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importances_df['{}_fold'.format(_fold)] = model.coef_.flatten()

        del x_tr, x_te, y_tr, y_te, y_tr_pr, y_te_pr, model
        gc.collect()

    # mean metrics
    scores_cv_tr = np.mean(pd.DataFrame(scores_tr), axis=0)
    scores_cv_te = np.mean(pd.DataFrame(scores_te), axis=0)

    logger.debug('------ cross validation ------')
    logger.debug('[cv] train_acc:{0}, test_acc:{1}'.format(
        scores_cv_tr['acc'], scores_cv_te['acc']))
    logger.debug('[cv] train_auc:{0}, test_auc:{1}'.format(
        scores_cv_tr['auc'], scores_cv_te['auc']))

    if importances_df.any(axis=None):
        # mean feature importance
        importances_df = pd.DataFrame({
            'feature':
            feature_names,
            'importance':
            np.mean(importances_df, axis=1)
        })

        # save
        file_name = 'importances_{0:%m%d_%H%M%S}_{1:.5f}_{2}'.format(
            start_dt, scores_cv_te['auc'], model_name)
        importances_df.to_csv('../../data/output/{0}.csv'.format(file_name),
                              index=False)

        # plot
        fig = plot_importances(importances_df, file_name)
        fig.savefig(
            '../../figures/feature_importance/{0}.png'.format(file_name))

    # save prediction on te dataset
    train_df = pd.read_pickle('../../data/input/train.pkl')
    if _debug:
        train_df = train_df.iloc[:int(train_df.shape[0] / 100)]

    y_te_prs_df = pd.DataFrame({
        'id': train_df[id_name],
        'pred': y_te_prs,
        'truth': y_train_all
    })
    logger.debug('y_tr_prs_df:{0}'.format(y_te_prs_df.shape))

    del train_df
    gc.collect()

    # save prediction on cross-validation test
    y_te_prs_df.to_pickle(
        '../../data/output/val_{0:%m%d_%H%M%S}_{1:.5f}_{2}.pkl'.format(
            start_dt, scores_cv_te['auc'], model_name))

    del y_te_prs_df
    gc.collect()

    # save models
    model_path = '../../models/models_{0:%m%d_%H%M%S}_{1:.5f}_{2}.pkl'.format(
        start_dt, scores_cv_te['auc'], model_name)
    with open(model_path, 'wb') as f:
        pickle.dump(trained_models, f)
    #        'f006',
    #        'f007',
    #        'f008',
    #        'f009',
    #        'f010',
    #        'f011',
    #        'f012',
]

var_names = [f'var_{i:03}' for i in range(200)]

# =============================================================================
# load
# =============================================================================

y_train = utils.load_target()['target']


def load(var):

    files_tr = sorted(glob(f'../data/{var}/train_f*.pkl'))

    # USE_PREF
    li = []
    for i in files_tr:
        for j in USE_PREF:
            if j in i:
                li.append(i)
                break
    files_tr = li
Example #10
0
NROUND = 500
ESR = 50
VERBOSE_EVAL = 25

feature_size = 30
file_tr = '../data/f008/train_f008_1.f'
file_te = '../data/f008/test_f008_1.f'
outpath_tr = '../data/train_f008_1.f'
outpath_te = '../data/test_f008_1.f'

# =============================================================================
# load
# =============================================================================

X_train = pd.read_feather(file_tr).sample(frac=0.5, random_state=SEED)
y_train = utils.load_target().sample(frac=0.5,
                                     random_state=SEED)['HasDetections']

if len(DROP) > 0:
    X_train.drop(DROP, axis=1, inplace=True)

if X_train.columns.duplicated().sum() > 0:
    raise Exception(
        f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {X_train.shape}')
#print(f'X_valid.shape {X_valid.shape}')

gc.collect()

CAT = list(set(X_train.columns) & set(utils_cat.ALL))
print(f'CAT: {CAT}')
Example #11
0
from sklearn.metrics import roc_auc_score

import sys
sys.path.append(f'/home/{os.environ.get("USER")}/PythonLibrary')
import GA

import utils
#utils.start(__file__)

# =============================================================================
# load
# =============================================================================
X = pd.read_pickle('../external/share_904_oof_preds.pkl.gz')
oof_pred_array = X.values

y = utils.load_target()['target']

print(
    roc_auc_score(y, (9 * oof_pred_array / (1 - oof_pred_array)).prod(axis=1)))


# =============================================================================
# def
# =============================================================================
def myfitness(gtype):
    """
    gtype[:200]: weight
    gtype[200:]: binary(use or not)
    
    """
Example #12
0
#prepare the log file
now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
sc = logging.StreamHandler()
logger.addHandler(sc)
fh = logging.FileHandler(f'logs/log_{now}.log')
logger.addHandler(fh)
logger.debug(f'logs/log_{now}.log')
logger.debug(config_path)

#load in datasets and target
feats = config['feats']
target_name = config['target_name']
train, test = load_datasets(feats)
target = load_target(target_name)
molecule_name = feather.read_dataframe(
    './data/input/train.feather')['molecule_name'].values

if is_debug_mode:
    print("Debug mode is ON!")
    train = train.iloc[:10000]
    test = test.iloc[:1000]
    target = target.iloc[:10000]
    molecule_name = molecule_name[:10000]

train_type = train['type'].values
test_type = test['type'].values
logger.debug(feats)

train.drop(['PropertyFunctor', 'type'], axis=1, inplace=True)  #always nan
Example #13
0
logger.debug(f'logs/log_{now}.log')
logger.debug(config_path)
logger.debug(f'is_debug_mode: {is_debug_mode}')
logger.debug(f'keep_nans: {keep_nans}')


#load in datasets and target
if is_debug_mode:
    print("Debug mode is ON!")
    molecule_name = feather.read_dataframe('./data/input/train.feather')['molecule_name'].head(10000).values
else:
    molecule_name = feather.read_dataframe('./data/input/train.feather')['molecule_name'].values
feats = config['feats']
target_name = config['target_name']
train, test = load_datasets(feats, is_debug_mode)
target = load_target(target_name, is_debug_mode)


train_type = train['type'].values
test_type = test['type'].values
logger.debug(feats)

#train.drop(['PropertyFunctor', 'type'], axis=1, inplace=True) #always nan
#test.drop(['PropertyFunctor', 'type'], axis=1, inplace=True) #always nan


if keep_nans:
    # simply keep nans as they are and let the lightgbm handle it
    categorical_cols = list(train.columns[train.dtypes == object])
    logger.debug(categorical_cols) 
Example #14
0
parser.add_argument('--config', default='./configs/default.json')
options = parser.parse_args()
config = json.load(open(options.config))

now = datetime.datetime.now()
logging.basicConfig(filename='./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now),
                    level=logging.DEBUG)
logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now))

feats = config['features']
logging.debug(feats)

# target_name = config['target_name']

feats_train, feats_test = load_datasets(feats)
y_train_all = load_target()

lr_Train = pd.concat([y_train_all, feats_train], axis=1)

lr_Train.head()

sc = MinMaxScaler(feature_range=(0, 1))

lr_Train_scaled = sc.fit_transform(lr_Train)

X_Train = []
y_Train = []

for i in range(timesteps, 1913 - startDay):
    X_Train.append(
        lr_Train_scaled[i - timesteps:i])  #i = 14の場合、[0:14], i = 15の場合、[1:15]