Exemple #1
0
def job(args, train_csv, test_csv, embeddings, cache):
    """ Reads data, makes preprocessing, trains model and records results.
        Gets args as argument and passes values of it's fields to functions."""

    data = Data(train_csv, test_csv, cache)

    # read and preprocess data
    to_cache = not args.no_cache
    data.read_embedding(embeddings, args.unk_std, args.max_vectors, to_cache)
    data.preprocess(args.tokenizer, args.var_length)
    data.embedding_lookup()

    # split train dataset
    data_iter = data.split(args.kfold, args.split_ratio, args.stratified, args.test, args.seed)

    # iterate through folds
    loss_function = nn.BCEWithLogitsLoss()
    for fold, d in enumerate(data_iter):
        print(f'\n__________ fold {fold} __________')
        # get dataloaders
        if len(d) == 2:
            train, val = d
            test = data.test
        else:
            train, val, test = d
        dataloaders = iterate(train, val, test, args.batch_size) # train, val and test dataloader

        # choose model, optimizer, lr scheduler
        model = choose_model(args.model, data.text, args.n_layers, args.hidden_dim, args.dropout)
        optimizer = choose_optimizer(filter(lambda p: p.requires_grad, model.parameters()), args)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lrstep, gamma=0.1)
        learn = Learner(model, dataloaders, loss_function, optimizer, scheduler, args)
        learn.fit(args.epoch, args.n_eval, args.f1_tresh, args.early_stop, args.warmup_epoch, args.clip)

        # load best model
        learn.model, info = learn.recorder.load()
        # save val predictions
        y_pred, y_true, ids = learn.predict_probs()
        val_ids = [data.qid.vocab.itos[i] for i in ids]
        pred_to_csv(val_ids, y_pred, y_true)
        # choose best threshold for val predictions
        best_th, max_f1 = choose_thresh(y_pred, y_true, [0.1, 0.5, 0.01], message=True)
        learn.recorder.append_info({'best_th': best_th, 'max_f1': max_f1})


        # predict test labels
        test_label, test_prob,_, test_ids, tresh = learn.predict_labels(is_test=True, thresh=args.f1_tresh)
        if args.test:
            test_loss, test_f1, _, _, _ = learn.evaluate(learn.test_dl, args.f1_tresh)
            learn.recorder.append_info({'test_loss': test_loss, 'test_f1': test_f1}, message='Test set results: ')

        # save test predictions to submission.csv
        test_ids = [data.qid.vocab.itos[i] for i in test_ids]
        submit(test_ids, test_label, test_prob)
        record_path = learn.recorder.record(fold)  # directory path with all records
        print('\n')
    return record_path
def main():
    submit = sys.argv[1]
    try:
        comment = sys.argv[2]
        utils.submit(file_path=submit, comment=comment)
    except IndexError:
        utils.submit(file_path=submit)

    shutil.move(submit, '../log_submit/')
Exemple #3
0
def main():
    COMMENT_TEXT_COL = 'comment_text'
    EMB_MAX_FEAT = 300
    MAX_LEN = 220
    MAX_FEATURES = 100000
    #BATCH_SIZE = 1024
    BATCH_SIZE = 256
    
    #BATCH_SIZE = 2048
    NUM_EPOCHS = 1
    LSTM_UNITS = 64
    if args.debug:
        print('running in debug mode')
    if args.debug:
        result_dir = os.path.join(utils.RESULT_DIR, 'debug-'+datetime.strftime(datetime.now(), '%Y%m%d%H%M%S'))
    else:
        result_dir = os.path.join(utils.RESULT_DIR, datetime.strftime(datetime.now(), '%Y%m%d%H%M%S'))
    os.mkdir(result_dir)
    print(f'created: {result_dir}')

#     convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
#         os.path.join(utils.BERT_MODEL_PATH, 'bert_model.ckpt'),
#         os.path.join(utils.BERT_MODEL_PATH, 'bert_config.json'),
#         utils.PYTORCH_BERT_MODEL_PATH)
    train_data = ToxicDataset(mode='train', debug=args.debug)
    test_data = ToxicDataset(mode='test')
    train, test = train_data.data, test_data.data
    train = utils.preprocess_data(train, mode='train')
    test = utils.preprocess_data(test)
    #tokenizer = Tokenizer(num_words=MAX_FEATURES, lower=True)
    tokenizer = BertTokenizer.from_pretrained(utils.BERT_MODEL_PATH, 
                                              do_lower_case=True)
    X_train, X_test, y_train = utils.run_bert_tokenizer(tokenizer, train, test, 
                                                               seq_len=MAX_LEN)
    #word_index = tokenizer.word_index
    word_index = None
    #print(word_index)
#    print(f'vocab size: {len(word_index)}')
#     embedding_matrix = utils.build_embeddings(word_index, emb_max_feat=EMB_MAX_FEAT)
#     print(embedding_matrix.shape)
    embedding_matrix = None
    sub_preds, oof_df = utils.run_model_pytorch(result_dir, X_train, X_test, y_train, embedding_matrix, 
                                        word_index, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, 
                                        max_len=MAX_LEN, lstm_units=LSTM_UNITS, oof_df=train)
    bias_metrics_df = utils.compute_bias_metrics_for_model(dataset=oof_df, 
                                                           subgroups=utils.IDENTITY_COLS,
                                                           model=utils.PREDICT_COL, 
                                                           label_col=utils.TOXICITY_COLUMN)
    validation_final_socre = utils.get_final_metric(bias_metrics_df, 
                                                    utils.calculate_overall_auc(oof_df, 
                                                                          utils.TOXICITY_COLUMN)
                                                   )
    print(f'validation final score: {validation_final_socre}')
    utils.submit(result_dir, sub_preds)
    print('finish!!!')
Exemple #4
0
def main():
    # load feathers
    files = sorted(glob('../feats/*.feather'))
    df = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)],
                   axis=1)
    df = df[configs['features']]
    feats = [f for f in df.columns if f not in FEATS_EXCLUDED]

    # load model
    reg = lgb.Booster(model_file='../output/lgbm_all_data.txt')

    # Recursive prediction
    print('Recursive prediction...')
    for day in tqdm(range(1914, 1914 + 28)):
        mask_test = (df['d_numeric'] >= day - 28) & (df['d_numeric'] <= day)
        tmp_df = df[mask_test]
        tmp_df = make_lags(tmp_df)
        df.loc[df['d_numeric'] == day, 'demand'] = reg.predict(
            tmp_df[tmp_df['d_numeric'] == day][feats],
            num_iteration=reg.best_iteration)

        del tmp_df
        gc.collect()

    # split test
    test_df = df[df['date'] >= '2016-04-25']

    del df
    gc.collect()

    # reshape prediction for submit
    preds = test_df[['id', 'd', 'demand']].reset_index()
    preds = preds.pivot(index='id', columns='d', values='demand').reset_index()

    # split test1 / test2
    preds1 = preds[['id'] + COLS_TEST1]
    preds2 = preds[['id'] + COLS_TEST2]

    # change column names
    preds1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]
    preds2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]

    # replace test2 id
    preds2['id'] = preds2['id'].str.replace('_validation', '_evaluation')

    # merge
    preds = preds1.append(preds2)

    # save csv
    preds.to_csv(submission_file_name, index=False)

    # submission by API
    submit(submission_file_name, comment='model301 recursive prediction')
Exemple #5
0
def model_xgb(train , test , flag):
    train_x , train_y = train[0] , train[1]
    test_x , test_y = test[0] , test[1]
    print train_x.shape
    print train_y[2].shape
    print fea_names
    dtrain = xgb.DMatrix(train_x , label = train_y[2].values)
    dtest = xgb.DMatrix(test_x)
    def mapeobj(preds , dtrain):
        gaps = dtrain.get_label()
        delta = (preds - gaps) / gaps
        k = 6
        e = np.exp(k * delta)
        grad = (e - 1 / e) / (e + 1 / e)
        for i , t in enumerate(delta):
            if t > 1:
                grad[i] = 1
            elif t < -1:
                grad[i] = -1
        grad = grad / gaps
        hess = (4 * k * gaps**2) / (e + 1 / e)**2
        for i , t in enumerate(delta):
            if abs(t) > 1:
                hess[i] = 0
        for i , g in enumerate(gaps):
            if g == 0:
                grad[i] = 0
                hess[i] = 0
        return grad , hess
    def evalmape(preds , dtrain):
        gaps = dtrain.get_label()
        errs = abs(gaps - preds) / gaps
        for i , g in enumerate(gaps):
            if g == 0:
                errs[i] = 0
        err = np.mean(errs)
        return 'error' , err
    if flag == 'online':
        watchlist = [(dtrain , 'train')]
        m_xgb = xgb.train(Params.xgb_reg_params , dtrain , 800 , watchlist , mapeobj , evalmape)
        prd = m_xgb.predict(dtest)
        Util.submit(test_y.values , prd)
    elif flag == 'offline':
        dtest.set_label(test_y[2].values)
        watchlist = [(dtrain , 'train') , (dtest , 'eval')]
        #m_xgb = xgb.train(Params.xgb_reg_params , dtrain , 100 , watchlist)
        m_xgb = xgb.train(Params.xgb_reg_params , dtrain , 800 , watchlist , mapeobj , evalmape)
        prd = m_xgb.predict(dtest)
        #prd = postprocess(train , test_y.values , prd)
        print Util.score(test_y.values , prd)
        imp = m_xgb.get_fscore()
        print sorted(imp.items() , key = lambda d:d[1] , reverse = True)
Exemple #6
0
 def predict_test(self, method, thresh, method_params):
     y_preds = []
     last_ids = None
     for pp in self.test_pred_paths:
         ids, y_prob, _ = load_pred_from_csv(pp)
         y_preds.append(y_prob)
         if last_ids:
             if np.array_equal(last_ids, ids):
                 raise Exception(
                     'Prediction ids should be the same for ensemble')
     test_ens_prob = methods[method](
         y_preds, args=method_params)  # target probability after ensembling
     test_ens_label = (test_ens_prob > thresh).astype(int)
     submit(ids, test_ens_label, test_ens_prob)
Exemple #7
0
def model_rf(train , test , flag):
    train_x , train_y = train[0] , train[1]
    test_x , test_y = test[0] , test[1]
    rf = RandomForestRegressor()
    rf.set_params(**Params.rf_reg_params)
    print "start training"
    rf.fit(train_x , train_y['gap'].values)
    if flag == 'online':
        prd = rf.predict(test_x)
        prd = postprocess(train , test_y.values , prd)
        Util.submit(test_y.values , prd)
    elif flag == 'offline':
        prd = rf.predict(test_x)
        prd = postprocess(train , test_y.values , prd)
        print 'test : ' , Util.score(test_y.values , prd)
        prd = rf.predict(train_x)
        print 'train : ' , Util.score(train_y.values , prd)
Exemple #8
0
def submit_task():
    start_ts = request.args.get('start_ts','')
    end_ts = request.args.get('end_ts','')
    submit_user = request.args.get('submit_user','')
    topic = request.args.get('topic','')
    status = submit(topic,start_ts,end_ts,submit_user)

    print status

    return json.dumps(status)
Exemple #9
0
def submit_task():
    start_ts = request.args.get('start_ts', '')
    end_ts = request.args.get('end_ts', '')
    submit_user = request.args.get('submit_user', '')
    topic = request.args.get('topic', '')
    status = submit(topic, start_ts, end_ts, submit_user)

    print status

    return json.dumps(status)
Exemple #10
0
def main():
    # load submission files
    sub = pd.read_csv('../output/submission_lgbm_diff.csv')

    # load train data
    df = pd.read_csv('../input/sales_train_validation.csv')

    # to cumsum
    sub['F1'] += df['d_1913']
    sub.loc[:, 'F1'] = sub['F1'].where(sub['F1'] > 0, 0)

    for i in range(2, 29):
        sub[f'F{i}'] += sub[f'F{i-1}']
        sub.loc[:, f'F{i}'] = sub[f'F{i}'].where(sub[f'F{i}'] > 0, 0)

    # save csv
    sub.to_csv(submission_file_name, index=False)

    # submission by API
    submit(submission_file_name, comment='model401 weekly prediction')
Exemple #11
0
def model_rf(train , test , flag):
    train_x , train_y = train[0] , train[1]
    test_x , test_y = test[0] , test[1]
    if os.path.exists(configs['rf_model']):
        print "model exists"
        rf = joblib.load(configs['rf_model'])
    else:
        rf = RandomForestRegressor()
        rf.set_params(**Params.rf_reg_params)
        print "start training"
        rf.fit(train_x , train_y[3].values)
        joblib.dump(rf , configs['rf_model'] , compress=3)
    if flag == 'online':
        prd = rf.predict(test_x)
        #prd = postprocess(train , test_y.values , prd)
        Util.submit(test_y.values , prd)
    elif flag == 'offline':
        prd = rf.predict(test_x)
        #prd = postprocess(train , test_y.values , prd)
        print 'test : ', Util.score2(test_y.values , prd)
Exemple #12
0
def output(train_df, test_df, models, model_params, feature_importance_df,
           train_preds, test_preds, scores, now, model_name):
    score = sum(scores) / len(scores)
    folder_path = make_output_dir(score, now, model_name)
    for i, m in enumerate(models):
        save2pkl('{0}/model_{1:0=2}.pkl'.format(folder_path, i), m)
    with open('{0}/model_params.json'.format(folder_path), 'w') as f:
        json.dump(model_params, f, indent=4)
    with open('{0}/model_valid_scores.json'.format(folder_path), 'w') as f:
        json.dump({i: s for i, s in enumerate(scores)}, f, indent=4)
    save_importances(feature_importance_df,
                     '{}/importances.png'.format(folder_path),
                     '{}/importance.csv'.format(folder_path))

    # 以下の部分はコンペごとに修正が必要
    submission_file_name = '{0}/submit_{1:%Y-%m-%d-%H-%M-%S}_{2}.csv'.format(
        folder_path, now, score)

    test_df.loc[:, 'target'] = test_preds
    test_df.loc[:, 'Outlier_Likelyhood'] = test_preds_bin
    q = test_df['Outlier_Likelyhood'].quantile(.9999)  # 1.0930%
    test_df.loc[:, 'target'] = test_df['Outlier_Likelyhood'].apply(
        lambda x: 1 if x > q else x)
    test_df = test_df.reset_index()
    test_df[['card_id', 'target']].to_csv(submission_file_name, index=False)

    train_df.loc[:, 'OOF_PRED'] = train_preds
    train_df = train_df.reset_index()
    train_df[['card_id',
              'OOF_PRED']].to_csv('{0}/oof.csv'.format(folder_path), )

    # API経由でsubmit
    if not is_debug:
        submit(competition_name,
               submission_file_name,
               comment='user02 cv: %.6f' % score)
Exemple #13
0
model.compile(Adam(1e-3), loss=['mse', 'mse'], loss_weights=[1, 1.7], metrics=[['acc'], []])

if os.path.exists(model_path): 
    model.load_weights(model_path)
    print('\033[32;1mLoad Model\033[0m')

plot_model(model, 'model.jpg')
if training:
    checkpoint = ModelCheckpoint(model_path, 'val_loss', verbose=1, save_best_only=True, save_weights_only=True)
    reduce_lr = ReduceLROnPlateau('val_loss', 0.5, 10, verbose=1, min_lr=1e-6)
    early_stopping = EarlyStopping('val_loss', patience=100, restore_best_weights=True)
    logger = CSVLogger(model_path+'.csv', append=True)
    tensorboard = TensorBoard(model_path[:model_path.rfind('.')]+'_logs', batch_size=1024, update_freq='epoch')
    model.fit(trainX, [trainY, missing_col], batch_size=128, epochs=500, validation_data=(validX, [validY, valid_missing_col]), verbose=2, callbacks=[checkpoint, reduce_lr, logger, tensorboard, early_stopping])

if submit:
    testX = utils.load_test_data(submit)
    Y = np.array(list(map(idx2word.get, np.argmax(model.predict(testX, batch_size=1024)[0], axis=-1))))
    utils.submit(Y)
elif svm:
    X, Y = utils.load_train_data(data_path)
    Y = np.array(list(map(word2idx.get, Y.ravel())))
    trainX = X[:, 1:]
    _, f1 = model.predict(trainX, batch_size=1024)
    X[:, 0] = f1.ravel()
    clf_svm(X, Y, save_model=svm)
else:
    model.load_weights(model_path)
    print(f'\n\033[32;1mTraining score: {model.evaluate(trainX, [trainY, missing_col], verbose=0, batch_size=1024)}')
    print(f'Validation Score: {model.evaluate(validX, [validY, valid_missing_col], verbose=0, batch_size=1024)}\033[0m')
Exemple #14
0
import os
import sys
from utils import submit

cache_dir = sys.argv[1]
host_name = sys.argv[2]
time_series = sys.argv[3]
environmental_data = sys.argv[4]
output_dir = sys.argv[5]

submit(host_name, cache_dir, "datatrans.PreprocPerPatSeriesNearestRoad",
       "--patgeo_data=" + time_series, "--output_file=" + output_dir,
       "--nearestroad_data=" + environmental_data, *sys.argv[6:])
Exemple #15
0
def mk_submit():

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = loader.train()
    col = [c for c in X_train.columns if c.startswith('f702_')]
    X_train.drop(col, axis=1, inplace=True)

    y_train = utils.read_pickles('../data/label').TARGET

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(loader.category()))
    print('CAT :', CAT)

    COL = X_train.columns.tolist()

    # test
    X_test = loader.test()[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    dtrain = lgb.Dataset(X_train,
                         y_train,
                         categorical_feature=CAT,
                         free_raw_data=False)

    model_all = []
    y_pred = pd.Series(0, index=y_train.index)
    for i in range(LOOP):
        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        y_pred += ex.eval_oob(X_train, y_train, models, i).rank()

        auc_mean = roc_auc_score(y_train, y_pred)
        result = f"CV auc-mean(loop {i}): {auc_mean} {ret['auc-mean'][-1]}"
        print(result)
        utils.send_line(result)

    y_pred /= y_pred.max()

    auc_mean = roc_auc_score(y_train, y_pred)
    result = f"CV auc-mean: {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
sub = X[['click_id']]
sub.click_id = sub.click_id.map(int)

X.drop('click_id', axis=1, inplace=True)
X.fillna(-1, inplace=True)


dtest = xgb.DMatrix(X[train_head.columns])
del X; gc.collect()

sub['is_attributed'] = 0
y_pred = model.predict(dtest)
sub['is_attributed'] += pd.Series(y_pred).rank()
#sub['is_attributed'] /= LOOP
sub['is_attributed'] /= sub['is_attributed'].max()
sub['click_id'] = sub.click_id.map(int)

sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

# =============================================================================
# submission
# =============================================================================
if EXE_SUBMIT:
    utils.submit(SUBMIT_FILE_PATH)


#==============================================================================
utils.end(__file__)

improved my scores.

"""

FILE_out = '../output/1022-1_Giba-post2.csv.gz'

sub.loc[sub.object_id.isin(oid_gal), 'class_99'] = 0.017
sub.loc[sub.object_id.isin(oid_exgal), 'class_99'] = 0.17

sub.loc[sub.object_id.isin(oid_gal),
        [f'class_{i}' for i in utils.classes_exgal]] = 0
sub.loc[sub.object_id.isin(oid_exgal),
        [f'class_{i}' for i in utils.classes_gal]] = 0

sub.to_csv(FILE_out, index=False, compression='gzip')
utils.submit(FILE_out, '0.017 and 0.17')

# =============================================================================
# yuval post
# =============================================================================
"""
yuval says

I believe a prerequisite for predicting Class 99 is having a very good model 
for the other classes. The good new is that a score of ~1.0 is achievable 
without really dealing with class 99. The only thing I did up to this point 
with this prediction is: class_99=np.where(other_classes.max>0.9 , 0.01, 0.1) 
[it is slightly better then uniform, If I use 0.8 the score degrades, 
and also other values of probabilities degrade the score]

Exemple #18
0
    test[target] = manage.base_test[target].values
    stack_cols = [key, target, pred_col]

    df_stack = pd.concat([train[stack_cols], test[stack_cols]],
                         ignore_index=True,
                         axis=0)

    #========================================================================
    # Saving
    feim.to_csv(
        f'../valid/{start_time[4:12]}_valid_{model_type}_SET-{set_type}_feat{n_feature}_{comment}_CV{str(cv_score)[:7]}_LB.csv',
        index=True)
    utils.to_pkl_gzip(
        obj=df_stack,
        path=
        f'../stack/{start_time[4:12]}_stack_{model_type}_SET-{set_type}_feat{n_feature}_{comment}_CV{str(cv_score)[:7]}_LB'
    )

    submit = pd.read_csv('../input/sample_submission.csv').set_index(key)
    submit[target] = test[pred_col].values
    submit_path = f'../submit/{start_time[4:12]}_submit_{model_type}_SET-{set_type}_feat{n_feature}_{comment}_CV{str(cv_score)[:7]}_LB.csv'
    submit.to_csv(submit_path, index=True)

    if is_submit:
        utils.submit(file_path=submit_path,
                     comment=comment,
                     COMPETITION_NAME=COMPETITION_NAME)
        shutil.move(submit_path, '../log_submit/')

    #========================================================================
def DO(frm, to, fileno):
    dtypes = {
        'ip': 'uint32',
        'app': 'uint16',
        'device': 'uint16',
        'os': 'uint16',
        'channel': 'uint16',
        'is_attributed': 'uint8',
        'click_id': 'uint32',
    }

    print('loading train data...', frm, to)
    train_df = pd.read_csv("../input/train.csv.zip",
                           parse_dates=['click_time'],
                           skiprows=range(1, frm),
                           nrows=to - frm,
                           dtype=dtypes,
                           usecols=[
                               'ip', 'app', 'device', 'os', 'channel',
                               'click_time', 'is_attributed'
                           ])

    print('loading test data...')
    if debug:
        test_df = pd.read_csv("../input/test.csv.zip",
                              nrows=100000,
                              parse_dates=['click_time'],
                              dtype=dtypes,
                              usecols=[
                                  'ip', 'app', 'device', 'os', 'channel',
                                  'click_time', 'click_id'
                              ])
    else:
        test_df = pd.read_csv("../input/test.csv.zip",
                              parse_dates=['click_time'],
                              dtype=dtypes,
                              usecols=[
                                  'ip', 'app', 'device', 'os', 'channel',
                                  'click_time', 'click_id'
                              ])

    len_train = len(train_df)
    train_df = train_df.append(test_df)

    del test_df
    gc.collect()

    print('Extracting new features...')
    train_df['hour'] = pd.to_datetime(
        train_df.click_time).dt.hour.astype('uint8')
    train_df['day'] = pd.to_datetime(
        train_df.click_time).dt.day.astype('uint8')

    gc.collect()

    naddfeat = 9
    for i in range(0, naddfeat):
        if i == 0:
            selcols = ['ip', 'channel']
            QQ = 4
        if i == 1:
            selcols = ['ip', 'device', 'os', 'app']
            QQ = 5
        if i == 2:
            selcols = ['ip', 'day', 'hour']
            QQ = 4
        if i == 3:
            selcols = ['ip', 'app']
            QQ = 4
        if i == 4:
            selcols = ['ip', 'app', 'os']
            QQ = 4
        if i == 5:
            selcols = ['ip', 'device']
            QQ = 4
        if i == 6:
            selcols = ['app', 'channel']
            QQ = 4
        if i == 7:
            selcols = ['ip', 'os']
            QQ = 5
        if i == 8:
            selcols = ['ip', 'device', 'os', 'app']
            QQ = 4
        print('selcols', selcols, 'QQ', QQ)

        filename = 'X%d_%d_%d.csv' % (i, frm, to)

        if os.path.exists(filename):
            if QQ == 5:
                gp = pd.read_csv(filename, header=None)
                train_df['X' + str(i)] = gp
            else:
                gp = pd.read_csv(filename)
                train_df = train_df.merge(gp,
                                          on=selcols[0:len(selcols) - 1],
                                          how='left')
        else:
            if QQ == 0:
                gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].count().reset_index().\
                    rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
                train_df = train_df.merge(gp,
                                          on=selcols[0:len(selcols) - 1],
                                          how='left')
            if QQ == 1:
                gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].mean().reset_index().\
                    rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
                train_df = train_df.merge(gp,
                                          on=selcols[0:len(selcols) - 1],
                                          how='left')
            if QQ == 2:
                gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].var().reset_index().\
                    rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
                train_df = train_df.merge(gp,
                                          on=selcols[0:len(selcols) - 1],
                                          how='left')
            if QQ == 3:
                gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].skew().reset_index().\
                    rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
                train_df = train_df.merge(gp,
                                          on=selcols[0:len(selcols) - 1],
                                          how='left')
            if QQ == 4:
                gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].nunique().reset_index().\
                    rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
                train_df = train_df.merge(gp,
                                          on=selcols[0:len(selcols) - 1],
                                          how='left')
            if QQ == 5:
                gp = train_df[selcols].groupby(
                    by=selcols[0:len(selcols) - 1])[selcols[len(selcols) -
                                                            1]].cumcount()
                train_df['X' + str(i)] = gp.values

            if not debug:
                gp.to_csv(filename, index=False)

        del gp
        gc.collect()

    print('doing nextClick')
    predictors = []

    new_feature = 'nextClick'
    filename = 'nextClick_%d_%d.csv' % (frm, to)

    if os.path.exists(filename):
        print('loading from save file')
        QQ = pd.read_csv(filename).values
    else:
        D = 2**26
        train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
            + "_" + train_df['os'].astype(str)).apply(hash) % D
        click_buffer = np.full(D, 3000000000, dtype=np.uint32)

        train_df['epochtime'] = train_df['click_time'].astype(
            np.int64) // 10**9
        next_clicks = []
        for category, t in zip(reversed(train_df['category'].values),
                               reversed(train_df['epochtime'].values)):
            next_clicks.append(click_buffer[category] - t)
            click_buffer[category] = t
        del (click_buffer)
        QQ = list(reversed(next_clicks))

        if not debug:
            print('saving')
            pd.DataFrame(QQ).to_csv(filename, index=False)

    train_df[new_feature] = QQ
    predictors.append(new_feature)

    train_df[new_feature + '_shift'] = pd.DataFrame(QQ).shift(+1).values
    predictors.append(new_feature + '_shift')

    del QQ
    gc.collect()

    print('grouping by ip-day-hour combination...')
    gp = train_df[['ip', 'day',
                   'hour', 'channel']].groupby(by=['ip', 'day', 'hour'])[[
                       'channel'
                   ]].count().reset_index().rename(
                       index=str, columns={'channel': 'ip_tcount'})
    train_df = train_df.merge(gp, on=['ip', 'day', 'hour'], how='left')
    del gp
    gc.collect()

    print('grouping by ip-app combination...')
    gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[[
        'channel'
    ]].count().reset_index().rename(index=str,
                                    columns={'channel': 'ip_app_count'})
    train_df = train_df.merge(gp, on=['ip', 'app'], how='left')
    del gp
    gc.collect()

    print('grouping by ip-app-os combination...')
    gp = train_df[['ip', 'app',
                   'os', 'channel']].groupby(by=['ip', 'app', 'os'])[[
                       'channel'
                   ]].count().reset_index().rename(
                       index=str, columns={'channel': 'ip_app_os_count'})
    train_df = train_df.merge(gp, on=['ip', 'app', 'os'], how='left')
    del gp
    gc.collect()

    # Adding features with var and mean hour (inspired from nuhsikander's script)
    print('grouping by : ip_day_chl_var_hour')
    gp = train_df[['ip', 'day',
                   'hour', 'channel']].groupby(by=['ip', 'day', 'channel'])[[
                       'hour'
                   ]].var().reset_index().rename(
                       index=str, columns={'hour': 'ip_tchan_count'})
    train_df = train_df.merge(gp, on=['ip', 'day', 'channel'], how='left')
    del gp
    gc.collect()

    print('grouping by : ip_app_os_var_hour')
    gp = train_df[['ip',
                   'app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[[
                       'hour'
                   ]].var().reset_index().rename(
                       index=str, columns={'hour': 'ip_app_os_var'})
    train_df = train_df.merge(gp, on=['ip', 'app', 'os'], how='left')
    del gp
    gc.collect()

    print('grouping by : ip_app_channel_var_day')
    gp = train_df[['ip', 'app',
                   'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[[
                       'day'
                   ]].var().reset_index().rename(
                       index=str, columns={'day': 'ip_app_channel_var_day'})
    train_df = train_df.merge(gp, on=['ip', 'app', 'channel'], how='left')
    del gp
    gc.collect()

    print('grouping by : ip_app_chl_mean_hour')
    gp = train_df[['ip', 'app',
                   'channel', 'hour']].groupby(by=['ip', 'app', 'channel'])[[
                       'hour'
                   ]].mean().reset_index().rename(
                       index=str, columns={'hour': 'ip_app_channel_mean_hour'})
    print("merging...")
    train_df = train_df.merge(gp, on=['ip', 'app', 'channel'], how='left')
    del gp
    gc.collect()

    print("vars and data type: ")
    train_df.info()
    train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16')
    train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
    train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')

    target = 'is_attributed'
    predictors.extend([
        'app', 'device', 'os', 'channel', 'hour', 'day', 'ip_tcount',
        'ip_tchan_count', 'ip_app_count', 'ip_app_os_count', 'ip_app_os_var',
        'ip_app_channel_var_day', 'ip_app_channel_mean_hour'
    ])
    categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']
    for i in range(0, naddfeat):
        predictors.append('X' + str(i))

    print('predictors', predictors)

    test_df = train_df[len_train:]
    val_df = train_df[(len_train - val_size):len_train]
    train_df = train_df[:(len_train - val_size)]

    print("train size: ", len(train_df))
    print("valid size: ", len(val_df))
    print("test size : ", len(test_df))

    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')

    gc.collect()

    print("Training...")
    start_time = time.time()

    params = {
        'learning_rate': 0.20,
        #'is_unbalance': 'true', # replaced with scale_pos_weight argument
        'num_leaves': 7,  # 2^max_depth - 1
        'max_depth': 3,  # -1 means no limit
        'min_child_samples':
        100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree':
        0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight':
        0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'scale_pos_weight':
        200  # because training data is extremely unbalanced 
    }
    (bst, best_iteration) = lgb_modelfit_nocv(params,
                                              train_df,
                                              val_df,
                                              predictors,
                                              target,
                                              objective='binary',
                                              metrics='auc',
                                              early_stopping_rounds=30,
                                              verbose_eval=True,
                                              num_boost_round=1000,
                                              categorical_features=categorical)

    print('[{}]: model training time'.format(time.time() - start_time))
    del train_df
    del val_df
    gc.collect()

    #    print('Plot feature importances...')
    #    ax = lgb.plot_importance(bst, max_num_features=100)
    #    plt.show()

    print("Predicting...")
    sub['is_attributed'] = bst.predict(test_df[predictors],
                                       num_iteration=best_iteration)
    if not debug:
        print("writing...")
        sub.to_csv('sub_it%d.csv.gz' % (fileno),
                   index=False,
                   compression='gzip')
        utils.submit('sub_it%d.csv.gz' % (fileno))
    print("done...")
    return sub
import os
import sys
from utils import submit

cache_dir = sys.argv[1]
host_name = sys.argv[2]
time_series = sys.argv[3]
environmental_data = sys.argv[4]
acs_data = sys.argv[5]
output_dir = sys.argv[6]

submit(host_name, cache_dir, "datatrans.PreprocPerPatSeriesACS2",
       "--patgeo_data=" + time_series, "--output_file=" + output_dir,
       "--geoid_data=" + environmental_data, "--acs_data=" + acs_data,
       *sys.argv[7:])
Exemple #21
0
            else:
                out_pred = df_pred[df_pred[key].isin(
                    out_ids)]['pred_mean'].values
            out_score = np.sqrt(mean_squared_error(out_val, out_pred))
        else:
            out_score = 0
    else:
        out_score = 0

#========================================================================
# Submission
test_pred = seed_pred / len(seed_list)
submit[target] = test_pred
submit_path = f'../submit/{start_time[4:12]}_submit_{model_type}_lr{learning_rate}_{feature_num}feats_{len(seed_list)}seed_{num_leaves}leaves_iter{iter_avg}_OUT{str(out_score)[:7]}_CV{cv_score}_LB.csv'
submit.to_csv(submit_path, index=False)

if go_submit:
    import shutil
    comment = sys.argv[1]
    if len(comment):
        try:
            lb_pb = utils.submit(file_path=submit_path, comment=comment)
        except IndexError:
            lb_pb = utils.submit(file_path=submit_path)

        shutil.move(submit, '../log_submit/')

    submit_path = submit_path.replace('LB', f'LB{lb_pb[0]}')
    submit.to_csv(submit_path, index=False)
    #========================================================================
Exemple #22
0
def mk_submit():

    files_tr = ('../feature/train_' + features + '.f').tolist()
    files_te = ('../feature/test_' + features + '.f').tolist()

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = loader.train()
    X_train_ = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1)
    X_train = pd.concat([X_train, X_train_], axis=1)

    y_train = utils.read_pickles('../data/label').TARGET

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(loader.category()))

    COL = X_train.columns.tolist()

    # test
    X_test = loader.test()
    X_test_ = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)
    X_test = pd.concat([X_test, X_test_], axis=1)[COL]

    # =============================================================================
    # groupKfold
    # =============================================================================
    sk_tbl = pd.read_csv('../data/user_id_v8.csv.gz')  # TODO: check
    user_tbl = sk_tbl.user_id.drop_duplicates().reset_index(
        drop=True).to_frame()

    sub_train = pd.read_csv('../input/application_train.csv.zip',
                            usecols=['SK_ID_CURR']).set_index('SK_ID_CURR')
    sub_train['y'] = y_train.values

    group_kfold = GroupKFold(n_splits=NFOLD)

    # =============================================================================
    # training with cv
    # =============================================================================
    model_all = []
    auc_mean = 0
    for i in range(LOOP):
        dtrain = lgb.Dataset(X_train,
                             y_train,
                             categorical_feature=CAT,
                             free_raw_data=False)

        # shuffle fold
        ids = list(range(user_tbl.shape[0]))
        np.random.shuffle(ids)
        user_tbl['g'] = np.array(ids) % NFOLD
        sk_tbl_ = pd.merge(sk_tbl, user_tbl, on='user_id',
                           how='left').set_index('SK_ID_CURR')

        sub_train['g'] = sk_tbl_.g
        folds = group_kfold.split(X_train, sub_train['y'], sub_train['g'])

        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             folds=folds,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        auc_mean += ret['auc-mean'][-1]
    auc_mean /= LOOP

    result = f"CV auc-mean({COMMENT}): {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
Exemple #23
0
if os.path.exists(model_path): 
    model.load_weights(model_path)
    print('\033[32;1mLoad Model\033[0m')

plot_model(model, 'model.jpg')
if training:
    checkpoint = ModelCheckpoint(model_path, 'val_loss', verbose=1, save_best_only=True, save_weights_only=True)
    reduce_lr = ReduceLROnPlateau('val_loss', 0.5, 10, verbose=1, min_lr=1e-6)
    logger = CSVLogger(model_path+'.csv', append=True)
    tensorboard = TensorBoard(model_path[:model_path.rfind('.')]+'_logs', batch_size=1024, update_freq='epoch')
    model.fit(trainX, [trainY, missing_col[:, 0], missing_col[:, 1], missing_col[:, 2]], batch_size=256, epochs=100, validation_data=(validX, [validY, valid_missing_col[:, 0], valid_missing_col[:, 1], valid_missing_col[:, 2]]), verbose=2, callbacks=[checkpoint, reduce_lr, logger, tensorboard])

if submit:
    out = tf.cast(out*2, tf.int32)
    submit_model = Model(I, out)
    utils.submit(submit_model, submit)
elif svm:
    X, Y = utils.load_train_data(data_path)
    Y = Y.astype(int)
    trainX = np.delete(X, [1, 6, 11], axis=1)
    _, f2, f7, f12 = model.predict(trainX, batch_size=1024)
    X[:, 1] = f2.ravel()
    X[:, 6] = f7.ravel()
    X[:, 11] = f12.ravel()
    clf_svm(X, Y, save_model=svm)
else:
    model.load_weights(model_path)
    print(f'\n\033[32;1mTraining score: {model.evaluate(trainX, [trainY, missing_col[:, 0], missing_col[:, 1], missing_col[:, 2]], verbose=0)}')
    print(f'Validation Score: {model.evaluate(validX, [validY, valid_missing_col[:, 0], valid_missing_col[:, 1], valid_missing_col[:, 2]], verbose=0)}\033[0m')
Exemple #24
0
import sys
from utils import submit

host_name, input_dir, resc_types, skip_preproc, output_dir, *args = sys.argv[1:]

 = []
args.extend(sys.argv[6:])

submit(host_name, cache_dir, "datatrans.PreprocFIHR",
           "--input_dir=" + input_dir,
           "--resc_types=" + resc_types,
           "--skip_preproc=" + skip_preproc,
           "--output_dir=" + output_dir, *args)

Exemple #25
0
x_train = x_train[:size * int(len(x_train) / size)]
y_train = y_train[:size * int(len(y_train) / size)]
x_test = x_test[:size * int(len(x_test) / size)]
y_test = y_test[:size * int(len(y_test) / size)]
# print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

x_train_mean = np.mean(x_train, axis=0)
x_train -= x_train_mean
x_test -= x_train_mean
print("start parseLabel")
y_train, y_test = parseLabel(y_train, y_test)
print("end parseLabel")

# model = resnet_v1(depth=32, num_classes=num_classes, metrics='acc')
model = zsl_res(metrics={"attr_out": attr_acc, "emb_out": emb_acc})
# model = res_pretrain_finetune()
model.summary()
model.load_weights(
    "../data/saved_models_zsl/zsl_model_res_pretrain_finetune.050.h5",
    by_name=False)
# model = load_model("weights/my1/w1.h5")
model.fit(x_train,
          y_train,
          batch_size=size,
          epochs=epochs,
          validation_data=(x_test, y_test),
          shuffle=True,
          callbacks=callbacks())
model.save("../data/saved_models_zsl/zsl_model_res_pretrain_finetune2.h5")
submit(model, x_train_mean)
def _path(used, adjacency):
    available = [el for el in range(len(adjacency)) if not used[el]]
    root = np.random.choice(available)
    path_1 = _dfs(root, used, adjacency)
    path_2 = _dfs(root, used, adjacency)
    path_1.reverse()
    return path_1[:-1] + path_2


def solve(adjacency):
    used = np.full(shape=len(adjacency), fill_value=False)
    solution = []
    with tqdm.tqdm(total=len(adjacency)) as progress:
        while not np.all(used):
            path = _path(used, adjacency)
            solution.extend(path)
            progress.update(len(path))
    return solution


if __name__ == "__main__":
    filename = input_files[1]
    alignments, tags = utils.read_input(filename)
    graph = convert_to_graph(tags)
    adj_size = list(map(len, graph))
    print(max(adj_size), min(adj_size), sum(adj_size) / len(adj_size))
    solution = solve(graph)
    print("Total Score for %s: %s" %
          (filename, utils.score(filename, solution)))
    utils.submit(filename, solution)
Exemple #27
0
import os
import sys
from utils import submit

cache_dir = sys.argv[1]
host_name = sys.argv[2]
patient_dir = sys.argv[3]
environment_dir = sys.argv[4]
input_files = sys.argv[5]
deidentify = sys.argv[6]
output_dir = sys.argv[7]

submit(host_name, cache_dir, "datatrans.PreprocCSVTable",
       "--patient_directory=" + patient_dir,
       "--environment_directory=" + environment_dir,
       "--input_files=" + input_files, "--deidentify=" + deidentify,
       "--output_directory=" + output_dir, *sys.argv[8:])
Exemple #28
0
from utils import submit
import sys

host = sys.argv[1]
cache_dir = sys.argv[2]
args = sys.argv[3:]

submit(host, cache_dir, "tic.Transform", *args)
import sys
from utils import submit

cache_dir = sys.argv[1]
host_name = sys.argv[2]
dir = sys.argv[3]
year = sys.argv[4]

submit(host_name, cache_dir, "datatrans.PreprocDailyEnvData",
       "--input_directory={0}/cmaq{1}".format(dir, year),
       "--output_prefix={0}/cmaq{1}/".format(dir, year))


Exemple #30
0
import numpy as np
import pandas as pd
import utils

EXE_SUBMIT = True

SEED = 71
np.random.seed(SEED)

FILE_in = '../output/matsuken-875_onodera-884_taguchi-888u_akiyama-889u.csv.gz'
FILE_out = '../output/LB839_c99_uniform.csv.gz'

COMMENT = 'np.random.uniform(1.2, 1.4)'

sub = pd.read_csv(FILE_in)

sub.class_99 *= np.random.uniform(1.2, 1.4, size=sub.shape[0])

sub.iloc[:,
         1:] = sub.iloc[:, 1:].values / sub.iloc[:, 1:].sum(1).values[:, None]

sub.to_csv(FILE_out, index=False, compression='gzip')

# =============================================================================
# submission
# =============================================================================
if EXE_SUBMIT:
    print('submit')
    utils.submit(FILE_out, COMMENT)
Exemple #31
0
def main():

    #========================================================================
    # Data Load
    #========================================================================

    base = utils.read_df_pkl('../input/base_app*')
    win_path_list = glob.glob(win_path)
    train_path_list = []
    test_path_list = []
    for path in win_path_list:
        if path.count('train'):
            train_path_list.append(path)
        elif path.count('test'):
            test_path_list.append(path)

    base_train = base[~base[target].isnull()].reset_index(drop=True)
    base_test = base[base[target].isnull()].reset_index(drop=True)
    train_feature_list = utils.pararell_load_data(path_list=train_path_list)
    test_feature_list = utils.pararell_load_data(path_list=test_path_list)
    train = pd.concat(train_feature_list, axis=1)
    train = pd.concat([base_train, train], axis=1)
    test = pd.concat(test_feature_list, axis=1)
    test = pd.concat([base_test, test], axis=1)

    ir_list = [col for col in test.columns if col.count('ir_')]
    #  test[ir_list] = test[ir_list] + 0.005
    #  train['CNT_PAYMENT@'] = train['CNT_PAYMENT@'].where( train['CNT_PAYMENT@']<=39, np.nan)
    #  test['CNT_PAYMENT@'] = test['CNT_PAYMENT@'].where( test['CNT_PAYMENT@']<=39, np.nan)

    # 実験用
    #  df = utils.read_df_pkl('../input/clean_app*').sample(50000)
    #  train = df[df[target]>=0]
    #  test = df[df[target].isnull()]

    metric = 'auc'
    fold = 5
    fold_type = 'stratified'
    group_col_name = ''
    dummie = 1
    oof_flg = True
    LGBM = lgb_ex(logger=logger,
                  metric=metric,
                  model_type=model_type,
                  ignore_list=ignore_list)

    train, test, drop_list = LGBM.data_check(train=train,
                                             test=test,
                                             target=target)
    if len(drop_list):
        train.drop(drop_list, axis=1, inplace=True)
        test.drop(drop_list, axis=1, inplace=True)

    #========================================================================
    # Train & Prediction Start
    #========================================================================
    LGBM = LGBM.cross_prediction(train=train,
                                 test=test,
                                 key=key,
                                 target=target,
                                 fold_type=fold_type,
                                 fold=fold,
                                 group_col_name=group_col_name,
                                 params=params,
                                 num_boost_round=num_boost_round,
                                 early_stopping_rounds=early_stopping_rounds,
                                 oof_flg=oof_flg)

    #========================================================================
    # Result
    #========================================================================
    cv_score = LGBM.cv_score
    result = LGBM.prediction
    cv_feim = LGBM.cv_feim
    feature_num = len(LGBM.use_cols)

    cv_feim.to_csv(
        f'../valid/{start_time[4:12]}_{model_type}_{fname}_feat{feature_num}_CV{cv_score}_lr{learning_rate}.csv',
        index=False)

    #========================================================================
    # X-RAYの計算と出力
    # Args:
    #     model    : 学習済のモデル
    #     train    : モデルの学習に使用したデータセット
    #     col_list : X-RAYの計算を行うカラムリスト。指定なしの場合、
    #                データセットの全カラムについて計算を行うが、
    #                計算時間を考えると最大30カラム程度を推奨。
    #========================================================================
    if xray:
        train.reset_index(inplace=True)
        train = train[LGBM.use_cols]
        result_xray = pd.DataFrame()
        N_sample = 500000
        max_point = 30
        for fold_num in range(fold):
            model = LGBM.fold_model_list[fold_num]
            if fold_num == 0:
                xray_obj = Xray_Cal(logger=logger,
                                    ignore_list=ignore_list,
                                    model=model)
            xray_obj, tmp_xray = xray_obj.get_xray(base_xray=train,
                                                   col_list=train.columns,
                                                   fold_num=fold_num,
                                                   N_sample=N_sample,
                                                   max_point=max_point,
                                                   Pararell=True)
            tmp_xray.rename(columns={'xray': f'xray_{fold_num}'}, inplace=True)

            if len(result_xray):
                result_xray = result_xray.merge(tmp_xray.drop('N', axis=1),
                                                on=['feature', 'value'],
                                                how='inner')
            else:
                result_xray = tmp_xray.copy()
            del tmp_xray
            gc.collect()

        xray_col = [col for col in result_xray.columns if col.count('xray')]
        result_xray['xray_avg'] = result_xray[xray_col].mean(axis=1)
        result_xray.to_csv(
            f'../output/{start_time[4:10]}_xray_{model_type}_CV{LGBM.cv_score}.csv',
            index=False)
        sys.exit()

    submit = pd.read_csv('../input/sample_submission.csv')
    #  submit = []

    #========================================================================
    # STACKING
    #========================================================================
    if len(stack_name) > 0:
        logger.info(f'result_stack shape: {LGBM.result_stack.shape}')
        utils.to_pkl(
            path=
            f"../stack/{start_time[4:12]}_{stack_name}_{model_type}_CV{str(cv_score).replace('.', '-')}_{feature_num}features.fp",
            obj=LGBM.result_stack)
    logger.info(
        f'FEATURE IMPORTANCE PATH: {HOME}/kaggle/home-credit-default-risk/output/cv_feature{feature_num}_importances_auc_{cv_score}.csv'
    )

    #========================================================================
    # Submission
    #========================================================================
    if len(submit) > 0:
        if stack_name == 'add_nest':
            test[target] = result
            test = test.reset_index()[[
                key, target
            ]].groupby(key)[target].mean().reset_index()
            submit = submit[key].to_frame().merge(test, on=key, how='left')
            submit[target].fillna(0, inplace=True)
            submit_path = f'../submit/{start_time[4:12]}_submit_{fname}_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv'
            submit.to_csv(submit_path, index=False)
            utils.submit(file_path=submit_path)

        else:
            submit[target] = result
            submit.to_csv(
                f'../submit/{start_time[4:12]}_submit_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv',
                index=False)
Exemple #32
0
def mk_submit():

    files_tr = ('../feature/train_' + features + '.f').tolist()
    files_te = ('../feature/test_' + features + '.f').tolist()

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = loader.train()
    X_train_ = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1)
    X_train = pd.concat([X_train, X_train_], axis=1)
    y_train = utils.read_pickles('../data/label').TARGET

    # remove old users
    X_train = X_train[new_train_users]
    y_train = y_train[new_train_users]

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(loader.category()))

    COL = X_train.columns.tolist()

    # test
    X_test = loader.test()
    X_test_ = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)
    X_test = pd.concat([X_test, X_test_], axis=1)[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    model_all = []
    auc_mean = 0
    for i in range(LOOP):
        dtrain = lgb.Dataset(X_train,
                             y_train,
                             categorical_feature=CAT,
                             free_raw_data=False)

        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        auc_mean += ret['auc-mean'][-1]
    auc_mean /= LOOP

    result = f"CV auc-mean({COMMENT}): {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
Exemple #33
0
sub = X[['click_id']]
sub.click_id = sub.click_id.map(int)

X.drop('click_id', axis=1, inplace=True)
X.fillna(-1, inplace=True)


dtest = xgb.DMatrix(X[train_head.columns])
del X; gc.collect()

sub['is_attributed'] = 0
y_pred = model.predict(dtest)
sub['is_attributed'] += pd.Series(y_pred).rank()
#sub['is_attributed'] /= LOOP
sub['is_attributed'] /= sub['is_attributed'].max()
sub['click_id'] = sub.click_id.map(int)

sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

# =============================================================================
# submission
# =============================================================================
if EXE_SUBMIT:
    utils.submit(SUBMIT_FILE_PATH, COMMENT)


#==============================================================================
utils.end(__file__)

def main():
    # load submission files
    print('load files...')
    sub_weekday = pd.read_csv(
        '../output/submission_lgbm_group_k_fold_weekday.csv')
    sub_holiday = pd.read_csv(
        '../output/submission_lgbm_group_k_fold_holiday.csv')

    # load oof files
    oof_weekday = pd.read_csv('../output/oof_lgbm_group_k_fold_weekday.csv')
    oof_holiday = pd.read_csv('../output/oof_lgbm_group_k_fold_holiday.csv')

    # merge
    sub = sub_weekday.append(sub_holiday)

    oof = oof_weekday.append(oof_holiday)

    del sub_weekday, sub_holiday, oof_weekday, oof_holiday
    gc.collect()

    # to pivot
    print('to pivot...')
    sub = sub.pivot(index='id', columns='d', values='demand').reset_index()
    oof = oof.pivot(index='id', columns='d', values='demand').reset_index()

    # split test1 / test2
    sub1 = oof[['id'] + COLS_TEST1]
    sub2 = sub[['id'] + COLS_TEST2]

    # change column names
    sub1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]
    sub2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]

    # replace test1 id
    sub1['id'] = sub1['id'].str.replace('_evaluation', '_validation')

    # merge
    sub = sub1.append(sub2)

    # postprocesssing
    cols_f = [f'F{i}' for i in range(1, 29)]
    cols_d = [c for c in oof.columns if 'd_' in c]

    sub.loc[:, cols_f] = sub[cols_f].where(sub[cols_f] > 0, 0)
    oof.loc[:, cols_d] = oof[cols_d].where(oof[cols_d] > 0, 0)

    # save csv
    sub.to_csv(submission_file_name, index=False)
    oof.to_csv(oof_file_name, index=False)

    # calc out of fold WRMSSE score
    print('calc oof cv scores...')
    scores = calc_score_cv(oof)
    score = np.mean(scores)
    print(f'scores: {scores}')

    # submission by API
    submit(submission_file_name, comment='model411 cv: %.6f' % score)

    # LINE notify
    line_notify('{} done. WRMSSE:{}'.format(sys.argv[0], round(score, 6)))