def predictTests_Bulk(model, inference_config, validpath, plot, cnt=None):
    print('BATCH SIZE', inference_config.IMAGES_PER_GPU)
    dataset_valid = ShapesDataset()
    dataset_valid.load_imgs(validpath)
    dataset_valid.prepare()

    new_test_ids = []
    rles = []

    tot = len(dataset_valid.image_ids)
    prog = tqdm(total=tot)
    for i in range(0, tot, inference_config.IMAGES_PER_GPU):
        if cnt is not None and i > cnt:
            break
        imgs = []
        img_metas = []
        file_ids = []
        for j in range(inference_config.IMAGES_PER_GPU):
            id = i + j
            image_id = dataset_valid.image_ids[id]
            file_id = dataset_valid.image_info[image_id]['id']
            scaled_image, image_meta, _, _, _ =\
                modellib.load_image_gt(dataset_valid, inference_config,
                                       image_id, use_mini_mask=True, augment=False)
            file_ids.append(file_id)
            imgs.append(scaled_image)
            img_metas.append(image_meta)

        results = model.detect(imgs, verbose=0)
        for i in range(len(results)):
            rle = convert_result(imgs[i], results[i], img_metas[i],
                                 dataset_valid, plot)
            file_id = file_ids[i]

            rles.extend(rle)
            new_test_ids.extend([file_id] * len(rle))

        prog.update(inference_config.IMAGES_PER_GPU)

    sub = pd.DataFrame()
    sub['ImageId'] = new_test_ids
    sub['EncodedPixels'] = pd.Series(rles).apply(
        lambda x: ' '.join(str(y) for y in x))
    sub.to_csv('../result/{}_org.csv'.format(inference_config.NAME),
               index=False)
    kaggle_util.save_result(sub,
                            '../result/{}.csv'.format(inference_config.NAME),
                            competition='airbus-ship-detection',
                            send=True,
                            index=False)
    return sub
def doodle_predict(model, model_path, x_test):
    model.load_weights(model_path)

    test_predictions = model.predict(x_test, batch_size=128, verbose=1)
    top3 = preds2catids(test_predictions)
    top3cats = top3.replace(id2cat)
    test['word'] = top3cats['a'] + ' ' + top3cats['b'] + ' ' + top3cats['c']
    submission = test[['key_id', 'word']]

    import kaggle_util
    kaggle_util.save_result(submission,  
                            '../result/{}.csv'.format(model_prefix), 
                            'quickdraw-doodle-recognition', 
                            send=True, index=False)
Ejemplo n.º 3
0
def save_ensemble(LABELS, prediction, prefix, mname, send):
    np.save('../result/ensembles/{}_{}.npy'.format(mname, prefix), prediction)
    # Make a submission file
    top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test = pd.read_csv('../data/sample_submission.csv')
    test['label'] = predicted_labels

    filename = '../result/{}_{}.csv'.format(mname, prefix)
    kaggle_util.save_result(test[['fname', 'label']],
                            filename,
                            'freesound-audio-tagging',
                            send=send,
                            index=False)
def predictTests(model,
                 config,
                 validpath,
                 plot,
                 flex='sub',
                 send=False,
                 fold=-1,
                 valid=-1):
    print('fold', fold, 'valid', valid)
    dataset_valid = ShapesDataset(fold, valid, -1)
    if fold >= 0 and valid == 1:
        dataset_valid.load_imgs(validpath)
    else:
        dataset_valid.load_test_imgs(validpath)
    dataset_valid.prepare()

    #file_id, rle = predictOne(model, dataset_valid, inference_config, 1)

    new_test_ids = []
    rles = []

    for i in tqdm(range(len(dataset_valid.image_ids))):
        file_id, rle = predictOne(model, dataset_valid, config, i, plot=plot)
        rles.extend(rle)
        new_test_ids.extend([file_id] * len(rle))

        #if i > 10:
        #    break

    from keras import backend as K
    K.clear_session()

    sub = pd.DataFrame()
    sub['ImageId'] = new_test_ids
    sub['EncodedPixels'] = pd.Series(rles).apply(
        lambda x: ' '.join(str(y) for y in x))

    sub.to_csv('../result/{}_{}.csv'.format(flex, config.NAME), index=False)
    kaggle_util.save_result(sub,
                            '../result/{}_{}.csv'.format(config.NAME, flex),
                            competition='airbus-ship-detection',
                            send=send,
                            index=False)
    return sub
Ejemplo n.º 5
0
def main_crossvalid_xgboost(frm, to):
    import xgboost as xgb

    nfold = 5
    df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex = get_crossvalid_data(
        frm, to)

    cat_features = []
    cols = list(df.columns)
    for col in categorical:
        cat_features.append(cols.index(col))

    #lgtest = xgb.DMatrix(testing.toarray())
    #del testing
    #gc.collect()

    skf = StratifiedKFold(y, n_folds=nfold)

    for i, (train_split, val_split) in enumerate(skf):
        #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5)
        print(train_split)
        X_train = hstack(
            [csr_matrix(df.iloc[train_split].values), ready_df[train_split]])
        X_valid = hstack(
            [csr_matrix(df.iloc[val_split].values),
             ready_df[val_split]])  # Sparse Matrix
        y_train = y[train_split]
        y_valid = y[val_split]

        #lgtrain = xgb.DMatrix(X_train.toarray(), label = y_train)
        #lgvalid = xgb.DMatrix(X_valid.toarray(), label = y_valid)

        #del X_train, X_valid, y_train
        #gc.collect()

        modelstart = time.time()

        bst = xgb.XGBRegressor(n_estimators=400,
                               booster='gbtree',
                               learning_rate=0.016,
                               gamma=0,
                               subsample=0.75,
                               colsample_bylevel=0.5,
                               max_depth=16,
                               nthread=6)

        bst.fit(X_train,
                y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                verbose=False,
                early_stopping_rounds=100)

        print("Model Evaluation Stage")
        ypre = bst.predict(X_valid)
        rmse = np.sqrt(metrics.mean_squared_error(y_valid, ypre))
        print('RMSE:', rmse)
        """
        f, ax = plt.subplots(figsize=[7,10])
        xgb.plot_importance(bst, ax=ax, max_num_features = 50)
        plt.title("Light GBM Feature Importance")
        plt.savefig('xgb_feature_import.png', bbox_inches='tight')
        """

        lgpred = bst.predict(testing)
        lgsub = pd.DataFrame(lgpred,
                             columns=["deal_probability"],
                             index=testdex)
        lgsub['deal_probability'].clip(0.0, 1.0,
                                       inplace=True)  # Between 0 and 1

        subfile = '../result/xgb_dense_feature_{}.csv'.format(i)
        if debug:
            subfile = '../result/xgb_dense_feature_debug{}.csv'.format(i)
        kaggle_util.save_result(lgsub,
                                subfile,
                                competition='avito-demand-prediction',
                                send=False,
                                index=True)

    result_list = []
    for i in range(nfold):
        subfile = '../result/xgb_dense_feature_{}.csv'.format(i)
        if debug:
            subfile = '../result/xgb_dense_feature_debug{}.csv'.format(i)
        result_list.append((subfile, 1 / nfold))

    kaggle_util.ensemble(result_list,
                         not debug,
                         competition='avito-demand-prediction',
                         score_col='deal_probability',
                         prefix='xgb_avg')
Ejemplo n.º 6
0
def main_crossvalid(frm, to):
    nfold = 5
    df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex = get_crossvalid_data(
        frm, to)

    lgbm_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 270,  # 37,
        'feature_fraction': 0.4,
        'bagging_fraction': 0.65,
        'bagging_freq': 2,
        'learning_rate': 0.016,
        #'max_depth' : 8,
        #'min_split_gain' : 0.0222415,
        #'min_child_weight' : 20,
        'nthread': 5,
        'verbose': 0,
        #'reg_alpha' : 0.041545473,
        #'reg_lambda' : 0.0735294,
        'drop_rate': 0.08
    }

    skf = StratifiedKFold(y, n_folds=nfold)

    for i, (train_split, val_split) in enumerate(skf):
        #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5)
        print(train_split)
        X_train = hstack(
            [csr_matrix(df.iloc[train_split].values), ready_df[train_split]])
        X_valid = hstack(
            [csr_matrix(df.iloc[val_split].values),
             ready_df[val_split]])  # Sparse Matrix
        y_train = y[train_split]
        y_valid = y[val_split]

        lgtrain = lgb.Dataset(X_train,
                              y_train,
                              feature_name=tfvocab,
                              categorical_feature=categorical)
        lgvalid = lgb.Dataset(X_valid,
                              y_valid,
                              feature_name=tfvocab,
                              categorical_feature=categorical)

        modelstart = time.time()
        lgb_clf = lgb.train(lgbm_params,
                            lgtrain,
                            num_boost_round=26000,
                            valid_sets=[lgtrain, lgvalid],
                            valid_names=['train', 'valid'],
                            early_stopping_rounds=100,
                            verbose_eval=100)

        print("Model Evaluation Stage")
        rmse = np.sqrt(
            metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))
        print('RMSE:', rmse)

        f, ax = plt.subplots(figsize=[7, 10])
        lgb.plot_importance(lgb_clf, max_num_features=100, ax=ax)
        plt.title("Light GBM Feature Importance")
        plt.savefig('feature_import.png', bbox_inches='tight')

        str_now = datetime.now().strftime("%m-%d-%H-%M")
        if not debug:
            lgb_clf.save_model('../model/model_{}.txt'.format(i),
                               lgb_clf.best_iteration)
        else:
            lgb_clf.save_model('../model/model_debug_{}.txt'.format(i),
                               lgb_clf.best_iteration)

        lgpred = lgb_clf.predict(testing, num_iteration=lgb_clf.best_iteration)
        lgsub = pd.DataFrame(lgpred,
                             columns=["deal_probability"],
                             index=testdex)
        lgsub['deal_probability'].clip(0.0, 1.0,
                                       inplace=True)  # Between 0 and 1

        subfile = '../result/dense_feature_{}.csv'.format(i)
        if debug:
            subfile = '../result/dense_feature_debug{}.csv'.format(i)
        kaggle_util.save_result(lgsub,
                                subfile,
                                competition='avito-demand-prediction',
                                send=False,
                                index=True)

    result_list = []
    for i in range(nfold):
        subfile = '../result/dense_feature_{}.csv'.format(i)
        if debug:
            subfile = '../result/dense_feature_debug{}.csv'.format(i)
        result_list.append((subfile, 1 / nfold))

    kaggle_util.ensemble(result_list,
                         False,
                         competition='avito-demand-prediction',
                         score_col='deal_probability',
                         prefix='lgb_avg')
Ejemplo n.º 7
0
def main(frm, to):

    testing = pd.read_csv('../input/test.csv',
                          skiprows=range(1, frm),
                          nrows=to - frm,
                          index_col="item_id",
                          parse_dates=["activation_date"])
    testdex = testing.index
    len_test = len(testing)

    tot_filename = '/media/extend/cache/total_{}_{}.csv'.format(frm, to)
    tot_yname = '/media/extend/cache/total_y_{}_{}.csv'.format(frm, to)
    if os.path.exists(tot_filename) and os.path.exists(tot_yname):
        print('load from feather')
        #df = pd.read_feather(tot_filename).set_index("item_id")
        #y = pd.read_feather(tot_yname).set_index("item_id").deal_probability.copy()
        df = pd.read_csv(tot_filename).set_index("item_id")
        y = pd.read_csv(tot_yname).set_index("item_id").deal_probability.copy()

        len_train = to - frm
    else:
        training = pd.read_csv('../input/train.csv',
                               skiprows=range(1, frm),
                               nrows=to - frm,
                               index_col="item_id",
                               parse_dates=["activation_date"])
        len_train = len(training)

        y = training.deal_probability.copy()
        training.drop("deal_probability", axis=1, inplace=True)
        #y.reset_index().to_feather(tot_yname)
        y.reset_index().to_csv(tot_yname)

        print('Train shape: {} Rows, {} Columns'.format(*training.shape))
        print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

        df = pd.concat([training, testing], axis=0)
        del training, testing

    predictors = []
    y, df, ready_df, tfvocab, predictors, len_train, categorical =  \
        preparTotalData(y, df, predictors, len_train, len_test, frm, to, tot_filename)

    none_categorical = [x for x in df.columns if x not in categorical]

    df = df[predictors]
    print(df.info())

    print("Modeling Stage")
    X = hstack([csr_matrix(df[:len_train].values),
                ready_df[0:len_train]])  # Sparse Matrix
    testing = hstack([csr_matrix(df[len_train:].values), ready_df[len_train:]])
    tfvocab = df.columns.tolist() + tfvocab
    for shape in [X, testing]:
        print("{} Rows and {} Cols".format(*shape.shape))
    print("Feature Names Length: ", len(tfvocab))
    del df
    gc.collect()

    print("\nModeling Stage")

    # Training and Validation Set
    """
    Using Randomized train/valid split doesn't seem to generalize LB score, so I will try time cutoff
    """
    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.05,
                                                          random_state=5)
    """
    total_len = X.shape[0]
    train_len = int(total_len * 0.9)
    X = X.tocsr()
    X_train = X[:train_len]
    X_valid = X[train_len:]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    """
    print("Light Gradient Boosting Regressor")
    lgbm_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        #'max_depth': 15,
        'num_leaves': 270,  # 37,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.85,
        # 'bagging_freq': 5,
        'learning_rate': 0.018,
        'nthread': 6,
        'verbose': 0,
        #'device':'gpu',
        #'gpu_platform_id':0,
        #'gpu_device_id':0
    }

    # LGBM Dataset Formatting
    lgtrain = lgb.Dataset(X_train,
                          y_train,
                          feature_name=tfvocab,
                          categorical_feature=categorical)
    lgvalid = lgb.Dataset(X_valid,
                          y_valid,
                          feature_name=tfvocab,
                          categorical_feature=categorical)

    # Go Go Go
    modelstart = time.time()
    lgb_clf = lgb.train(lgbm_params,
                        lgtrain,
                        num_boost_round=26000,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=100,
                        verbose_eval=100)

    # Feature Importance Plot
    #f, ax = plt.subplots(figsize=[7,10])
    #lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax)
    #plt.title("Light GBM Feature Importance")
    #plt.savefig('feature_import.png', bbox_inches='tight')

    print("Model Evaluation Stage")
    rmse = np.sqrt(
        metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))
    print('RMSE:', rmse)

    str_now = datetime.now().strftime("%m-%d-%H-%M")
    if not debug:
        lgb_clf.save_model('../model/model_{}.txt'.format(str_now),
                           lgb_clf.best_iteration)
    else:
        lgb_clf.save_model('../model/model_debug.txt', lgb_clf.best_iteration)

    #lgb_clf = lgb.Booster(model_file='../model/model_05-13-21-50.txt')

    lgpred = lgb_clf.predict(testing, num_iteration=lgb_clf.best_iteration)
    lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex)
    lgsub['deal_probability'].clip(0.0, 1.0, inplace=True)  # Between 0 and 1
    #lgsub.to_csv("lgsub.csv",index=True,header=True)

    if not debug:
        kaggle_util.save_result(
            lgsub,
            '../result/dense_feature_{}.csv'.format(str_now),
            competition='avito-demand-prediction',
            send=True,
            index=True)
    print("Model Runtime: %0.2f Minutes" % ((time.time() - modelstart) / 60))
    print("Notebook Runtime: %0.2f Minutes" %
          ((time.time() - notebookstart) / 60))
Ejemplo n.º 8
0
RMSE_idx = RMSE.index(min_value)
print(RMSE_idx)
pred_final2 = Kfold_preds_final[RMSE_idx]
print(pred_final2.shape)

#del Kfold_preds_final, train1
gc.collect()

test_cols = ['item_id']
test = pd.read_csv('../input/test.csv', skiprows=range(1,frm), nrows=to-frm, usecols = test_cols)

# using Average of KFOLD preds 

submission1 = pd.DataFrame( columns = ['item_id', 'deal_probability'])

submission1['item_id'] = test['item_id']
submission1['deal_probability'] = pred_final1

print("Check Submission NOW!!!!!!!!@")
#submission1.to_csv("Avito_Shanth_RNN_AVERAGE.csv", index=False)
kaggle_util.save_result(submission1, '../result/rnn_avg.csv', competition = 'avito-demand-prediction', send = not debug, index = False)

# Using KFOLD preds with Minimum value 
submission2 = pd.DataFrame( columns = ['item_id', 'deal_probability'])

submission2['item_id'] = test['item_id']
submission2['deal_probability'] = pred_final2

print("Check Submission NOW!!!!!!!!@")
#submission2.to_csv("Avito_Shanth_RNN_MIN.csv", index=False)
kaggle_util.save_result(submission2, '../result/rnn_min.csv', competition = 'avito-demand-prediction', send = False, index = False)
Ejemplo n.º 9
0
    
    return X_tra, X_val, y_tra, y_val, x_test

if __name__ == "__main__":
    
    
    X_tra, X_val, y_tra, y_val, x_test = loadData()
    
    
    model = getModel()
    batch_size = 3000
    epochs = 10
    # filepath="../input/best-model/best.hdf5"
    filepath="../model/weights_base.best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_root_mean_squared_error', verbose=1, save_best_only=True, mode='min')
    early = EarlyStopping(monitor="val_root_mean_squared_error", mode="min", patience=5)
    callbacks_list = [checkpoint, early]
    
    #model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)
    #Loading model weights
    model.load_weights(filepath)
    print('Predicting....')
    y_pred = model.predict(x_test,batch_size=1024,verbose=1)
    
    sub = pd.read_csv('../input/sample_submission.csv')
    sub['deal_probability'] = y_pred
    sub['deal_probability'].clip(0.0, 1.0, inplace=True)
    sub.to_csv('gru_capsule_description.csv', index=False)
    
    kaggle_util.save_result(sub, '../result/capsule.csv', competition = 'avito-demand-prediction', send = True)
Ejemplo n.º 10
0
        lgsub = pd.DataFrame(y_pred,
                             columns=["deal_probability"],
                             index=testdex)
        lgsub['deal_probability'].clip(0.0, 1.0,
                                       inplace=True)  # Between 0 and 1
        del modelRNN
        gc.collect()

        print("Number of folds completed...." + str(k))
        #print(Kfold_preds_final[k][0:10])
        k += 1
        K.clear_session()

        kaggle_util.save_result(lgsub,
                                '../result/rnn_{}.csv'.format(k),
                                competition='avito-demand-prediction',
                                send=False,
                                index=True)

    print("All Folds completed" + str(k + 1))
    print("RNN FOLD MODEL Done")

    result_list = []
    for i in range(nfold):
        subfile = 'rnn_{}.csv'.format(i)
        result_list.append((subfile, 1 / nfold))

    kaggle_util.ensemble(result_list,
                         not debug,
                         competition='avito-demand-prediction',
                         score_col='deal_probability',
Ejemplo n.º 11
0
            lgb_clf.save_model('../model/ridge_debug_{}.txt'.format(i),
                               lgb_clf.best_iteration)

        lgpred = lgb_clf.predict(testing, num_iteration=lgb_clf.best_iteration)
        lgsub = pd.DataFrame(lgpred,
                             columns=["deal_probability"],
                             index=testdex)
        lgsub['deal_probability'].clip(0.0, 1.0,
                                       inplace=True)  # Between 0 and 1

        subfile = '../result/ridge_{}.csv'.format(i)
        if debug:
            subfile = '../result/ridge_debug{}.csv'.format(i)
        kaggle_util.save_result(lgsub,
                                subfile,
                                competition='avito-demand-prediction',
                                send=False,
                                index=True)

    result_list = []
    for i in range(nfold):
        subfile = '../result/ridge_{}.csv'.format(i)
        if debug:
            subfile = '../result/ridge_debug{}.csv'.format(i)
        result_list.append((subfile, 1 / nfold))

    kaggle_util.ensemble(result_list,
                         not debug,
                         competition='avito-demand-prediction',
                         score_col='deal_probability')
Ejemplo n.º 12
0
    epochs = 200

    sample_weight = np.ones(y.shape)
    sample_weight[y < 1e-7] = 1 + len(y[y < 1e-7]) / len(y)
    history = model.fit(X_train,
                        y,
                        sample_weight=sample_weight,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_split=0.05,
                        verbose=1,
                        callbacks=[check_point, early_stop])

    model.load_weights(file_path)
    pred = model.predict(X_test, batch_size=batch_size, verbose=1)
    print('pred shape {}'.format(pred.shape))

    sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
    print('sub shape {}'.format(sub.shape))
    sub[target_col] = pred

    scr = min(history.history['val_root_mean_squared_error'])
    print('save to ' + f'mixnn_{scr}.csv')

    str_now = datetime.now().strftime("%m-%d-%H-%M")
    kaggle_util.save_result(sub,
                            '../result/mixnn_{}.csv'.format(str_now),
                            competition='avito-demand-prediction',
                            send=not debug,
                            index=False)