Ejemplo n.º 1
0
import numpy as np
import pandas as pd
import multiprocessing
from functools import partial
from sklearn.preprocessing import LabelEncoder

from kairbnb.preprocessing import process_user_actions
from kairbnb.preprocessing import process_user_secs_elapsed
from kairbnb.io import load_users

NROWS = None
VERSION = '5'

if __name__ == '__main__':
    # Load raw data
    train_users, test_users = load_users(nrows=NROWS, na_values='-unknown-')
    sessions = pd.read_csv('../data/sessions.csv',
                           nrows=NROWS, na_values='-unknown-')

    # Join users
    users = pd.concat((train_users, test_users), axis=0, ignore_index=True)
    users = users.set_index('id')

    # Drop date_first_booking column (empty since competition's restart)
    users = users.drop('date_first_booking', axis=1)

    # Remove weird age values
    users.loc[users['age'] > 100, 'age'] = np.nan
    users.loc[users['age'] < 13, 'age'] = np.nan

    # Change type to date
import numpy as np
import pandas as pd
import multiprocessing
from functools import partial
from sklearn.preprocessing import LabelEncoder

from kairbnb.preprocessing import process_user_actions
from kairbnb.preprocessing import process_user_secs_elapsed
from kairbnb.io import load_users

NROWS = None
VERSION = '5'

if __name__ == '__main__':
    # Load raw data
    train_users, test_users = load_users(nrows=NROWS, na_values='-unknown-')
    sessions = pd.read_csv('../data/sessions.csv',
                           nrows=NROWS,
                           na_values='-unknown-')

    # Join users
    users = pd.concat((train_users, test_users), axis=0, ignore_index=True)
    users = users.set_index('id')

    # Drop date_first_booking column (empty since competition's restart)
    users = users.drop('date_first_booking', axis=1)

    # Remove weird age values
    users.loc[users['age'] > 100, 'age'] = np.nan
    users.loc[users['age'] < 13, 'age'] = np.nan
Ejemplo n.º 3
0
#!/usr/bin/env python

from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

from kairbnb.io import generate_submission, load_users

VERSION = '5'
NAME = 'gb_' + VERSION

if __name__ == '__main__':
    train_users, test_users = load_users(version=VERSION)

    y_train = train_users['country_destination']
    train_users.drop(['country_destination', 'id'], axis=1, inplace=True)
    train_users = train_users.fillna(-1)
    x_train = train_users.values
    label_encoder = LabelEncoder()
    encoded_y_train = label_encoder.fit_transform(y_train)

    test_users_ids = test_users['id']
    test_users.drop('id', axis=1, inplace=True)
    test_users = test_users.fillna(-1)
    x_test = test_users.values

    clf = XGBClassifier(max_depth=7,
                        learning_rate=0.18,
                        n_estimators=80,
                        objective="rank:pairwise",
                        gamma=0,
                        min_child_weight=1,
Ejemplo n.º 4
0
import pandas as pd

from kairbnb.preprocessing import one_hot_encoding
from kairbnb.io import load_users

VERSION = '5'

if __name__ == '__main__':

    # Load raw data
    train_users, test_users = load_users(version=VERSION)

    # Join users
    users = pd.concat((train_users, test_users), axis=0, ignore_index=True)

    # Set ID as index
    users = users.set_index('id')
    train_users = train_users.set_index('id')
    test_users = test_users.set_index('id')

    # Drop columns
    drop_list = [
        'date_account_created',
        'date_first_active',
        'timestamp_first_active'
    ]

    users.drop(drop_list, axis=1, inplace=True)

    # IDEA: Add interaction features
Ejemplo n.º 5
0
from kairbnb.io import load_users

VERSION = '1'

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--max_depth', default=7, type=int)
    parser.add_argument('-lr', '--learning_rate', default=0.18, type=float)
    parser.add_argument('-n', '--n_estimators', default=80, type=int)
    parser.add_argument('-ct', '--colsample_bytree', default=1, type=float)
    parser.add_argument('-cl', '--colsample_bylevel', default=1, type=float)
    parser.add_argument('-sub', '--subsample', default=1, type=float)
    parser.add_argument('-md', '--max_delta', default=0, type=float)
    args = parser.parse_args()

    train_users, _ = load_users(version=VERSION)
    train_users.fillna(-1, inplace=True)
    y_train = train_users['country_destination']
    train_users.drop(['country_destination', 'id'], axis=1, inplace=True)
    x_train = train_users.values

    label_encoder = LabelEncoder()
    encoded_y_train = label_encoder.fit_transform(y_train)

    xgb = XGBClassifier(
        max_depth=args.max_depth,
        learning_rate=args.learning_rate,
        n_estimators=args.n_estimators,
        objective="multi:softprob",
        gamma=0,
        min_child_weight=1,
Ejemplo n.º 6
0
from kairbnb.io import load_users

VERSION = '1'

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--max_depth', default=7, type=int)
    parser.add_argument('-lr', '--learning_rate', default=0.18, type=float)
    parser.add_argument('-n', '--n_estimators', default=80, type=int)
    parser.add_argument('-ct', '--colsample_bytree', default=1, type=float)
    parser.add_argument('-cl', '--colsample_bylevel', default=1, type=float)
    parser.add_argument('-sub', '--subsample', default=1, type=float)
    parser.add_argument('-md', '--max_delta', default=0, type=float)
    args = parser.parse_args()

    train_users, _ = load_users(version=VERSION)
    train_users.fillna(-1, inplace=True)
    y_train = train_users['country_destination']
    train_users.drop(['country_destination', 'id'], axis=1, inplace=True)
    x_train = train_users.values

    label_encoder = LabelEncoder()
    encoded_y_train = label_encoder.fit_transform(y_train)

    xgb = XGBClassifier(max_depth=args.max_depth,
                        learning_rate=args.learning_rate,
                        n_estimators=args.n_estimators,
                        objective="multi:softprob",
                        gamma=0,
                        min_child_weight=1,
                        max_delta_step=0,