Ejemplo n.º 1
0
def load_data():
    """
        Load data and specified features of the data sets
    """
    if preprocessing:
        train_org, test_org = utils.read_csv_files('./data/train.csv', './data/test.csv')
    else:
        train_org, test_org = utils.read_csv_files('./data/train_agg.csv', './data/test_agg.csv')
    return train_org, test_org
Ejemplo n.º 2
0
def load_data():
    """
        Load data and specified features of the data sets
    """
    train, test = utils.read_csv_files('./data/train1000.csv', './data/test1000.csv')
    features = test.columns.tolist()
    return train, test, features
Ejemplo n.º 3
0
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe


store = pd.read_csv('./data/store.csv')

# Load data
if sample:  # To train with 75% data
    if gridsearch:
        train_org, test_org = utils.read_csv_files(
            './data/train.csv',
            './data/test.csv',
            dtype={'StateHoliday': pd.np.string_})
    else:
        df = pd.read_csv('./data/train.csv',
                         dtype={'StateHoliday': pd.np.string_})
        df['is_train'] = (df['Sales'] * 15 + df['Customers'] *
                          3) % 17 <= 13  # my pseudo random, deal with it
        train_org, test_org = df[df['is_train'] == True], df[df['is_train'] ==
                                                             False]
else:
    # To run with real data
    train_org, test_org = utils.read_csv_files(
        './data/train.csv',
        './data/test.csv',
        dtype={'StateHoliday': pd.np.string_})
Ejemplo n.º 4
0
# coding: utf-8
"""
Beating the benchmark @ Kaggle Springleaf
@author: Abhishek Thakur
"""


import pandas as pd
import numpy as np
import xgboost as xgb
import utils
from sklearn import preprocessing, linear_model

train, test = utils.read_csv_files('data/train.csv', 'data/test.csv')

y = train.target.values
train = train.drop(['ID', 'target'], axis=1)
test = test.drop('ID', axis=1)

train = train.dropna(axis=1)
test = test.dropna(axis=1)

train = train.fillna(-1)
test = test.fillna(-1)

for f in train.columns:
    if train[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))
Ejemplo n.º 5
0
pd.options.mode.chained_assignment = None

sample = True

goal = 'Survived'
myid = 'PassengerId'

# Load data
if sample:  # To run with 100k data
    df = pd.read_csv('./data/train.csv')
    df['is_train'] = (df[myid] % 10) >= 5
    train, test = df[df['is_train'] == True], df[df['is_train'] == False]
else:
    # To run with real data
    train, test = utils.read_csv_files('./data/train.csv', './data/test.csv')

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
features_non_numeric = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Fare', 'Embarked',
                        'firstname', 'titles']
train['firstname'] = train['Name'].apply(lambda x: x.split(',')[0])
test['firstname'] = test['Name'].apply(lambda x: x.split(',')[0])
train['titles'] = train['Name'].apply(lambda x: x.split(',')[1].split('.')[0])
test['titles'] = test['Name'].apply(lambda x: x.split(',')[1].split('.')[0])
train['Age'] = train['Age'].apply(lambda x: 35 if math.isnan(x) else x)
test['Age'] = test['Age'].apply(lambda x: 35 if math.isnan(x) else x)
train['Fare'] = train['Fare'].apply(lambda x: 30 if math.isnan(x) else x)
test['Fare'] = test['Fare'].apply(lambda x: 30 if math.isnan(x) else x)

# Pre-processing non-number values
le = LabelEncoder()
Ejemplo n.º 6
0
def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat) ** 2))
    return "rmspe", rmspe


store = pd.read_csv('./data/store.csv')

# Load data
if sample:  # To train with 75% data
    if gridsearch:
        train_org, test_org = utils.read_csv_files('./data/train.csv', './data/test.csv',
                                                   dtype={'StateHoliday': pd.np.string_})
    else:
        df = pd.read_csv('./data/train.csv', dtype={'StateHoliday': pd.np.string_})
        df['is_train'] = (df['Sales'] * 15 + df['Customers'] * 3) % 17 <= 13  # my pseudo random, deal with it
        train_org, test_org = df[df['is_train'] == True], df[df['is_train'] == False]
else:
    # To run with real data
    train_org, test_org = utils.read_csv_files('./data/train.csv', './data/test.csv',
                                               dtype={'StateHoliday': pd.np.string_})

train = pd.merge(train_org, store, on='Store', how='left')
test = pd.merge(test_org, store, on='Store', how='left')

train['year'] = train.Date.apply(lambda x: x.split('-')[0])
train['year'] = train['year'].astype(float)
train['month'] = train.Date.apply(lambda x: x.split('-')[1])