Ejemplo n.º 1
0
def test_titanic():
    train_df = titanic()[0].fillna(-999)
    X, y = train_df.drop('Survived', axis=1), train_df.Survived
    categorical_features_indices = np.where(X.dtypes != np.float)[0]

    model = CatBoostClassifier(iterations=5)
    model.fit(X, y, cat_features=categorical_features_indices)
    preds = model.predict(X)
Ejemplo n.º 2
0
def build_titanic_dataset():
    df_train, df_test = titanic()
    df_train.fillna(-999, inplace=True)
    df_test.fillna(-999, inplace=True)
    X = df_train.drop('Survived', axis=1)  # X: train feature
    y = df_train.Survived  # y: train label

    # if the feature value type != float, then be treat as a categorial feature
    cate_feat_idx = np.where(X.dtypes != np.float)[0]
    x_train, x_vali, y_train, y_vali = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        random_state=42)
    x_test = df_test
    return X, y, x_train, x_vali, y_train, y_vali, x_test, cate_feat_idx
Ejemplo n.º 3
0
def main(**args):
    titanic_train, _ = titanic()
    titanic_train.fillna(-999, inplace=True)
    cols = [
        'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
        'Cabin', 'Embarked'
    ]
    train_sz = int(titanic_train.shape[0] * 0.1)
    x_train = titanic_train[:train_sz][cols]
    y_train = titanic_train[:train_sz]['Survived'].astype(int)
    x_test = titanic_train[train_sz:][cols]
    y_test = titanic_train[train_sz:]['Survived'].astype(int)
    try:
        model = CatBoostClassifier(random_seed=42, **args)
        model.fit(x_train, y_train, [1, 2, 6, 8, 9], silent=True)
        accuracy = model.score(x_test, y_test)
        print(-accuracy)
    except:
        print(0)
Ejemplo n.º 4
0
 def load_test(self):
     _, self.df_test = titanic()
     self.df_test["Survived"] = -1
Ejemplo n.º 5
0
 def load_train(self):
     self.df_train, _ = titanic()
Ejemplo n.º 6
0
from pagi.models.binary_classifiers import get_CatBoostClassifier

if __name__ == "__main__":
    from catboost.datasets import titanic
    import numpy as np

    train_df, test_df = titanic()

    train_df.head()
    train_df.fillna(-999, inplace=True)
    test_df.fillna(-999, inplace=True)
    X = train_df.drop('Survived', axis=1)
    y = train_df.Survived
    print(X.dtypes)

    categorical_features_indices = np.where(X.dtypes != np.float)[0]
    from sklearn.model_selection import train_test_split

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=0.75, random_state=42)

    X_test = test_df

    model = get_CatBoostClassifier()

    model.fit(X_train,
              y_train,
              cat_features=categorical_features_indices,
              eval_set=(X_validation, y_validation),
              plot=True)
Objective: Catboost algorithm implementation
Author: Samuel Adebayo
Blog: https://dataaspirant.com
Date: 2021-01-02
===============================================
"""

## import the libraries needed
import pandas as pd
import numpy as np

# Here we import our dataset from the CatBoost dataset library
from catboost.datasets import titanic

## The titanic dataset is made up of the train and test set, so we have to separate the data
titanic_train, titanic_test = titanic()

## Here we create a list to sort the columns so that the "Survived" column comes last
## This is because "Survived" is the target
column_sort = [
    'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
    'Cabin', 'Embarked', 'Survived'
]

## Now we apply the sorted columns to the train data
train = titanic_train[column_sort]
train.set_index('Pclass')  ## Not necessary just to get of the default index

test = titanic_test
train.head()
Ejemplo n.º 8
0
 def _data_loading(self):
     # The data for this tutorial can be obtained from [this page](https://www.kaggle.com/c/titanic/data)
     self.train_df, self.test_df = titanic()
     print(self.train_df.head())
# coding=utf-8
import json
import os

import numpy as np

from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split

from catboost.datasets import titanic

# get training data
train, test = titanic()

# remove nans
train, test = train.fillna(-999), test.fillna(-999)

# split into train and test
X, y = train.drop(['PassengerId', 'Survived'], axis=1), train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# define categorical features
cat_indices = np.where(X_train.dtypes != float)[0]

# define model params
params = {
    'iterations': 1000,
    'depth': 2,
    'loss_function': 'Logloss',
    'eval_metric': 'F1',
    'use_best_model': True,