コード例 #1
0
sml.xgb.cv(tuned_params)

sml.xgb.cv_results.tail(5)

tuned_params['n_estimators'] = sml.xgb.cv_results.shape[0] - 1
sml.xgb.params(tuned_params)

sml.xgb.classifier()

sml.model.evaluate()
sml.plot.model_ranks()

sml.model.ranks()


sml.xgb.fit()
sml.xgb.predict()

sml.plot.xgb_importance()

sml.xgb.feature_selection()

sml.xgb.sample_accuracy()

sml.save_results(
    columns={ 'PassengerId': sml.uid,
             'Survived': sml.xgb.predictions }, 
    file_path='output/titanic-speedml-{}.csv'.format(sml.slug()))
sml.slug()

コード例 #2
0
class TitanicML:
    def __init__(self):
        self.sml = None

    def run(self):
        print("running")
        self.data()
        self.models()
        self.results()

    def data(self):
        self.setup_speedml()
        self.prepare_data()

    def models(self):
        self.prepare_models()
        self.evaluate_models()
        self.predict_models()

    def results(self):
        self.save_results()
        self.write_results()

    def prepare_data(self):
        print("preparing data")
        self.strip_outliers()
        self.create_features()
        self.map_features()
        self.impute_features()
        self.feature_densities()
        self.drop_features()
        print("data prepared")

    def setup_speedml(self):
        print("Setting up Speedml")
        self.sml = Speedml('../input/train.csv', '../input/test.csv', target='Survived', uid='PassengerId')

    ################ DATA PREPARATION  ################

    def strip_outliers(self):
        print("Stripping Outliers")
        self.sml.feature.outliers('Fare', upper=99)
        self.sml.feature.outliers('SibSp', upper=98)

    def create_features(self):
        self.create_family_size()
        self.create_title()
        self.create_deck()

    def map_features(self):
        self.map_sex()
        self.map_embarked()

    def impute_features(self):
        self.impute_ages()
        self.impute_values()

    def feature_densities(self):
        #self.create_ticket_density()
        self.create_age_density()
        #self.create_fare_density()

    def drop_features(self):
        self.drop_cabin()
        self.drop_ticket()
        #self.drop_fare()
        #self.drop_age()
        #self.drop_embarked()
        self.drop_sibsp()
        self.drop_parch()
        self.drop_name()

    ### CREATE FEATUES

    def create_family_size(self):
        print("Merge Parch and SibSp into FamilySize")
        self.sml.feature.sum(new='FamilySize', a='Parch', b='SibSp')
        self.sml.feature.add('FamilySize', 1)

    def create_title(self):
        print("extract Title from Name")
        self.sml.feature.extract(new='Title', a='Name', regex=r" ([A-Za-z]+)\.")
        self.sml.feature.replace(a='Title', match=['Lady', 'Countess', 'Dona', 'Mme'], new='Mrs')
        self.sml.feature.replace(a='Title', match=['Don', 'Sir', 'Jonkheer'], new='Mr')
        self.sml.feature.replace(a='Title', match=['Capt', 'Col', 'Dr', 'Major', 'Rev'], new='Crew')
        self.sml.feature.replace(a='Title', match=['Mlle','Ms'], new='Miss')
        self.sml.feature.mapping('Title', {'Miss': 1, 'Master': 2, 'Mrs': 3, 'Mr': 4, 'Crew': 5})
        self.sml.feature.fillna(a='Title', new=0)

    def create_deck(self):
        print("create deck")
        self.sml.feature.fillna(a='Cabin', new='Z')
        self.sml.feature.extract(new='Deck', a='Cabin', regex='([A-Z]){1}')
        self.sml.feature.labels(['Deck', 'Cabin'])
        ## TODO ^^ figure out Deck from TicketPrice, Cabin, and PClass

    ### MAP FEATURES

    def map_sex(self):
        print("map sex")
        self.sml.feature.mapping('Sex', {'male': 0, 'female': 1})

    def map_embarked(self):
        print("map embarked")
        self.sml.feature.fillna(a='Embarked', new='Z')
        self.sml.feature.mapping('Embarked', {'S': 0, 'C': 1, 'Q': 2, 'Z': 3})

    ### IMPUTE FEATURES

    def impute_ages(self):
        print("Impute ages")
        titanic.sml.feature.fillna('Age',0)
        for df in [titanic.sml.train, titanic.sml.test]:
          for i in list(range(1,6)):
              titles = df[(df['Title'] == i) & (df['Age'] != 0)]
              title_mean_age = titles['Age'].median()
              null_ages = df[(df['Title'] == i) & (df['Age'] == 0)]
              null_ages['Age'] = title_mean_age

    def impute_values(self):
        print("Impute remaining empty fields")
        self.sml.feature.impute()

    ### FEATURE DENSITIES

    def create_ticket_density(self):
        print("create ticket density")
        self.sml.feature.density('Ticket')
        ## TODO: ^^ let's figure out Deck using this and PClass

    def create_age_density(self):
        print("FOR NOW add Age densities")
        self.sml.feature.density(['Age'])

    def create_fare_density(self):
        print("FOR NOW add Fare densities")
        self.sml.feature.density(['Fare'])

    def drop_ticket(self):
        print("drop ticket")
        self.sml.feature.drop('Ticket')

    ### DROP FEATURES

    def drop_cabin(self):
        print("drop cabin")
        self.sml.feature.drop('Cabin')

    def drop_fare(self):
        print("drop fare")
        self.sml.feature.drop('Fare')

    def drop_age(self):
        print("drop age")
        self.sml.feature.drop('Age')

    def drop_embarked(self):
        print("drop embarked")
        self.sml.feature.drop('Embarked')

    def drop_parch(self):
        print("drop parch")
        self.sml.feature.drop('Parch')

    def drop_sibsp(self):
        print("drop sibsp")
        self.sml.feature.drop("SibSp")

    def drop_name(self):
        print("drop name")
        self.sml.feature.drop("Name")

     ################ MODEL PREPARATION  ################

    def prepare_models(self):
        print("prepare models")
        self.sml.model.data()
        self.set_model_parameters()

    def set_model_parameters(self):
        print("set model parameters")
        ret1 = self.refine_max_depth_and_min_child_weight()
        ret2 = self.refine_learning_rate_and_subsample(ret1)
        self.assign_tuned_variables(ret1, ret2)

    def refine_max_depth_and_min_child_weight(self):
        """Finds best max_depth and min_child_weight against fixed params"""
        print("refine max depth and min child weight")
        select_params = {'max_depth': list(range(3, 9)), 'min_child_weight': list(range(1, 7))}
        fixed_params = {'learning_rate': 0.1, 'subsample': 0.8,
                        'colsample_bytree': 0.8, 'seed': 0,
                        'objective': 'binary:logistic'}
        ret = self.sml.xgb.hyper(select_params, fixed_params)
        return ret['params'][0]


    def refine_learning_rate_and_subsample(self, results):
        """Finds best refine_learning_rate and subsample against fixed params"""
        print("refine learning rate and subsamples with max_depth")
        learning_rate_range = [0.3,0.2,0.1,0.05,0.01]
        subsample_range = list(map(lambda x: str(x/10), range(6, 9)))
        select_params = {'learning_rate': learning_rate_range, 'subsample': subsample_range}
        fixed_params = {'max_depth': results['max_depth'], 'min_child_weight': results['min_child_weight'],
                        'colsample_bytree': 0.8, 'seed': 0,
                        'objective': 'binary:logistic'}
        ret = self.sml.xgb.hyper(select_params, fixed_params)
        return ret['params'][0]

    def assign_tuned_variables(self, ret1, ret2):
        """Assigned best fit params to XGBoost"""
        print("assign tuned variables")
        print("learning_rate: "+str(ret2['learning_rate']))
        print("subsample: "+str(ret2['subsample']))
        print("max_depth: "+str(ret1['max_depth']))
        print("min_child_weight: "+str(ret1['min_child_weight']))
        tuned_params = {'learning_rate': ret2['learning_rate'], 'subsample': ret2['subsample'],
                        'max_depth': ret1['max_depth'], 'min_child_weight': ret1['min_child_weight'],
                        'seed':0, 'colsample_bytree': 0.8,
                        'objective': 'binary:logistic'}
        self.sml.xgb.cv(tuned_params)
        tuned_params['n_estimators'] = self.sml.xgb.cv_results.shape[0] - 1
        self.sml.xgb.params(tuned_params)

    def evaluate_models(self):
        print("Show best models")
        self.sml.xgb.classifier()
        self.sml.model.evaluate()
        self.sml.plot.model_ranks()
        self.sml.model.ranks()

    def predict_models(self):
        print("predict and get accuracy")
        self.sml.xgb.fit()
        self.sml.xgb.predict()
        self.sml.xgb.feature_selection()
        self.sml.xgb.sample_accuracy()

     ################ RESULTS  ################

    def save_results(self):
        print("save results when happy")
        self.sml.save_results(
            columns={'PassengerId': self.sml.uid,
                     'Survived': self.sml.xgb.predictions},
            file_path='output/titanic-speedml-{}.csv'.format(self.sml.slug()))
        self.sml.slug()

    def write_results(self):
        print(check_output(["ls", "../input"]).decode("utf8"))