Ejemplo n.º 1
0
    def build(self):
        train, y, test, _ = data.get()
        #
        ntrain = len(train)
        df = pd.concat([train, test], axis=0)
        to_drop = df.columns
        
        for c in ['ap_hi', 'ap_lo', 'height', 'weight']:
            squish(df, c, c + 'S', 10)
        for c in ['gluc', 'cholesterol']:
            squish(df, c, c + 'S', 1)

        df = df.drop(to_drop, axis=1)
        train = df[:ntrain]
        test = df[ntrain:].copy()
        #
        train = pd.concat([train, y], axis=1)

        global_avg = y.mean()
        for c in test.columns:
            d = wa_dict(train, c, 'cardio', global_avg, 10)
            test[c] = test[c].map(d).astype('float32')
        test.fillna(global_avg, inplace=True)

        kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
        for itrain, iset in kf.split(train, y):
            for c in test.columns:
                d = wa_dict(train.ix[itrain], c, 'cardio', global_avg, 10)
                train.ix[iset, c] = train.ix[iset, c].map(d)
        train.drop('cardio', axis=1, inplace=True)
        train.fillna(global_avg, inplace=True)

        return train.astype('float32'), test.astype('float32'), None
Ejemplo n.º 2
0
    def build(self):
        train, _, test, _ = data.get()
        #
        ntrain = len(train)
        df = pd.concat([train, test], axis=0)
        to_drop = df.columns

        h = df.height / 100
        df['bwi'] = df['weight'] / (h * h)
        df['ap_p'] = df['ap_hi'] - df['ap_lo']
        df['ap_m4'] = (df['ap_hi'] + 3 * df['ap_lo']) / 4
        df['ap_m3'] = (df['ap_hi'] + 2 * df['ap_lo']) / 3
        df['ap_m2'] = (df['ap_hi'] + df['ap_lo']) / 2

        male = df['gender'] == 2
        df.ix[male, 'ap_hi_e'] = 109 + 0.5 * df.ix[
            male, 'age'] / 365.25 + 0.1 * df.ix[male, 'weight']
        df.ix[male, 'ap_lo_e'] = 74 + 0.1 * df.ix[
            male, 'age'] / 365.25 + 0.15 * df.ix[male, 'weight']
        df.ix[~male, 'ap_hi_e'] = 102 + 0.7 * df.ix[
            ~male, 'age'] / 365.25 + 0.15 * df.ix[~male, 'weight']
        df.ix[~male, 'ap_lo_e'] = 78 + 0.17 * df.ix[
            ~male, 'age'] / 365.25 + 0.1 * df.ix[~male, 'weight']

        df = df.drop(to_drop, axis=1)
        ftrain = df[:ntrain]
        ftest = df[ntrain:]
        return ftrain, ftest, None
Ejemplo n.º 3
0
    def build(self):
        train, _, test, _ = data.get()
        #
        ntrain = len(train)
        df = pd.concat([train, test], axis=0)
        to_drop = df.columns

        h = df.height / 100
        df['bwi'] = df['weight'] / (h * h)
        df['ap_p'] = df['ap_hi'] - df['ap_lo']
        df['ap_m4'] = (df['ap_hi'] + 3 * df['ap_lo']) / 4
        df['ap_m3'] = (df['ap_hi'] + 2 * df['ap_lo']) / 3
        df['ap_m2'] = (df['ap_hi'] + df['ap_lo']) / 2

        male = df['gender'] == 2
        df.ix[male, 'ap_hi_e'] = 109 + 0.5 * df.ix[male, 'age'] / 365.25 + 0.1 * df.ix[male, 'weight']
        df.ix[male, 'ap_lo_e'] = 74 + 0.1 * df.ix[male, 'age'] / 365.25 + 0.15 * df.ix[male, 'weight']
        df.ix[~male, 'ap_hi_e'] = 102 + 0.7 * df.ix[~male, 'age'] / 365.25 + 0.15 * df.ix[~male, 'weight']
        df.ix[~male, 'ap_lo_e'] = 78 + 0.17 * df.ix[~male, 'age'] / 365.25 + 0.1 * df.ix[~male, 'weight']

        df.ix[male, 'weight_ah'] = (df.ix[male, 'ap_hi'] - 109 - 0.5 * df.ix[male, 'age'] / 365.25) / 0.1
        df.ix[male, 'weight_al'] = (df.ix[male, 'ap_lo'] - 74 - 0.1 * df.ix[male, 'age'] / 365.25) / 0.15
        df.ix[~male, 'weight_ah'] = (df.ix[~male, 'ap_hi'] - 102 - 0.7 * df.ix[~male, 'age'] / 365.25) / 0.15
        df.ix[~male, 'weight_al'] = (df.ix[~male, 'ap_lo'] - 78 - 0.17 * df.ix[~male, 'age'] / 365.25) / 0.1

        df['ap_hi_ed'] = df['ap_hi'] - df['ap_hi_e']
        df['ap_lo_ed'] = df['ap_lo'] - df['ap_lo_e']
        df['dw_ah'] = df['weight'] - df['weight_ah']
        df['dw_al'] = df['weight'] - df['weight_al']

        df = df.drop(to_drop, axis=1)
        ftrain = df[:ntrain]
        ftest = df[ntrain:]
        return ftrain, ftest, None
Ejemplo n.º 4
0
def predict():
    saved = state.load('model')
    #saved = None
    if saved == None:
        train, y, test, _ = data.get()
        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _
Ejemplo n.º 5
0
def predict():
    saved = state.load('model')
    #saved = None
    if saved == None:
        train, y, test, _ = data.get()
        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _
Ejemplo n.º 6
0
    def build(self):
        train, _, test, _ = data.get()
        cset = []
        ntrain = len(train)
        df = pd.concat([train, test], axis=0)
        to_drop = df.columns
        for sc in ['height', 'weight', 'ap_hi', 'ap_lo']:
            tc = df[sc].apply(str)
            maxc = tc.apply(len).max()
            for n in range(maxc):
                df['ft_l_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[n])  if n < len(s) else -1)
                df['ft_r_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[-n]) if n < len(s) else -1)
                cset.append('ft_l_'+sc+'_'+str(n))
                cset.append('ft_r_'+sc+'_'+str(n))

        df = df.drop(to_drop, axis=1)
        self.train_= df[:ntrain]
        self.test_ = df[ntrain:]
        return self.train_, self.test_, None
Ejemplo n.º 7
0
    def build(self):
        train, y, test, _ = data.get()

        ntrain = len(train)
        df = pd.concat([train, test], axis=0)
        to_drop = df.columns

        dcn = []
        for n in [2, 5, 10, 15, 25]:
            cname = 'kmeans_' + str(n)
            dcn.append(cname)
            df[cname] = cluster.KMeans(n_clusters=n).fit_predict(df)

        df = pd.get_dummies(df, columns=dcn)

        df = df.drop(to_drop, axis=1)
        train = df[:ntrain]
        test = df[ntrain:].copy()

        return train.astype('int32'), test.astype('int32'), None
Ejemplo n.º 8
0
    def build(self):
        train, _, test, _ = data.get()
        cset = []
        ntrain = len(train)
        df = pd.concat([train, test], axis=0)
        to_drop = df.columns
        for sc in ['height', 'weight', 'ap_hi', 'ap_lo']:
            tc = df[sc].apply(str)
            maxc = tc.apply(len).max()
            for n in range(maxc):
                df['ft_l_' + sc + '_' + str(n)] = tc.apply(
                    lambda s: ord(s[n]) if n < len(s) else -1)
                df['ft_r_' + sc + '_' + str(n)] = tc.apply(
                    lambda s: ord(s[-n]) if n < len(s) else -1)
                cset.append('ft_l_' + sc + '_' + str(n))
                cset.append('ft_r_' + sc + '_' + str(n))

        df = pd.get_dummies(df, columns=cset).drop(to_drop, axis=1)
        self.train_ = df[:ntrain]
        self.test_ = df[ntrain:]
        return self.train_, self.test_, None
Ejemplo n.º 9
0
    def build(self):
        train, y, test, _ = data_src.get()
        xgb_params = dict(
            max_depth=5,
            learning_rate=0.005,
            subsample=0.7,
            gamma=5,
            alpha=0.01,
            #colsample_bytree = 0.8,
            objective='binary:logistic',
            eval_metric='logloss',
            seed=1,
            silent=1)

        idx = (test.smoke > 0).values * (test.smoke < 1).values
        print('values to restore:', np.sum(idx))
        xtrain = pd.concat([train, test[~idx]])
        ytrain = xtrain['smoke']
        xtrain.drop('smoke', axis=1, inplace=True)
        print(xtrain.shape, ytrain.shape, test[idx].shape)

        dtrain = xgb.DMatrix(xtrain.values, ytrain.values)
        dpred = xgb.DMatrix(test[idx].drop('smoke', axis=1).values)

        cv = xgb.cv(params=xgb_params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    early_stopping_rounds=50,
                    nfold=10,
                    seed=1,
                    metrics='error',
                    stratified=True)
        print('smoke num_boost_rounds =', len(cv))
        bst = xgb.train(params=xgb_params,
                        dtrain=dtrain,
                        num_boost_round=len(cv))
        test.ix[idx, 'smoke'] = bst.predict(dpred)
        test['smoke'] = (test['smoke'] > 0.5) * 1
        return train, y, test, None
Ejemplo n.º 10
0
    def build(self):
        train, y, test, _ = data_src.get()
        xgb_params = dict(
                max_depth = 5,
                learning_rate = 0.005,
                subsample = 0.7,
                gamma = 5,
                alpha = 0.01,
                #colsample_bytree = 0.8,
                objective = 'binary:logistic',
                eval_metric = 'logloss',
                seed = 1,
                silent = 1
            )

        idx = (test.smoke > 0).values * (test.smoke < 1).values
        print('values to restore:', np.sum(idx))
        xtrain = pd.concat([train, test[~idx]])
        ytrain = xtrain['smoke']
        xtrain.drop('smoke', axis=1, inplace=True)
        print(xtrain.shape, ytrain.shape, test[idx].shape)

        dtrain = xgb.DMatrix(xtrain.values, ytrain.values)
        dpred = xgb.DMatrix(test[idx].drop('smoke', axis=1).values)

        cv = xgb.cv(params=xgb_params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    early_stopping_rounds=50,
                    nfold=10,
                    seed=1,
                    metrics='error',
                    stratified=True)
        print('smoke num_boost_rounds =', len(cv))
        bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
        test.ix[idx, 'smoke'] = bst.predict(dpred)
        test['smoke'] = (test['smoke'] > 0.5) * 1
        return train, y, test, None