コード例 #1
0
ファイル: stacking.py プロジェクト: godkillok/daguan
X_train, X_test, y_train, y_test = train_test_split(trn_term_doc,
                                                    y,
                                                    test_size=0.01,
                                                    random_state=111)
print('tttt')
# X_train=X_train.toarray()
# X_test=X_test.toarray()
print('to array')
#创建数据集11
dataset = Dataset(X_train, y_train, test_term_doc, use_cache=False)
#创建RF模型和LR模型1

class_use_cache = False
model_nb = Classifier(dataset=dataset,
                      estimator=MultinomialNB,
                      name='nb',
                      use_cache=class_use_cache)
model_lr = Classifier(dataset=dataset,
                      estimator=LogisticRegression,
                      parameters={
                          'C': 4,
                          'dual': True,
                          'n_jobs': -1
                      },
                      name='lr',
                      use_cache=class_use_cache)
model_lr2 = Classifier(dataset=dataset,
                       estimator=LogisticRegression,
                       parameters={
                           'C': 4,
                           'multi_class': 'multinomial',
コード例 #2
0
ファイル: test_estimator.py プロジェクト: zwt233/heamy
def func_estimator(X_train, y_train, X_test, y_test):
    return np.zeros(X_test.shape[0])


def random_param():
    return random.randint(1, 100)


model_func = Regressor(estimator=func_estimator, dataset=TestDataset)
model_cls = TestEstimator(dataset=TestDataset())
model_param = Regressor(estimator=LinearRegression,
                        parameters={'random_param': random_param},
                        dataset=TestDataset)
model_param2 = Classifier(estimator=LogisticRegression,
                          parameters={'colsample_bylevel': 0.9},
                          dataset=TestDataset)


def test_hashing():
    assert str(
        model_func) == 'func_estimator(54743c7a5484d1bf2a64ac1d7b68f8cc)'
    assert str(model_cls) == 'TestEstimator(da29cb8766f96e6561a51e8e3c13f661)'
    assert str(
        model_param) == 'LinearRegression(2e789a766f6dc2457fb6a63452ad2859)'
    assert str(
        model_param2) == 'LogisticRegression(74efb248db47d168aed2fc37c0016e6f)'

    assert model_param2.hash == '74efb248db47d168aed2fc37c0016e6f'

    e_hash = TestEstimator(dataset=TestDataset()).hash
コード例 #3
0
lr_param = {
    'multi_class': 'multinomial',
    'solver': 'newton-cg',
    'random_state': 1
}

mlp_param = {
    'hidden_layer_sizes': (132, ),
    'activation': 'logistic',
    'max_iter': 500
}
#------------------------------------------------------------------------------
knn = Classifier(dataset=dataset,
                 estimator=KNeighborsClassifier,
                 use_cache=CACHE,
                 parameters=knn_param,
                 name='knn')
rf = Classifier(dataset=dataset,
                estimator=RandomForestClassifier,
                use_cache=CACHE,
                parameters=rf_param,
                name='rf')
et = Classifier(dataset=dataset,
                use_cache=CACHE,
                estimator=ExtraTreesClassifier,
                parameters=et_param,
                name='et')
lgb = Classifier(dataset=dataset,
                 estimator=LGBMClassifier,
                 use_cache=CACHE,
コード例 #4
0
    train_xf = train_x[filtered_cols_t]
    # val_xr = val_x[filtered_cols]
    test_xf = test_x[filtered_cols_t]

    dataset_full = Dataset(train_x.astype(np.float64), train_yt, test_x.astype(np.float64))
    dataset_f = Dataset(train_xf.astype(np.float64), train_yt, test_xf.astype(np.float64))

    xgb_params = {
        'learning_rate': 0.05,
        'max_depth': 6,
        'n_estimators': 500,
        # 'num_class': 5,
        'objective': 'multi:softprob',
        'subsample': 0.8}

    model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb')
    model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f')
    model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700},
                          name='rf')

    pipeline = ModelsPipeline(
        # model_xgb,
        model_rf,
        model_xgb_f)

    stack_ds = pipeline.stack(k=5, full_test=True, seed=111)
    stacker = Classifier(stack_ds, LogisticRegression)
    stacker.validate(k=5, scorer=log_loss)

    # logging.info(val_results)
    #
コード例 #5
0
ファイル: thirteenth_try.py プロジェクト: amaity/kaggle-
xg_params = {
        'seed': 0,
        'colsample_bytree': 0.7,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.1,
        'objective': 'multi:softprob',   
        'num_class': 7,
        'max_depth': 4,
        'min_child_weight': 1,
        'eval_metric': 'mlogloss',
        'nrounds': 200
    }

#------------------------------------------------------------------------------
knn = Classifier(dataset=dataset, estimator = KNeighborsClassifier, use_cache=CACHE, parameters=knn_param,name='knn')
rf = Classifier(dataset=dataset, estimator = RandomForestClassifier, use_cache=CACHE, parameters=rf_param,name='rf')
et = Classifier(dataset=dataset, estimator=ExtraTreesClassifier, use_cache=CACHE, parameters=et_param,name='et')
lgb = Classifier(dataset=dataset, estimator=LGBMClassifier, use_cache=CACHE, parameters=lgb_param,name='lgb')
lr = Classifier(dataset=dataset, estimator=LogisticRegression, use_cache=CACHE, parameters=lr_param,name='lr')
xgf = Classifier(dataset=dataset, estimator=XGBClassifier, use_cache=CACHE, parameters=xg_params,name='xgf')
#------------------------------------------------------------------------------
#Stack the models and returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(knn, rf, et, lgb, lr) 
stack_ds = pipeline.stack(k=NFOLDS,seed=1)
print(stack_ds.X_train.shape,stack_ds.X_test.shape)
#------------------------------------------------------------------------------
dtrain = xgb.DMatrix(stack_ds.X_train, label=stack_ds.y_train)
dtest = xgb.DMatrix(stack_ds.X_test)

xgb_params = {
コード例 #6
0
from heamy.estimator import Classifier
from heamy.pipeline import ModelsPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from xgboost import XGBClassifier

#创建数据集
dataset = Dataset(X_train, y_train, X_n)  #对无标签训练集进行预测时将X_test替换为X_n

model_xgb = Classifier(dataset=dataset,
                       estimator=XGBClassifier,
                       parameters={
                           'reg_alpha': 0.01,
                           'n_estimators': 100,
                           'objective': 'binary:logistic',
                           'seed': 32,
                           'gamma': 0.4,
                           'colsample_bytree': 0.75,
                           'subsample': 0.8,
                       },
                       name='xgb')

model_xgb2 = Classifier(dataset=dataset,
                        estimator=XGBClassifier,
                        parameters={
                            'seed': 128,
                            'gamma': 0.4,
                            'reg_alpha': 0.01,
                            'n_estimators': 100,
                            'objective': 'binary:logistic',
                            'colsample_bytree': 0.75,
コード例 #7
0
data_train = pd.read_csv('data_analysis/data_train.csv',encoding='gb2312')
targets = data_train['TARGET']
train_data = data_train.drop(labels=['TARGET'],axis=1)

data_test = pd.read_csv('data_analysis/data_test.csv',encoding='gb2312')

test_data = data_test.drop(labels=['FORTARGET','PROB'],axis=1)
# ------------------------------------------------------- 划分样本集-----------------------------------#
# train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.5,random_state=66)
# create dataset
# dataset = Dataset(train_data,targets,test_data)
dataset = Dataset(train_data,targets,test_data)
#xgb = XGBClassifier(n_estimators = 1350,scale_pos_weight=4,nthread=-1,seed=6,max_depth=3,min_child_weight=6,learning_rate=0.05,
#							gamma=0,subsample=0.9,colsample_bytree=0.9,reg_alpha=8)
#--------------------------------------------------------stacking model----------------------#
model_rf1 = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters={'n_estimators': 1000,'max_depth':19,
							'criterion':'entropy','min_samples_split':15,'n_jobs':-1},name='rf1')
model_rf2 = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters={'n_estimators': 1000,'max_depth':19,
							'criterion':'gini','min_samples_split':15,'n_jobs':-1},name='rf2')
							
model_gdbt1 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'exponential',
							'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.06,'random_state':1},
							name='gdbt1')
model_gdbt2 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'exponential',
							'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':2},
							name='gdbt2')
model_gdbt3 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'deviance',
							'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':3},
							name='gdbt3')							
model_xgbt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators' :1350,
							'nthread':-1,'max_depth':3,'min_child_weight':6,'learning_rate':0.05,
							'gamma':0,'subsample':0.9,'colsample_bytree':0.9,'reg_alpha':8,},name='xgbt')
コード例 #8
0
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')
model_knn = Regressor(dataset=dataset,
                      estimator=KNeighborsRegressor,
                      parameters={'n_neighbors': 15},
                      name='knn')
model_lgt = Regressor(dataset=dataset,
                      estimator=LogisticRegression,
                      parameters={'penalty': 'l2'},
                      name='lgt')
xgbclf = Classifier(dataset=dataset, estimator=XGBClassifier)

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr, model_knn, xgbclf)
weights = pipeline.find_weights(mean_absolute_error)
result = pipeline.weight(weights)
stack_ds = pipeline.stack(k=10, seed=111)

# Then, train LinearRegression on stacked data
stacker = Regressor(dataset=dataset, estimator=LinearRegression)
results = stacker.predict()

results = stacker.validate(k=10, scorer=mean_absolute_error)
コード例 #9
0
ファイル: stacking.py プロジェクト: godkillok/heamy
                      smooth_idf=1,
                      sublinear_tf=1)
trn_term_doc = vec.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(trn_term_doc,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=111)
# X_train=X_train.toarray()
# X_test=X_test.toarray()
print(type(X_train))
#创建数据集1
dataset = Dataset(X_train, y_train, X_test, use_cache=False)
class_use_cache = False
#创建RF模型和LR模型
model_nb = Classifier(dataset=dataset,
                      estimator=MultinomialNB,
                      name='nb',
                      use_cache=class_use_cache)
model_lr = Classifier(dataset=dataset,
                      estimator=LogisticRegression,
                      parameters={
                          'C': 4,
                          'dual': True,
                          'n_jobs': -1
                      },
                      name='lr',
                      use_cache=class_use_cache)
model_svm = Classifier(dataset=dataset,
                       estimator=svm.SVC,
                       parameters={'probability': True},
                       name='svm',
                       use_cache=class_use_cache)
コード例 #10
0
from sklearn.model_selection import train_test_split

"""数据集设置"""
X_train = df_data.loc[df_data['sample']=='train', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)
X_test = df_data.loc[df_data['sample']=='test', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)

y_train = df_data.loc[df_data['sample']=='train', 'isDefault']
# 数据集划分
# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)


from heamy.dataset import Dataset
from heamy.estimator import Classifier

model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)

from heamy.pipeline import ModelsPipeline

pipeline = ModelsPipeline(model_xgb, model_lgb)
pipeline

# 构建第一层新特征,其中k默认是5,表示5折交叉验证,full_test=True,对全部训练集进行训练得到基学习器,然后用基学习器对测试集预测得到新特征
stack_ds = pipeline.stack(k=5, seed=111, full_test=True)

from sklearn.linear_model import LogisticRegression
# 第二层使用逻辑回归进行stack
LogisticRegression(solver='lbfgs')
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果