Exemple #1
0
def test_properties():
    d = Dataset(X_train, y_train, X_test, y_test)
    assert d.X_train is X_train
    assert d.y_train is y_train
    assert d.X_test is X_test
    assert d.y_test is y_test

    d = Dataset(X_train, y_train)

    assert d.X_test is None
    assert d.y_test is None
def test_merge():
    d1 = Dataset(X_train, y_train, X_test, y_test)
    d2 = Dataset(X_train, y_train, X_test, y_test)
    ds = d1.merge(d2)
    assert ds.X_train.shape[1] == 26
    assert ds.X_test.shape[1] == 26

    d1 = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test), pd.DataFrame(y_test))
    d2 = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test), pd.DataFrame(y_test))
    d1.merge(d2, inplace=True)
    assert d1.X_train.shape[1] == 26
    assert d1.X_test.shape[1] == 26
Exemple #3
0
def test_initialization():
    Dataset(X_train, y_train, X_test, y_test)
    Dataset(X_train, y_train, X_test)
    Dataset(X_train, y_train)

    Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test))

    # with pytest.raises(ValueError):
    #     Dataset()

    Dataset(preprocessor=preprocess)

    CustomDataset()
Exemple #4
0
def test_hashing():
    assert Dataset(X_train, y_train, X_test,
                   y_test).hash == '0cb7e710b7319bb71e7328e4b422b374'
    assert Dataset(X_train, y_train,
                   X_test).hash == 'c9b316f827981b3d0b53f8ab139234ea'
    assert Dataset(
        pd.DataFrame(X_train), pd.DataFrame(y_train),
        pd.DataFrame(X_test)).hash == 'c9b316f827981b3d0b53f8ab139234ea'

    assert Dataset(
        np.asfortranarray(X_train), np.asfortranarray(y_train),
        np.asfortranarray(X_test)).hash == '8087697aa8460a25314edc85cc915ec8'

    d_hash = TestDataset().hash
    assert d_hash == TestDataset().hash
Exemple #5
0
def test_hashing():
    assert Dataset(X_train, y_train, X_test,
                   y_test).hash == '13fceb92d1485772af58252810646711'
    assert Dataset(X_train, y_train,
                   X_test).hash == '116d39a012c2b54df573a8b8d0eae85c'
    assert Dataset(
        pd.DataFrame(X_train), pd.DataFrame(y_train),
        pd.DataFrame(X_test)).hash == '116d39a012c2b54df573a8b8d0eae85c'

    assert Dataset(
        np.asfortranarray(X_train), np.asfortranarray(y_train),
        np.asfortranarray(X_test)).hash == 'e2ebd8834b5b3bfb6b9ff3e4697ceb24'

    d_hash = CustomDataset().hash
    assert d_hash == CustomDataset().hash
Exemple #6
0
def test_shapes():
    x_t = np.random.rand(100, 5)
    y_t = np.random.rand(5, 1)

    with pytest.raises(ValueError):
        assert Dataset(X_train, y_train, x_t, y_test)

    with pytest.raises(ValueError):
        assert Dataset(X_train, y_t, x_t, y_test)

    with pytest.raises(ValueError):
        assert Dataset(X_train, y_train, X_test, y_t)

    with pytest.raises(ValueError):
        assert Dataset(X_train, y_train, x_t)
Exemple #7
0
def test_merge():
    d1 = Dataset(X_train, y_train, X_test, y_test)
    d2 = Dataset(X_train, y_train, X_test, y_test)
    ds = d1.merge(d2)
    assert ds.X_train.shape[1] == 26
    assert ds.X_test.shape[1] == 26

    d1 = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train),
                 pd.DataFrame(X_test), pd.DataFrame(y_test))
    d2 = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train),
                 pd.DataFrame(X_test), pd.DataFrame(y_test))
    d1.merge(d2, inplace=True)
    assert d1.X_train.shape[1] == 26
    assert d1.X_test.shape[1] == 26
def test_slicing():
    train_index = np.array(range(100))
    test_index = np.array(range(100, 250))

    d = Dataset(X_train, y_train, X_test, y_test)

    Xtrain, ytrain, Xtest, ytest = d.split(indices=(train_index, test_index))
    assert Xtrain.shape[0] == 100
    assert ytrain.shape[0] == Xtrain.shape[0]

    assert Xtest.shape[0] == 150
    assert Xtest.shape[0] == ytest.shape[0]

    d = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test), pd.DataFrame(y_test))
    Xtrain, ytrain, Xtest, ytest = d.split(indices=(train_index, test_index))
    assert Xtrain.shape[0] == 100
    assert ytrain.shape[0] == Xtrain.shape[0]

    assert Xtest.shape[0] == 150
    assert Xtest.shape[0] == ytest.shape[0]
def test_split():
    d = Dataset(X_train, y_train)
    d.split(inplace=True)
    assert d.X_test is not None
    assert d.y_test is not None

    d = Dataset(X_train, y_train)
    data = d.split(inplace=False)
    assert all(x is not None for x in data)

    d = Dataset(X_train, y_train)
    data = d.split(inplace=False)
    assert all([x is not None for x in data])

    d = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train))
    train_ind = np.array(range(0, 70))
    test_ind = np.array(range(30, 100))
    data = d.split(indices=[train_ind, test_ind])
    assert isinstance(data[0], (pd.DataFrame, pd.Series))
    assert isinstance(data[2], (pd.DataFrame, pd.Series))
Exemple #10
0
 def stacking(self):
     from heamy.dataset import Dataset
     from heamy.estimator import Regressor
     from heamy.pipeline import ModelsPipeline
     %%time
     dataset = Dataset(X_train, y_train, X_test)
     models_dic = {
         'random forest': RandomForestRegressor(n_estimators=50, random_state=seed),
         'linear regression': LinearRegression(normalize=True),
         'knn': KNeighborsRegressor(),
         'catboost': CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')
     }
     for name, model in models_dic.items():
         kfold = KFold(n_splits=10, random_state=seed)
         cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="neg_mean_absolute_error")
         print(f'{name} : {-np.mean(cv_results):.2f}')
Exemple #11
0
def test_slicing():
    train_index = np.array(range(100))
    test_index = np.array(range(100, 250))

    d = Dataset(X_train, y_train, X_test, y_test)

    Xtrain, ytrain, Xtest, ytest = d.split(indices=(train_index, test_index))
    assert Xtrain.shape[0] == 100
    assert ytrain.shape[0] == Xtrain.shape[0]

    assert Xtest.shape[0] == 150
    assert Xtest.shape[0] == ytest.shape[0]

    d = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train),
                pd.DataFrame(X_test), pd.DataFrame(y_test))
    Xtrain, ytrain, Xtest, ytest = d.split(indices=(train_index, test_index))
    assert Xtrain.shape[0] == 100
    assert ytrain.shape[0] == Xtrain.shape[0]

    assert Xtest.shape[0] == 150
    assert Xtest.shape[0] == ytest.shape[0]
"""
Created on Mon Oct 29 15:25:05 2018

@author: Administrator
"""

from heamy.dataset import Dataset
from heamy.estimator import Classifier
from heamy.pipeline import ModelsPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from xgboost import XGBClassifier

#创建数据集
dataset = Dataset(X_train, y_train, X_n)  #对无标签训练集进行预测时将X_test替换为X_n

model_xgb = Classifier(dataset=dataset,
                       estimator=XGBClassifier,
                       parameters={
                           'reg_alpha': 0.01,
                           'n_estimators': 100,
                           'objective': 'binary:logistic',
                           'seed': 32,
                           'gamma': 0.4,
                           'colsample_bytree': 0.75,
                           'subsample': 0.8,
                       },
                       name='xgb')

model_xgb2 = Classifier(dataset=dataset,
"""对训练集数据进行划分,分成训练集和验证集,并进行相应的操作"""
from sklearn.model_selection import train_test_split

"""数据集设置"""
X_train = df_data.loc[df_data['sample']=='train', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)
X_test = df_data.loc[df_data['sample']=='test', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)

y_train = df_data.loc[df_data['sample']=='train', 'isDefault']
# 数据集划分
# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)


from heamy.dataset import Dataset
from heamy.estimator import Classifier

model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)

from heamy.pipeline import ModelsPipeline

pipeline = ModelsPipeline(model_xgb, model_lgb)
pipeline

# 构建第一层新特征,其中k默认是5,表示5折交叉验证,full_test=True,对全部训练集进行训练得到基学习器,然后用基学习器对测试集预测得到新特征
stack_ds = pipeline.stack(k=5, seed=111, full_test=True)

from sklearn.linear_model import LogisticRegression
# 第二层使用逻辑回归进行stack
LogisticRegression(solver='lbfgs')
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
Exemple #14
0
vec = TfidfVectorizer(ngram_range=(1, 1),
                      min_df=3,
                      max_df=0.8,
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1)
trn_term_doc = vec.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(trn_term_doc,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=111)
# X_train=X_train.toarray()
# X_test=X_test.toarray()
print(type(X_train))
#创建数据集1
dataset = Dataset(X_train, y_train, X_test, use_cache=False)
class_use_cache = False
#创建RF模型和LR模型
model_nb = Classifier(dataset=dataset,
                      estimator=MultinomialNB,
                      name='nb',
                      use_cache=class_use_cache)
model_lr = Classifier(dataset=dataset,
                      estimator=LogisticRegression,
                      parameters={
                          'C': 4,
                          'dual': True,
                          'n_jobs': -1
                      },
                      name='lr',
                      use_cache=class_use_cache)
Exemple #15
0
    LinearRegression(normalize=True),
    'knn':
    KNeighborsRegressor(),
    'catboost':
    CatBoostRegressor(custom_metric=['MAE'],
                      random_seed=seed,
                      logging_level='Silent')
}

from heamy.dataset import Dataset
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline
from catboost import CatBoostRegressor
#import adaboost
# datasetを準備
dataset = Dataset(x_test, t_test, x_train2)  # X_testは今回使わないが入れないとエラーになる

# アンサンブルに使うモデルを定義
models = [
    Regressor(dataset=dataset,
              estimator=RandomForestRegressor,
              parameters={
                  'n_estimators': 50,
                  'random_state': seed
              },
              name='rf'),
    Regressor(dataset=dataset,
              estimator=LinearRegression,
              parameters={'normalize': True},
              name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
from heamy.dataset import Dataset
from sklearn.svm import SVC

#---------------------------------------------------------读取数据集-------------------------------#
data_train = pd.read_csv('data_analysis/data_train.csv',encoding='gb2312')
targets = data_train['TARGET']
train_data = data_train.drop(labels=['TARGET'],axis=1)

data_test = pd.read_csv('data_analysis/data_test.csv',encoding='gb2312')

test_data = data_test.drop(labels=['FORTARGET','PROB'],axis=1)
# ------------------------------------------------------- 划分样本集-----------------------------------#
# train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.5,random_state=66)
# create dataset
# dataset = Dataset(train_data,targets,test_data)
dataset = Dataset(train_data,targets,test_data)
#xgb = XGBClassifier(n_estimators = 1350,scale_pos_weight=4,nthread=-1,seed=6,max_depth=3,min_child_weight=6,learning_rate=0.05,
#							gamma=0,subsample=0.9,colsample_bytree=0.9,reg_alpha=8)
#--------------------------------------------------------stacking model----------------------#
model_rf1 = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters={'n_estimators': 1000,'max_depth':19,
							'criterion':'entropy','min_samples_split':15,'n_jobs':-1},name='rf1')
model_rf2 = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters={'n_estimators': 1000,'max_depth':19,
							'criterion':'gini','min_samples_split':15,'n_jobs':-1},name='rf2')
							
model_gdbt1 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'exponential',
							'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.06,'random_state':1},
							name='gdbt1')
model_gdbt2 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'exponential',
							'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':2},
							name='gdbt2')
model_gdbt3 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'deviance',
Exemple #17
0
from heamy.cache import np_hash
from heamy.dataset import Dataset
from heamy.estimator import Regressor
from heamy.utils.main import generate_columns, group_models, report_score
from heamy.utils.optimizer import Optimizer


def boston_dataset():
    data = load_boston()
    X, y = data['data'], data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
    return X_train, y_train, X_test, y_test


dataset = Dataset(preprocessor=boston_dataset, use_cache=True)
model = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True})
model_2 = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50})


def test_generate_columns():
    x = np.random.rand(100, 10)
    output = generate_columns(x, 'test')
    assert len(output) == 10
    assert all([True if col.find('test') == 0 else False for col in output])

    output = generate_columns(np.random.rand(100), 'test')
    assert output[0] == 'test'


def test_optimizer():
stacking_pre = []
Y_test = []
err_shop_rf = []
err_shop_gbrt = []
while i<500:
    # readfile
    X = []
    Y = readfile_oneshop_Y(fr2)
    start = len(Y)
    #print(start)
    X = readfile_oneshop_X(fr1,xday,xweekend,xweekday,xholiday)[-(start+14):-14]
    x_train = X[:-7]
    y_train = Y[:-7]
    x_test = X[-7:]
    y_test = Y[-7:]
    dataset = Dataset(x_train,y_train,x_test)
    ###
    params_rcv = {'cv':8,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_squared_error'}
    rcv = RidgeCV
    params_lascv = {'n_jobs':4,'cv':8}
    lascv = LassoCV
   
    params_rf = {'n_estimators':500,'max_depth':10,'min_samples_split':2,'warm_start':True,'n_jobs':4,'oob_score':True,'max_features':'log2'}
    params_rf2 = {'n_estimators':400,'max_depth':10,'min_samples_split':2,'warm_start':True,'n_jobs':4,'oob_score':True,'max_features':'log2'}
    params_br = {'n_iter':300}
    br = BayesianRidge
    params_knn = {'n_neighbors':14,'algorithm':'auto'}
    knn = KNeighborsRegressor

    rf = RandomForestRegressor
    rf2 = RandomForestRegressor
Exemple #19
0
def test_cache():
    d = Dataset(X_train, y_train, X_test, y_test)
    assert not d._cache()

    cache_dir = '.cache/heamy/'

    if os.path.exists(cache_dir):
        shutil.rmtree(cache_dir)

    d = CustomDataset()
    d._cache()

    d = CustomDataset(use_cache=True)
    d.load()
    assert d.loaded

    d = CustomDataset(use_cache=False)
    d.load()
    assert d.loaded

    d = CustomDataset3(use_cache=True)
    d.load()
    d._cache()

    d = CustomDataset3(use_cache=True)
    d.load()
    assert isinstance(d.X_train, (pd.DataFrame, pd.Series))
Exemple #20
0
scaler = StandardScaler()
data_X = scaler.fit_transform(data_X)
data_y = scaler.fit_transform(data_y)
data_X = pd.DataFrame(data_X)
data_y = pd.DataFrame(data_y)
# 训练集,测试集划分
# train_X, test_X, train_y, test_y = train_test_split(data_X, data_y1, test_size=0.8, random_state=7)
# print(np.shape(train_X))
# print("################模型融合##############")
x_train, x_test, y_train, y_test = train_test_split(data_X,
                                                    data_y,
                                                    test_size=0.33,
                                                    random_state=2018)
y_train = np.asarray(y_train).reshape(-1, 1)
# 创建数据集
dataset = Dataset(x_train, y_train.ravel(), x_test)
# 创建RF模型和LR模型
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_gbdt = Regressor(dataset=dataset,
                       estimator=GradientBoostingRegressor,
                       parameters={
                           'n_estimators': 50,
                           'learning_rate': 0.05,
                           'max_depth': 4,
Exemple #21
0
    logging.info('read from .....')
    with open('../input_data/trn_term_doc_13.pil', 'rb') as f:
        trn_term_doc = pickle.load(f)
    with open('../input_data/test_term_doc_13.pil', 'rb') as f:
        test_term_doc = pickle.load(f)

X_train, X_test, y_train, y_test = train_test_split(trn_term_doc,
                                                    y,
                                                    test_size=0.01,
                                                    random_state=111)
print('tttt')
# X_train=X_train.toarray()
# X_test=X_test.toarray()
print('to array')
#创建数据集11
dataset = Dataset(X_train, y_train, test_term_doc, use_cache=False)
#创建RF模型和LR模型1

class_use_cache = False
model_nb = Classifier(dataset=dataset,
                      estimator=MultinomialNB,
                      name='nb',
                      use_cache=class_use_cache)
model_lr = Classifier(dataset=dataset,
                      estimator=LogisticRegression,
                      parameters={
                          'C': 4,
                          'dual': True,
                          'n_jobs': -1
                      },
                      name='lr',
Exemple #22
0
def test_repr():
    assert str(Dataset(X_train, y_train, X_test,
                       y_test)) == 'Dataset(13fceb92d1485772af58252810646711)'
def test_cache():
    d = Dataset(X_train, y_train, X_test, y_test)
    assert not d._cache()

    cache_dir = '.cache/heamy/'

    if os.path.exists(cache_dir):
        shutil.rmtree(cache_dir)

    d = TestDataset()
    d._cache()

    d = TestDataset(use_cache=True)
    d.load()
    assert d.loaded

    d = TestDataset(use_cache=False)
    d.load()
    assert d.loaded

    d = TestDataset3(use_cache=True)
    d.load()
    d._cache()

    d = TestDataset3(use_cache=True)
    d.load()
    assert isinstance(d.X_train, (pd.DataFrame, pd.Series))
Exemple #24
0
def test_repr():
    assert str(Dataset(X_train, y_train, X_test,
                       y_test)) == 'Dataset(0cb7e710b7319bb71e7328e4b422b374)'
        'n_estimators': 500,
        'num_class': 5,
        'objective': 'multi:softprob',
        'subsample': 0.8}

    bst = xgb.train(xgb_params, xgb.DMatrix(train_x, train_yt))
    fscores = bst.get_fscore()

    filtered_fscores_t = {k: v for k, v in fscores.items() if v > 0}
    filtered_cols_t = list(filtered_fscores_t.keys())

    train_xf = train_x[filtered_cols_t]
    # val_xr = val_x[filtered_cols]
    test_xf = test_x[filtered_cols_t]

    dataset_full = Dataset(train_x.astype(np.float64), train_yt, test_x.astype(np.float64))
    dataset_f = Dataset(train_xf.astype(np.float64), train_yt, test_xf.astype(np.float64))

    xgb_params = {
        'learning_rate': 0.05,
        'max_depth': 6,
        'n_estimators': 500,
        # 'num_class': 5,
        'objective': 'multi:softprob',
        'subsample': 0.8}

    model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb')
    model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f')
    model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700},
                          name='rf')
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

#stack
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=111)

# create dataset
dataset = Dataset(X_train, y_train, X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr)
stack_ds = pipeline.stack(k=10, seed=111)
Exemple #27
0
        test_term_doc=pickle.load( f)

    with open('../input_data/trn_term_doc_wc_13.pil', 'rb') as f:
        trn_term_doc_wc=pickle.load( f)
    with open('../input_data/test_term_doc_wc_13.pil', 'rb') as f:
        test_term_doc_wc=pickle.load( f)

X_train, X_test, y_train, y_test =train_test_split(trn_term_doc, y, test_size=0.01, random_state=111)

X_train_wc, X_test_wc, y_train_wc, y_test_wc =train_test_split(trn_term_doc_wc, y, test_size=0.01, random_state=111)
print('tttt')
# X_train=X_train.toarray()
# X_test=X_test.toarray()
print('to array')
#创建数据集11
dataset = Dataset(X_train,y_train,test_term_doc,use_cache=False)
#创建RF模型和LR模型1
dataset_wc = Dataset(X_train_wc,y_train_wc,test_term_doc_wc,use_cache=False)

class_use_cache=False
model_nb = Classifier(dataset=dataset_wc, estimator=MultinomialNB,name='nb',use_cache=class_use_cache)
model_lr = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={'C':4, 'dual':True,'n_jobs':-1},name='lr',use_cache=class_use_cache)
model_lr2 = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={'C':4, 'multi_class':'multinomial','solver':'sag','dual':False,'n_jobs':-1},name='lr2',use_cache=class_use_cache)
model_svm = Classifier(dataset=dataset, estimator=svm.SVC, parameters={ 'probability':True},name='svm',use_cache=class_use_cache)
model_svc= Classifier(dataset=dataset, estimator=svm.LinearSVC,name='LinearSVC',use_cache=class_use_cache)
model_knn=Classifier(dataset=dataset, estimator=KNeighborsClassifier,name="knn",use_cache=class_use_cache)
# Stack两个模型mhg
# Returns new dataset with out-of-fold prediction,model_svm,model_per
logging.info('stack_ds....')
pipeline = ModelsPipeline(model_knn)
# pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2
Exemple #28
0
        test[c] *= weights[i]

    train = dtrn_.values
    test = dtst_.values
    y = y_train.ravel() - 1

    return {
        'X_train': train,
        'X_test': test,
        'y_train': y
    }  #dtrn_, dtst_, y_train


#------------------------------------------------------------------------------
#create the dataset
dataset = Dataset(preprocessor=preprocessData, use_cache=True)
#------------------------------------------------------------------------------

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, \
    AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
Exemple #29
0
def test_convertion():
    d1 = Dataset(X_train, y_train, X_test, y_test)
    d1.to_csc()
    d1.to_dense()

    d1 = Dataset(X_train, y_train, X_test, y_test)
    d1.to_csr()
    d1.to_dense()

    d1 = Dataset(X_train, y_train, X_test, y_test)
    d1.to_dense()
 dummy_fea = [
     'sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound',
     'account_grade', 'industry'
 ]
 for _fea in dummy_fea:
     print(_fea)
     le = LabelEncoder()
     le.fit(train_data[_fea].tolist() + test_data[_fea].tolist())
     tmp = le.transform(train_data[_fea].tolist() +
                        test_data[_fea].tolist())
     train_data[_fea] = tmp[:train_data.shape[0]]
     test_data[_fea] = tmp[train_data.shape[0]:]
 train_x = train_data.drop(['target'], axis=1)
 test_x = test_data.drop(['target'], axis=1)
 lgb_dataset = Dataset(train_x,
                       train_data['target'],
                       test_x,
                       use_cache=False)
 ##############################
 train_data = pd.read_csv('8288train.csv', engine='python')
 train_data = train_data.fillna(0)
 test_data = pd.read_csv('8288test.csv', engine='python')
 test_data = test_data.fillna(0)
 dummy_fea = [
     'sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound',
     'account_grade', 'industry'
 ]
 train_test_data = pd.concat([train_data, test_data],
                             axis=0,
                             ignore_index=True)
 train_test_data = train_test_data.fillna(0)
 dummy_df = pd.get_dummies(train_test_data.loc[:, dummy_fea])
def test_convertion():
    d1 = Dataset(X_train, y_train, X_test, y_test)
    d1.to_csc()
    d1.to_dense()

    d1 = Dataset(X_train, y_train, X_test, y_test)
    d1.to_csr()
    d1.to_dense()

    d1 = Dataset(X_train, y_train, X_test, y_test)
    d1.to_dense()
# load boston dataset from sklearn
from heamy.dataset import Dataset
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

# create dataset
Data = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr')


# Stack two models
# Returns new dataset with out-of-fold predictions
Pipeline = ModelsPipeline(RfModel,LRModel)
StackModel = Pipeline.stack(k=10,seed=2)

# Train LinearRegression on stacked data (second stage)
Stacker = Regressor(dataset=StackModel, estimator=LinearRegression)
Results = Stacker.predict()
# Validate results using 10 fold cross-validation
Exemple #33
0
def test_split():
    d = Dataset(X_train, y_train)
    d.split(inplace=True)
    assert d.X_test is not None
    assert d.y_test is not None

    d = Dataset(X_train, y_train)
    data = d.split(inplace=False)
    assert all(x is not None for x in data)

    d = Dataset(X_train, y_train)
    data = d.split(inplace=False)
    assert all([x is not None for x in data])

    d = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train))
    train_ind = np.array(range(0, 70))
    test_ind = np.array(range(30, 100))
    data = d.split(indices=[train_ind, test_ind])
    assert isinstance(data[0], (pd.DataFrame, pd.Series))
    assert isinstance(data[2], (pd.DataFrame, pd.Series))