dataset = dataset.dropna()
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1].values
X = X.dropna()
submit = submit.fillna(0)

# Replacing 0's and unknown values of gender by unknown
X.Gender.replace(['0', 'unknown'], ['unknown' , 'unknown'], inplace=True)
submit.Gender.replace(['0', 'unknown'], ['unknown' , 'unknown'], inplace=True)

# Replacing values of  0's by No in the university column
X['University Degree'].replace(['0', 0], ['No', 'No'], inplace = True)
submit['University Degree'].replace(['0', 0], ['No', 'No'], inplace = True)

# Dropping the instance column only as it gives best result
X = X.drop(['Instance'] , axis='columns')
submit = submit.drop(['Instance'] , axis='columns')

# Train test split of 80 : 20
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Using Categorical Boost Regresor
from catboost import CatBoostRegressor
model=CatBoostRegressor(task_type = 'GPU', iterations = 100000, learning_rate = 0.005)
model.fit(X_train,y_train,cat_features=([1, 3, 5, 6, 8]),eval_set=(X_test, y_test))
model.score(X_test,y_test)

# Getting the predicted values
ans = model.predict(submit)
예제 #2
0
def train_model(X, X_test, y, params=None, folds=folds, model_type='lgb', plot_feature_importance=False, model=None):

    oof = np.zeros(X.shape[0])
    prediction = np.zeros(X_test.shape[0])
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        if model_type == 'sklearn':
            X_train, X_valid = X[train_index], X[valid_index]
        else:
            X_train, X_valid = X.values[train_index], X.values[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='rmse',
                    verbose=1000, early_stopping_rounds=200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test.values), ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = mean_squared_error(y_valid, y_pred_valid)
            
            y_pred = model.predict(X_test)
            
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric='RMSE', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_squared_error(y_valid, y_pred_valid) ** 0.5)
        
        prediction += y_pred    
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction
    
    else:
        return oof, prediction
예제 #3
0
    print("Train test split")
    X_train, X_test, y_train, y_test = train_test_split(
        X[['score_x', 'score_y', 'score']].values, y.values)

    if False:
        model = LGBMRegressor(n_estimators=200)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)
        with open("./model_lgb.pk", "wb") as f:
            pickle.dump(model, f)
    else:
        with open("./model_lgb.pk", "rb") as f:
            model = pickle.loads(f.read())

    if False:
        if False:
            model2 = CatBoostRegressor(iterations=1000, learning_rate=0.1)
            model2.fit(X_train, y_train)
            with open("./model_cb.pk", "wb") as f:
                pickle.dump(model2, f)
        else:
            with open("./model_cb.pk", "rb") as f:
                model2 = pickle.load(f)

    if False:
        model3 = XGBRegressor(n_estimators=1000, verbosity=2)
        model3.fit(X_train, y_train)
    else:
        with open("./model_xgb.pk", "rb") as f:
            model3 = pickle.loads(f.read())

    pool = Pool(20)
예제 #4
0
def test_predict_sklearn_regress():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostRegressor(iterations=2, random_seed=0)
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #5
0
def test_wrong_params_regressor():
    with pytest.raises(CatboostError):
        CatBoostRegressor(wrong_param=1)
예제 #6
0
                      shuffle=True)
# num_bins = np.int(1 + np.log2(len(train)))
# bins = pd.cut(train['Global_Sales'], bins=num_bins, labels=False)
# for i, (train_idx, valid_idx) in enumerate(skf.split(train, bins.values)):
for i, (train_idx, valid_idx) in enumerate(skf.split(train, publisher)):
    x_train, x_valid = train.iloc[train_idx], train.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # # Publisherでfoldを割ってるので、trainはデータを分割した後にカラムをドロップ
    # x_train = x_train.drop(drop_column, axis=1)
    # x_valid = x_valid.drop(drop_column, axis=1)

    train_data = Pool(x_train, y_train)
    valid_data = Pool(x_valid, y_valid)

    model = CatBoostRegressor(**cab_params)
    model.fit(train_data,
              eval_set=valid_data,
              early_stopping_rounds=50,
              verbose=False,
              use_best_model=True)
    cab_valid_pred = model.predict(x_valid)
    score = mean_squared_error(y_valid, cab_valid_pred)**.5
    print(f'Fold {i} CAB RMSLE: {score}')

    cab_oof_pred[valid_idx] = cab_valid_pred
    models.append(model)
    scores.append(score)

    model = lgbm.LGBMRegressor(**lgbm_params)
    model.fit(
예제 #7
0
def run_train_all_sklearn(file, fp_name, cv=5, verbose=0, seed=1):

    np.random.seed(seed)
    c = defaultdict(list)

    for k in ProgIter([
            'synergy_zip', 'synergy_bliss', 'synergy_loewe', 'synergy_hsa',
            'css_ri', 'name'
    ],
                      verbose=verbose,
                      total=5):
        v = file[k]

        if k != 'name':
            temp = dict(
            )  # for results storage. Assuming that "name" comes last

            if 'drug_row_col' in v.columns:
                v.drop(columns=['drug_row_col'], inplace=True)

            cat_cols = ['cell_line_name']
            categories = [
                v[column].unique() for column in v[cat_cols]
            ]  # manually find all available categories for one-hot

            # pipelines
            encode = Pipeline(steps=[('one-hot-encode',
                                      OneHotEncoder(categories=categories))])
            processor = ColumnTransformer(transformers=[
                ('cat_encoding', encode, cat_cols), ('dropping', 'drop', [k])
            ],
                                          remainder='passthrough')

            catbst = ColumnTransformer(transformers=[('dropping', 'drop', [k])
                                                     ],
                                       remainder='passthrough')

            # regressions
            lr = make_pipeline(processor, linear_model.LinearRegression())
            ridge = make_pipeline(processor, linear_model.Ridge())
            lasso = make_pipeline(processor, linear_model.Lasso())
            elastic = make_pipeline(processor, linear_model.ElasticNet())
            lassolars = make_pipeline(processor, linear_model.LassoLars())
            b_ridge = make_pipeline(processor, linear_model.BayesianRidge())
            kernel = DotProduct() + WhiteKernel()
            gpr = make_pipeline(processor,
                                GaussianProcessRegressor(kernel=kernel))
            linSVR = make_pipeline(processor, LinearSVR())
            hist_gbr = make_pipeline(
                processor,
                HistGradientBoostingRegressor(warm_start=True, max_depth=6))
            rfr = make_pipeline(
                processor,
                RandomForestRegressor(warm_start=True, max_depth=6, n_jobs=3))
            iso = make_pipeline(processor,
                                IsotonicRegression(increasing='auto'))
            xgb = make_pipeline(
                processor, XGBRegressor(tree_method='gpu_hist', max_depth=6))
            cbt = make_pipeline(
                catbst,
                CatBoostRegressor(task_type='GPU',
                                  depth=6,
                                  cat_features=np.array([0]),
                                  verbose=False))

            mls = [
                cbt, rfr, gpr, hist_gbr, lr, ridge, lasso, elastic, lassolars,
                b_ridge, gpr, linSVR, iso
            ]
            mls_names = [
                "cbt", "rfr", "gpr", "hist_gbr", "lr", "ridge", "lasso",
                "elastic", "lassolars", "b_ridge", "gpr", "linSVR", "iso"
            ]

            # results
            start = time.time()
            for MODEL, name in zip(mls, mls_names):
                print(f'\n{name}')
                if 'cbt' == name:
                    n_jobs = 1
                else:
                    n_jobs = cv
                cv_dict = cross_validate(
                    MODEL,
                    v,
                    v[k],
                    cv=cv,
                    scoring={
                        "pearsonr": pearson,
                        "rmse": rmse
                    },
                    return_train_score=False,
                    verbose=verbose,
                    n_jobs=n_jobs,
                )
                temp[name] = {
                    'test_pearsonr': np.nanmean(cv_dict['test_pearsonr']),
                    'test_rmse': abs(np.nanmean(cv_dict['test_rmse']))
                }
                print(temp[name])
            print(f'{k} took {int(time.time()-start)/60} mins')

            c[k] = temp
        else:
            nm = f'/tf/notebooks/code_for_pub/_logs_as_python_files/{fp_name}_13models_5foldCV_{time.ctime()}.pickle'
            with open(nm, 'wb') as file:
                pickle.dump(c, file)
            print(f'saving complete to {nm}')
    return c
예제 #8
0
파일: example.py 프로젝트: henadzit/flatico
from catboost import CatBoostRegressor

model = CatBoostRegressor().load_model("fitted_model")

# floor_number, total_floors, area, latitude, longitude, apt_state

model.predict([[2, 5, 45, 53.908681, 27.572759, 1]])
예제 #9
0
train = pd.read_csv('FINAL_TRAIN_month3.csv')
test = pd.read_csv('FINAL_TEST_month3.csv')

# getting cat features indexes
cat_ff = [
    'date1', 'month', 'Класс объекта', 'Огорожена территория',
    'Входные группы', 'Спортивная площадка', 'Автомойка', 'Кладовые',
    'Колясочные', 'Кондиционирование', 'Вентлияция', 'Лифт',
    'Система мусоротведения', 'Видеонаблюдение', 'Подземная парковка',
    'Двор без машин', 'most_otdelka', 'most_vid', 'most_plan_size'
]
cat_ff = name_to_col_num(train.drop(['value', 'bulk_id'], axis=1), cat_ff)

if CROSS_VALIDATION:
    model = CatBoostRegressor(random_state=19, iterations=1500)
    # model = CatBoostRegressor(random_state=1, iterations=1300, learning_rate=0.03, depth=10)

    local_validation_cutoff = pd.DatetimeIndex(['2017-12-01'
                                                ]).astype(np.int64)[0]
    X_train = train[train.date1 < local_validation_cutoff].drop(
        ['value', 'bulk_id'], axis=1)
    y_train = train[train.date1 < local_validation_cutoff]['value']
    X_validation = train[train.date1 >= local_validation_cutoff].drop(
        ['value', 'bulk_id'], axis=1)
    y_validation = train[train.date1 >= local_validation_cutoff]['value']

    f_pool = Pool(X_train, y_train, cat_features=cat_ff)
    model.fit(
        X_train,
        y_train,
예제 #10
0
def main(args):

    # build search space
    data = load_data(args.dataset, args.seed)
    ss, _ = pruning_search_space_by_eda(data)

    if data.setting == 'inductive':
        trainer = InductiveTrainer()
    else:
        trainer = TransductiveTrainer()

    sampler = Sampler(args.dataset, ss)

    archs = []
    val_scores = []

    top_archs = []
    top_val_scores = []
    top_test_scores = []

    # init training data for GBDT
    sampled_archs = sampler.sample(args.n)

    i = 0
    while i < len(sampled_archs):
        arch = sampled_archs[i]
        data = sampler.load_data(arch)
        try:
            model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1)
            trainer.init_trainer(model, arch[7], arch[6])
            val_score = trainer.train(data)
        except RuntimeError as e:
            if "cuda" in str(e) or "CUDA" in str(e):     # CUDA OOM, sample another arch
                print(e)
                sampled_archs += sampler.sample(1)
                i += 1
                continue
            else:
                raise e

        archs.append(arch)
        val_scores.append(val_score)
        print(arch, f'real val score: {val_score}')
        print(f'Number of evaluated archs: {len(archs)}')

        i += 1

    # train GBDT predictor
    for iter_round in range(1, args.iterations + 1):
        print(f'Iteration round {iter_round}, ReTraining model and sampling archs...', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        # train GBDT
        X = [[str(e) for e in row] for row in archs]
        y = np.array(val_scores)
        train_pool = Pool(X, y, cat_features=[i for i in range(len(X[0]))])
        # X = lgb.Dataset(pd.DataFrame(X, columns=ss.keys()), label=np.array(val_scores))
        # gbdt_model = lgb.train(gbdt_params, X, args.gbdt_num_boost_round, categorical_feature=ss.keys())
        gbdt_model = CatBoostRegressor(
            learning_rate=args.gbdt_lr,
            verbose=False
        )
        gbdt_model.fit(train_pool)
        # pruning search space
        ss = pruning_search_space_by_shap(archs, gbdt_model, ss, args.p)
        sampler.update_search_space(ss)

        # predict some archs
        sampled_archs = sampler.sample(args.m)
        X = [[str(e) for e in row] for row in sampled_archs]
        test_pool = Pool(X, cat_features=[i for i in range(len(X[0]))])
        predicted_val_scores = gbdt_model.predict(test_pool)

        # sort the archs according to the predicted value
        zipped = zip(sampled_archs, predicted_val_scores)
        zipped = sorted(zipped, key=lambda e: e[1], reverse=True) # sort in decreaing order
        sampled_archs, predicted_val_scores = zip(*zipped)
        sampled_archs, predicted_val_scores = list(sampled_archs), list(predicted_val_scores)

        print(f'Iteration round {iter_round}, evaluating top k archs on valid set', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        # evaluate top k archs
        i = 0
        while i < len(sampled_archs):
            arch = sampled_archs[i]
            data = sampler.load_data(arch)
            try:
                model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1)
                trainer.init_trainer(model, arch[7], arch[6])
                val_score = trainer.train(data)
                predicted_val_score = predicted_val_scores[i]
            except RuntimeError as e:
                if "cuda" in str(e) or "CUDA" in str(e):     # CUDA OOM, sample another arch
                    print(e)
                    sampled_archs += sampler.sample(1)
                    i += 1
                    continue
                else:
                    raise e
            
            archs.append(arch)
            val_scores.append(val_score)
            print(arch, f'predicted val score: {predicted_val_score} | real val score: {val_score}')
            print(f'Number of evaluated archs: {len(archs)}')

            if i + 1 >= args.k:
                break

            i += 1
        
        # sort all the evaluated archs
        zipped = zip(archs, val_scores)
        zipped = sorted(zipped, key=lambda e: e[1], reverse=True)
        archs, val_scores = zip(*zipped)
        archs, val_scores = list(archs), list(val_scores)

        print(f'Iteration round {iter_round}, evaluating top k_test archs on test set', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        # evaluate top k_test archs on test set
        i = 0
        while i < len(archs):
            arch = archs[i]
            data = sampler.load_data(arch)
            try:
                model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1)
                trainer.init_trainer(model, arch[7], arch[6])
                val_score = trainer.train(data)
                test_score, z = trainer.test(data, return_logits=True)
                pickle.dump((z, data.y[data.test_mask]), open(f'embeddings/{args.dataset}_AutoGRL-round{iter_round}-top{i + 1}.pt', 'wb'))
                
            except RuntimeError as e:
                if "cuda" in str(e) or "CUDA" in str(e):     # CUDA OOM, sample another arch
                    print(e)
                    i += 1
                    continue
                else:
                    raise e
            
            
            top_archs.append(arch)
            top_val_scores.append(val_score)
            top_test_scores.append(test_score)

            print(arch)
            print(f'Testing... round {iter_round} | arch top {i + 1} | real val score {val_score} | real test score {test_score}', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

            if i + 1 >= args.k_test: # only test top k_test models for every round
                break
            
            i += 1
        
        zipped = zip(top_val_scores, top_test_scores)
        zipped = sorted(zipped, key=lambda e: e[0], reverse=True)
        best_val_score, corr_test_score = zipped[0][0], zipped[0][1]

        # logging
        print(f'Iteration {iter_round} | best val score {best_val_score} | corresponding test score {corr_test_score} | best test score {max(top_test_scores)}', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        pickle.dump((ss, sampler, trainer, archs, val_scores, gbdt_model, sampled_archs, predicted_val_scores, top_val_scores, top_test_scores), open(f'cache/gbdt/{args.dataset}_seed{args.seed}_round{iter_round}.pt', 'wb'))
예제 #11
0
from catboost import CatBoostRegressor
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 加载数据
X, y = load_diabetes(return_X_y=True)

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)

# 训练数据
clf = CatBoostRegressor(iterations=800,
                        learning_rate=0.8,
                        depth=6,
                        loss_function='RMSE')
fit_model = clf.fit(X_train, y_train)

# 模型参数
print(fit_model.get_params())

# 预测模型
y_pred = clf.predict(X_test)

# 评估模型
print(f'mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'r2 score: {r2_score(y_test, y_pred)}')
예제 #12
0
def main():
    cmdl = getcommandline()
    if cmdl.wellscsv:
        allwells = pd.read_csv(cmdl.wellscsv)
        # dz = np.diff(allwells.DEPTH)[2]
        dz = np.diff(allwells[allwells.columns[1]])[2]
        print('Well Vertical increment {}'.format(dz))
        wdirsplit, wfextsplit = os.path.split(cmdl.wellscsv)
        wfname, wfextn = os.path.splitext(wfextsplit)
        # logname = allwells.columns[-1]
        wcols = allwells.columns.tolist()
        print(wcols)
        logname = wcols[-1]
        print('logname:', logname)
        lognamepred = logname + 'pred'
        wcols.append(lognamepred)

        if cmdl.outdir:
            outfw = os.path.join(cmdl.outdir, wfname) + "_pred.csv"
        else:
            outfw = os.path.join(wdirsplit, wfname) + "_pred.csv"

    if cmdl.segyfileslist:
        sflist = list()
        sflist = process_segylist(cmdl.segyfileslist)

        dirsplit, fextsplit = os.path.split(sflist[0])
        fname, fextn = os.path.splitext(fextsplit)
        if cmdl.outdir:
            outfsegy = os.path.join(cmdl.outdir,
                                    wfname) + "_p%s.sgy" % (logname)
        else:
            outfsegy = os.path.join(dirsplit, wfname) + "_p%s.sgy" % (logname)

        print('Copying file, please wait ........')
        start_copy = datetime.now()
        copyfile(sflist[0], outfsegy)
        end_copy = datetime.now()
        print('Duration of copying: {}'.format(end_copy - start_copy))

        sr = get_samplerate(outfsegy)
        print('Seismic Sample Rate: {}'.format(sr))

        print('Zeroing segy file, please wait ........')
        start_zero = datetime.now()
        zero_segy(outfsegy)
        end_zero = datetime.now()
        print('Duration of zeroing: {}'.format(end_zero - start_zero))

        xclst, yclst = get_xy(fextsplit, cmdl.segyxhdr, cmdl.segyyhdr,
                              cmdl.xyscalerhdr)
        xydf = pd.DataFrame({'XC': xclst, 'YC': yclst})
        preddf = xydf.copy()
        scols = list()
        for f in sflist:
            dirsplit, fextsplit = os.path.split(f)
            fname, fextn = os.path.splitext(fextsplit)
            scols.append(fname)

        sfname = 'allattrib'
        # slicerange = cmdl.startendslice[1] - cmdl.startendslice[0]
        sstart = int(cmdl.startendslice[0] // dz)
        send = int(cmdl.startendslice[1] // dz)
        start_process = datetime.now()
        slicelst = list()
        slicenumlst = list()
        wnlst = list()
        slicewnlst = list()
        coef0lst = list()
        coef1lst = list()
        r2lst = list()
        for slicenum in range(sstart, send):
            if cmdl.outdir:
                outfslice = os.path.join(cmdl.outdir,
                                         sfname) + "_slice%d.csv" % slicenum
            else:
                outfslice = os.path.join(dirsplit,
                                         sfname) + "_slice%d.csv" % slicenum
            zslice = slicenum * dz
            if cmdl.intime:
                wdf = allwells[allwells.TIME == zslice]
            else:
                wdf = allwells[allwells.DEPTH == zslice]
            c = wdf.columns[4]  #log name
            nw = wdf[~wdf[c].isnull()].count()[4]
            if cmdl.intime:
                print('# of wells for time slice {} is {}'.format(zslice, nw))
            else:
                print('# of wells for depth slice {} is {}'.format(zslice, nw))

            slicefiles = list()
            for i in range(len(sflist)):
                slicefiles.append(get_slice(sflist[i], slicenum))
            slicear = np.array(slicefiles).T
            slicedf = pd.DataFrame(slicear, columns=scols)

            alldata = pd.concat((xydf, slicedf), axis=1)
            if cmdl.intime:
                print('Slice#: {} @ Time : {} ms'.format(slicenum, zslice))
            else:
                print('Slice#: {} @ Depth : {} ms'.format(slicenum, zslice))

            # print(alldata.head())

            if cmdl.slicesout:
                alldata.to_csv(outfslice, index=False)
            alldatas = process_sscalecols(alldata, includexy=cmdl.includexy)
            # print('After Scaling .....')
            # print(alldatas.head())
            wdfsa = process_seiswellattrib(alldatas, wdf, cmdl.intime)
            print(wdfsa.tail())
            # lastcol = wdfsa.shape[1]
            X = wdfsa.iloc[:, 4:-1]
            y = wdfsa.iloc[:, -1]
            inshape = y.size
            # print( f"size of y: {inshape}")
            if y.size > 2 and cmdl.generatesamples:
                X, y = gensamples(X,
                                  y,
                                  nsamples=cmdl.generatensamples,
                                  ncomponents=cmdl.generatencomponents,
                                  kind='r',
                                  func='cbr')
            Xpred = alldatas.iloc[:, 2:]
            # print(f'Xpred: {Xpred.shape}' )
            # print('# of wells used: ', X.shape[0], y.shape)
            # print(f'X shape: {X.shape} ')
            # print(X )

            model = CatBoostRegressor(iterations=cmdl.cbriterations,
                                      learning_rate=cmdl.cbrlearningrate,
                                      depth=cmdl.cbrdepth,
                                      loss_function='RMSE',
                                      random_seed=42,
                                      logging_level='Silent')
            # Fit model
            model.fit(X, y)
            # Get predictions
            ypred = model.predict(X)
            # Calculating Mean Squared Error
            mse = np.mean((ypred - y)**2)
            print('Metrics on input data: ')
            print('MSE: %.4f' % (mse))
            r2 = r2_score(y, ypred)
            print('R2 : %10.3f' % r2)

            ccmdl = sts.pearsonr(y, ypred)
            if slicenum == sstart:
                wellsdf = wdfsa[wdfsa.columns[:4]].copy()
                wellsdf[logname] = wdfsa[wdfsa.columns[-1]].copy()
                if cmdl.generatesamples:
                    wellsdf[lognamepred] = ypred[:inshape]
                else:
                    wellsdf[lognamepred] = ypred

                # print(wellsdf.tail())
                # print(wellsdf.shape)
            else:
                wellsdf0 = wdfsa[wdfsa.columns[:4]].copy()
                wellsdf0[logname] = wdfsa[wdfsa.columns[-1]].copy()
                if cmdl.generatesamples:
                    wellsdf0[lognamepred] = ypred[:inshape]
                else:
                    wellsdf0[lognamepred] = ypred
                allwellspred = wellsdf.append(wellsdf0)
                wellsdf = allwellspred[wcols].copy()
                print(allwellspred.tail())
                print(allwellspred.shape)

            pred = model.predict(Xpred)
            alldatas[wdfsa.columns[4]] = pred
            # print('After Prediction........')
            # print(alldatas.head())
            slicestr = '{:.0f}'.format(zslice)
            preddf[slicestr] = pred

            qc0 = np.polyfit(y, ypred, 1)
            xrngmin, xrngmax = y.min(), y.max()
            xvi = np.linspace(xrngmin, xrngmax)
            yvi0 = np.polyval(qc0, xvi)

            if slicenum % cmdl.plotincrement == 0:
                slicedepth = slicenum * dz
                fig, ax = plt.subplots()

                plt.scatter(y,
                            ypred,
                            alpha=0.5,
                            c='b',
                            s=15,
                            label='Model Predicted')
                if cmdl.generatesamples:
                    ax.scatter(y[inshape:],
                               ypred[inshape:],
                               c='r',
                               marker='X',
                               s=25,
                               label='Generated Samples')

                plt.plot(xvi, yvi0, c='k', lw=2)

                ax.annotate('Model = %-.*f * Actual + %-.*f' %
                            (2, qc0[0], 2, qc0[1]),
                            xy=(xvi[0], yvi0[0]),
                            xytext=(0.14, 0.85),
                            textcoords='figure fraction',
                            fontsize=10)
                ax.annotate('Model Pearson cc = %-.*f   Pearson p = %-.*f' %
                            (2, ccmdl[0], 3, ccmdl[1]),
                            xy=(xvi[0], yvi0[0]),
                            xytext=(0.14, 0.81),
                            textcoords='figure fraction',
                            fontsize=10)
                ax.set_title(f'CBR Slice {slicedepth:.0f} Pseudo {logname}')
                ax.set_xlabel('Actual')
                ax.set_ylabel('Predicted')
                if not cmdl.hideplots:
                    plt.show()
                swfname = 'SWAttrib'
                if cmdl.outdir:
                    # pdfcl = os.path.join(cmdl.outdir,swfname)+"%d" %(slicenum) +"_cbr%s.pdf" %(logname)
                    # wsdf = os.path.join(cmdl.outdir,swfname)+"%d" %(slicenum) +"_cbr%s.csv" %(logname)
                    pdfcl = os.path.join(
                        cmdl.outdir, swfname
                    ) + f"{slicedepth:.0f}" + "_cbr%s.pdf" % (logname)
                    wsdf = os.path.join(
                        cmdl.outdir, swfname
                    ) + f"{slicedepth:.0f}" + "_cbr%s.csv" % (logname)
                else:
                    pdfcl = os.path.join(
                        dirsplit, swfname
                    ) + f"{slicedepth:.0f}" + "_cbr%s.pdf" % (logname)
                    wsdf = os.path.join(
                        dirsplit, swfname
                    ) + f"{slicedepth:.0f}" + "_cbr%s.csv" % (logname)
                fig.savefig(pdfcl)
                wdfsa.to_csv(wsdf, index=False)
                print(f'Successfully generated {wsdf}')

            slicelst.append(zslice)
            wnlst.append(nw)
            slicewnlst.append(wdfsa.shape[0])
            slicenumlst.append(slicenum)
            r2lst.append(r2)
            coef0lst.append(qc0[0])
            coef1lst.append(qc0[1])

        end_process = datetime.now()
        print('Duration of ML model building and prediction : {}'.format(
            end_process - start_process))

        qccols = [
            'SLICENUM', 'SLICEZ', 'WELLSFOUND', 'WELLSUSED', 'COEF0', 'COEF1',
            'R2'
        ]
        qcdf = pd.DataFrame({
            'SLICENUM': slicenumlst,
            'SLICEZ': slicelst,
            'WELLSFOUND': wnlst,
            'WELLSUSED': slicewnlst,
            'COEF0': coef0lst,
            'COEF1': coef1lst,
            'R2': r2lst
        })
        qcdf = qcdf[qccols].copy()

        if cmdl.outdir:
            outseispred = os.path.join(cmdl.outdir, wfname) + "_slices.csv"
            outqc = os.path.join(cmdl.outdir, wfname) + "_qc.csv"
        else:
            outseispred = os.path.join(dirsplit, wfname) + "_slices.csv"
            outqc = os.path.join(dirsplit, wfname) + "_qc.csv"

        preddf.to_csv(outseispred, index=False)
        print('Successfully generated {}'.format(outseispred))
        print('DataFrame size: ', preddf.shape)
        endsmpl = preddf.shape[1] - 2
        # print(preddf.head())

        qcdf.to_csv(outqc, index=False)
        print('Successfully generated {}'.format(outqc))

        with sg.open(outfsegy, "r+") as srcp:
            for trnum, tr in enumerate(srcp.trace):
                trplog = preddf.iloc[trnum, 2:].values
                # lentrplog = trplog.size
                # print(trplog)
                tr[sstart:(sstart + endsmpl)] = trplog
                srcp.trace[trnum] = tr
        print('Successfully generated {}'.format(outfsegy))

        allwellspred.to_csv(outfw, index=False)
        print('Successfully generated {}'.format(outfw))
        plotwells(allwellspred, hideplots=cmdl.hideplots)
예제 #13
0
pred = model.predict(X_test_sn)
print(sklearn.metrics.mean_absolute_error(y_test_sn, pred))




from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
kfolds = 4
models = []
kfold = KFold(n_splits = kfolds, shuffle = True)
for i , (train_index, test_index) in enumerate(kfold.split(X)):
    print('Training cat model with fold {}...'.format(i + 1))
    X_train, X_test = X.ix[train_index], X.ix[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = CatBoostRegressor(iterations=200, learning_rate=0.03,
    depth=6, l2_leaf_reg=3,loss_function='MAE',eval_metric='MAE')
    models.append(model.fit(X_train, y_train))
    

months = np.array([10, 11, 12])
y_pred = []

for i in range(0, len(months)):
    pred = 0
    print(months[i])
    if months[i] != 0:
        X_validation['month'] = months[i]
    for model in models:
        print('next model...')
        pred += model.predict(X_validation)/kfolds
    y_pred.append(pred)
예제 #14
0
from pytrends.request import TrendReq
import datetime
import pandas as pd
import regex as re
from catboost import CatBoostRegressor, Pool

post_model = CatBoostRegressor()
doc_model = CatBoostRegressor()

post_model.load_model("models/posts_model")
doc_model.load_model("models/doc_model")


def trends(topic):
    score = 0
    time = str(datetime.datetime.now())
    year = int(time[0:4])
    month = int(time[5:7])
    day = int(time[8:10])
    hour = int(time[11:13])
    pytrends = TrendReq(hl='ru-RU', tz=360)
    smth = \
        pytrends.get_historical_interest([topic], year_start=year, month_start=month, day_start=day - 7,
                                         hour_start=hour,
                                         year_end=year,
                                         month_end=month, day_end=day, hour_end=hour, cat=0, geo='', gprop='', sleep=0)[
            topic]
    for i in range(0, 167):
        score += smth[-i]
    score = float(score / 168)
    return score
예제 #15
0
    def test_benchmark_classification(self):

        data, label = get_data_label(load_iris())

        num_features = 3
        corr_threshold = 0.5
        alpha = 1000
        tree_params = {"random_state": 123, "n_estimators": 100}

        selectors = {
            "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"),
            "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"),
            "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"),
            "univ_anova": SelectionMethod.Statistical(num_features, method="anova"),
            "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"),
            "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"),
            "linear": SelectionMethod.Linear(num_features, regularization="none"),
            "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha),
            "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha),
            "random_forest": SelectionMethod.TreeBased(num_features),
            "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)),
            "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)),
            "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)),
            "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)),
            "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)),
            "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)),
            "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)),
            "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)),
            "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)),
            "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)),
            "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)),
            "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True))
        }

        # Benchmark
        score_df, selected_df, runtime_df = benchmark(selectors, data, label, output_filename=None)
        _ = calculate_statistics(score_df, selected_df)

        self.assertListAlmostEqual([0.7018161715727902, 0.47803395524999537, 0.8157648279049796, 0.7867331225527027],
                                   score_df["corr_pearson"].to_list())

        self.assertListAlmostEqual([0.6127053183332257, 0.35502921869499415, 0.6778502590804124, 0.6548312268837866],
                                   score_df["corr_kendall"].to_list())

        self.assertListAlmostEqual([0.7207411401565564, 0.4413611232398492, 0.7823000090067262, 0.7652468370362326],
                                   score_df["corr_spearman"].to_list())

        self.assertListAlmostEqual([119.26450218449871, 49.16004008961098, 1180.1611822529776, 960.0071468018025],
                                   score_df["univ_anova"].to_list())

        self.assertListAlmostEqual([10.81782087849401, 3.7107283035324987, 116.31261309207022, 67.04836020011116],
                                   score_df["univ_chi_square"].to_list())

        self.assertListAlmostEqual([0.4742659474041446, 0.2458627871667194, 0.9899864089960027, 0.9892550496360593],
                                   score_df["univ_mutual_info"].to_list())

        self.assertListAlmostEqual([0.28992981466266715, 0.5607438535573831, 0.2622507287680856, 0.04272068858604694],
                                   score_df["linear"].to_list())

        self.assertListAlmostEqual([0.7644807315853743, 0.594582626209646, 0.3661598482641388, 1.0152555188158772],
                                   score_df["lasso"].to_list())

        self.assertListAlmostEqual([1.646830819860649e-15, 1.572815951552305e-15, 3.2612801348363973e-15, 5.773159728050814e-15],
                                   score_df["ridge"].to_list())

        self.assertListAlmostEqual([0.09210348279677849, 0.03045933928742506, 0.4257647994615192, 0.45167237845427727],
                                   score_df["random_forest"].to_list())
예제 #16
0
                # For IJHN create 2 clusters for high and low group
                knclf = KNeighborsClassifier(n_neighbors=5)
                y_kn = [1 if x > 170 else 0 for x in y_train]
                knclf.fit(X_train_nona, y_kn)
                X_train['high_low_ind'] = knclf.predict(X_train_nona)
                X_valid['high_low_ind'] = knclf.predict(X_valid_nona)
                X_test_type['high_low_ind'] = knclf.predict(
                    X_test_type[X_train_nona.columns])
            train_dataset = Pool(data=X_train, label=y_train)
            valid_dataset = Pool(data=X_valid, label=y_valid)
            test_dataset = Pool(data=X_test_type)
            model = CatBoostRegressor(
                iterations=N_ESTIMATORS,
                learning_rate=LEARNING_RATE,
                depth=DEPTH,
                eval_metric=EVAL_METRIC,
                verbose=VERBOSE,
                random_state=RANDOM_STATE,
                thread_count=N_THREADS,
                #loss_function=EVAL_METRIC,
                task_type="GPU")  # Train on GPU

            model.fit(train_dataset,
                      eval_set=valid_dataset,
                      early_stopping_rounds=500)
            now = timer()
            update_tracking(run_id,
                            '{}_tr_sec_f{}'.format(bond_type, fold_n + 1),
                            (now - fold_start),
                            integer=True)
            logger.info('Saving model file')
            model.save_model('models/{}-{}-{}-{}.model'.format(
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor

best_params = {
    'bagging_temperature': 0.6,
    'border_count': 200,
    'depth': 8,
    'iterations': 350,
    'l2_leaf_reg': 30,
    'learning_rate': 0.30,
    'random_strength': 0.01,
    'scale_pos_weight': 0.48
}

model = CatBoostRegressor(iterations=1000,
                          depth=3,
                          learning_rate=0.1,
                          loss_function='RMSE')
# model = LogisticRegression(
#                 penalty='l2',
#                 C=1.0,
#                 fit_intercept=True,
#                 random_state=432,
#                 solver = 'liblinear',
#                 max_iter = 1000,
#         )
stats = cross_validate(model,
                       X,
                       y,
                       groups=None,
                       scoring='roc_auc',
                       cv=5,
예제 #18
0
    'Hour', 'week_day', 'Year', 'Day', 'season'
]

xgb_model = xgb.XGBRegressor(n_estimators=1000,
                             max_depth=5,
                             learning_rate=0.03,
                             colsample_bytree=0.8,
                             subsample=0.7,
                             booster='gbtree')
xgb_cols = [
    'weather', 'atemp', 'humidity', 'windspeed', 'holiday', 'workingday',
    'Hour', 'week_day', 'Year', 'Day', 'season'
]

params = {'depth': 6, 'learning_rate': 0.05, 'iterations': 150}
cat_model = CatBoostRegressor(1000)
cat_model.fit(train[xgb_cols], train['registered_log'])

lr = LinearRegression()
streg_model = StackingCVRegressor(
    regressors=[cat_model, rf_model, gbm_model, xgb_model], meta_regressor=lr)

scores_casual_cat = cross_val_score(cat_model,
                                    train[xgb_cols],
                                    train['casual_log'],
                                    cv=5,
                                    scoring=make_scorer(
                                        log_rmsle, greater_is_better=False))
scores_r_cat = cross_val_score(cat_model,
                               train[xgb_cols],
                               train['registered_log'],
예제 #19
0
#          'grow_policy': 'Depthwise',
#          'l2_leaf_reg': 126,
#          'learning_rate': 0.30065425194784257,
#          'max_depth': 16,
#          #'max_leaves': 54,
#          'min_data_in_leaf': 90,
#          'random_strength': 10,
#          'iterations':2000,
#          'eval_metric': 'RMSE',
#          'random_seed':13,
#          'verbose':25,
#          'task_type': 'GPU',
#          'od_type':'Iter',
#          'od_wait': 20 }

model_cat = CatBoostRegressor(**param)
# model.fit(train_pool, eval_set=validation_pool, verbose= True)
# cost(np.log1p(y_test), model.predict(validation_pool))
pred_cat = cluster_id('cat',
                      train_new,
                      test_new,
                      'building_id',
                      model_cat,
                      num_cluster,
                      num_iters,
                      skip=True,
                      param=None)
print('Cat predict end')

####lasso
model_lasso = Lasso(alpha=1, random_state=13)
예제 #20
0
def catboost_regressor_learner(df: pd.DataFrame,
                               features: List[str],
                               target: str,
                               learning_rate: float = 0.1,
                               num_estimators: int = 100,
                               extra_params: Dict[str, Any] = None,
                               prediction_column: str = "prediction",
                               weight_column: str = None) -> LearnerReturnType:
    """
    Fits an CatBoost regressor to the dataset. It first generates a Pool
    with the specified features and labels from `df`. Then it fits a CatBoost
    model to this Pool. Return the predict function for the model and the
    predictions for the input dataset.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target : str
        The name of the column in `df` that should be used as target for the model.
        This column should be numerical and continuous, since this is a regression model.

    learning_rate : float
        Float in range [0,1].
        Step size shrinkage used in update to prevents overfitting. After each boosting step,
        we can directly get the weights of new features. and eta actually shrinks the
        feature weights to make the boosting process more conservative.
        See the eta hyper-parameter in:
        https://catboost.ai/docs/concepts/python-reference_parameters-list.html

    num_estimators : int
        Int in range [0, inf]
        Number of boosted trees to fit.
        See the n_estimators hyper-parameter in:
        https://catboost.ai/docs/concepts/python-reference_parameters-list.html

    extra_params : dict, optional
        Dictionary in the format {"hyperparameter_name" : hyperparameter_value.
        Other parameters for the CatBoost model. See the list in:
        https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
        If not passed, the default will be used.

    prediction_column : str
        The name of the column with the predictions from the model.

    weight_column : str, optional
        The name of the column with scores to weight the data.
    """
    from catboost import Pool, CatBoostRegressor
    import catboost

    weights = df[weight_column].values if weight_column else None
    params = extra_params if extra_params else {}
    params = assoc(params, "eta", learning_rate)

    dtrain = Pool(df[features].values,
                  df[target].values,
                  weight=weights,
                  feature_names=list(map(str, features)))
    cat_boost_regressor = CatBoostRegressor(iterations=num_estimators,
                                            **params)
    cbr = cat_boost_regressor.fit(dtrain, verbose=0)

    def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
        dtest = Pool(new_df[features].values,
                     feature_names=list(map(str, features)))
        col_dict = {prediction_column: cbr.predict(dtest)}

        if apply_shap:
            import shap
            explainer = shap.TreeExplainer(cbr)
            shap_values = list(explainer.shap_values(new_df[features]))
            shap_expected_value = explainer.expected_value

            shap_output = {
                "shap_values":
                shap_values,
                "shap_expected_value":
                np.repeat(shap_expected_value, len(shap_values))
            }

            col_dict = merge(col_dict, shap_output)

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("CatBoostRegressor", shap=False)

    log = {
        'catboost_regression_learner': {
            'features': features,
            'target': target,
            'prediction_column': prediction_column,
            'package': "catboost",
            'package_version': catboost.__version__,
            'parameters': assoc(params, "num_estimators", num_estimators),
            'feature_importance': cbr.feature_importances_,
            'training_samples': len(df)
        }
    }

    return p, p(df), log
예제 #21
0
def run_train(file,
              fp_name,
              cv=10,
              for_valid=0.4,
              ordered=False,
              ram_fraction=0.95,
              save=False,
              cv_params=None):
    cv_lower = 1
    cv_higher = 1 + cv
    if cv_params is None:
        cv_params = dict()
        cv_params['bootstrap_type'] = 'Poisson'
        cv_params['l2_leaf_reg'] = 9
        cv_params['learning_rate'] = 0.15
        cv_params['depth'] = 10
        cv_params['cat_features'] = ['cell_line_name']
        cv_params['use_best_model'] = True
        cv_params['early_stopping_rounds'] = 50
        cv_params['iterations'] = 5000
        cv_params['task_type'] = 'GPU'
    else:
        cv_params = cv_params
    if ordered:
        cv_params['boosting_type'] = 'Ordered'

    cat_features = cv_params['cat_features']
    cv_params['gpu_ram_part'] = ram_fraction

    f = for_valid
    c = defaultdict(list)

    for k in ProgIter([
            'synergy_zip', 'synergy_bliss', 'synergy_loewe', 'synergy_hsa',
            'css_ri', 'name'
    ],
                      total=5,
                      verbose=1):
        v_temp = file[k]
        if k != 'name':
            if 'drug_row_col' in v_temp.columns:
                v = v_temp.drop(columns=['drug_row_col'], inplace=False)
            else:
                v = v_temp
            size = int(v.shape[0] * f)  # 40% for valid
            a = []
            for i in range(cv_lower, cv_higher, 1):
                print(k)
                # sampling
                np.random.seed(i)
                idx_valid = pd.Index(
                    np.random.choice(v.index, size, replace=False))
                idx_test = v.index.difference(idx_valid)
                train = v.loc[
                    idx_test, :]  # returns df without the dropped idx
                valid = v.loc[idx_valid, :]

                #prep datasets
                true_labels = valid.pop(k)
                y = train.pop(k)
                eval_dataset = Pool(valid,
                                    true_labels,
                                    cat_features=cat_features)

                #create a model
                model = CatBoostRegressor(**cv_params)
                model.fit(train,
                          y,
                          eval_set=eval_dataset,
                          plot=False,
                          verbose=1000)

                # get stats
                preds = model.predict(valid)
                corr = pearsonr(true_labels, preds)
                rmse = np.sqrt(mean_squared_error(true_labels, preds))
                if save:
                    print(f'iteration: {i}, pearson: {corr}, rmse: {rmse}'
                          )  #,file=f, flush=True)
                    a.append([corr, rmse, true_labels, preds])
                else:
                    a.append([corr, rmse])
                    print(f'iteration: {i}, pearson: {corr}, rmse: {rmse}'
                          )  #,file=f, flush=True)
            c[k].append(a)
        else:
            c['name'].append(
                [v, for_valid,
                 cv])  # name of the fp, valid percentage, number of cv folds
            if save:
                nm = f'/tf/notebooks/code_for_pub/_logs_as_python_files/{fp_name}_noreplicates_{for_valid}_{time.ctime()}.pickle'
                with open(nm, 'wb') as file:
                    pickle.dump(c, file)
    return c
예제 #22
0
model = CatBoostRegressor(iterations=1000,
                          learning_rate=0.026,
                          depth=4,
                          l2_leaf_reg=None,
                          model_size_reg=None,
                          rsm=None,
                          loss_function='RMSE',
                          border_count=None,
                          feature_border_type=None,
                          fold_permutation_block_size=None,
                          od_pval=None,
                          od_wait=None,
                          od_type=None,
                          nan_mode=None,
                          counter_calc_method=None,
                          leaf_estimation_iterations=None,
                          leaf_estimation_method=None,
                          thread_count=10,
                          random_seed=None,
                          use_best_model=None,
                          verbose=None,
                          logging_level=None,
                          metric_period=None,
                          ctr_leaf_count_limit=None,
                          store_all_simple_ctr=None,
                          max_ctr_complexity=None,
                          has_time=None,
                          one_hot_max_size=None,
                          random_strength=None,
                          name=None,
                          ignored_features=None,
                          train_dir=None,
                          custom_metric=None,
                          eval_metric=None,
                          bagging_temperature=None,
                          save_snapshot=None,
                          snapshot_file=None,
                          fold_len_multiplier=None,
                          used_ram_limit=None,
                          gpu_ram_part=None,
                          allow_writing_files=None,
                          approx_on_full_history=None,
                          boosting_type=None,
                          simple_ctr=None,
                          combinations_ctr=None,
                          per_feature_ctr=None,
                          task_type=None,
                          device_config=None,
                          devices=None,
                          bootstrap_type=None,
                          subsample=None,
                          max_depth=None,
                          n_estimators=None,
                          num_boost_round=None,
                          num_trees=None,
                          colsample_bylevel=None,
                          random_state=None,
                          reg_lambda=None,
                          objective=None,
                          eta=None,
                          max_bin=None,
                          gpu_cat_features_storage=None,
                          data_partition=None)
예제 #23
0
def test_invalid_loss_regressor():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostRegressor(loss_function="fee")
        model.fit(pool)
예제 #24
0
def fun_cat_fs(x, *args):
    X, y, flag, n_splits, random_seed = args
    clf = CatBoostRegressor(random_state=int(random_seed), verbose=0)

    n_samples, n_var = X.shape
    #cr ={
    #     0:'reg:linear',
    #     1:'reg:logistic',
    #     2:'binary:logistic',
    #    }

    #x=[0.1, 200, 5, 2.5, 10.0, 0.8, ]
    p = {
        'learning_rate': x[0],
        'n_estimators': int(round(x[1])),
        'depth': int(round(x[2])),
        'loss_function': 'RMSE',
        'l2_leaf_reg': x[3],
        'bagging_temperature': x[4],
        #'boosting_type':'Pĺain',
        #'colsample_bytree':x[3],
        #'min_child_weight':int(round(x[4])),
        #'bootstrap_type':'Bernoulli',
        #'subsample':int(x[5]*1000)/1000,
        ##'alpha':x[6],
        #'objective':cr[0],
        ##'presort':ps[0],
    }

    clf.set_params(**p)
    #x[2::] = [1 if k>0.5 else 0 for k in x[4::]]
    if len(x) <= 6:
        ft = np.array([1 for i in range(n_var)])
    else:
        ft = np.array([1 if k > 0.5 else 0 for k in x[2::]])

    ft = np.where(ft > 0.5)

    try:
        cv = KFold(n_splits=n_splits,
                   shuffle=True,
                   random_state=int(random_seed))
        y_p = cross_val_predict(clf, X, y.ravel(), cv=cv, n_jobs=1)

        r = RMSE(y_p, y)
        r2 = MAPE(y_p, y)
        r3 = RRMSE(y_p, y)
        r4 = -r2_score(y_p, y)
        #r =  mean_squared_error(y,y_p)**0.5
        #r =  -accuracy_score(y,y_p)
        #r =  -f1_score(y,y_p,average='weighted')
        #r =  -precision_score(y,y_p)
        #print(r,p)
    except:
        y_p = [None]
        r = 1e12

#   print(r,'\t',p)
    if flag == 'eval':
        return r
    else:
        clf.fit(X[:, ft].squeeze(), y)
        return {
            'Y_TRUE': y,
            'Y_PRED': y_p,
            'EST_PARAMS': p,
            'PARAMS': x,
            'EST_NAME': 'CAT',
            'ESTIMATOR': clf,
            'ACTIVE_VAR': ft,
            'DATA': X,
            'SEED': random_seed,
            'ERROR_TRAIN': {
                'RMSE': r,
                'MAPE': r2,
                'RRMSE': r3,
                'R2_SCORE': r4
            }
        }
예제 #25
0
for i, x in enumerate(y_pred_mlp_rd):
    if x < 0:
        y_pred_mlp_rd[i] = 0

#y_pred_mlp_rd = [0 for i,x in enumerate(y_pred_mlp_rd) if x<0]

test_eval = eval_metrics(y_test, y_pred_mlp_rd)

y_test_total2 = (
    (pd.DataFrame(y_pred_test_rd) + pd.DataFrame(y_pred_X_test_rd.ravel()) +
     pd.DataFrame(y_pred_mlp_rd.ravel())) / 3).astype(int)
metrics_total2 = eval_metrics(y_test, y_test_total2[0].values)

from catboost import Pool, CatBoostRegressor, cv
from sklearn.metrics import accuracy_score
model = CatBoostRegressor()

# Fit model
model.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    #     logging_level='Verbose',  # you can uncomment this for text output
    plot=True)

model.fit(X_train, y_train, plot=True)
# Get predictions
pred_cat = model.predict(X_test)
pred_cat_rd = np.round(pred_cat)
cat_eval = eval_metrics(y_test, pred_cat_rd)
예제 #26
0
파일: 3train.py 프로젝트: houxiaona/tianyi
    "Feature", "importance"
]].groupby("Feature").mean().sort_values(by="importance", ascending=False)
###############################################
from catboost import CatBoostRegressor

# model = "cat"
for fold_, (trn_idx,
            val_idx) in enumerate(folds.split(train_x.values, train_y.values)):
    print("fold {}".format(fold_))
    trn_x, trn_y = train_x.iloc[trn_idx], train_y.iloc[trn_idx]
    val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    num_round = 10000
    clf = CatBoostRegressor(
        objective="RMSE",  # MultiClass 0.8957
        n_estimators=num_round,
        max_depth=6,
        reg_lambda=0.01,
        random_seed=2019,
        verbose=True,
    )
    clf.fit(trn_x,
            trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            verbose=200,
            early_stopping_rounds=100
            # cat_features=cat_features
            )
    # n*6矩阵
    oof[val_idx] = clf.predict(train_x.iloc[val_idx])

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
예제 #27
0
def train_model_regression(X,
                           X_test,
                           y,
                           params,
                           folds,
                           model_type='lgb',
                           eval_metric='mae',
                           columns=None,
                           plot_feature_importance=False,
                           model=None,
                           verbose=10000,
                           early_stopping_rounds=200,
                           n_estimators=50000,
                           mol_type=-1):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {
        'mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'sklearn_scoring_function': metrics.mean_absolute_error
        },
        'group_mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'scoring_function': group_mean_log_mae
        },
        'mse': {
            'lgb_metric_name': 'mse',
            'catboost_metric_name': 'MSE',
            'sklearn_scoring_function': metrics.mean_squared_error
        }
    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(len(X))

    # averaged predictions on train data
    prediction = np.zeros(len(X_test))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()

    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[
                valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params,
                                      n_estimators=n_estimators,
                                      n_jobs=-1)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=verbose,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](
                y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(
                iterations=20000,
                eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                **params,
                loss_function=metrics_dict[eval_metric]
                ['catboost_metric_name'])
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )
        if eval_metric != 'group_mae':
            scores.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](
                y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
        np.mean(scores), np.std(scores)))

    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')
            feature_importance.to_csv(log_path / f"importance_{mol_type}.csv")
            result_dict['feature_importance'] = feature_importance

    return result_dict
예제 #28
0
    def test_benchmark_regression(self):

        data, label = get_data_label(load_boston())
        data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])

        num_features = 3
        corr_threshold = 0.5
        alpha = 1000
        tree_params = {"random_state": 123, "n_estimators": 100}

        selectors = {
            "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"),
            "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"),
            "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"),
            "univ_anova": SelectionMethod.Statistical(num_features, method="anova"),
            "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"),
            "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"),
            "linear": SelectionMethod.Linear(num_features, regularization="none"),
            "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha),
            "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha),
            "random_forest": SelectionMethod.TreeBased(num_features),
            "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)),
            "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)),
            "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)),
            "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)),
            "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)),
            "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)),
            "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)),
            "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)),
            "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)),
            "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)),
            "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)),
            "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True))
        }

        # Benchmark
        score_df, selected_df, runtime_df = benchmark(selectors, data, label, output_filename=None)
        _ = calculate_statistics(score_df, selected_df)

        self.assertListAlmostEqual([0.4787777784012165, 0.47170429073431874, 0.5596288196730658, 0.4400410275414326, 0.5674082968785575],
                                   score_df["corr_pearson"].to_list())

        self.assertListAlmostEqual([0.5357134888110283, 0.48128808343101986, 0.5132201793752295, 0.3384081264406572, 0.49448886053070107],
                                   score_df["corr_kendall"].to_list())

        self.assertListAlmostEqual([0.6542231557010167, 0.5538583519391704, 0.6267310661636885, 0.3924548536221991, 0.5984933578623318],
                                   score_df["corr_spearman"].to_list())

        self.assertListAlmostEqual([89.48611475768125, 75.25764229895405, 83.47745921923685, 63.05422911249312, 601.6178711099022],
                                   score_df["univ_anova"].to_list())

        self.assertListAlmostEqual([0, 0, 0, 0, 0],
                                   score_df["univ_chi_square"].to_list())

        self.assertListAlmostEqual([0.3421450205863028, 0.1806168920395521, 0.31266011627421086, 0.16107911083428794, 0.666208499757925],
                                   score_df["univ_mutual_info"].to_list())

        self.assertListAlmostEqual([0.06901111285092865, 0.05408618283036938, 0.06145227292569164, 0.006510036424819454, 0.9546615660373198],
                                   score_df["linear"].to_list())

        self.assertListAlmostEqual([0.05682706487290267, 0.051008405488957305, 0.05319245109490162, 0.007176306398647428, 0.9231211889322195],
                                   score_df["lasso"].to_list())

        self.assertListAlmostEqual([0.0690214777400926, 0.054087779998048285, 0.06144441861097637, 0.006510854482697315, 0.95459417786841],
                                   score_df["ridge"].to_list())

        self.assertListAlmostEqual([0.10947144861974874, 0.020211076089938374, 0.08416074180466389, 0.045604950489313435, 0.7405517829963355],
                                   score_df["random_forest"].to_list())
예제 #29
0
파일: 0.8429.py 프로젝트: zhaojinxi/tianchi
print("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)
print("Training time !!")
X_train = train_df[train_features]
y_train = train_df['血糖']
print(X_train.shape, y_train.shape)
X_test = test_df[train_features]
print(X_test.shape)
num_ensembles = 5
y_pred_cat = 0.0
for i in tqdm(range(num_ensembles)):
    model = CatBoostRegressor(iterations=1000,
                              learning_rate=0.03,
                              depth=6,
                              l2_leaf_reg=3,
                              loss_function='RMSE',
                              eval_metric='RMSE',
                              random_seed=i)
    model.fit(X_train, y_train, cat_features=cat_feature_inds)
    y_pred_cat += model.predict(X_test)
y_pred_cat /= num_ensembles
del train
del test
gc.collect()
################
################
##    OLS     ##
################
################
np.random.seed(17)
예제 #30
0
def test_regression_ctr():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostRegressor(iterations=5, random_seed=0, ctr_description=['Borders:TargetBorderCount=5:TargetBorderType=Uniform', 'Counter'])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)