Ejemplo n.º 1
0
def main():
    train_data = pd.read_csv('train.csv')
    test_data = pd.read_csv('test.csv')
    train_y = train_data.iloc[:, -1]
    datas = train_data.append(test_data, ignore_index=True)

    datas.drop(['SalePrice'], axis=1, inplace=True)
    datas.drop(['Id'], axis=1, inplace=True)
    # train_data.info()
    # train_data.describe()
    #观察数据的缺省值
    # print(datas.isnull().sum().sort_values(ascending=True))
    # print(train_data['MSZoning'].mode())
    datas=data_value_deal(datas)
    train_data,test_data=datapca(train_data,datas)
    # print(train_data[:5],test_data[:5])
    model=XGBRegressor()
    grid=datastrain(model).gradient_get(train_data,train_y,{
        'max_depth':[8],
        'learning_rate':[0.01],
        'n_estimators':[10000]
    })
    model=grid.best_estimator_
    result = rmse_cv(model, train_data, train_y)
    cv_mean = result.mean()
    cv_std = result.std()
    print('cv_mean:', cv_mean, 'cv_std:', cv_std)
    prey=model.predict(train_data)
    model.save_model('001.model')
    acc=np.sqrt(np.power(prey-train_y,2))
    print(acc[:5],acc.sum())
Ejemplo n.º 2
0
def train_model(train_set_path, model_out_file):
    """
    Train the wine predictor, with parameters discovered in hyper-parameter
    tuning phase. The model is then saved for future use.
    """

    assert '.csv' in train_set_path, f'Received {train_set_path}! ' \
                                     f'Please provide a .csv file'
    hp = {
        'colsample_bytree': 0.3,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'max_depth': 12,
        'min_child_weight': 7
    }
    train_set = pd.read_csv(train_set_path)
    train_y = train_set[['points']]
    train_x = train_set.drop(columns=['points'])

    logger.info(f'XGBoost Regression with parameters: {hp}')
    model = XGBRegressor(random_state=42,
                         colsample_bytree=hp['colsample_bytree'],
                         learning_rate=hp['learning_rate'],
                         max_depth=hp['max_depth'],
                         min_child_weight=hp['min_child_weight'],
                         gamma=hp['gamma'])

    logger.info('Training model...')
    started = time()
    model.fit(train_x, train_y)

    logger.info(f'Model trained in {time() - started} seconds')
    os.makedirs(os.path.dirname(model_out_file), exist_ok=True)
    model.save_model(Path(model_out_file))
    logger.info(f'Models saved to {model_out_file}')
Ejemplo n.º 3
0
def train(name="features.pkl"):
    data = pd.read_pickle(name)
    data = data[[
        col for col in set(settings.FEATURES).intersection(set(data.columns))
    ]]

    X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
    Y_train = data[data.date_block_num < 33]['item_cnt_month']
    X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
    Y_valid = data[data.date_block_num == 33]['item_cnt_month']
    X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

    del data
    gc.collect()

    ts = time.time()
    model = XGBRegressor(**settings.REGRESSOR_PARAMS)
    model.fit(X_train,
              Y_train,
              eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
              **settings.FIT_PARAMS)
    model.save_model("model.pkl")
    print(f"Training the model in {time.time() - ts}s")

    Y_pred = model.predict(X_valid).clip(0, 20)
    Y_test = model.predict(X_test).clip(0, 20)
    test = pd.read_csv(os.path.join(settings.DATA_PATH, 'sales_test.csv'))
    test = remove_duplicates(test)
    submission = pd.DataFrame({'ID': test.index, 'item_cnt_month': Y_test})
    submission.to_csv('xgb_submission.csv', index=False)
def train_trainable(data):
    df0 = data[[
        "date_block_num", "shop_id", "item_id", "id_struct", "item_category",
        "Price_agg", "keyz", "item_cnt_month_lag1", "item_cnt_month_lag2",
        "item_cnt_month_lag3", "item_cnt_month_lag4", "item_cnt_month_lag5",
        "item_cnt_month_lag6", "item_cnt_month_lag7", "Price_agg_lag1",
        "Price_agg_lag2"
    ]]
    df1 = data[["item_cnt_month"]]
    param = {
        'colsample_bytree': 0.8,
        'subsample': 0.75,
        'eta': 0.02,
        'n_estimators': 1100,
        'max_depth': 7,
        'min_child_weight': 1
    }
    model = XGBRegressor(**param)
    model.fit(df0,
              df1,
              eval_metric="rmse",
              eval_set=[(df0, df1)],
              verbose=False,
              early_stopping_rounds=1)
    model.save_model("./models/xgbmodelprime")
Ejemplo n.º 5
0
def predict(course_code, user_id):
    filename = get_path(course_code, '%s_model.xgb' % course_code)

    X, y = load_data(course_code)

    user_X = X.loc[user_id]

    # Normalization
    if course_code not in data_transformer:
        scaler = MinMaxScaler()
        scaler.fit(X)
        data_transformer[course_code] = scaler
    scaler = data_transformer[course_code]

    if course_code not in model_cache:
        model = XGBRegressor()
        if os.path.isfile(filename):
            model.load_model(filename)
        else:
            X = scaler.transform(X)
            model.fit(X, y)
            model.save_model(filename)
        model_cache[course_code] = model

    model = model_cache[course_code]
    X = scaler.transform(X)

    y_ = model.predict(X)
    hist, bin_edges = np.histogram(y_, bins=10, range=[0, 1])

    return {
        "classFinalExamDistribution": hist.tolist(),
        "myChapterScore": get_user_chapter_grades(course_code, user_id),
        "myPredictedFinalExamScore": float(model.predict(user_X)[0])
    }
Ejemplo n.º 6
0
 def generate_models(self):
     params = dict(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, early_stopping_rounds=20)
     training_set = self.load_training_set()
     for label in labels:
         train, test = self.prepare_data(training_set, label)
         model = XGBRegressor(**params)
         model.fit(train.X, train.y, eval_set=[(test.X, test.y)])
         model.save_model(f"resources/{label}.json")
Ejemplo n.º 7
0
def main():
    print("Loading data...")
    # The training data is used to train your model how to predict the targets.
    training_data = read_csv("numerai_training_data.csv")
    # The tournament data is the data that Numerai uses to evaluate your model.
    tournament_data = read_csv("numerai_tournament_data.csv")

    feature_names = [
        f for f in training_data.columns if f.startswith("feature")
    ]
    print(f"Loaded {len(feature_names)} features")

    # This is the model that generates the included example predictions file.
    # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster.
    # Remember to delete example_model.xgb if you change any of the parameters below.
    model = XGBRegressor(max_depth=5,
                         learning_rate=0.01,
                         n_estimators=2000,
                         n_jobs=-1,
                         colsample_bytree=0.1)
    if MODEL_FILE.is_file():
        print("Loading pre-trained model...")
        model.load_model(MODEL_FILE)
    else:
        print("Training model...")
        model.fit(training_data[feature_names], training_data[TARGET_NAME])
        model.save_model(MODEL_FILE)

    # Generate predictions on both training and tournament data
    print("Generating predictions...")
    training_data[PREDICTION_NAME] = model.predict(
        training_data[feature_names])
    tournament_data[PREDICTION_NAME] = model.predict(
        tournament_data[feature_names])

    # Check the per-era correlations on the training set (in sample)
    train_correlations = training_data.groupby("era").apply(score)
    print(
        f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}"
    )
    print(
        f"On training the average per-era payout is {payout(train_correlations).mean()}"
    )

    # Check the per-era correlations on the validation set (out of sample)
    validation_data = tournament_data[tournament_data.data_type ==
                                      "validation"]
    validation_correlations = validation_data.groupby("era").apply(score)
    print(
        f"On validation the correlation has mean {validation_correlations.mean()} and "
        f"std {validation_correlations.std()}")
    print(
        f"On validation the average per-era payout is {payout(validation_correlations).mean()}"
    )

    # Save predictions as a CSV and upload to https://numer.ai
    tournament_data[PREDICTION_NAME].to_csv(TOURNAMENT_NAME +
                                            "_submission.csv")
Ejemplo n.º 8
0
def xgb_model(n_estimators=[],
              learning_rate=[],
              validation_data=(),
              training_data=(),
              testing_data=(),
              directory='',
              filename=''):
    '''
    Takes a list of estimators and learning rate
    along with train/valid/test data.
    
    Runs the XGB regressor saves
    the weights in .model format and
    the performances in a csv file 
    and returns the performance
    results back in a dataFrame
    '''
    mse = {}
    for estimator in n_estimators:
        for rate in learning_rate:
            #Inisiating the model
            model = XGBRegressor(n_estimators=estimator, learning_rate=rate)

            #Training the model
            model.fit(training_data[0],
                      training_data[1],
                      early_stopping_rounds=50,
                      eval_set=[(validation_data[0], validation_data[1])],
                      verbose=False)

            #Evaluating the model
            prediction = model.predict(testing_data[0])

            #saving the model
            model.save_model(
                '../src/models/xgb_weights/n_estimator{}_learning_rate{}.model'
                .format(estimator, rate))

            #Calculating the error
            error = mean_squared_error(prediction, testing_data[1])
            mse[error] = [estimator, rate]

    #Converting the dict to a DataFrame
    xgb_performance = pd.DataFrame(data=mse)
    xgb_performance = xgb_performance.transpose()
    xgb_performance.columns = ['n_estimator', 'learning_rate']
    xgb_performance.index.name = 'mse'

    #Saving the performances in a CSV file
    if os.path.exists(directory):
        xgb_performance.to_csv('../src/models/{}/{}.csv'.format(
            directory, filename))
    else:
        os.makedirs(directory)
        xgb_performance.to_csv('../src/models/{}/{}.csv'.format(
            directory, filename))
    return xgb_performance
Ejemplo n.º 9
0
class GDPGrowthPredictor:
    """Gbm class"""
    def __init__(self, *args, **kwargs):
        """Create model with given parameters"""
        self.model = XGBRegressor(*args, **kwargs)

    def train(self, filename, split, previous_year, plot, *args, **kwargs):
        """Train model, and plot results"""
        X_train, X_test, y_train, y_test, features = _io.retrieve_training_dataset(
            split, previous_year)
        self.model.fit(X_train, y_train, *args, **kwargs)
        self.save(filename)

        if split != 0:
            self.test(X_test, y_test, features, split, plot)

    def test(self, X_test, y_test, features, split, plot):
        """Test model"""
        model_y_pred = self.model.predict(X_test)

        results_df = X_test
        results_df = results_df.drop(columns=features)
        results_df["y_real"] = y_test
        results_df["y_pred"] = model_y_pred
        results_df["err"] = np.absolute(results_df["y_real"] -
                                        results_df["y_pred"])
        results_df["%_err"] = ((results_df["err"]) /
                               (np.absolute(results_df["y_real"])) * 100)

        logging.info("Test results with %s split:", split)
        logging.info("\t RMSE: %.3f",
                     mean_squared_error(y_test, model_y_pred)**0.5)
        logging.info("\t R^2: %.3f", r2_score(y_test, model_y_pred))

        if plot:
            logging.info("Generating plots")
            plots.plot_performance_results(y_test, model_y_pred)
            plots.plot_shap_results(X_test, features, self.model)

    def predict(self, filename, previous_year, year, *args, **kwargs):
        """Make predictions for next year GDP growth,
        returns a pandas df"""
        self.load(filename)
        predictions, X_predict = _io.retrieve_predict_dataset(
            previous_year, year)
        predictions["Value"] = self.model.predict(X_predict, *args, **kwargs)
        return predictions

    def save(self, filename):
        """ Save model to file"""
        self.model.save_model(filename)
        logging.info("Model saved")

    def load(self, filename):
        """ Load model from file"""
        self.model.load_model(filename)
        logging.info("Model loaded")
Ejemplo n.º 10
0
def train_xgbr(X,
               y,
               param,
               param1,
               param2,
               model_path='./model',
               test_size=0.2,
               estimator=XGBRegressor,
               score=mean_absolute_error):
    '''
    训练最佳参数模型
   
    Inputs: X, y, param, param1, param2: 参考参数搜索函数search_best_param()
                - 去除名字与日期 (日期可以进一步做特征工程,但当前版本暂不考虑
            test_size: 测试集比例
            model_path: 模型保存路径
            estimator: 参数搜索用模型
            score: 参数搜索用评分metric
             *注意:实际模型用Huber loss进行优化,相对普遍的square loss function对异常值比较不敏感,表现更加robust
    Output: 保存训练好的xgb模型至路径
    '''

    X = X.drop(['name', 'date'], axis=1)
    # CV dataset split w/o shuffling (not sure if suffuling is better anot)
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.2,
                                                      shuffle=False,
                                                      random_state=None)

    #find best param
    best_score, best_param = search_best_param(X_train,
                                               y_train,
                                               X_val,
                                               y_val,
                                               param,
                                               param1,
                                               param2,
                                               estimator=XGBRegressor,
                                               score=mean_absolute_error)

    #initilize and train on training data
    best_xgbr = XGBRegressor(objective=huber_approx_obj, **best_param)
    best_xgbr.fit(X, y)

    # output found best_param and trained model
    if not os.path.isdir(model_path):
        os.makedirs(model_path)
    try:
        best_xgbr.save_model(os.path.join(model_path, 'xgb_model.json'))
    except:
        print("error saving the model")

    return best_xgbr
Ejemplo n.º 11
0
class XGBModel(GenericModel):
    def __init__(self, name, version=1, classifier=True, xgb_kwargs=None):
        super().__init__(name, version)
        self.xgb_kwargs = xgb_kwargs
        if classifier:
            self.model = XGBClassifier(**xgb_kwargs)
        else:
            self.model = XGBRegressor(**xgb_kwargs)

    def train(self):
        print(
            'No custom train method implemented. Instead call self.model.fit(...)'
        )

    def save_model(self,
                   notes=None,
                   update_version=False,
                   config=None,
                   save_attributes=True):
        if update_version:
            self.version += 1

        try:
            model_path = self.model_dir / Path(f'v{self.version}.json')
            self.model.save_model(model_path.as_posix())
        except Exception as e:
            print('Error saving model')
            print(e)
            raise

        if save_attributes:
            self._save_attributes()

        if notes is not None:
            self._save_notes(notes)

        if config is not None:
            self._save_config(config)

    def load_model(self, version, load_attributes=True):
        # First load the xgb_kwargs so that we can create a new instance of XGB
        self._load_attributes(self.attr_dir)
        if hasattr(self, 'xgb_kwargs'):
            self.model = self.model(self.xgb_kwargs)

        # Next load the model
        model_path = self.model_dir / Path(f'v{self.version}.json')
        assert model_path.exists(
        ), f'No model exists at {model_path.as_posix()}'
        self.model.load_model(model_path)
Ejemplo n.º 12
0
class XGBConfidenceIntervalBootstrap:
    def __init__(self,
                 n_regressors=100,
                 n_common_trees=0,
                 sample_rate=1.0,
                 **xgb_args):
        self.n_regressors = n_regressors
        self.n_common_trees = n_common_trees
        self.sample_rate = sample_rate
        self.xgb_args = xgb_args
        self.base_regressor = None
        self.regressors = []

    def fit(self, X, y):
        # gpu_args = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'gpu_id': 0, 'n_jobs': 16}
        if self.n_common_trees:
            base_regressor_args = {
                'objective': 'reg:squarederror',
                'n_estimators': self.n_common_trees
            }
            self.base_regressor = XGBRegressor(**base_regressor_args)
            self.base_regressor.fit(X, y, verbose=False)
            self.base_regressor.save_model('base.model')

        for i in tqdm(range(self.n_regressors)):
            regressor = XGBRegressor(**self.xgb_args)
            n_samples = int(len(X) * self.sample_rate)
            sample_indexes = np.random.choice(range(len(X)),
                                              n_samples,
                                              replace=True)
            train_args = {}
            if self.n_common_trees:
                train_args['xgb_model'] = 'base.model'
            regressor.fit(X[sample_indexes],
                          y[sample_indexes],
                          verbose=False,
                          **train_args)
            self.regressors.append(regressor)

    def predict(self, X):
        result = np.array([r.predict(X) for r in self.regressors])
        mean = result.mean(axis=0)
        lower = np.quantile(result, 0.05, axis=0)
        upper = np.quantile(result, 0.95, axis=0)
        return mean, lower, upper
Ejemplo n.º 13
0
def main():
    course = 'VJx__VJx_2__3T2016'
    filename = 'model.xgb'
    X, y = load_data(course)

    # Normalization
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    model = XGBRegressor()
    if os.path.isfile(filename):
        model.load_model(filename)
    else:
        model.fit(X, y)
        model.save_model(filename)
    y_ = model.predict(X)
    print(y_)
Ejemplo n.º 14
0
def main():

    print(MODEL_FILE)

    print("Loading data...")

    # The training data is used to train your model how to predict the targets.
    #training_data = read_csv("numerai_training_data.csv")
    # The tournament data is the data that Numerai uses to evaluate your model.
    #tournament_data = read_csv("numerai_tournament_data.csv")

    contest = str(233)
    directory = 'F:\\Numerai\\numerai' + contest + '\\'

    print("Loading data...")

    # The training data is used to train your model how to predict the targets.
    training_data = pd.read_csv(directory +
                                "numerai_training_data.csv").set_index("id")

    # The tournament data is the data that Numerai uses to evaluate your model.
    tournament_data = pd.read_csv(
        directory + "numerai_tournament_data.csv").set_index("id")

    #MODEL_FILE = directory + "example_model.xgb"

    feature_names = [
        f for f in training_data.columns if f.startswith("feature")
    ]
    print(f"Loaded {len(feature_names)} features")

    # This is the model that generates the included example predictions file.
    # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster.
    # Remember to delete example_model.xgb if you change any of the parameters below.
    model = XGBRegressor(max_depth=5,
                         learning_rate=0.01,
                         n_estimators=2000,
                         n_jobs=-1,
                         colsample_bytree=0.1)

    print("Training model...")
    model.fit(training_data[feature_names], training_data[TARGET_NAME])
    print("Training model... {MODEL_FILE}")
    model.save_model("F:\\Numerai\\numerai233\\example_model.xgb")
Ejemplo n.º 15
0
class XGBModel(Model):
    def Build(self):
        self.model = XGBRegressor(max_depth=10, n_estimators=1000, objective='reg:squarederror', seed=config.random_state, nthread=12, tree_method='gpu_hist')

    def Load(self, fileName):
        self.Build()
        self.model.load_model(fileName + '.xgb')

    def Save(self, fileName):
        self.model.save_model(fileName + '.xgb')

    def Fit(self, X_trn, y_trn, X_tst, y_tst, plot=False):
        self.model.fit(X_trn, y_trn, eval_metric='rmse', eval_set=[(X_trn, y_trn), (X_tst, y_tst)], verbose=True, early_stopping_rounds=50)
        if plot: 
            results = self.model.evals_result()
            loss = results['validation_0']['rmse']
            val_loss = results['validation_1']['rmse']
            plot_loss(loss, val_loss)

    def Predict(self, X):
        return self.model.predict(X).reshape(-1,1)
Ejemplo n.º 16
0
class HousePricePredictor(BaseModel):
    def __init__(self):
        self.model = XGBRegressor()

    def predict(self, X):
        X = self._prepare_data(X)
        return self.model.predict(X)

    def _prepare_data(self, X):
        return pd.DataFrame(X, columns=FEATURES)

    def fit(self, X, y):
        model = XGBRegressor()
        clf = GridSearchCV(
            model,
            {
                'max_depth': [6, ],
                'learning_rate': [0.05, ],
                'n_estimators': [450, 470, 475, 480, 485, ]
            },
            n_jobs=4,
            cv=3,
            verbose=1
        )
        clf.fit(X, y)
        logging.info("Best Score: {}".format(clf.best_score_))
        logging.info("Best Params: {}".format(clf.best_params_))
        self.model = clf.best_estimator_

        return self.model

    def dump(self, path):
        self.model.save_model(path)

    @classmethod
    def load(cls, path):
        house_model = HousePricePredictor()
        house_model.model.load_model(path)

        return house_model
Ejemplo n.º 17
0
def bulid_models(x_train, y_train, x_test, y_test, best_grida, best_gridb):
    root_folder = lib.features.STORAGE
    file_patha = os.path.join(root_folder, "modela.xgb")
    file_pathb = os.path.join(root_folder, "modelb.xgb")

    modela = XGBRegressor()
    modela.load_model(file_patha)
    y_preda = modela.predict(x_test)
    base_scorea = mean_absolute_error(y_test[:, 0], y_preda)

    modelb = XGBRegressor()
    modelb.load_model(file_pathb)
    y_predb = modela.predict(x_test)
    base_scoreb = mean_absolute_error(y_test[:, 1], y_predb)

    modela = XGBRegressor(**best_grida)
    modela = modela.fit(x_train,
                        y_train[:, 0],
                        eval_set=[(x_test, y_test[:, 0])],
                        early_stopping_rounds=100,
                        verbose=False)
    y_preda = modela.predict(x_test)
    scorea = mean_absolute_error(y_test[:, 0], y_preda)
    print("score A : {} vs {}".format(scorea, base_scorea))
    if scorea <= base_scorea:
        modela.save_model(file_patha)
        print("model A saved !")

    modelb = XGBRegressor(**best_gridb)
    modelb = modelb.fit(x_train,
                        y_train[:, 1],
                        eval_set=[(x_test, y_test[:, 1])],
                        early_stopping_rounds=100,
                        verbose=False)
    y_predb = modelb.predict(x_test)
    scoreb = mean_absolute_error(y_test[:, 1], y_predb)
    print("score B : {} vs {}".format(scoreb, base_scoreb))
    if scoreb <= base_scoreb:
        modelb.save_model(file_pathb)
        print("model B saved !")
Ejemplo n.º 18
0
y_test = proton_test

kfold = KFold(n_splits=5, shuffle=True, random_state=41)

model = XGBRegressor(learning_rate=0.01,
                     n_estimators=1000,
                     booster='gblinear',
                     colsample_bytree=0.8,
                     n_jobs=-1,
                     objective='reg:squaredlogerror',
                     gpu_id=0,
                     tree_method='gpu_hist').fit(x_train, y_train)

scores = cross_val_score(model, x_val, y_val, cv=kfold, verbose=2)

model.save_model("./AI_2020/task19/model")

y_predict = model.predict(x_test)

print("====================")
print(scores)
print(y_predict)

y_predict = pd.DataFrame(y_predict)
y_test = pd.DataFrame(y_test)

y_test = np.append(y_test, y_predict, axis=0)
y_test = pd.DataFrame(y_test[575136:])

y_test.to_csv('./AI_2020/task19/predict.csv', header=0, index=0)
Ejemplo n.º 19
0
                                  (select_x_test, y_test)],
                        early_stopping_rounds=20)

    y_pred = selection_model.predict(select_x_test)
    r2 = r2_score(y_test, y_pred)

    print("Thresh=%.3f, n = %d, R2 : %.2f%%" %
          (thres, select_x_train.shape[1], r2 * 100.0))

    result = selection_model.evals_result()
    # print("eval's result : ", result)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # model.save_model("./model/xgb_save/boston_thresh=%.3f-r2=%.2f.model"%(thres, r2))
    model.save_model("./model/xgb_save/boston_rmse=%.3f-r2=%.2f.model" %
                     (rmse, r2))

# Thresh=0.003, n = 13, R2 : 93.54%
# Thresh=0.005, n = 12, R2 : 93.71%
# Thresh=0.006, n = 11, R2 : 93.69%
# Thresh=0.009, n = 10, R2 : 93.78%
# Thresh=0.012, n = 9, R2 : 94.11%
# Thresh=0.014, n = 8, R2 : 94.31%
# Thresh=0.015, n = 7, R2 : 93.76%
# Thresh=0.017, n = 6, R2 : 92.80%
# Thresh=0.017, n = 5, R2 : 93.63%
# Thresh=0.039, n = 4, R2 : 92.26%
# Thresh=0.045, n = 3, R2 : 89.30%
# Thresh=0.248, n = 2, R2 : 81.05%
# Thresh=0.569, n = 1, R2 : 69.21%
Ejemplo n.º 20
0
model = XGBRegressor(n_estimators = 100, learning_rate = 0.05, n_jobs = -1) 

model.fit(x_train, y_train)

threshold = np.sort(model.feature_importances_)

for thres in threshold:
    selection = SelectFromModel(model, threshold = thres, prefit = True)
    
    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)

    selection_model =LGBMRegressor(n_estimators = 100, learning_rate = 0.05, n_jobs = -1) 

    selection_model.fit(select_x_train, y_train, verbose= False, eval_metric= ['logloss', 'rmse'],
                                        eval_set= [(select_x_train, y_train), (select_x_test, y_test)],
                                        early_stopping_rounds= 20)

    y_pred = selection_model.predict(select_x_test)
    r2 = r2_score(y_test, y_pred)

    print("Thresh=%.3f, n = %d, R2 : %.2f%%" %(thres, select_x_train.shape[1], r2*100.0))

    # result = selection_model.evals_result()
    # print("eval's result : ", result)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # model.save_model("./model/xgb_save/boston_thresh=%.3f-r2=%.2f.model"%(thres, r2))
    model.save_model("./model/sample/boston/boston_rmse=%.3f-r2=%.2f.model"%(rmse, r2))
Ejemplo n.º 21
0
model_input = train
print(model_input.head())

X, y = model_input.iloc[:,np.r_[6:24]],model_input.iloc[:,3]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

xg_reg = XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=1000, 
                      silent=True, objective='reg:linear', 
                      nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, 
                      colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 
                      base_score=0.5, missing=None)

xg_reg.fit(X_train,y_train)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()
xgb.plot_importance(xg_reg,max_num_features=3)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

preds = xg_reg.predict(X_test)
predictions = np.ndarray.reshape(preds,(preds.shape[0],1))
plt.plot(y_test,predictions,'ro')
plt.show

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

modelName =name+'_XGB.model'
xg_reg.save_model(modelName)
Ejemplo n.º 22
0
class Regressor:

    # for initializing train and test sets, classifier and accuracy score
    # Change method to gpu_hist if you want xgboost to run on a GPU
    def __init__(self,
                 params={
                     'objective': 'reg:squarederror',
                     'verbosity': 0
                 }):
        self.X_train = []
        self.X_labels = []
        self.test = []
        self.test_labels = []
        self.model = XGBRegressor(**params)
        self.prediction = 0
        self.error = 0

    def size(self):
        if isinstance(self.X_train, np.ndarray):
            return self.X_train.size
        return len(self.X_train)

    # adding the data points
    def input_train(self, features, feature):
        if isinstance(self.X_train, np.ndarray) and self.X_train.size > 0:
            self.X_train = self.X_train.tolist()
            self.X_labels = self.X_labels.tolist()
        self.X_train.append(features)
        self.X_labels.append(feature)

    # train the data
    def train(self):
        self.X_train = np.asarray(self.X_train)
        self.X_labels = np.asarray(self.X_labels)
        self.model.fit(self.X_train, self.X_labels)

    def train_eval(self, metric='error'):
        self.X_train = np.asarray(self.X_train)
        self.X_labels = np.asarray(self.X_labels)
        X_train, X_test, y_train, y_test = train_test_split(self.X_train,
                                                            self.X_labels,
                                                            test_size=0.33)
        self.model.fit(X_train,
                       y_train,
                       eval_set=[(X_train, y_train), (X_test, y_test)],
                       eval_metric=metric)
        evals_result = self.model.evals_result()
        if metric == 'error':
            validations = []
            for val in evals_result.values():
                lst = val.get("error")
                validations.append(sum(lst) / len(lst))
            return 1 - (sum(validations) / len(validations))
        else:
            validations = []
            for val in evals_result.values():
                lst = val.get(metric)
                validations.append(lst[-1])
            return validations

    # input test labels if you want to check accuracy
    def label(self, label):
        self.test_labels.append(label)

    def input_test(self, features):
        if isinstance(self.test, np.ndarray) and self.test.size > 0:
            self.test = self.test.tolist()
        self.test.append(features)

    # test data
    def predict(self):
        if not isinstance(self.test, np.ndarray):
            self.test = np.asarray(self.test)
        self.prediction = self.model.predict(self.test)
        return self.prediction

    # if you have the test labels you can check the error rate (you want error close to 0)
    def check_error(self):
        self.test_labels = np.asarray(self.test_labels)
        self.error = metrics.mean_absolute_error(self.test_labels,
                                                 self.prediction)
        return self.error

    # save classifier
    def save_classifier(self, file):
        self.model.save_model(file)

    # open saved classifier
    def open_classifier(self, file):
        self.model.load_model(file)

    # removes all training data
    def clean_train(self):
        self.X_train = []
        self.X_labels = []

    # removes all testing data
    def clean_test(self):
        self.test = []
        self.test_labels = []
Ejemplo n.º 23
0
          early_stopping_rounds=10)

aaa = model.score(x_test, y_test)
print('aaa :', aaa)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print('r2 :', r2)

print('====================================')
results = model.evals_result()
# print(results)

# 저장
import pickle
# pickle.dump(model, open('../data/xgb_save/m39.pickle.dat', 'wb'))
# print('저장완료')
import joblib
# joblib.dump(model, '../data/xgb_save/m39.joblib.dat')
model.save_model('../data/xgb_save/m39.xgb.model')

print('================ xgb model 불러오기 ====================')
# 불러오기
# model2 = pickle.load('../data/xgb_save/m39.pickle.dat', 'wb')
# model2 = joblib.load('../data/xgb_save/m39.joblib.dat')
model2 = XGBRegressor()
model2.load_model('../data/xgb_save/m39.xgb.model')
print('불러옴!')
r22 = model2.score(x_test, y_test)
print('r22 :', r22)
Ejemplo n.º 24
0
from xgboost import XGBRegressor
import xgboost as xgb

#数据预处理
data = pd.read_csv('Lyangchi1.csv')
data2 = pd.read_csv('Lyangchi1.csv')
Frequency = data.pop('Frequency')
x = data2.pop('Data')

#print(data2)

#回归预测
reg = XGBRegressor()
reg.fit(data2, data)
#joblib.dump(reg,"train.m")
reg.save_model('lyangchi1.model')

#bst2 = xgb.Booster(model_file='001.model')
#fig,ax = plt.subplots()
#fig.set_size_inches(60,30)
#xgb.plot_tree(reg,ax=ax, num_trees=2, rankdir='LR')
#fig.savefig('xgb_tree22.jpg')
#plt.show()
#tar = xgb.Booster(model_file='001.model')
#dtest = xgb.DMatrix(data2)
#preds = tar.predict(dtest)
y_pred = reg.predict(data2)

#plt.scatter(Frequency, data, s=5, label='True dates')
#plt.plot(data2,y_pred, lw=2, color='g',alpha=0.2,label='Model')
#plt.title("L-Shenmen-Train")
Ejemplo n.º 25
0
    selec_model = XGBRegressor()
    # selec_model = GridSearchCV(model,parameters,cv=3, n_jobs=n_jobs)
    selec_model.fit(selec_x_train, y_train)
    ''' 여기서 새로운 모델을 생성하는 것과 반복문에 들어오기전의 모델을 그대로 쓰는것과 차이가 있을까? '''
    # print(thresh)

    selec_x_test = selection.transform(x_test)
    y_pred = selec_model.predict(selec_x_test)

    score = r2_score(y_test, y_pred)
    # print(f'select model score : {score}')
    # print(f"model.feature_importances_ : {model.feature_importances_}")

    if max <= score:
        selec_model.save_model(
            f'./model/xgb_save/model_{filename}_save_{selec_x_train.shape[1]}_{np.round(score*100,2)}.dat'
        )
        max = score
    # selec_model.save_model(f'./model/xgb_save/{__file__}_{np.round(thresh,2)}_{np.round(score*100,2)}.data')
    print(
        f"select model score : Thresh={np.round(thresh,2)} \t n={selec_x_train.shape[1]} \t r2={np.round(score*100,2)}"
    )

# 메일 제목 : 아무개 **등

# model.fit(x_train,y_train, verbose=True, eval_metric='error',eval_set=[(x_train, y_train), (x_test, y_test)])
# model.fit(x_train,y_train, verbose=True, eval_metric=['rmse','logloss'],eval_set=[(x_train, y_train), (x_test, y_test)],
#                             early_stopping_rounds=500)

# selec_model.fit(x_train,y_train, verbose=True, eval_metric=['rmse','logloss'],eval_set=[(x_train, y_train), (x_test, y_test)],
#                             early_stopping_rounds=500)
Ejemplo n.º 26
0
thresholds = np.sort(model.feature_importances_)
print(thresholds)


for thresh in thresholds: 
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)

    selection_model = XGBRegressor()
    selection_model.fit(select_x_train, y_train)

    select_x_test = selection.transform(x_test)
    y_pred = selection_model.predict(select_x_test)
    
    score = r2_score(y_test, y_pred)

    print("thresh=%.3f, n = %d, R2 : %2.f%%" %(thresh, select_x_train.shape[1], score*100.0))

model.save_model('./model/xgb_save/boston_rmse')
print("저장 됬다.")

model2=XGBRegressor()
model2.load_model('./model/xgb_save/boston_rmse')
print("불러왔다.")


y_pred = model2.predict(x_test)
score = r2_score(y_pred, y_test)

print("score : ", score)
Ejemplo n.º 27
0
selection = SelectFromModel(xgb, threshold=thresholds[idx_max], prefit=True)
selection_x_train = selection.transform(x_train)
selection_x_test = selection.transform(x_test)
#2)모델구성
selection_model = XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1)

#3)훈련
selection_model.fit(selection_x_train,
                    y_train,
                    verbose=False,
                    eval_metric=["logloss", "rmse"],
                    eval_set=[(selection_x_train, y_train),
                              (selection_x_test, y_test)],
                    early_stopping_rounds=20)
path = f"./model/xgb_save/{__file__[-24:-3]}-idx{idx_max}.dat"
selection_model.save_model(path)
'''
r2
0.9328556062354909
score
0.9328556062354909
idx
0
r2
0.9328556062354909
idx
1
r2
0.932501384781691
idx
2
Ejemplo n.º 28
0
    'max_depth': [100, 150, 200, 250, 300, 350, 400, 450],
    #'min_child_weight': [6, 7, 8],
    # 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
}
optimized_GBM = GridSearchCV(estimator=model,
                             param_grid=cv_params,
                             scoring='r2',
                             cv=3,
                             verbose=1,
                             n_jobs=-1)
optimized_GBM.fit(x_train, y_train)
evalute_result = optimized_GBM.return_train_score
print('每轮迭代运行结果:{0}'.format(evalute_result))
print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))
model.save_model('xgboost.model')
y_pred = model.predict(x_test)

# 兰州	雪佛兰	2.4L	2016	8.9	0	20.61	自动挡	8.1
# 上海	福特	1.8L	2009	5	0	14.64	自动挡	3.2
# 待预测的数据feature 下例真实值[8.1,3.2]
# t = ['兰州', '雪佛兰',	'2.4L',	2016,	8.9, 0, 20.61, 0]
# city_id = list(city_le.classes_).index('兰州')
# band_id = list(brand_le.classes_).index('雪佛兰')
# t = [city_id, band_id, 2.4, 2016, 8.9, 0, 20.61, 0]
t = ['上海', '福特', '1.8', 2016, 8.9, 0, 20.61, 0]
city_id = list(city_le.classes_).index('上海')
band_id = list(brand_le.classes_).index('福特')
t = [city_id, band_id, 1.8, 2009, 5, 0, 14.64, 0]

tu = tuple(t)
Ejemplo n.º 29
0
    search = GridSearchCV(XGBRegressor(), parameter, cv=5, n_jobs=-1)

    select_x_train = selection.transform(x_train)

    search.fit(select_x_train, y_train)

    select_x_test = selection.transform(x_test)
    x_pred = search.predict(select_x_test)

    score = r2_score(y_test, x_pred)
    # print('R2는',score)

    print("Thresh=%.3f, n=%d, R2: %.2f%%" %
          (thresh, select_x_train.shape[1], score * 100.0))
    model.save_model('./model/xgb_save/m34sfm/cancer.xgb' + str(thresh) +
                     '.model')
'''
[0.00497141 0.00802845 0.00874821 0.00903318 0.00930241 0.01546535
 0.02015997 0.02073635 0.02245208 0.03186943 0.03302769 0.15981925
 0.65638626]
Thresh=0.005, n=13, R2: 90.59%
Thresh=0.008, n=12, R2: 90.09%
Thresh=0.009, n=11, R2: 90.93%
Thresh=0.009, n=10, R2: 90.57%
Thresh=0.009, n=9, R2: 91.49%
Thresh=0.015, n=8, R2: 92.46%
Thresh=0.020, n=7, R2: 92.93%
Thresh=0.021, n=6, R2: 90.61%
Thresh=0.032, n=4, R2: 88.70%
Thresh=0.033, n=3, R2: 87.77%
Thresh=0.160, n=2, R2: 70.24%
Ejemplo n.º 30
0
def main():
    print("Loading data...")
    # The training data is used to train your model how to predict the targets.
    training_data = read_csv("numerai_training_data.csv")
    # The tournament data is the data that Numerai uses to evaluate your model.
    tournament_data = read_csv("numerai_tournament_data.csv")

    feature_names = [
        f for f in training_data.columns if f.startswith("feature")
    ]
    print(f"Loaded {len(feature_names)} features")

    # This is the model that generates the included example predictions file.
    # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster.
    # Remember to delete example_model.xgb if you change any of the parameters below.
    model = XGBRegressor(max_depth=5,
                         learning_rate=0.01,
                         n_estimators=2000,
                         n_jobs=-1,
                         colsample_bytree=0.1)
    if MODEL_FILE.is_file():
        print("Loading pre-trained model...")
        model.load_model(MODEL_FILE)
    else:
        print("Training model...")
        model.fit(training_data[feature_names], training_data[TARGET_NAME])
        model.save_model(MODEL_FILE)

    # Generate predictions on both training and tournament data
    print("Generating predictions...")
    training_data[PREDICTION_NAME] = model.predict(
        training_data[feature_names])
    tournament_data[PREDICTION_NAME] = model.predict(
        tournament_data[feature_names])

    # Check the per-era correlations on the training set (in sample)
    train_correlations = training_data.groupby("era").apply(score)
    print(
        f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}"
    )
    print(
        f"On training the average per-era payout is {payout(train_correlations).mean()}"
    )
    """Validation Metrics"""
    # Check the per-era correlations on the validation set (out of sample)
    validation_data = tournament_data[tournament_data.data_type ==
                                      "validation"]
    validation_correlations = validation_data.groupby("era").apply(score)
    print(
        f"On validation the correlation has mean {validation_correlations.mean()} and "
        f"std {validation_correlations.std(ddof=0)}")
    print(
        f"On validation the average per-era payout is {payout(validation_correlations).mean()}"
    )

    # Check the "sharpe" ratio on the validation set
    validation_sharpe = validation_correlations.mean(
    ) / validation_correlations.std(ddof=0)
    print(f"Validation Sharpe: {validation_sharpe}")

    print("checking max drawdown...")
    rolling_max = (validation_correlations + 1).cumprod().rolling(
        window=100, min_periods=1).max()
    daily_value = (validation_correlations + 1).cumprod()
    max_drawdown = -(rolling_max - daily_value).max()
    print(f"max drawdown: {max_drawdown}")

    # Check the feature exposure of your validation predictions
    feature_exposures = validation_data[feature_names].apply(
        lambda d: correlation(validation_data[PREDICTION_NAME], d), axis=0)
    max_per_era = validation_data.groupby("era").apply(
        lambda d: d[feature_names].corrwith(d[PREDICTION_NAME]).abs().max())
    max_feature_exposure = max_per_era.mean()
    print(f"Max Feature Exposure: {max_feature_exposure}")

    # Check feature neutral mean
    print("Calculating feature neutral mean...")
    feature_neutral_mean = get_feature_neutral_mean(validation_data)
    print(f"Feature Neutral Mean is {feature_neutral_mean}")

    # Load example preds to get MMC metrics
    example_preds = pd.read_csv("example_predictions.csv").set_index(
        "id")["prediction"]
    validation_example_preds = example_preds.loc[validation_data.index]
    validation_data["ExamplePreds"] = validation_example_preds

    print("calculating MMC stats...")
    # MMC over validation
    mmc_scores = []
    corr_scores = []
    for _, x in validation_data.groupby("era"):
        series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])),
                                   pd.Series(unif(x["ExamplePreds"])))
        mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29**2))
        corr_scores.append(
            correlation(unif(x[PREDICTION_NAME]), x[TARGET_NAME]))

    val_mmc_mean = np.mean(mmc_scores)
    val_mmc_std = np.std(mmc_scores)
    val_mmc_sharpe = val_mmc_mean / val_mmc_std
    corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
    corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
    corr_plus_mmc_mean = np.mean(corr_plus_mmcs)
    corr_plus_mmc_sharpe_diff = corr_plus_mmc_sharpe - validation_sharpe

    print(f"MMC Mean: {val_mmc_mean}\n"
          f"Corr Plus MMC Sharpe:{corr_plus_mmc_sharpe}\n"
          f"Corr Plus MMC Diff:{corr_plus_mmc_sharpe_diff}")

    # Check correlation with example predictions
    full_df = pd.concat([
        validation_example_preds, validation_data[PREDICTION_NAME],
        validation_data["era"]
    ],
                        axis=1)
    full_df.columns = ["example_preds", "prediction", "era"]
    per_era_corrs = full_df.groupby('era').apply(
        lambda d: correlation(unif(d["prediction"]), unif(d["example_preds"])))
    corr_with_example_preds = per_era_corrs.mean()
    print(f"Corr with example preds: {corr_with_example_preds}")

    # Save predictions as a CSV and upload to https://numer.ai
    tournament_data[PREDICTION_NAME].to_csv("submission.csv", header=True)
Ejemplo n.º 31
0
import pickle

data = pd.read_csv('dataset/car data.csv')

# The column car name doesn't seem to add much value to our analysis and hence dropping the column
data = data.drop('Car_Name', axis=1)

# It's important to know how many years old the car is.
data['Car_age'] = 2020-data['Year']
data.drop('Year', axis=1, inplace=True)

fuel = pd.get_dummies(data['Fuel_Type'])
transmission = pd.get_dummies(data['Transmission'])
seller = pd.get_dummies(data['Seller_Type'])

data.drop(['Fuel_Type', 'Transmission', 'Seller_Type'], axis=1, inplace=True)

data_final = pd.concat([data, fuel, transmission, seller], axis=1)


X = data_final.iloc[:, 1:]
y = data_final.iloc[:, 0]


model = XGBRegressor()
model.fit(X.values, y.values)

model.save_model('model.pkl')

# pickle.dump(model, open('model.pkl', 'wb'))