Esempio n. 1
0
def train(array, embedDim, interval):
    XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1)
    kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False)
    params = {
        'C': uniform(1, 99),
        'gamma': uniform(0.01, 0.29),
        'kernel': ['rbf', 'poly']
    }
    bestModels = []
    for i in range(len(yTrain[0])):
        svr = svm.SVR()
        clf = grid_search.RandomizedSearchCV(svr,
                                             param_distributions=params,
                                             n_iter=30,
                                             cv=kfold,
                                             scoring='mean_squared_error',
                                             n_jobs=1,
                                             verbose=0)
        clf.fit(XTrain, yTrain[:, i])
        bestModels.append(clf.best_estimator_)

    for i in range(1, 12):
        XTrain, yTrain = pp.makeTrainset(array, embedDim, interval,
                                         i)  # 模型的预测天数递增
        XPredict = pp.makeXPredict(array, embedDim, interval, i)  # 待预测的输入递增
        subyPredict = []
        for j in range(len(yTrain[0])):
            bestModels[j].fit(XTrain, yTrain[:, j])
            subyPredict.append(bestModels[j].predict(XPredict))
        array = np.hstack(
            (array, np.array(copy(subyPredict))))  # 将一个模型的预测值作为已知数据,训练下一个模型
    yPredict = array[0, -65:-5]  # 一共可以预测66天,取其中对应的数据
    return yPredict
Esempio n. 2
0
def lr_model(all_file, num=200, debug=True):
    if debug:
        all_data = pd.read_csv(all_file, nrows=500, encoding='gb18030')
    else:
        all_data = pd.read_csv(all_file, encoding='gb18030')
    train_data = all_data[all_data['tag'] == 1]
    feature_data = train_data.drop(['Idx', 'ListingInfo', 'target', 'tag'],
                                   axis=1)
    feature_data.fillna(-1, inplace=True)
    # labels = train_data['target']
    feature_importance = pd.read_csv(features_importance_file)
    feature_importance_columns = feature_importance['feature'].tolist()
    feature_importance_columns = feature_importance_columns[:num]
    final_train_data = feature_data[feature_importance_columns]
    # final_train_data = feature_data
    print final_train_data.shape
    labels = train_data['target']
    clf = LogisticRegression()
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'liblinear']
    }
    model = grid_search.RandomizedSearchCV(estimator=clf,
                                           param_distributions=param_grid,
                                           n_jobs=1,
                                           cv=2,
                                           verbose=0,
                                           n_iter=5,
                                           scoring=AUC)
    model.fit(final_train_data, labels)
    report(model.grid_scores_)
Esempio n. 3
0
def train(array, embedDim, interval):
    XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1)
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    params = {
        "n_estimators": randint(5, 100),
        "max_depth": [1, 2, 3, 5, 8, 10, None],
        "max_features": randint(1, len(XTrain[0])),
        "min_samples_split": randint(1, 3),
        "min_samples_leaf": randint(1, 3)
    }
    bestModels = []
    for i in range(len(yTrain[0])):
        erf = ExtraTreesRegressor()
        clf = grid_search.RandomizedSearchCV(erf,
                                             param_distributions=params,
                                             n_iter=10,
                                             scoring='mean_squared_error',
                                             cv=kfold,
                                             n_jobs=-1)
        clf.fit(XTrain, yTrain[:, i])
        bestModels.append(clf.best_estimator_)

    for i in range(60):
        XTrain, yTrain = pp.makeTrainset(array, embedDim, interval,
                                         1)  # 模型的嵌入维度递增
        XPredict = pp.makeXPredict(array, embedDim, interval, 1)  # 待预测的嵌入维度递增
        subyPredict = []
        for j in range(len(yTrain[0])):
            bestModels[j].fit(XTrain, yTrain[:, j])
            subyPredict.append(bestModels[j].predict(XPredict))
        array = np.hstack(
            (array, np.array(copy(subyPredict))))  # 将一个模型的预测值作为已知数据,训练下一个模型
        embedDim += 1
    yPredict = array[0, -60:]  # 一共可以预测60天,取其中对应的数据
    return yPredict
Esempio n. 4
0
def train(array, embedDim, interval):
    distance = 7
    for i in range(9):
        XTrain, yTrain = pp.makeTrainset(array, embedDim, interval,
                                         distance)  # 模型的预测天数
        XPredict = pp.makeXPredict(array, embedDim, interval, distance)
        params = {
            'C': uniform(1, 99),
            'gamma': uniform(0.01, 0.29),
            'kernel': ['rbf', 'poly']
        }
        kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False)
        subyPredict = []
        for j in range(len(yTrain[0])):
            svr = svm.SVR()
            clf = grid_search.RandomizedSearchCV(svr,
                                                 param_distributions=params,
                                                 n_iter=10,
                                                 cv=kfold,
                                                 scoring='mean_squared_error',
                                                 n_jobs=1,
                                                 verbose=0)
            clf.fit(XTrain, yTrain[:, j])
            subyPredict.append(clf.predict(XPredict))
        array = np.hstack(
            (array, np.array(copy(subyPredict))))  # 将一个模型的预测值作为已知数据,训练下一个模型
        embedDim += distance
    yPredict = array[0, -62:-2]  # 一共可以预测63天,取其中对应的数据
    return yPredict
Esempio n. 5
0
    def test_grid_search(self):
        def scorer(network, X, y):
            result = network.predict(X)
            return rmsle(result[:, 0], y)

        dataset = datasets.load_diabetes()
        x_train, x_test, y_train, y_test = train_test_split(dataset.data,
                                                            dataset.target,
                                                            train_size=0.7)

        grnnet = algorithms.GRNN(std=0.5, verbose=False)
        grnnet.train(x_train, y_train)
        error = scorer(grnnet, x_test, y_test)

        self.assertAlmostEqual(0.513, error, places=3)

        random_search = grid_search.RandomizedSearchCV(
            grnnet,
            param_distributions={'std': np.arange(1e-2, 0.1, 1e-4)},
            n_iter=10,
            scoring=scorer)
        random_search.fit(dataset.data, dataset.target)
        scores = random_search.grid_scores_

        best_score = min(scores, key=itemgetter(1))
        self.assertAlmostEqual(0.4303, best_score[1], places=3)
Esempio n. 6
0
def train(XTrain, yTrain, XPredict):
    XTrain = np.array(XTrain, dtype=float)
    yTrain = np.array(yTrain, dtype=float)
    params = {
        'n_estimators': randint(50, 150),
        'max_depth': randint(1, 4),
        'learning_rate': uniform(0.01, 0.19),
        'min_child_weight': [1],
        'max_delta_step': randint(0, 50),
        'subsample': uniform(0.5, 0.5),
        'colsample_bytree': uniform(0.5, 0.5),
        'colsample_bylevel': uniform(0.5, 0.5),
        'scale_pos_weight': [0],
        'gamma': uniform(1, 6)
    }
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    xgbr = xgb.XGBRegressor()
    clf = grid_search.RandomizedSearchCV(xgbr,
                                         param_distributions=params,
                                         n_iter=5,
                                         n_jobs=1,
                                         scoring='mean_squared_error',
                                         cv=kfold,
                                         verbose=0)
    yPredict = []
    for i in range(yTrain.shape[1]):
        clf.fit(XTrain, yTrain[:, i])  # 训练distance个模型
        yPredict.extend(clf.predict(XPredict))
    return np.array(yPredict)
Esempio n. 7
0
def train(XTrain, yTrain, XPredict):
    params = {
        'n_estimators': randint(20, 200),
        'max_depth': randint(1, 4),
        'learning_rate': uniform(0.01, 0.19),
        'min_child_weight': [1],
        'max_delta_step': randint(0, 50),
        'subsample': uniform(0.5, 0.5),
        'colsample_bytree': uniform(0.5, 0.5),
        'colsample_bylevel': uniform(0.5, 0.5),
        'scale_pos_weight': [0],
        'gamma': uniform(1, 10)
    }
    kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False)
    xgbr = xgb.XGBRegressor()
    clf = grid_search.RandomizedSearchCV(xgbr,
                                         param_distributions=params,
                                         n_iter=50,
                                         n_jobs=1,
                                         scoring='mean_absolute_error',
                                         cv=kfold,
                                         verbose=0)
    clf.fit(XTrain, yTrain)  # 一次性训练模型
    # print clf.best_score_, clf.best_estimator_
    yPredict = clf.predict(XPredict)
    return yPredict
Esempio n. 8
0
def train(array, embedDim, interval):
    distance = 7
    XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, distance)
    kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False)
    params = {'n_estimators': randint(20, 200),
              'loss': ['ls', 'lad', 'huber'],
              'learning_rate': uniform(0.01, 0.19),
              'subsample': uniform(0.5, 0.5),
              'max_depth': randint(1, 5),
              'min_samples_split': randint(1, 3),
              'min_samples_leaf': randint(1, 3),
              'max_features': randint(1, len(XTrain[0]))}
    bestModels = []
    for i in range(len(yTrain[0])):
        gbrt = GradientBoostingRegressor()
        clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=30,
                                             scoring='mean_squared_error', cv=kfold, n_jobs=-1)
        clf.fit(XTrain, yTrain[:, i])
        bestModels.append(clf.best_estimator_)

    for i in range(9):
        XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, distance)  # 模型的嵌入维度递增
        XPredict = pp.makeXPredict(array, embedDim, interval, distance)  # 待预测的嵌入维度递增
        subyPredict = []
        for j in range(len(yTrain[0])):
            bestModels[j].fit(XTrain, yTrain[:, j])
            subyPredict.append(bestModels[j].predict(XPredict))
        array = np.hstack((array, np.array(copy(subyPredict))))  # 将一个模型的预测值作为已知数据,训练下一个模型
        embedDim += distance
    yPredict = array[0, -62:-2]  # 一共可以预测63天,取其中对应的数据
    return yPredict
Esempio n. 9
0
def train(XTrain, yTrain, XPredict):
    XTrain = np.array(XTrain, dtype=float)
    yTrain = np.array(yTrain, dtype=float)
    params = {
        'n_estimators': randint(50, 150),
        'loss': ['ls', 'lad', 'huber'],
        'learning_rate': uniform(0.01, 0.19),
        'subsample': uniform(0.5, 0.5),
        'max_depth': randint(1, 4),
        'min_samples_split': randint(1, 4),
        'min_samples_leaf': randint(1, 4),
        'max_features': randint(1, len(XTrain[0]))
    }
    gbrt = GradientBoostingRegressor()
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    clf = grid_search.RandomizedSearchCV(gbrt,
                                         param_distributions=params,
                                         n_iter=5,
                                         scoring='mean_squared_error',
                                         cv=kfold,
                                         n_jobs=-1)
    yPredict = []
    for i in range(yTrain.shape[1]):
        clf.fit(XTrain, yTrain[:, i])  # 训练distance个模型
        yPredict.extend(clf.predict(XPredict))
    return np.array(yPredict)
Esempio n. 10
0
def train(XTrain, yTrain, testsize):
    XTrain = np.array(XTrain, dtype=float)
    yTrain = np.array(yTrain, dtype=float)
    params = {'C': uniform(1, 99),
              'gamma': uniform(0.01, 0.29),
              'kernel': ['rbf', 'poly']}
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    models = []
    for i in range(len(yTrain[0])):
        svr = svm.SVR()
        clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=30, cv=kfold,
                                             scoring='mean_squared_error', n_jobs=-1,verbose=1)
        clf.fit(transArray(XTrain), yTrain[:, i])
        models.append(clf.best_estimator_)
    yPredict = []
    XPredict = copy(XTrain[-1])
    for i in range(testsize):
        XPredict = np.delete(XPredict, 0, axis=0)
        XPredict = np.insert(XPredict, len(XPredict), yTrain[-1], axis=0)
        subyPredict = np.array([])
        for j in range(len(models)):
            models[j].fit(transArray(XTrain), yTrain[:, j])  # 重复训练模型
            newPredict = models[j].predict([transRow(XPredict)])
            subyPredict = np.hstack((subyPredict, newPredict))
        XTrain = np.delete(XTrain, 0, axis=0)
        XTrain = np.insert(XTrain, len(XTrain), copy(XPredict), axis=0)
        yTrain = np.delete(yTrain, 0, axis=0)
        yTrain = np.insert(yTrain, len(yTrain), copy(subyPredict), axis=0)
        yPredict.append(copy(subyPredict[0]))
    return np.array(yPredict)
Esempio n. 11
0
def randomized_grid_search_linear(datamat, classvect, C=4.6, n=20, n_jobs=1):
    """Retun best parameters from randomized grid search"""
    clf = svm.LinearSVC(class_weight='auto')
    param_dist = {'C': scipy.stats.expon(scale=C)}
    # run randomized search
    random_search = grid_search.RandomizedSearchCV(
        clf, param_distributions=param_dist, n_iter=n, n_jobs=n_jobs)
    random_search.fit(datamat, classvect)
    return random_search
def fit(train, target):

    # set up pipeline
    est = pipeline.Pipeline([
        ('xgb', xgb.XGBRegressor(silent=True)),
    ])

    # create param grid for grid search
    params = {
        'xgb__learning_rate': [
            0.05,
            0.1,
            0.3,
        ],
        'xgb__min_child_weight': [
            1,
            2,
        ],
        'xgb__subsample': [
            1,
        ],
        'xgb__colsample_bytree': [
            1,
        ],
        'xgb__max_depth': [
            15,
            20,
        ],
        'xgb__n_estimators': [
            1000,
        ],
    }

    # set up scoring mechanism
    gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better=True)

    # initialize gridsearch
    gridsearch = grid_search.RandomizedSearchCV(
        estimator=est,
        param_distributions=params,
        scoring=gini_scorer,
        verbose=10,
        n_jobs=-1,
        cv=5,
        n_iter=3,
    )

    # fit gridsearch
    gridsearch.fit(train, target)
    print('Best score: %.3f' % gridsearch.best_score_)
    print('Best params:')
    for k, v in sorted(gridsearch.best_params_.items()):
        print("\t%s: %r" % (k, v))

    # get best estimator
    return gridsearch.best_estimator_
Esempio n. 13
0
def train(XTrain, yTrain, XPredict):
    params = {'n_estimators': randint(1, 100)}
    kfold = cross_validation.KFold(len(XTrain), n_folds=3)
    svr = svm.SVR(kernel='rbf', C=50, gamma=0.1)
    baggingsvr = ensemble.BaggingRegressor(svr)
    clf = grid_search.RandomizedSearchCV(baggingsvr, param_distributions=params, n_iter=10,
                                         scoring='mean_squared_error', cv=kfold, n_jobs=-1)
    clf.fit(XTrain, yTrain)  # 一次性训练模型
    # print clf.best_score_, clf.best_estimator_
    yPredict = clf.predict(XPredict)
    return yPredict, clf.best_params_
Esempio n. 14
0
def fitAlgo(clf, Xtrain, Ytrain, opt = False, param_dict = None, opt_metric = 'roc_auc', n_iter = 5, n_optFolds = 3):
    '''Return the fitted classifier
    Keyword arguments:
    clf - - base classifier
    Xtrain - - training feature matrix
    Ytrain - - training target array
    param_dict - - the parameter distribution of param, grids space, if opt == False, every element should have length 1
    opt_metric - - optimization metric
    opt - - whether to do optimization or not
    '''

    if opt & (param_dict != None):
        assert(map(lambda x: isinstance(param_dict[x],list), param_dict))
        prod_feature_05 =np.prod([math.pow(len(v),0.5) for x, v in param_dict.iteritems()])
        prod_feature =np.prod([len(v) for x, v in param_dict.iteritems()])
        N_iter = int(np.ceil(prod_feature_05* n_iter / 5 * 1.5))
        N_iter = N_iter if N_iter < prod_feature else prod_feature
        print("Using N_iter = " + str(N_iter))
        if n_iter != 0:
            rs = gd.RandomizedSearchCV(estimator = clf, n_iter = N_iter,
                                    param_distributions = param_dict,
                                    scoring = opt_metric,
                                    refit = True,
                                    n_jobs=-1, cv = n_optFolds, verbose = 1)
        else:
            rs = gd.GridSearchCV(estimator = clf,
                                    param_grid = param_dict,
                                    scoring = opt_metric,
                                    refit = True,
                                    n_jobs=-1, cv = n_optFolds, verbose = 1)
        print("Simulation with num_features=", num_features)
        print("max_features=")
        print(param_dict)
        rs.fit(Xtrain, Ytrain)
        print("\n### Optimal parameters: ###")
        pprint(rs.best_params_)
        print("####################### \n")

        imp = []
        if clf.__class__.__name__ == "RandomForestClassifier":
            imp = rs.best_estimator_.feature_importances_
        return rs.best_estimator_, rs.grid_scores_, imp
    else:
        if param_dict != None:
            assert(map(lambda x: not isinstance(param_dict[x], list), param_dict))
            for k in param_dict.keys():
                # print k
                # print opt
                # print param_dict
                clf.set_params(k = param_dict[k])
        clf.fit(Xtrain, Ytrain)
        return clf, [], []
Esempio n. 15
0
    def test_example_randomized_search(self):
        # The classic example from the sklearn documentation
        iris = datasets.load_iris()
        parameters = {'kernel': ('linear', 'rbf'), 'C': range(1, 10)}
        svr = svm.SVC()
        clf = grid_search.RandomizedSearchCV(svr, parameters, random_state=4)
        clf.fit(iris.data, iris.target)

        clf2 = RandomizedSearchCV(self.sc, svr, parameters, random_state=4)
        clf2.fit(iris.data, iris.target)

        b1 = clf.estimator
        b2 = clf2.estimator
        self.assertEqual(b1.get_params(), b2.get_params())
Esempio n. 16
0
def randomized_grid_search_rbf(datamat,
                               classvect,
                               C=4.6,
                               gamma=0.01,
                               n=20,
                               n_jobs=1):
    """Retun best parameters from randomized grid search"""
    clf = svm.SVC(kernel='rbf', cache_size=4000, class_weight='auto')
    param_dist = {
        'C': scipy.stats.expon(scale=C),
        'gamma': scipy.stats.expon(scale=gamma)
    }
    # run randomized search
    random_search = grid_search.RandomizedSearchCV(
        clf, param_distributions=param_dist, n_iter=n, n_jobs=n_jobs)
    random_search.fit(datamat, classvect)
    return random_search
Esempio n. 17
0
def train(XTrain, yTrain, testsize):
    XTrain = np.array(XTrain, dtype=float)
    yTrain = np.array(yTrain, dtype=float)
    params = {
        'n_estimators': randint(50, 150),
        'max_depth': randint(1, 4),
        'learning_rate': uniform(0.01, 0.19),
        'min_child_weight': [1],
        'max_delta_step': randint(0, 50),
        'subsample': uniform(0.5, 0.5),
        'colsample_bytree': uniform(0.5, 0.5),
        'colsample_bylevel': uniform(0.5, 0.5),
        'scale_pos_weight': [0],
        'gamma': uniform(1, 6)
    }
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    models = []
    for i in range(len(yTrain[0])):
        xgbr = xgb.XGBRegressor()
        clf = grid_search.RandomizedSearchCV(xgbr,
                                             param_distributions=params,
                                             n_iter=10,
                                             n_jobs=1,
                                             scoring='mean_squared_error',
                                             cv=kfold,
                                             verbose=0)
        clf.fit(transArray(XTrain), yTrain[:, i])
        models.append(clf.best_estimator_)
    yPredict = []
    XPredict = copy(XTrain[-1])
    for i in range(testsize):
        XPredict = np.delete(XPredict, 0, axis=0)
        XPredict = np.insert(XPredict, len(XPredict), yTrain[-1], axis=0)
        subyPredict = np.array([])
        XTrainTrans = transArray(XTrain)
        XPredictTrans = transRow(XPredict)
        for j in range(len(models)):
            models[j].fit(XTrainTrans, yTrain[:, j])  # 重复训练模型
            newPredict = models[j].predict([XPredictTrans])
            subyPredict = np.hstack((subyPredict, newPredict))
        XTrain = np.delete(XTrain, 0, axis=0)
        XTrain = np.insert(XTrain, len(XTrain), copy(XPredict), axis=0)
        yTrain = np.delete(yTrain, 0, axis=0)
        yTrain = np.insert(yTrain, len(yTrain), copy(subyPredict), axis=0)
        yPredict.append(copy(subyPredict[0]))
    return np.array(yPredict)
def classifier_comparison(X, y):
    """
    分类器比较

    Args:
        X: training samples, size=[n_samples, n_features]
        y: class labels, size=[n_samples, 1]
    Returns:
        None
    """
    from sklearn import grid_search
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.lda import LDA
    from sklearn.qda import QDA
    import scipy

    # Exhaustive Grid Search
    exhaustive_parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000], 'gamma':[1e-3, 1e-4]}
    clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters)
    # Randomized Parameter Optimization
    randomized_parameter = {'kernel':['rbf'], 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1)}
    clf_SVC_randomized = grid_search.RandomizedSearchCV(SVC(), randomized_parameter)

    names = ["Linear SVM", "RBF SVM",
             "RBF SVM with Grid Search", "RBF SVM with Random Grid Search", 
             "Decision Tree", "Random Forest", 
             "AdaBoost", "Naive Bayes", "LDA", "QDA"]
    classifiers = [
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        clf_SVC_exhaustive,
        clf_SVC_randomized,
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()]

    for name, clf in zip(names, classifiers):
        logger.info('Use %s:' % (name))
        train_classifier(clf, X, y)
Esempio n. 19
0
def train(XTrain, yTrain, XPredict):
    params = {
        'C': uniform(1, 999),
        'gamma': uniform(0.01, 0.29),
        'kernel': ['rbf', 'poly']
    }
    kfold = cross_validation.KFold(len(XTrain), n_folds=3, shuffle=False)
    svr = svm.SVR()
    clf = grid_search.RandomizedSearchCV(svr,
                                         param_distributions=params,
                                         n_iter=20,
                                         cv=kfold,
                                         scoring='mean_squared_error',
                                         n_jobs=-1)
    clf.fit(XTrain, yTrain)  # 一次性训练模型
    # print clf.best_score_, clf.best_estimator_
    yPredict = clf.predict(XPredict)
    return yPredict, clf.best_params_
Esempio n. 20
0
def find_best_estimator(base_estimator,
                        X,
                        y,
                        cfg,
                        section,
                        grid_search_params_key,
                        random_search=True,
                        scoring="accuracy",
                        verbosity=3):
    # grid_search_params_key : key under the indicated section of the
    # configuration YML file containing the grid search parameters
    cv_nfold = cfg[section]["cv_nfold"]
    name = type(base_estimator).__name__
    n_iter = cfg[section]["n_iters"]
    n_jobs = cfg[section]["n_jobs"]
    param_dist = cfg[section][grid_search_params_key]
    random_state = cfg["common"]["seed"]
    logger.info("Finding the best %s based on %s score" % (name, scoring))
    if random_search == cfg[section]["use_random_search"]:
        logger.info("Using random search to find the best %s" % name)
        search = grid_search.RandomizedSearchCV(estimator=base_estimator,
                                                param_distributions=param_dist,
                                                n_iter=n_iter,
                                                n_jobs=n_jobs,
                                                cv=cv_nfold,
                                                random_state=random_state,
                                                scoring=scoring,
                                                verbose=verbosity)
    else:
        logger.info("Using grid search to find the best %s" % name)
        search = grid_search.GridSearchCV(estimator=base_estimator,
                                          param_grid=param_dist,
                                          n_jobs=n_jobs,
                                          cv=cv_nfold,
                                          scoring=scoring,
                                          verbose=verbosity)

    logger.info(search)
    start = time.time()
    search.fit(X, y)
    logger.info("Took %.2f seconds to find the best %s." %
                ((time.time() - start), name))
    report_grid_search_scores(search.grid_scores_, n_top=3)
    return search.best_estimator_
Esempio n. 21
0
    def grid_search(self, col2fit, **kwargs):
        """Using grid search to find the best parameters."""
        n_jobs = kwargs.get('n_jobs', 1)

        # use a full grid over all parameters
        parameters = {
            "max_depth": sp_randint(1, 30),
            "max_features": [1.0, 0.8, 0.6, 0.4, 0.2, 0.1],
            "min_samples_leaf": sp_randint(1, 25),
            "learning_rate": [0.01, 0.02, 0.05, 0.1]
        }

        if not self.iscleaned:
            print 'Preparing the data...'
            self.prepare_data(self.df_full, True, col2fit)
        else:
            print 'data frame is already cleaned...'
        train_values = self.df_full[col2fit].values
        target_values = self.df_full['Expected'].values

        pre_dispatch = '2*n_jobs'

        # Fit the grid
        print 'fitting the grid with njobs = {}...'.format(n_jobs)
        start = time()
        estimator = GradientBoostingRegressor(n_estimators=200)
        rf_grid = grid_search.RandomizedSearchCV(estimator,
                                                 parameters,
                                                 n_jobs=n_jobs,
                                                 verbose=2,
                                                 pre_dispatch=pre_dispatch,
                                                 scoring=kaggle_score,
                                                 error_score=0,
                                                 n_iter=25)
        rf_grid.fit(train_values, target_values)
        print('Grid search finished')

        print(
            "\n\nGridSearchCV took %.2f seconds for %d candidate parameter settings."
            % (time() - start, len(rf_grid.grid_scores_)))
        self.grid_report(rf_grid.grid_scores_, 15)

        print('\n\nBest score = {}'.format(rf_grid.best_score_))
        print('Best params = {}\n\n'.format(rf_grid.best_params_))
Esempio n. 22
0
def do_gs(clf, X, y, params, n_samples=1000, n_iter=3, 
    n_jobs=-2, scoring=None, fit_params=None, 
    random_iterations=None):
  start('starting grid search')
  if type(n_samples) is float: n_samples = int(n_samples)
  reseed(clf)
  cv = cross_validation.ShuffleSplit(n_samples, n_iter=n_iter, random_state=cfg['sys_seed'])
  if random_iterations is None:
    gs = grid_search.GridSearchCV(clf, params, cv=cv, 
      n_jobs=n_jobs, verbose=2, scoring=scoring or cfg['scoring'], fit_params=fit_params)
  else:
    gs = grid_search.RandomizedSearchCV(clf, params, random_iterations, cv=cv, 
      n_jobs=n_jobs, verbose=2, scoring=scoring or cfg['scoring'], 
      fit_params=fit_params, refit=False)
  X2, y2 = utils.shuffle(X, y, random_state=cfg['sys_seed'])  
  gs.fit(X2[:n_samples], y2[:n_samples])
  stop('done grid search')
  dbg(gs.best_params_, gs.best_score_)  
  return gs
Esempio n. 23
0
def generate_model(data, classes, args):

    # Define the parameters
    tuned_parameters = {'C': C_RANGE,
                        'class_weight': CLASS_WEIGHTS}

    # Define the classifier
    if args.kernel == 'rbf':
        clf = svm.SVC(cache_size=CACHE_SIZE)
        tuned_parameters['gamma'] = GAMMA_RANGE
    else:
        clf = svm.LinearSVC(dual=False)

    print_verbose("Classifier: %s" % str(clf), 5)
    print_verbose("Parameters: %s" % str(tuned_parameters), 5)

    # Generate the K-fold development
    skf = cross_validation.StratifiedKFold(classes, n_folds=K_FOLD, shuffle=True)
    print_verbose("KFold: %s" % str(skf), 5)

    # Generate the grid search
    if args.search == 'grid':
        gscv = grid_search.GridSearchCV(clf, tuned_parameters, cv=skf, scoring='f1',
                                        n_jobs=1, verbose=get_verbose_level())
    else:
        gscv = grid_search.RandomizedSearchCV(clf, tuned_parameters, cv=skf, scoring='f1',
                                              n_jobs=1, verbose=get_verbose_level(), n_iter=args.iter)

    # Search
    print_verbose("GridSearch: %s" % str(gscv), 5)
    gscv.fit(data, classes)

    # Print scores
    print_verbose("GridSearch scores:", 5)
    for params, mean_score, scores in gscv.grid_scores_:
        print_verbose("%0.6f (+/-%0.06f) for %r"
                      % (mean_score, scores.std() / 2, params), 5)

    # Print best score
    print_verbose("GridSearch best score:", 0)
    print_verbose("%0.6f for %r" % (gscv.best_score_, gscv.best_params_), 0)

    return gscv
Esempio n. 24
0
def train(XTrain, yTrain, XPredict):
    params = {
        "n_estimators": randint(5, 100),
        "max_depth": [1, 2, 3, 5, 8, 10, None],
        "max_features": randint(1, len(XTrain[0])),
        "min_samples_split": randint(1, 3),
        "min_samples_leaf": randint(1, 3)
    }
    erf = ExtraTreesRegressor()
    kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False)
    clf = grid_search.RandomizedSearchCV(erf,
                                         param_distributions=params,
                                         n_iter=50,
                                         scoring='mean_absolute_error',
                                         cv=kfold,
                                         n_jobs=-1)
    clf.fit(XTrain, yTrain)
    # print clf.best_score_, clf.best_estimator_
    yPredict = clf.predict(XPredict)
    return yPredict
Esempio n. 25
0
def train(XTrain, yTrain, XPredict):
    XTrain = np.array(XTrain, dtype=float)
    yTrain = np.array(yTrain, dtype=float)
    params = {
        'C': uniform(1, 999),
        'gamma': uniform(0.01, 0.29),
        'kernel': ['rbf', 'poly']
    }
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    svr = svm.SVR()
    clf = grid_search.RandomizedSearchCV(svr,
                                         param_distributions=params,
                                         n_iter=20,
                                         cv=kfold,
                                         scoring='mean_squared_error',
                                         n_jobs=-1)
    yPredict = []
    for i in range(yTrain.shape[1]):
        clf.fit(XTrain, yTrain[:, i])  # 训练distance个模型
        yPredict.extend(clf.predict(XPredict))
    return np.array(yPredict)
def GRNN(data_model):
    x_train, x_test, y_train, y_test = create_dataset(data_model['2017'])

    grnnet = algorithms.GRNN(std=0.5, verbose=True)
    grnnet.train(x_train, y_train)
    error = scorer(grnnet, x_test, y_test)
    print("GRNN RMSLE = {:.3f}\n".format(error))
    part_to_predict = data_model['2018'].copy()
    df_test = part_to_predict.copy()
    index_predict = df_test.index
    df_test.reset_index(inplace=True)
    df_test.drop(["Date"], axis=1, inplace=True)
    # fix random seed for reproducibility
    pd.np.random.seed(7)
    X = df_test.drop([pr.PowerPV], axis=1)
    y = df_test.drop([x for x in df_test.columns if x not in [pr.PowerPV]],
                     axis=1)
    pred = grnnet.predict(X)
    prediction_to_plot = pd.DataFrame(index=index_predict,
                                      data={
                                          'observed':
                                          pd.np.array(y[pr.PowerPV]),
                                          'predicted':
                                          pred.reshape(pred.shape[0], )
                                      })
    pr.plot_data(prediction_to_plot['2018-04-01':'2018-04-05'],
                 prediction_to_plot.columns, 1)

    print("Run Random Search CV")
    grnnet.verbose = False
    random_search = grid_search.RandomizedSearchCV(
        grnnet,
        param_distributions={'std': np.arange(1e-2, 1, 1e-4)},
        n_iter=400,
        scoring=scorer,
    )
    random_search.fit(
        data_model[[x for x in df_test.columns if x not in [pr.PowerPV]]],
        data_model[pr.PowerPV])
    report(random_search.grid_scores_)
Esempio n. 27
0
def train(XTrain, yTrain, XPredict):
    XTrain = np.array(XTrain, dtype=float)
    yTrain = np.array(yTrain, dtype=float)
    params = {
        "n_estimators": randint(50, 150),
        "max_depth": [1, 3, 5, None],
        "max_features": randint(1, len(XTrain[0])),
        "min_samples_split": randint(1, 4),
        "min_samples_leaf": randint(1, 4)
    }
    erf = ExtraTreesRegressor()
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    clf = grid_search.RandomizedSearchCV(erf,
                                         param_distributions=params,
                                         n_iter=5,
                                         scoring='mean_squared_error',
                                         cv=kfold,
                                         n_jobs=-1)
    yPredict = []
    for i in range(yTrain.shape[1]):
        clf.fit(XTrain, yTrain[:, i])  # 训练distance个模型
        yPredict.extend(clf.predict(XPredict))
    return np.array(yPredict)
Esempio n. 28
0
def train(XTrain, yTrain, XPredict):
    params = {
        'n_estimators': randint(20, 200),
        'loss': ['ls', 'lad', 'huber'],
        'learning_rate': uniform(0.01, 0.19),
        'subsample': uniform(0.5, 0.5),
        'max_depth': randint(1, 5),
        'min_samples_split': randint(1, 3),
        'min_samples_leaf': randint(1, 3),
        'max_features': randint(1, len(XTrain[0]))
    }
    gbrt = GradientBoostingRegressor()
    kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False)
    clf = grid_search.RandomizedSearchCV(gbrt,
                                         param_distributions=params,
                                         n_iter=50,
                                         scoring='mean_absolute_error',
                                         cv=kfold,
                                         n_jobs=-1)
    clf.fit(XTrain, yTrain)
    # print clf.best_score_, clf.best_estimator_
    yPredict = clf.predict(XPredict)
    return yPredict
Esempio n. 29
0
def random_parameter_search(variants,
                            n_iter=10,
                            classifier_name='RandomForest',
                            folds_name='kfold',
                            folds_params=None,
                            label_name=None):
    # Get X, y and classifier
    X, y = _get_X_y(variants)
    estimator = _get_estimator(classifier_name)

    # Get labels if given
    labels = variants[label_name] if label_name else None

    # Get folds
    logging.info('Generating %s folds, params: %s', folds_name,
                 str(folds_params))
    folds = _get_folds(folds_name, y, folds_params, labels)

    # User Mathews Correlation Coefficient scorer
    mcc_scorer = metrics.make_scorer(metrics.matthews_corrcoef)

    # Get parameters distribution for estimator
    param_distributions = {
        'classifier__' + k: v
        for k, v in CLASSIFIERS[classifier_name]
        ['param_distributions'].items()
    }

    search = grid_search.RandomizedSearchCV(estimator,
                                            param_distributions,
                                            cv=folds,
                                            n_iter=n_iter,
                                            scoring=mcc_scorer,
                                            verbose=1)

    search.fit(X, y)
    return search
Esempio n. 30
0
def benchmark():
    from solaris.run import load_data
    from sklearn import grid_search
    from sklearn import metrics

    def rmse(y_true, pred):
        return np.sqrt(metrics.mean_squared_error(y_true, pred))

    data = load_data()
    X = data['X_train']
    y = data['y_train']

    x = Interpolate._grid_data()

    fx = 0
    day = 180
    y = X.nm[day, fx].mean(axis=0)[3]
    #nugget = X.nm[day, fx].std(axis=0)[3]
    mask = np.ones_like(y, dtype=np.bool)
    rs = np.random.RandomState(5)
    test_idx = np.c_[rs.randint(2, 7, 20),
                     rs.randint(3, 13, 20)]
    print test_idx.shape
    mask[test_idx[:, 0], test_idx[:, 1]] = False
    mask = mask.ravel()
    y = y.ravel()

    print '_' * 80
    est = GaussianProcess(corr='squared_exponential', theta0=(10, 10, 10))
    est.fit(x[mask], y[mask])
    pred = est.predict(x[~mask])
    print 'MAE: %.2f' % metrics.mean_absolute_error(y[~mask], pred)

    print '_' * 80

    sys.exit(0)

    #import IPython
    #IPython.embed()

    class KFold(object):

        n_folds = 1

        def __iter__(self):
            yield mask, ~mask

        def __len__(self):
            return 1

    est = Ridge()
    params = {'normalize': [True, False],
              'alpha': 10.0 ** np.arange(-7, 1, 1)}
    gs = grid_search.GridSearchCV(est, params, cv=KFold(),
                                  scoring='mean_squared_error').fit(x, y)
    print gs.grid_scores_
    print gs.best_score_

    est = GaussianProcess()
    params = {'corr': ['squared_exponential'],
               'theta0': MultivariateNormal(),
               }

    ## params = {'corr': ['squared_exponential'],
    ##           #'regr': ['constant', 'linear', 'quadratic'],
    ##           'theta0': np.arange(4, 11),
    ##           }

    # gs = grid_search.GridSearchCV(est, params, cv=KFold(),
    #                               loss_func=rmse).fit(x, y)
    gs = grid_search.RandomizedSearchCV(est, params, cv=KFold(),
                                        scoring='mean_squared_error',
                                        n_iter=100).fit(x, y)
    print gs.grid_scores_
    print gs.best_params_
    print gs.best_score_