コード例 #1
0
def create_project(params_e_pre, params_e_post, params_g_pre, params_g_post,
        baseline_period_start_date, baseline_period_end_date,
        reporting_period_start_date, reporting_period_end_date,
        has_electricity, has_gas, weather_source, zipcode):

    model_e = AverageDailyTemperatureSensitivityModel(heating=True, cooling=True)
    model_g = AverageDailyTemperatureSensitivityModel(heating=True, cooling=False)

    # generate consumption
    baseline_period = Period(baseline_period_start_date, reporting_period_start_date)
    datetimes_pre = generate_monthly_billing_datetimes(baseline_period, dist=randint(29,31))

    reporting_period = Period(datetimes_pre[-1], reporting_period_end_date)
    datetimes_post = generate_monthly_billing_datetimes(reporting_period, dist=randint(29,31))
    location = Location(zipcode=zipcode)
    baseline_period = Period(baseline_period_start_date, baseline_period_end_date)
    reporting_period = Period(reporting_period_start_date, reporting_period_end_date)


    cds = []

    if has_electricity:
        cd_e = generate_consumption_records(model_e, params_e_pre, params_e_post, datetimes_pre, datetimes_post, "electricity", "kWh", weather_source)
        cds.append(cd_e)

    if has_gas:
        cd_g = generate_consumption_records(model_g, params_g_pre, params_g_post, datetimes_pre, datetimes_post, "natural_gas", "therm", weather_source)
        cds.append(cd_g)

    return Project(location, cds, baseline_period, reporting_period)
コード例 #2
0
   def Gradient(self): 
       X_train, y_train =self.X_train,self.y_train
       parameters_boost={'max_depth':randint(3,self.max_depth_max+1),
 'n_estimators':randint(80,100+self.n_estimators_max)}
       boost_reg=RandomizedSearchCV(GradientBoostingRegressor(loss=self.loss),param_distributions=parameters_boost,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
       boost_reg.fit(X_train,y_train)
       self.boost_reg=boost_reg.best_estimator_
コード例 #3
0
 def get_param_grid(cur_model, points, rand):
     print('\nRetrieving parameter grid...')
     try:
         c_range = 10.0 ** np.arange(-2, 3)
         # print 'Getting Parameter grid...'
         # out_txt.write('Getting Parameter grid...')
         gamma_range = [0, .01, .1, .3]
         # neighbor_range = np.arange(2, points, step=5)
         # leaf_range = np.arange(10, points, step=5)
         neighbor_range = np.arange(2, 17, step=5)
         leaf_range = np.arange(10, 60, step=5)
         if not rand:
             grid_params = {'SVC()': [{'C': c_range,
                                       'kernel': ['poly'],
                                       'degree': [3, 5, 8],
                                       'gamma': gamma_range,
                                       'probability': [True],
                                       'class_weight': ['auto', None]},
                                      {'C': c_range,
                                       'kernel': ['rbf', 'sigmoid'],
                                       'gamma': gamma_range,
                                       'probability': [True],
                                       'class_weight': ['auto', None]},
                                      {'C': c_range,
                                       'kernel': ['linear'],
                                       'random_state': [10],
                                       'probability': [True],
                                       'class_weight': ['auto', None]}],
                            'KNeighborsClassifier()': [{'n_neighbors': neighbor_range,
                                                        'weights': ['uniform'],
                                                        'algorithm': ['brute'],
                                                        'metric': ['euclidean', 'manhattan']},
                                                       {'n_neighbors': neighbor_range,
                                                        'weights': ['uniform'],
                                                        'algorithm': ['ball_tree', 'kd_tree'],
                                                        'metric': ['euclidean', 'manhattan'],
                                                        'leaf_size': leaf_range}],
                            'LogisticRegression()': [{'penalty': ['l1', 'l2'],
                                                      'C': c_range,
                                                      'class_weight': [None, 'auto']}]}
             return grid_params[cur_model]
         else:
             rand_params = {'SVC()': {'C': stats.expon(scale=300),
                                      'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                                      'degree': [3, 4, 5, 6, 7, 8],
                                      'gamma': stats.expon(scale=1/3),
                                      'random_state': [10],
                                      'probability': [True],
                                      'class_weight': ['auto', None]},
                            'KNeighborsClassifier()': {'n_neighbors': stats.randint(low=2, high=20),
                                                       'weights': ['uniform', 'distance'],
                                                       'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                                                       'metric': ['euclidean', 'manhattan'],
                                                       'leaf_size': stats.randint(low=10, high=60)},
                            'LogisticRegression()': {'penalty': ['l1', 'l2'],
                                                     'C': stats.expon(scale=300),
                                                     'class_weight': [None, 'auto']}}
             return rand_params[cur_model]
     except:
         print('could not get parameter grid')
コード例 #4
0
ファイル: gbrtModel.py プロジェクト: ifenghao/tianchi_contest
def train(array, embedDim, interval):
    XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1)
    kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False)
    params = {'n_estimators': randint(20, 200),
              'loss': ['ls', 'lad', 'huber'],
              'learning_rate': uniform(0.01, 0.19),
              'subsample': uniform(0.5, 0.5),
              'max_depth': randint(1, 5),
              'min_samples_split': randint(1, 3),
              'min_samples_leaf': randint(1, 3),
              'max_features': randint(1, len(XTrain[0]))}
    bestModels = []
    for i in range(len(yTrain[0])):
        gbrt = GradientBoostingRegressor()
        clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=20,
                                             scoring='mean_squared_error', cv=kfold, n_jobs=-1)
        clf.fit(XTrain, yTrain[:, i])
        bestModels.append(clf.best_estimator_)

    for i in range(1, 12):
        XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, i)  # 模型的预测天数递增
        XPredict = pp.makeXPredict(array, embedDim, interval, i)  # 待预测的输入递增
        subyPredict = []
        for j in range(len(yTrain[0])):
            bestModels[j].fit(XTrain, yTrain[:, j])
            subyPredict.append(bestModels[j].predict(XPredict))
        array = np.hstack((array, np.array(copy(subyPredict))))  # 将一个模型的预测值作为已知数据,训练下一个模型
    yPredict = array[0, -65:-5]  # 一共可以预测66天,取其中对应的数据
    return yPredict
コード例 #5
0
ファイル: erfModel.py プロジェクト: ifenghao/tianchi_contest
def train(array, embedDim, interval):
    XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1)
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    params = {"n_estimators": randint(5, 100),
              "max_depth": [1, 2, 3, 5, 8, 10, None],
              "max_features": randint(1, len(XTrain[0])),
              "min_samples_split": randint(1, 3),
              "min_samples_leaf": randint(1, 3)}
    bestModels = []
    for i in range(len(yTrain[0])):
        erf = ExtraTreesRegressor()
        clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=10,
                                         scoring='mean_squared_error', cv=kfold, n_jobs=-1)
        clf.fit(XTrain, yTrain[:, i])
        bestModels.append(clf.best_estimator_)

    for i in range(60):
        XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1)  # 模型的嵌入维度递增
        XPredict = pp.makeXPredict(array, embedDim, interval, 1)  # 待预测的嵌入维度递增
        subyPredict = []
        for j in range(len(yTrain[0])):
            bestModels[j].fit(XTrain, yTrain[:, j])
            subyPredict.append(bestModels[j].predict(XPredict))
        array = np.hstack((array, np.array(copy(subyPredict))))  # 将一个模型的预测值作为已知数据,训练下一个模型
        embedDim += 1
    yPredict = array[0, -60:]  # 一共可以预测60天,取其中对应的数据
    return yPredict
コード例 #6
0
ファイル: improve.py プロジェクト: gusseppe/pymach
    def decisiontree_param(self, method='grid'):
        parameters = {
            # 'selector__extraTC__n_estimators':  [10],
            # 'selector__extraTC__n_estimators':  [10, 15],
            # # 'selector__extraTC__criterion': ['entropy'],
            # 'selector__extraTC__criterion': ['gini','entropy'],
            # 'selector__extraTC__n_jobs': [-1],
            # 'selector__pca__svd_solver': ['randomized'],
            'selector__pca__svd_solver': ['full', 'arpack', 'randomized'],
            # 'selector__pca__whiten': [True],
            'selector__pca__whiten': [True,False],
            'DecisionTreeClassifier__criterion': ['gini','entropy'],
            'DecisionTreeClassifier__splitter': ['best','random'],
            'DecisionTreeClassifier__max_features': ['sqrt','log2', None]
            # 'DecisionTreeClassifier__max_leaf_nodes': [2,3, None],
            # 'DecisionTreeClassifier__max_depth': [2,3, None],
            # 'DecisionTreeClassifier__min_samples_leaf': [1,3,5, None]

        }
        if method == 'random':
            parameters['DecisionTreeClassifier__min_samples_leaf'] = randint(1,20)
            parameters['DecisionTreeClassifier__max_leaf_nodes'] = randint(2,20)
            parameters['DecisionTreeClassifier__max_depth'] = randint(1,20)

        return parameters
コード例 #7
0
ファイル: test_big.py プロジェクト: tgsmith61591/skutil
    def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works
コード例 #8
0
ファイル: estimators.py プロジェクト: cristi-zz/TeamVisoft
def get_random_forest():
    classifier = sklearn.ensemble.RandomForestClassifier(max_features=None,oob_score=False,n_jobs=1)
    pipeline = sklearn.pipeline.Pipeline([('RF',classifier)])

    meta_dict={'RF__n_estimators':stats.randint(5,100),'RF__max_features':['sqrt','log2','auto',None],
               'RF__max_depth':stats.randint(2,10)}

    return pipeline,meta_dict
コード例 #9
0
    def Extra(self):
        parameters_extra={'max_depth':randint(1,self.max_depth_max+1),
"bootstrap": [True, False],
'min_samples_split':randint(1,self.min_samples_split_max+1),
  "min_samples_leaf": randint(1, self.min_samples_leaf_max+1),
'n_estimators':randint(20,20+self.n_estimators_max)
}
        X_train, y_train =self.X_train,self.y_train
        extra_reg=RandomizedSearchCV(ExtraTreesRegressor(),param_distributions=parameters_extra,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
        extra_reg.fit(X_train,y_train)
        self.extra_reg=extra_reg.best_estimator_
コード例 #10
0
def do_train_rand(X, y, params=None, n_iter=50):
    if not params:
        params = {'n_estimators': stats.randint(40, 90),
                  'max_depth': stats.randint(20, 40),
                  'min_samples_leaf': stats.randint(80, 110),
                  'max_features': ['auto', 'sqrt']}
    clf = RandomizedSearchCV(GradientBoostingClassifier(), params,
                             n_iter=n_iter, scoring=do_test, n_jobs=15,
                             verbose=1, cv=4)
    clf.fit(X, y)
    return clf
コード例 #11
0
    def RandomFo(self):
        parameters_forest={'max_depth':randint(1,self.max_depth_max+1),
                                "bootstrap": [True, False],
    'min_samples_split':randint(1,self.min_samples_split_max+1),
    "min_samples_leaf": randint(1,self.min_samples_leaf_max+1),
    "max_features": randint(1, self.max_features_max),
    'n_estimators':randint(15,self.n_estimators_max),
}
### Gridsearch
        X_train, y_train =self.X_train,self.y_train
        forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
        forest_reg.fit(X_train,y_train)
        self.forest_reg=forest_reg.best_estimator_
コード例 #12
0
 def simple_tree(self): #### methode d'apprentissage avec arbre simple
                                             ### cv: nb d'etapes ds la cross valid
                                             ### n_iter est le nb d'iteration pour la random cross valid
             
     parameters_tree={'max_depth': randint(1,self.max_depth_max+1),
              'min_samples_split':randint(1,self.min_samples_split_max+1),
                 'min_samples_leaf':randint(1,self.min_samples_leaf_max+1),
                 'max_leaf_nodes':randint(2,self.max_leaf_nodes_max),
     "max_features": randint(1, self.max_features_max)}
     tree_reg = RandomizedSearchCV(DecisionTreeRegressor(), param_distributions=parameters_tree, cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
     X_train, y_train =self.X_train,self.y_train
     tree_reg.fit(X_train, y_train)
     self.tree_reg=tree_reg.best_estimator_
コード例 #13
0
ファイル: rfModel.py プロジェクト: ifenghao/tianchi_contest
def train(XTrain, yTrain, XPredict):
    params = {"n_estimators": randint(5, 100),
              "max_depth": [1, 2, 3, 5, 10, None],
              "max_features": randint(1, len(XTrain[0])),
              "min_samples_split": randint(1, 3),
              "min_samples_leaf": randint(1, 3)}
    rf = RandomForestRegressor()
    kfold = cross_validation.KFold(len(XTrain), n_folds=3, shuffle=False)
    clf = grid_search.RandomizedSearchCV(rf, param_distributions=params, n_iter=30,
                                         scoring='mean_squared_error', cv=kfold, n_jobs=-1)
    clf.fit(XTrain, yTrain)
    # print clf.best_score_, clf.best_estimator_
    yPredict = clf.predict(XPredict)
    return yPredict, clf.best_params_
コード例 #14
0
def _get_random_params(model_name): 
    """Return some random model parameters to search over. 

    Args: 
    ----
        model_name: str

    Return: 
    ------
        param_dct: dct
    """

    if model_name == 'logit': 
        param_dct = {'penalty': ['l1', 'l2'], 'C': scs.uniform(0.00001, 0.0099)}
    elif model_name == 'random_forest': 
        param_dct = {'n_estimators': scs.randint(400, 1200), 
                'max_depth': scs.randint(2, 32)}
    elif model_name == 'extra_trees': 
        param_dct = {'n_estimators': scs.randint(400, 1200), 
                'max_depth': scs.randint(2, 32)}
    elif model_name == 'gboosting': 
        param_dct = {'n_estimators': scs.randint(400, 1200), 
                'learning_rate': scs.uniform(0.001, 0.099), 
                'max_depth': scs.randint(1, 8), 
                'max_features': scs.uniform(0.5, 0.5), 
                'subsample': scs.uniform(0.5, 0.5)}
    elif model_name == 'xgboost': 
        param_dct = {'learning_rate': scs.uniform(0.001, 0.099), 
                'n_estimators': scs.randint(400, 1200), 
                'max_depth': scs.randint(1, 8), 
                'subsample': scs.uniform(0.5, 0.5), 
                'colsample_bytree': scs.uniform(0.5, 0.5)}

    return param_dct
コード例 #15
0
ファイル: test_generator.py プロジェクト: bryongloden/eemeter
def test_generate_monthly_billing_datetimes():
    period = Period(datetime(2012,1,1),datetime(2013,1,1))
    datetimes_30d = generate_monthly_billing_datetimes(period,
            randint(30,31))
    assert datetimes_30d[0] == datetime(2012,1,1)
    assert datetimes_30d[1] == datetime(2012,1,31)
    assert datetimes_30d[11] == datetime(2012,11,26)
    assert datetimes_30d[12] == datetime(2012,12,26)

    datetimes_1d = generate_monthly_billing_datetimes(period, randint(1,2))
    assert datetimes_1d[0] == datetime(2012,1,1)
    assert datetimes_1d[1] == datetime(2012,1,2)
    assert datetimes_1d[330] == datetime(2012,11,26)
    assert datetimes_1d[331] == datetime(2012,11,27)
コード例 #16
0
ファイル: test_pipe.py プロジェクト: thetuxedo/skutil
def test_random_grid():
    # get our train/test
    X_train, X_test, y_train, y_test = train_test_split(X, iris.target, train_size=0.75, random_state=42)

    # default CV does not shuffle, so we define our own
    custom_cv = KFold(n=y_train.shape[0], n_folds=5, shuffle=True, random_state=42)

    # build a pipeline
    pipe = Pipeline([
        ('retainer'    , FeatureRetainer()), # will retain all
        ('dropper'     , FeatureDropper()),  # won't drop any
        ('mapper'      , FunctionMapper()),  # pass through
        ('encoder'     , OneHotCategoricalEncoder()), # no object dtypes, so will pass through
        ('collinearity', MulticollinearityFilterer(threshold=0.85)),
        ('imputer'     , SelectiveImputer()), # pass through
        ('scaler'      , SelectiveScaler()),
        ('boxcox'      , BoxCoxTransformer()),
        ('nzv'         , NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca'         , SelectivePCA(n_components=0.9)),
        ('model'       , RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold' : uniform(loc=.8, scale=.15),
        'collinearity__method'    : ['pearson','kendall','spearman'],
        'scaler__scaler'          : [StandardScaler(), RobustScaler()],
        'pca__n_components'       : uniform(loc=.75, scale=.2),
        'pca__whiten'             : [True, False],
        'model__n_estimators'     : randint(5,100),
        'model__max_depth'        : randint(2,25),
        'model__min_samples_leaf' : randint(1,15),
        'model__max_features'     : uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes'   : randint(10,75)
    }

    # define the gridsearch
    search = RandomizedSearchCV(pipe, hp,
                                n_iter=2, # just to test it even works
                                scoring='accuracy',
                                cv=custom_cv,
                                random_state=42)

    # fit the search
    search.fit(X_train, y_train)

    # test the report
    the_report = report_grid_score_detail(search, charts=False)
コード例 #17
0
 def RandomFo(self):
     parameters_forest={'n_estimators':randint(10,self.n_estimators_max),
             "bootstrap": [True, False]}
     X_train, y_train =self.X_train,self.y_train
     forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
     forest_reg.fit(X_train,y_train)
     self.forest_reg=forest_reg.best_estimator_
コード例 #18
0
def do_train_rand(train, valid, params=None, max_models=32):
    """Do randomized hyper-parameter search
    Args:
        train (SFrame): training set
        valid (SFrame): validataion set
        params (dict): parameters for random search
        max_models (int): maximum number of models to run
    Returns:
        res (SFrame): table of choices of parameters sorted by valid RMSE
    """
    if not params:
        params = {'user_id': ['username'], 'item_id': ['course_id'],
                  'target': ['label'], 'binary_target': [True],
                  'num_factors': stats.randint(4, 128),
                  'regularization': stats.expon(scale=1e-4),
                  'linear_regularization': stats.expon(scale=1e-7)}
    try:
        job = gl.toolkits.model_parameter_search \
                         .random_search.create((train, valid),
                                               gl.recommender.
                                               factorization_recommender.create,
                                               params, max_models=max_models)
        res = job.get_results()
        res = res.sort('validation_rmse')
        print 'Best params for random search are: {}'.format(res[0])
        res.save('rand_search.csv', format='csv')
    except:
        print job.get_metrics()
        res = None
    return res
コード例 #19
0
ファイル: resource.py プロジェクト: irwinsnet/DesPy
 def get_available_resource(self, random = False):
     """Gets the index number of an available resource.
     
     *Arguments*
         ``random`` (Boolean)
             If set to True, randomly chooses the index of an
             available resource. Otherwise, returns the index of the
             available resource with the lowest index value.
             
     *Returns:* A positive integer representing the index number of
     the resource. ``None`` if all resources are busy.
     
     """
 
     empty_resources = []    
     for index in range(self.num_resources):
         if self[index].get_available_station() is not None:
             empty_resources.append(index)
     
     if len(empty_resources) == 0:
         return None
     elif not random:
         return empty_resources[0]
     else:
         return empty_resources[randint(0, len(empty_resources) - 1)] 
コード例 #20
0
ファイル: parallel.py プロジェクト: neuroidss/mHTM
	def make_data(self):
		"""
		Create a generator for building the data.
		"""
		
		# Build a logging format suitable for sorting
		log_dir = self.param_distributions['log_dir']
		log_fmt = '{{0:0{0}d}}-{{1:0{1}d}}'.format(len(str(self.niter)),
			len(str(self.nsplits)))
		
		# Create the parameters for each instance
		for i in xrange(1, self.niter + 1):
			# Make the params
			ncolumns = self.param_distributions['ncolumns'].rvs()
			param = {'ncolumns':ncolumns}
			
			####
			# Ensure that parameters make sense
			####
			
			# Compute nactive as a function of ncolumns
			# Ensure that nactive is bounded by (0, ncolumns)
			nactive = int(self.param_distributions['nactive'].rvs() * ncolumns)
			while ((nactive == 0) or (nactive == ncolumns)):
				nactive = int(self.param_distributions['nactive'].rvs() *
					ncolumns)
			param['nactive'] = nactive
			
			# Ensure that each input is seen at least once
			nsynapses = self.param_distributions['nsynapses'].rvs()
			p = prob.p_c(ncolumns, nsynapses, self.ninputs)
			e = int(p * self.ninputs)
			while e > 0:
				nsynapses = self.param_distributions['nsynapses'].rvs()
				p = prob.p_c(ncolumns, nsynapses, self.ninputs)
				e = int(p * self.ninputs)
			param['nsynapses'] = nsynapses
			
			# Compute seg_th as a function of nsynapses
			seg_th = int(self.param_distributions['seg_th'].rvs() * nsynapses)
			param['seg_th'] = seg_th
			
			# Make a useful log directory
			param['log_dir'] = os.path.join(log_dir, log_fmt.format(i, 1))
			
			####
			# Add all other parameters
			####
			
			added = set(param.keys())
			missing = [key for key in self.keys if key not in added]
			for key in missing:
				if hasattr(self.param_distributions[key], 'rvs'):
					param[key] = self.param_distributions[key].rvs()
				else:
					param[key] = value[randint(0, len(v)).rvs()]
			
			# Yield each item
			for key in self.keys:
				yield param[key]
コード例 #21
0
def generate_altitudes_for_traj(trajectories, distr_file = None, distr_type = "flat", min_FL = 240., max_FL = 350., save_file = None, starting_date = [2010, 6, 5, 10, 0, 0]):
	"""
	@trajectories: a list of tuple (lat, lon, alt, time).
	TODO: do a distribution for entry and for exit?
	"""
	print "Generating altitudes from distribution..."
	trajectories = [[list(p) for p in traj] for traj in trajectories]

	if distr_file!=None:
		print "Getting distribution of altitudes from file", distr_file
		distr_type = "data"
		data = []
		with open(distr_type, 'r') as f:
			for columns in (raw.strip().split() for raw in f):  
				data.append(columns[0])
		min_FL, max_FL = min(data), max(data)
		distr = getDistribution(data)
	else:
		if distr_type == 'flat':
			distr = stats.randint(low = min_FL, high = max_FL).rvs
		else:
			print "You asked for a distribution of type", distr_type
			raise Exception("This type of distribution is not implemented.")

	for traj in trajectories:
		alt = distr()
		for p in traj: #same altitude for the whole trajectory
			p[2] = 10*int(alt/10.) # To have trajectories separated by 10 FL.



	if save_file!=None:
		write_trajectories_for_tact(trajectories, fil = save_file, starting_date = starting_date)

	return trajectories
コード例 #22
0
ファイル: improve.py プロジェクト: gusseppe/pymach
    def extratrees_param(self, method='grid'):
        parameters = {
            # 'selector__extraTC__n_estimators': [10],
            # 'selector__extraTC__n_estimators': [10, 15],
            # 'selector__extraTC__criterion': ['gini', 'entropy'],
            # # 'selector__extraTC__criterion': ['entropy'],
            # 'selector__extraTC__n_jobs': [-1],
            # 'selector__pca__svd_solver': ['randomized'],
            'selector__pca__svd_solver': ['full', 'arpack', 'randomized'],
            # 'selector__pca__whiten': [True],
            'selector__pca__whiten': [True,False],
            'ExtraTreesClassifier__n_estimators': [10, 15, 20],
            'ExtraTreesClassifier__criterion': ['gini', 'entropy']
            # 'ExtraTreesClassifier__min_samples_leaf': [1,2,3,4,5],
            # 'ExtraTreesClassifier__min_samples_leaf': range(200,1001,200),
            # 'ExtraTreesClassifier__max_leaf_nodes': [2,3,4,5],
            # 'ExtraTreesClassifier__max_depth': [2,3,4,5]
        }

        if method == 'random':
            parameters['ExtraTreesClassifier__min_samples_leaf'] = randint(200,1001)
            # parameters['ExtraTreesClassifier__max_leaf_nodes'] = randint(2,20)
            # parameters['ExtraTreesClassifier__max_depth'] = randint(1,20)
            pass

        return parameters
コード例 #23
0
def test_sklearn_cv():

    model = LightFM(loss='warp', random_state=42)

    # Set distributions for hyperparameters
    randint = stats.randint(low=1, high=65)
    randint.random_state = 42
    gamma = stats.gamma(a=1.2, loc=0, scale=0.13)
    gamma.random_state = 42
    distr = {'no_components': randint, 'learning_rate': gamma}

    # Custom score function
    def scorer(est, x, y=None):
        return precision_at_k(est, x).mean()

    # Custom CV which sets train_index = test_index
    class CV(KFold):
        def __iter__(self):
            ind = np.arange(self.n)
            for test_index in self._iter_test_masks():
                train_index = np.logical_not(test_index)
                train_index = ind[train_index]
                yield train_index, train_index

    cv = CV(n=train.shape[0], random_state=42)
    search = RandomizedSearchCV(estimator=model, param_distributions=distr,
                                n_iter=10, scoring=scorer, random_state=42,
                                cv=cv)
    search.fit(train)
    assert search.best_params_['no_components'] == 52
コード例 #24
0
ファイル: __init__.py プロジェクト: bgruening/EDeN
def fit_estimator(estimator, positive_data_matrix=None, negative_data_matrix=None, target=None, cv=10, n_jobs=-1, n_iter_search=40, random_state=1):
    # hyperparameter optimization
    param_dist = {"n_iter": randint(5, 100),
                  "power_t": uniform(0.1),
                  "alpha": uniform(1e-08, 1e-03),
                  "eta0": uniform(1e-03, 1),
                  "penalty": ["l1", "l2", "elasticnet"],
                  "learning_rate": ["invscaling", "constant", "optimal"]}
    scoring = 'roc_auc'
    n_iter_search = n_iter_search
    random_search = RandomizedSearchCV(estimator,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=cv,
                                       scoring=scoring,
                                       n_jobs=n_jobs,
                                       random_state=random_state,
                                       refit=True)
    X, y = make_data_matrix(positive_data_matrix=positive_data_matrix,
                            negative_data_matrix=negative_data_matrix,
                            target=target)
    random_search.fit(X, y)

    logger.debug('\nClassifier:')
    logger.debug('%s' % random_search.best_estimator_)
    logger.debug('\nPredictive performance:')
    # assess the generalization capacity of the model via a 10-fold cross validation
    for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']:
        scores = cross_validation.cross_val_score(random_search.best_estimator_, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs)
        logger.debug('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores)))

    return random_search.best_estimator_
コード例 #25
0
def test_sklearn_():
    '''
    Test whether the booster indeed gets updated
    :return:
    '''
    Xtrain = np.random.randn(100,10)
    ytrain = np.random.randint(0,2,100)

    Xval = np.random.randn(20, 10)
    yval = np.random.randint(0, 2, 20)


    classifier = SHSklearnEstimator(model=RandomForestClassifier(n_estimators=4),\
                                      ressource_name='n_estimators')

    param_grid = {'max_depth': randint(1,10),
                   'min_impurity_decrease':lognorm(0.1)
                  }
    scoring = make_scorer(accuracy_score)
    successiveHalving = SuccessiveHalving(
        estimator=classifier,
        n = 10,
        r = 100,
        param_grid=param_grid,
        ressource_name='n_estimators',
        scoring=scoring,
        n_jobs=1,
        cv=None,
        seed=0
    )

    T = successiveHalving.apply(Xtrain,ytrain,Xval,yval)
    print(T)

    assert(True)
コード例 #26
0
ファイル: erfModel.py プロジェクト: ifenghao/tianchi_contest
def train(XTrain, yTrain, XPredict):
    XTrain = np.array(XTrain, dtype=float)
    yTrain = np.array(yTrain, dtype=float)
    params = {"n_estimators": randint(50, 150),
              "max_depth": [1, 3, 5, None],
              "max_features": randint(1, len(XTrain[0])),
              "min_samples_split": randint(1, 4),
              "min_samples_leaf": randint(1, 4)}
    erf = ExtraTreesRegressor()
    kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False)
    clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=5,
                                         scoring='mean_squared_error', cv=kfold, n_jobs=-1)
    yPredict = []
    for i in range(yTrain.shape[1]):
        clf.fit(XTrain, yTrain[:, i])  # 训练distance个模型
        yPredict.extend(clf.predict(XPredict))
    return np.array(yPredict)
コード例 #27
0
ファイル: gbrtModel.py プロジェクト: ifenghao/tianchi_contest
def train(XTrain, yTrain, XPredict):
    params = {'n_estimators': randint(20, 200),
              'loss': ['ls', 'lad', 'huber'],
              'learning_rate': uniform(0.01, 0.19),
              'subsample': uniform(0.5, 0.5),
              'max_depth': randint(1, 5),
              'min_samples_split': randint(1, 3),
              'min_samples_leaf': randint(1, 3),
              'max_features': randint(1, len(XTrain[0]))}
    gbrt = GradientBoostingRegressor()
    kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False)
    clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=50,
                                         scoring='mean_absolute_error', cv=kfold, n_jobs=-1)
    clf.fit(XTrain, yTrain)
    # print clf.best_score_, clf.best_estimator_
    yPredict = clf.predict(XPredict)
    return yPredict
コード例 #28
0
    def Extra(self):
        parameters_extra={"bootstrap": [True, False],
'n_estimators':randint(20,self.n_estimators_max)
}
        X_train, y_train =self.X_train,self.y_train
        extra_reg=RandomizedSearchCV(ExtraTreesRegressor(),param_distributions=parameters_extra,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
        extra_reg.fit(X_train,y_train)
        self.extra_reg=extra_reg.best_estimator_
コード例 #29
0
ファイル: consumption.py プロジェクト: bryongloden/eemeter
def time_span_1(request, consumption_generator_1,
        gsod_722880_2012_2014_weather_source):
    period, n_days = request.param
    generator,_ = consumption_generator_1
    datetimes = generate_monthly_billing_datetimes(period, dist=randint(30,31))
    consumption_data = generator.generate(
            gsod_722880_2012_2014_weather_source, datetimes)
    return consumption_data, n_days
コード例 #30
0
    def __init__(self,
                 extent:tuple,
                 layer_range:tuple,
                 fault_range:tuple,
                 verbose:bool=False):
        """A Noddy history random generator.

        Args:
            extent: (x,X,y,Y,z,Z)
            layer_range:  (low, high)
            fault_range: (low, high)
            verbose: True / False
        """
        self.extent = extent
        self.x = abs(extent[1] - extent[0])
        self.y = abs(extent[3] - extent[2])
        self.z = abs(extent[5] - extent[4])
        self.layer_low, self.layer_high = layer_range
        self.faults_low, self.faults_high = fault_range
        self.verbose = verbose

        # defaults
        self.n_layers = self.layer_high
        self.n_faults = self.faults_low

        self.dist_faults = {
            "pos": self._random_pos(),
            "dip_dir": np.random.choice([stats.uniform(60, 120),
                                         stats.uniform(240, 300)]),
            "dip": stats.norm(45, 5),
            "slip": stats.uniform(0, self.z / 4)  # np.random.uniform(0, self.z / 4)
        }

        self.dist_strat = {
            "layer_thickness": [stats.randint(self.z / self.n_layers,
                                              self.z / self.n_layers + self.z / 8 * self.n_faults)
                                for l in range(self.n_layers)]
        }

        self.dist_tilt = {
            "pos": self._random_pos(),
            "rotation": stats.norm(0, 10),
            "plunge_direction": stats.uniform(0, 360),
            "plunge": stats.norm(0, 10)
        }

        self.dist_fold = {
            "pos": self._random_pos(),
            "wavelength": stats.uniform(self.x * 0.1, self.x * 2),
            "amplitude": stats.uniform(self.z * 0.05, self.z * 0.15)
        }

        self.dist_unconf = {
            "pos": self._random_pos(z_offset=self.z / 2),
            "dip_direction": stats.uniform(0, 360),
            "dip": stats.norm(0, 5),
        }
@author: T510
"""

from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

#-------------- finding best parameters and initialising instance models with good parameters
# random forest
param_distribs = {
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
    'max_depth': randint(low=1, high=20),
    'min_samples_leaf': randint(low=1, high=4),
}
my_model = RandomForestClassifier()
rnd_search = RandomizedSearchCV(my_model,
                                param_distributions=param_distribs,
                                n_iter=5,
                                cv=3,
                                scoring='balanced_accuracy',
                                random_state=42)
rnd_search.fit(X_train, y_train)
print(rnd_search.best_params_)

rnd_forest = RandomForestClassifier(max_depth=11,
コード例 #32
0
    def rand_search(self):
        '''running a randomized search to find the parameter combination for a random forest
     which gives the best accuracy score'''
        print('*' * 80)
        print(
            '*    Running RandomizedSearch for best parameter combination for ExtremeRandomForest'
        )
        print('*' * 80)

        #create the decision forest
        extra_clf_rand = ExtraTreesClassifier(random_state=100,
                                              max_depth=1,
                                              n_jobs=-1)

        with open(
                os.path.join(self.output_dir,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write('Created extreme random forest: extra_clf_rand \n')

        #set up randomized search
        param_rand = {
            "criterion": ["gini",
                          "entropy"],  #metric to judge reduction of impurity
            'class_weight': ['balanced', None],
            'n_estimators': randint(100, 10000),  #number of trees in forest
            'max_features':
            randint(2, 31),  #max number of features when splitting
            "min_samples_split":
            randint(2, 20),  #min samples per node to induce split
            #"max_depth": randint(1, 10),#max number of splits to do
            "min_samples_leaf": randint(1,
                                        20),  #min number of samples in a leaf
            "max_leaf_nodes": randint(10, 20)
        }  #max number of leaves

        with open(
                os.path.join(self.output_dir,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write(
                'Running randomized search for the following parameters: %s \n'
                % param_rand)
            text_file.write('use cv=3, scoring=accuracy \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(extra_clf_rand,
                                         param_rand,
                                         random_state=5,
                                         cv=3,
                                         n_iter=500,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_metrix_train, self.y_train)
        with open(
                os.path.join(self.output_dir,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write('Best parameters: ' +
                            str(rand_search_fitted.best_params_) + '\n')
            text_file.write('Best score: ' +
                            str(rand_search_fitted.best_score_) + '\n')
        feature_importances_fitted = rand_search_fitted.best_estimator_.feature_importances_
        feature_importances_fitted_ls = sorted(zip(feature_importances_fitted,
                                                   self.X_metrix_train),
                                               reverse=True)
        with open(
                os.path.join(self.output_dir,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write('Feature importances: %s \n' %
                            feature_importances_fitted_ls)

        self.best_params = rand_search_fitted.best_params_

        self.feature_importances_fitted_ls = feature_importances_fitted_ls

        def feature_importances_best_estimator(feature_list, directory):
            datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
            feature_list.sort(key=lambda x: x[1], reverse=True)
            feature = list(zip(*feature_list))[1]
            score = list(zip(*feature_list))[0]
            x_pos = np.arange(len(feature))
            plt.bar(x_pos, score, align='center')
            #plt.figure(figsize=(20,10))
            plt.xticks(x_pos, feature, rotation=90)
            plt.title(
                'Histogram of Feature Importances for best RandomForest using features '
            )
            plt.xlabel('Features')
            plt.tight_layout()
            plt.savefig(
                os.path.join(
                    directory, 'feature_importances_best_bar_plot_rand_bag_' +
                    datestring + '.png'))
            plt.close()

        feature_importances_best_estimator(self.feature_importances_fitted_ls,
                                           self.output_dir)
コード例 #33
0
ファイル: xgboost_model.py プロジェクト: P79N6A/atlas-1
    x_train, y_train, x_test = feature_engineering_titanic.read_titanic()

    x_train = x_train.as_matrix()
    y_train = y_train.as_matrix()
    x_test = x_test.as_matrix()

    # split train validate
    # x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=0)

    # get best model
    one_to_left = st.beta(10, 1)
    from_zero_positive = st.expon(0, 50)

    params = {
        "n_estimators": st.randint(3, 40),
        "max_depth": st.randint(3, 10),
        "learning_rate": st.uniform(0.05, 0.4),
        "colsample_bytree": one_to_left,
        "subsample": one_to_left,
        "gamma": st.uniform(0, 10),
        'reg_alpha': from_zero_positive,
        "min_child_weight": from_zero_positive,
    }
    xgb_clf = XGBClassifier(nthreads=-1)

    best_xgb_model = best_model.get_best_model(x_train,
                                               y_train,
                                               model=xgb_clf,
                                               params=params,
                                               n_iter=500,
コード例 #34
0
        "loss": ["hinge", "squared_hinge"],
        "tol": [0.00001, 0.0001, 0.001]
        # should probably include this in grid search, as
        # dual=False is preferred when samples>features. However:
        # Unsupported set of arguments: The combination of
        # penalty='l2' and loss='hinge' are not supported when
        # dual=False
        # "dual": [True, False]
    },
    'svm': {
        "C": [1, 0.5, 0.1, 0.9, 0.8],
        "tol": [0.00001, 0.0001, 0.001, 0.01],
        "shrinking": [True, False]
    },
    'kneighbors': {
        "n_neighbors": randint(2, 15),
        "weights": ['uniform', 'distance'],
        "leaf_size": randint(15, 100)
    },
    'linear': {
        "alpha": [0.0001, 0.01, 1.0, 10.0, 1000.0],
        "tol": [0.00001, 0.0001, 0.001, 0.01]
    }
}

TEMPLATES = pkg_resources.resource_filename('q2_sample_classifier', 'assets')


def _load_data(feature_data, targets_metadata):
    '''Load data and generate training and test sets.
コード例 #35
0
    'c__colsample_bylevel': [1, 0.8, 0.6], # Subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree.
    'c__colsample_bynode': [1, 0.8, 0.6],# Subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level.
    'c__num_parallel_tree': [1], # Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
    'c__max_depth': [2, 3, 6], # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    # 'c__reg_alpha': [0], # L1 regularization term on weights. Increasing this value will make model more conservative.
    # 'c__reg_lambda': [1], # L2 regularization term on weights. Increasing this value will make model more conservative.
    'c__learning_rate': [0.3], # 0.3 Step size shrinkage used in update to prevents overfitting. Shrinks the feature weights to make the boosting process more conservative.
    # 'c__scale_pos_weight': [1] # should be negative_samples_count / positive_samples_count
    #'c__objective': ['multi:softmax'], #XGBoost will adjust this between binary:logistic and multi:softmax based on # of classes
    'c__eval_metric': ['mlogloss'],  # logloss heavily penalizes false-positives (better precision)
    'c__tree_method': ['hist']
}

PARAMETER_DISTRIBUTION = {
    'i__strategy': ['mean'],  # 'median', 'most_frequent', 'constant'
    'c__n_estimators': randint(100, 1000),
    'c__subsample': [1], # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees.
    'c__colsample_bytree': uniform(0.4, 0.6), # Subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
    'c__colsample_bylevel': uniform(0.4, 0.6), # Subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree.
    'c__colsample_bynode': uniform(0.4, 0.6),# Subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level.
    'c__num_parallel_tree': [1], # Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
    'c__max_depth': randint(2, 12), # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    # 'c__reg_alpha': [0], # L1 regularization term on weights. Increasing this value will make model more conservative.
    # 'c__reg_lambda': [1], # L2 regularization term on weights. Increasing this value will make model more conservative.
    'c__learning_rate': [0.3], # 0.3 Step size shrinkage used in update to prevents overfitting. Shrinks the feature weights to make the boosting process more conservative.
    # 'c__scale_pos_weight': [1] # should be negative_samples_count / positive_samples_count
    # 'c__objective': ['multi:softmax'], #XGBoost will adjust this between binary:logistic and multi:softmax based on # of classes
    'c__eval_metric': ['mlogloss'],  # logloss heavily penalizes false-positives (better precision)
    'c__tree_method': ['hist']
}
コード例 #36
0
    def randomised_search(self):
        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = DecisionTreeClassifier(random_state=20,
                                      class_weight='balanced',
                                      max_features=self.numf)

        ada = AdaBoostClassifier(base_estimator=clf1,
                                 algorithm="SAMME.R",
                                 random_state=55)

        logging.info(f'Initialised classifier using balanced class weights \n')

        #set up randomized search
        param_dict = {
            'base_estimator__criterion': ['gini', 'entropy'],
            'n_estimators': randint(100,
                                    10000),  #number of base estimators to use
            'learning_rate': uniform(0.0001, 1.0),
            'base_estimator__min_samples_split': randint(2, 20),
            'base_estimator__max_depth': randint(1, 10),
            'base_estimator__min_samples_leaf': randint(1, 20),
            'base_estimator__max_leaf_nodes': randint(10, 20)
        }

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(ada,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        best_parameters = rand_search_fitted.best_params_
        best_scores = rand_search_fitted.best_score_

        logging.info(
            f'Running randomised search for best patameters of classifier \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best accuracy scores found: {best_scores} \n')

        self.model = rand_search_fitted.best_estimator_

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')

        print_to_consol('Getting feature importances for best classifier')

        best_clf_feat_import = self.model.feature_importances_
        best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import,
                                                 self.X_train_scaled.columns),
                                             reverse=True)

        logging.info(
            f'Feature importances for best classifier {best_clf_feat_import_sorted} \n'
        )

        all_clf_feat_import_mean = np.mean(
            [tree.feature_importances_ for tree in self.model.estimators_],
            axis=0)
        all_clf_feat_import_mean_sorted = sorted(zip(
            all_clf_feat_import_mean, self.X_train_scaled.columns),
                                                 reverse=True)

        print_to_consol('Plotting feature importances for best classifier')

        feature_importances_best_estimator(best_clf_feat_import_sorted,
                                           self.directory)
        logging.info(
            f'Plotting feature importances for best classifier in decreasing order \n'
        )
        feature_importances_error_bars(self.model, self.X_train_scaled.columns,
                                       self.directory)
        logging.info(
            f'Plotting feature importances for best classifier with errorbars \n'
        )
コード例 #37
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# define the parameter space that will be searched over
param_distributions = {
    'n_estimators': randint(1, 5),
    'max_depth': randint(5, 10)
}

# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)
search.fit(X_train, y_train)

print(search.best_params_)

# the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
print(search.score(X_test, y_test))
コード例 #38
0
ファイル: pipelines.py プロジェクト: m0rr15/workflow.py
r2 = gm_cv.score(X_test, y_test)  # Accuracy test (R^2)
mse = mean_squared_error(y_test, y_pred)  # Accuracy test
print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
print("Tuned ElasticNet MSE: {}".format(mse))

# RandomizedSearchCV (w/ DecisionTreeClassifier)
# problem: large hyperparam spaces, many hyperparams -> GridSearchCV comp. exp.
# solution: fixed number of hyperparam values is sampled
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
# Setup the grid and hyperparam spaces to sample from: param_dist
param_dist = {
    "max_depth": [3, None],
    "max_features": randint(1, 9),
    "min_samples_leaf": randint(1, 9),
    "criterion": ["gini", "entropy"]
}
tree = DecisionTreeClassifier()  # Model
tree_cv = RandomizedSearchCV(
    tree, param_dist, cv=5, n_iter=10
)  # Inst. RandomizedSearchCV. n_iter: the number of parameter settings tried
tree_cv.fit(X_train, y_train)  # Fitting on training data
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

#endregion (HYPERPARAMETER TUNING)

#region PREPROCESSING
コード例 #39
0
ファイル: parameters.py プロジェクト: mirrorzsd/CCF-Dianxin
# featureGain = {}
# featureScore = {}
# lastScore = 0
# count = 0

xx_score = []
cv_pred = []
splitsNum = 5
randomSeed = 42
numBoostRound = 200  # 50
earlyStoppingRounds = 60
#=====================================================================================
tunedParameters = {
    "learning_rate": expon(scale=0.2),#np.exp(range(-3,0)),
    #"lambda_l2": 0.25,
    "max_depth": randint(low=5,high=15),
    "num_leaves": [64,128,256,512,1024],
    "bagging_fraction":uniform(),
    "feature_fraction":uniform(),
    #"min_data_in_leaf":
}
model = lgb.LGBMClassifier(objective="multiclass",num_class= 11,seed=42)
searchResult = RandomizedSearchCV(model,tunedParameters,scoring ="f1_macro",n_iter=10,n_jobs=1,cv=3)
searchResult.fit(trainData,trainLabel)
print(rscv.cv_results_)
print(rscv.best_params_)
exit()
#=====================================================================================
skf = StratifiedKFold(n_splits=splitsNum, random_state=randomSeed, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(trainData, trainLabel)):
    print("[+] "+str(index)+" iteration")
コード例 #40
0
from sklearn.linear_model import LogisticRegression
from scipy.stats import randint, uniform

seed = 0
model = LogisticRegression()
param_dist = {
    # "penalty": ['l1', 'l2'],
    "penalty": ['l2'],

    # "C": [0.1, 0.5, 1.0, 2, 10],
    "C": uniform(0.001, 0.01),
    "random_state": [seed],
    "max_iter": randint(500, 1000),
}
コード例 #41
0
def msm_distance_measure_getter(X):
    """
    generate the msm distance measure
    :param X: dataset to derive parameter ranges from
    :return: distance measure and parameter range dictionary
    """
    n_dimensions = 1  # todo use other dimensions
    return {
        "distance_measure": [cython_wrapper(msm_distance)],
        "dim_to_use":
        stats.randint(low=0, high=n_dimensions),
        "c": [
            0.01,
            0.01375,
            0.0175,
            0.02125,
            0.025,
            0.02875,
            0.0325,
            0.03625,
            0.04,
            0.04375,
            0.0475,
            0.05125,
            0.055,
            0.05875,
            0.0625,
            0.06625,
            0.07,
            0.07375,
            0.0775,
            0.08125,
            0.085,
            0.08875,
            0.0925,
            0.09625,
            0.1,
            0.136,
            0.172,
            0.208,
            0.244,
            0.28,
            0.316,
            0.352,
            0.388,
            0.424,
            0.46,
            0.496,
            0.532,
            0.568,
            0.604,
            0.64,
            0.676,
            0.712,
            0.748,
            0.784,
            0.82,
            0.856,
            0.892,
            0.928,
            0.964,
            1,
            1.36,
            1.72,
            2.08,
            2.44,
            2.8,
            3.16,
            3.52,
            3.88,
            4.24,
            4.6,
            4.96,
            5.32,
            5.68,
            6.04,
            6.4,
            6.76,
            7.12,
            7.48,
            7.84,
            8.2,
            8.56,
            8.92,
            9.28,
            9.64,
            10,
            13.6,
            17.2,
            20.8,
            24.4,
            28,
            31.6,
            35.2,
            38.8,
            42.4,
            46,
            49.6,
            53.2,
            56.8,
            60.4,
            64,
            67.6,
            71.2,
            74.8,
            78.4,
            82,
            85.6,
            89.2,
            92.8,
            96.4,
            100,
        ],
    }
コード例 #42
0
                                   scoring='roc_auc',
                                   fit_params=None,
                                   cv=None,
                                   verbose=2).fit(X_train, y_train)

# In[97]:

gridSearchAda.best_params_, gridSearchAda.best_score_

# #### GradientBoosting

# In[98]:

gbHyperParams = {
    'loss': ['deviance', 'exponential'],
    'n_estimators': randint(10, 500),
    'max_depth': randint(1, 10)
}

# In[99]:

gridSearchGB = RandomizedSearchCV(estimator=gbMod,
                                  param_distributions=gbHyperParams,
                                  n_iter=10,
                                  scoring='roc_auc',
                                  fit_params=None,
                                  cv=None,
                                  verbose=2).fit(X_train, y_train)

# In[100]:
コード例 #43
0
ests = {
    'case-1': [  #('lin', LinearRegression()),
        ('bay', BayesianRidge(tol=1e-5)),
        #('hub', HuberRegressor(max_iter=5000, tol=1e-5)),
        ('ard', ARDRegression(tol=1e-5)),
        ('par', PassiveAggressiveRegressor(max_iter=5000, tol=1e-5)),
        ('rdg', Ridge(max_iter=5000, random_state=seed)),
        ('las', Lasso(max_iter=5000, random_state=seed)),
        # ('eln', ElasticNet(max_iter=5000, tol=1e-5, random_state=seed)),
        ('svr', SVR(kernel='linear')),
        ('mlp', MLPRegressor())
    ]
}

r = uniform(0, 30)
d = randint(2, 10)
f = randint(1, 100)
e = uniform(0, 3)
ee = uniform(0, 1)

pars_1 = {
    'case-1.mlp': {
        'alpha': ee,
        'beta_1': e,
        'beta_2': e,
        'epsilon': ee
    },
    'case-1.eln': {
        'alpha': e,
        'l1_ratio': e
    },
コード例 #44
0
class FixedLengthTupleDistribution:
    """
    Tuples where each element stems from a specified distribution
    Note: this is not a normalized distribution
    """
    def __init__(self, distributions):
        self.distributions = distributions

    def rvs(self, random_state=None):
        return tuple([
            dist.rvs(random_state=random_state) for dist in self.distributions
        ])


param_grid = {
    'mlp__hidden_layer_sizes': randint(2, 150),
    'mlp__activation': ['logistic', 'tanh', 'relu'],
    'mlp__solver': ['lbfgs'],
    'mlp__alpha': ExpDistribution(uniform(-6, 5)),
    'mlp__learning_rate': ['constant'],
    'mlp__max_iter': [200],
    'mlp__learning_rate_init': ExpDistribution(uniform(-6, 5)),
    # no2 results analysis shows good range between 0.1 and 0.001
    # 'filter__alpha': ExpDistribution(uniform(-3, 2)),
    'mlp__early_stopping': [True]
}

# CO_best = {'mlp__activation': 'relu', 'mlp__alpha': 1.899210794532138e-06,
#            'mlp__hidden_layer_sizes': 112,
#            'mlp__learning_rate_init': 0.07908998568339845,
#            'mlp__solver': 'lbfgs', 'mlp__learning_rate': 'constant',
コード例 #45
0
ファイル: ex05-2.py プロジェクト: dron512/narsha
    랜덤 서치에는 매개변수 값이 목록을 전달하는 것이 아니라 매개변수를 샘플링 할 수 있는 확률 분포객체를
    전달합니다. 확률 분포라 하니 조금 어렵게 들릴 수 있지만 간단하고 쉽습니다. 먼저 싸이파이에서
    2개의 확률 분포 클래스를 임포트 해보죠.

    싸이파이란................
    싸이파이는 파이썬의 핵심과학 라이브러리 중 하나입니다. 적분, 보간, 선형 대수, 확률 등을 포함한
    수치 계산 전용 라이브러리 입니다. 사이킷런은 넘파이와 싸이파이 기능을 많이 사용합니다.
    코랩에는 이미 설치 되어 있다.
'''
'''
    싸이파이의 stats 서브 패키지에 있는 uniform과 randint클래스는 모두 주어진 범위에서 고르게 값을 뽑습니다.
    이를 균등분포에서 샘플링 한다라고 말합니다. randint는 정숫값을 뽑고, uniform은 실숫값을 뽑습니다.
    사용하는 방법은 같습니다. 0에서 10사이의 범위를 갖는 randint객체를 만들고 10개의 숫자를 샘플링해보겠습니다.
'''

rgen = randint(0, 10)
nansu = rgen.rvs(10)
print(nansu)
'''
    좋습니다. 난수 발생기랑 유사하게 생각하면 됩니다. 랜덤 서치에 randint과 uniform 클래스 객체를 넘겨주고,
    총 몇 번을 샘플링해서 최적의 매개변수를 찾으라고 명령할 수 있습니다. 샘플링 횟수는 시스템 자원이
    허락하는 범위 내에서 최대한 크게 하는 것이 좋겠죠

    그럼 탐색할 매개변수의 딕셔너리를 만들어 보겠습니다. 여기에서는 min_samples_leaf 매개변수를 탐색 대상에
    추가하겠습니다. 이 매개변수는 리프 노드가 되기 위한 최소 샘플의 개수입니다. 어떤 노드가 분할하여 만들어질
    자식 노드의 샘플 수가 이 값보다 작을 경우 분할하지 않습니다.
'''

params = {
    'min_impurity_decrease': uniform(0.0001, 0.001),
    'max_depth': randint(20, 50),
コード例 #46
0
def test_smoke_hyperband(min_iter):
    seed = 10
    n_splits = 4
    eta = 3
    max_iter = 27
    params = dict(a=randint(low=1, high=100))
    cmax_param = {'b': max_iter}
    if min_iter is not None:
        cmin_param = {'b': min_iter}
    else:
        cmin_param = None
    hyperband_search = HyperbandSearchCV(
        DummyCVedEstimator(),
        cost_parameter_max=cmax_param,
        cost_parameter_min=cmin_param,
        cv=n_splits,
        iid=False,
        return_train_score=False,
        eta=eta,
        param_distributions=params,
        random_state=seed)

    hyperband_search.fit(np.random.normal(size=(100, 3)),
                         np.random.choice(2, size=100, replace=True))

    rng = np.random.RandomState(seed=seed)
    ri = randint(low=1, high=100)

    # b_vals is the geometric sequence of values of b hyperband will test
    if min_iter is None or min_iter == 1:
        b_vals = np.array(
            [1] * 27 + [3] * 9 + [9] * 3 + [27] +
            [3] * 12 + [9] * 4 + [27] +
            [9] * 6 + [27] * 2 +
            [27] * 4)
        a_itrs = [[27, 9, 3, 1], [12, 4, 1], [6, 2], [4]]
    else:
        b_vals = np.array(
            [3] * 9 + [9] * 3 + [27] +
            [9] * 5 + [27] +
            [27] * 3)
        a_itrs = [[9, 3, 1], [5, 1], [3]]

    # now draw the a_vals in the proper order
    a_vals = []
    for bstart, nks in enumerate(a_itrs):
        a_vals_orig = ri.rvs(random_state=rng, size=nks[0]).tolist()
        for i, nk in enumerate(nks):
            scores = np.array(
                [np.random.RandomState(seed=s).uniform()
                 for s in a_vals_orig]) + np.power(3, i + bstart)
            sinds = np.argsort(scores)[::-1]  # bigger is better
            msk = np.zeros_like(a_vals_orig)
            msk[sinds[0:nk]] = 1
            msk = msk.astype(bool)
            a_vals_orig = [a for i, a in enumerate(a_vals_orig) if msk[i]]
            a_vals += a_vals_orig
    a_vals = np.array(a_vals)
    mn_scores = np.array(
        [np.random.RandomState(seed=a).uniform() for a in a_vals]) + b_vals
    best_index = np.argmax(mn_scores)

    # now make sure it got the right values
    assert hyperband_search.best_index_ == best_index, "Best index is wrong!"
    assert hyperband_search.best_score_ == mn_scores[best_index], (
        "Best score is wrong!")
    assert (hyperband_search.best_params_ ==
            {'a': a_vals[best_index], 'b': b_vals[best_index]}), (
        "Best parameters are wrong!")
コード例 #47
0
                        param_grid=params_grid,
                        cv=cv,
                        scoring='neg_mean_squared_error')

bst_grid.fit(X, y)
bst_grid.grid_scores_

print("Best accuracy obtained: {0}".format(bst_grid.best_score_))
print("Parameters:")
for key, value in bst_grid.best_params_.items():
    print("\t{}: {}".format(key, value))

params_dist_grid = {
    'max_depth': [1, 2, 3, 4],
    'gamma': [0, 0.5, 1],
    'n_estimators': randint(1, 1001),  # uniform discrete random distribution
    'learning_rate': uniform(),  # gaussian distribution
    'subsample': uniform(),  # gaussian distribution
    'colsample_bytree': uniform()  # gaussian distribution
}

rs_grid = RandomizedSearchCV(estimator=XGBRegressor(**params_fixed, seed=seed),
                             param_distributions=params_dist_grid,
                             n_iter=10,
                             cv=cv,
                             scoring='neg_mean_squared_error',
                             random_state=seed)

rs_grid.fit(X, y)
rs_grid.grid_scores_
rs_grid.best_estimator_
コード例 #48
0
ファイル: seq_class.py プロジェクト: jdc5884/hsi_atk
          ('rfr', RandomForestClassifier(n_jobs=1)),
          ('mlp', MLPClassifier(tol=1e-4)), ('svc', SVC(tol=1e-4, degree=9)),
          ('rdc', RidgeClassifierCV()), ('gbc', GradientBoostingClassifier()),
          ('ada', AdaBoostClassifier()),
          ('svc', SVC(tol=1e-4, degree=7, kernel='linear')),
          ('bag', BaggingClassifier(n_jobs=1))]

ests_1 = {
    'case-1': est_l1,
    # 'case-2': est_l1,
    # 'case-3': est_l1,
    # 'case-4': est_l1
}

r = uniform(0, 30)
d = randint(2, 10)
f = randint(100, 200)
e = uniform(0, 3)
ee = uniform(0, 1)

pars_1 = {}

sc = StandardScaler()
pca = PCA()
fa = FactorAnalysis()
nmf = NMF()

pre_cases = {
    'case-1': [sc],
    # 'case-2': [sc],
    # 'case-3': [pca],
コード例 #49
0
    device=('cuda' if USE_CUDA else 'cpu'),
    max_epochs=5,
    lr=0.01,
    optimizer=torch.optim.RMSprop,
)

pipe = Pipeline(steps + [('net', net)])
# pipe.fit(X, y)

pipe.set_params(net__verbose=0, net__train_split=None)

params = {
    'to_idx__stop_words': ['english', None],
    'to_idx__lowercase': [False, True],
    'to_idx__ngram_range': [(1, 1), (2, 2)],
    'net__module__embedding_dim': stats.randint(32, 256 + 1),
    'net__module__rec_layer_type': ['gru', 'lstm'],
    'net__module__num_units': stats.randint(32, 256 + 1),
    'net__module__num_layers': [1, 2, 3],
    'net__module__dropout': stats.uniform(0, 0.9),
    'net__module__bidirectional': [True, False],
    'net__lr': [10**(-stats.uniform(1, 5).rvs()) for _ in range(NUM_CV_STEPS)],
    'net__max_epochs': [5, 10],
}

search = RandomizedSearchCV(pipe,
                            params,
                            n_iter=NUM_CV_STEPS,
                            verbose=2,
                            refit=False,
                            scoring='accuracy',
コード例 #50
0
ファイル: random_forest.py プロジェクト: Yard1/tune-sklearn
"""
An example training a RandomForestClassifier, performing
randomized search using TuneSearchCV.
"""

from tune_sklearn import TuneSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from scipy.stats import randint
import numpy as np

digits = datasets.load_digits()
x = digits.data
y = digits.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

clf = RandomForestClassifier()
param_distributions = {
    "n_estimators": randint(20, 80),
    "max_depth": randint(2, 10)
}

tune_search = TuneSearchCV(clf, param_distributions, n_iter=3)

tune_search.fit(x_train, y_train)

pred = tune_search.predict(x_test)
accuracy = np.count_nonzero(np.array(pred) == np.array(y_test)) / len(pred)
print(accuracy)
コード例 #51
0
def sample_preprocess():
    preprocess_distr = {
        "filter_type": ["none", "ma_smoothing"],
        "win_len": randint(2, 20)
    }
    return sample_scikit(preprocess_distr)
コード例 #52
0
pd.DataFrame(grid_search.cv_results_)


# Random 탐색
# n_iter는 몇 번 정도 오류가나면 stop 기준
# random state= 42 이유에는 스토리텔링이 있음...책에 있을걸?
# RandomForest가 개별 특성을 잘 뽑아내주는 (실제 가장 중요한 역할을 하는 키를 뽑아내기 쉬움)

# In[64]:


from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_disribs = {
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_disribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)


# In[65]:


cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
Use a scipy statistical distribution to answer the questions below:

from scipy import stats
from scipy.stats import norm, binom

die_distribution = stats.randint(1, 7)

# What is the probability of rolling a 1?
die_distribution.pmf(1)

# There's a 1 in 2 chance that I'll roll higher than what number?
die_distribution.isf(.5)

# What is the probability of rolling less than or equal to 2?
die_distribution.cdf(2)

# There's a 5 in 6 chance that my roll will be less than or equal to what number?
die_distribution.ppf(5/6)

# There's a 1 in 2 chance that my roll will be less than or equal to what number?
die_distribution.ppf(.5)

# What is the probability of rolling less than or equal to 6?
die_distribution.cdf(6)

# There's a 1 in 3 chance that I'll roll higher than what number?
die_distribution.isf(1/3)

# What is the probability of rolling higher than a 1?
die_distribution.sf(1)
コード例 #54
0
class Training:
    def __init__(self, n_calls):
        self.n_calls = n_calls

    def afl_loss(y_true, y_pred):
        return -np.sum(1 + np.log2(y_true * y_pred + (1 - y_true) *
                                   (1 - y_pred)))

    scorer = make_scorer(afl_loss, greater_is_better=False, needs_proba=True)

    spaceR = {
        'n_estimators': randint(low=350, high=700),
        'max_depth': randint(low=3, high=20),
        'min_samples_split': uniform(0.01, 0.99),
        'min_samples_leaf': randint(low=1, high=10),
        'min_weight_fraction_leaf': uniform(0, 0.5),
        'max_features': randint(low=1, high=18),
        'max_leaf_nodes': randint(low=2, high=1000),
        'min_impurity_decrease': uniform(0, 2)
    }

    spaceB = {
        'n_estimators': Integer(200, 1000),
        'max_depth': Integer(3, 20),
        'min_samples_split': Real(0.01, .99, "uniform"),
        'min_samples_leaf': Integer(1, 10),
        'min_weight_fraction_leaf': Real(0, 0.5, "uniform"),
        'max_features': Integer(1, 17),
        'max_leaf_nodes': Integer(2, 1000),
        'min_impurity_decrease': Real(0, 2)
    }

    def trainR(self, X_list, y_list, space=spaceR, cv=5):
        """
        RandomSearchCV method
        :param X_list: List of training sets
        :param y_list: List of targets
        :param space: parameter space
        :return: models an metrics
        """
        n_calls = self.n_calls

        scores = []
        val_scores = []
        best_models = []

        for j in range(len(X_list)):
            classifier = RandomForestClassifier(n_jobs=-1)
            y = y_list.copy()
            X = X_list.copy()
            y_test = y.pop(j)
            X_test = X.pop(j)
            y_train = np.concatenate(y, axis=0)
            X_train = np.concatenate(X, axis=0)

            X_train = Features().div_cols(X_train).values
            X_test = Features().div_cols(X_test).values

            start = time()

            opt = RandomizedSearchCV(classifier,
                                     param_distributions=space,
                                     n_iter=n_calls,
                                     scoring=self.scorer,
                                     cv=cv,
                                     n_jobs=-1,
                                     iid=False)

            opt.fit(X_train, y_train)
            model = opt.best_estimator_
            print('Season', 2019 - j)
            print("Random CV search took %.2f seconds for %d candidates"
                  " parameter settings." % ((time() - start), n_calls))
            print("val. score:", opt.best_score_)
            print("test score:", opt.score(X_test, y_test))
            # print(model)
            print("")
            best_models.append(model)
            val_scores.append(opt.best_score_)
            scores.append(opt.score(X_test, y_test))
        return scores, val_scores, best_models

    def trainB(self, X_list, y_list, n_points=1, space=spaceB, cv=5):
        """
        BayesianSearchCV method
        :param X_list: List of training sets
        :param y_list: List of targets
        :param space: parameter space
        :return: models an metrics
        """
        n_calls = self.n_calls

        scores = []
        val_scores = []
        best_models = []

        for j in range(len(X_list)):
            classifier = RandomForestClassifier(n_jobs=-1)
            y = y_list.copy()
            X = X_list.copy()
            y_test = y.pop(j)
            X_test = X.pop(j)
            y_train = np.concatenate(y, axis=0)
            X_train = np.concatenate(X, axis=0)

            X_train = Features().div_cols(X_train).values
            X_test = Features().div_cols(X_test).values

            start = time()
            opt = BayesSearchCV(classifier,
                                search_spaces=space,
                                scoring=self.scorer,
                                cv=cv,
                                n_points=n_points,
                                n_iter=n_calls,
                                n_jobs=-1)

            opt.fit(X_train, y_train)
            model = opt.best_estimator_
            print('Season', 2019 - j)
            print("Bayes CV search took %.2f seconds for %d candidates"
                  " parameter settings." % ((time() - start), n_calls))
            print("val. score:", opt.best_score_)
            print("test score:", opt.score(X_test, y_test))
            # print(model)
            print("")
            best_models.append(model)
            val_scores.append(opt.best_score_)
            scores.append(opt.score(X_test, y_test))
        return scores, val_scores, best_models
コード例 #55
0
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier

if __name__ == '__main__':

    x, y, num, cat = data_prep()

    classifiers = [
        ('KNN', KNeighborsClassifier(), {
            'KNN__algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
            'KNN__n_neighbors': stats.randint(4, 40),
            'KNN__p': stats.randint(1, 5),
            'KNN__weights': ('uniform', 'distance')
        }),
        (
            'ETclf',
            ExtraTreesClassifier(),
            {
                'ETclf__n_estimators': stats.randint(10, 500),
                'ETclf__criterion': ('gini', 'entropy'),
                'ETclf__max_depth': stats.randint(10, 50),
                'ETclf__max_features': ('sqrt', 'log2', 'auto'),
                #                             'ETclf__max_leaf_nodes' :(None , 2 , 4 , 5 , 6 , 7 , 9 , 10) ,
                #                             'ETclf__min_samples_leaf':[2, 4 , 6 , 8 , 10],
                #                             'ETclf__min_samples_split':[2, 4 , 6 , 8 , 10]
            }),
コード例 #56
0
    sh.fit(X, y)
    assert sh.n_candidates_[0] == expected_n_candidates
    if n_candidates == 'exhaust':
        # Make sure 'exhaust' makes the last iteration use as much resources as
        # we can
        assert sh.n_resources_[-1] == max_resources


@pytest.mark.parametrize(
    'param_distributions, expected_n_candidates',
    [
        ({
            'a': [1, 2]
        }, 2),  # all lists, sample less than n_candidates
        ({
            'a': randint(1, 3)
        }, 10),  # not all list, respect n_candidates
    ])
def test_random_search_discrete_distributions(param_distributions,
                                              expected_n_candidates):
    # Make sure random search samples the appropriate number of candidates when
    # we ask for more than what's possible. How many parameters are sampled
    # depends whether the distributions are 'all lists' or not (see
    # ParameterSampler for details). This is somewhat redundant with the checks
    # in ParameterSampler but interaction bugs were discovered during
    # developement of SH

    n_samples = 1024
    X, y = make_classification(n_samples=n_samples, random_state=0)
    base_estimator = FastClassifier()
    sh = HalvingRandomSearchCV(base_estimator,
コード例 #57
0
#scaling and splitting data
#X_data = preprocessing.scale(X_data)
#Y_data = preprocessing.scale(Y_data)
print("done")

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data)

#Exhaustive grid search parameters
param_grid = {'C': [1,10, 100, 1000], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'degree': [0, 1, 2, 3, 4, 5], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], 'coef0': [0,1,2,3,4,5]}
#param_grid = {'C': [1,10,100,1000], 'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], }, {'C': [1,10,100,1000], 'kernel': ['poly'], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], 'degree': [1,2,3,4,5,6,7]}, {'C': [1,10,100,1000], 'kernel': ['linear']}
#Randomized grid search parameters
param_dist = {'C': expon(scale = 100), 
'gamma': expon(scale = .1), 
'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 
'degree': randint(1,100),
'coef0': randint(0,100)}
n_iter_search = 20


#Creating and training model
    #Regular model
#model = SVC(C= 10, gamma= .0001, kernel= 'rbf', verbose = True)
SVC = SVC(verbose = False)
    #Regular grid search
#model = GridSearchCV(SVC, param_grid=param_grid)
#searchtype = GridSearch
    #Randomized Grid search
model = RandomizedSearchCV(SVC, param_distributions=param_dist, n_iter=n_iter_search)
searchtype = "Randomized, n_iter_search = {}".format(n_iter_search)
    #Training
コード例 #58
0
# + [markdown] id="d0k9DQTNlaD6"
# ### 랜덤 서치
# #### 매개변수로서 확률분포객체를 전달

# +
# 양이 너무 많을 경우(시간이 오래걸릴 경우) random하게 뽑아서 grid search -> 대충 그 근방에서 정해진다 함
# cost가 너무 크면 전체적(hybrid하게?)으로 random하게 하고 적당히 min이 잡히면 거기에 맞춰서 grid를 깔고 반복하는 방식

# + id="_T9KTEk1GBcY"
from scipy.stats import uniform, randint

# + colab={"base_uri": "https://localhost:8080/", "height": 34} executionInfo={"elapsed": 33380, "status": "ok", "timestamp": 1591523215530, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="fd0UJpCGGDhz" outputId="a2f0b471-aa50-45f3-d070-76eba4aef24f"
# A uniform discrete random variable

# randint, A uniform discrete random variable.
rgen = randint(low=0, high=9)
# rgen.rvs(size = 2)

# + colab={"base_uri": "https://localhost:8080/", "height": 50} executionInfo={"elapsed": 33373, "status": "ok", "timestamp": 1591523215530, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="ch3zTUohIJR6" outputId="c09ba926-927d-4ae1-9e35-433c11ad5014"
# Find the unique elements of an array.
np.unique(rgen.rvs(1000), return_counts=True)
# np.unique(rgen.rvs(1000))

# + colab={"base_uri": "https://localhost:8080/", "height": 50} executionInfo={"elapsed": 33366, "status": "ok", "timestamp": 1591523215531, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="bGhshTn0IjkI" outputId="36e8962b-f517-4521-d272-d829ced45377"
# A uniform continuous random variable.
ugen = uniform(0, 1)
ugen.rvs(10)

# + id="irDX9e6WYTIH"
params = {
    'min_impurity_decrease': uniform(0.0001, 0.001),
classifier.fit(X_train, y_train)
""" Random forest is a classifier for various decision trees, n_estimator specifies how many 
decision trees we are using. criterion = 'entropy, we can use gini also.

We have selected these parameters randomly, how can we get to know number of n_estimator and 
criterion, for this we use RandomizedSearchCV"""

from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint

est = RandomForestClassifier(n_jobs=-1)
rf_p_dist = {
    'max_depth': [3, 5, 10, None],
    'n_estimators': [10, 100, 200, 300, 400, 500],
    'max_features': randint(1, 3),
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    'min_samples_leaf': randint(1, 4),
}


def hypertuning_rscv(est, p_distr, nbr_iter, X, y):
    rdmsearch = RandomizedSearchCV(est,
                                   param_distributions=p_distr,
                                   n_jobs=-1,
                                   n_iter=nbr_iter,
                                   cv=9)
    #CV = Cross-Validation ( here using Stratified KFold CV)
    rdmsearch.fit(X, y)
    ht_params = rdmsearch.best_params_
コード例 #60
0
    # 'model__C': stats.loguniform(1, 2),
    # 'gamma': 'auto',
    # 'model__degree': stats.randint(1, 3),
    "preprocessor__scaler": [StandardScaler(),
                             RobustScaler(),
                             MinMaxScaler()]
}
#
params_lr = {"random_grid_search": grid_lr, "model": LinearRegression()}
######################################################

######################################################
# RandomForestRegressor model
######################################################
grid_rfr = {
    'model__n_estimators': stats.randint(1, 300),
    'model__max_depth': stats.randint(1, 300),
    'model__max_samples': stats.randint(1, 300),
    "preprocessor__scaler": [StandardScaler(),
                             RobustScaler(),
                             MinMaxScaler()]
}
#
params_rfr = {"random_grid_search": grid_rfr, "model": RandomForestRegressor()}
######################################################

######################################################
# GradientBoostingRegressor model
######################################################
grid_gbr = {  #'model__loss': ["ls", "lad", "huber", "quantile"],
    'model__learning_rate': stats.loguniform(0.001, 10),