def nearest_neighbors(self):
     neighbors_array = [11, 31, 201, 401, 601]
     tuned_parameters = {"n_neighbors" : neighbors_array}
     knn = KNeighborsClassifier()
     clf = GridSearchCV(knn, tuned_parameters, cv=5, n_jobs= 5, scoring = "f1")
     clf.fit(self.train_data_x, self.train_labels_y)
     self.models.append(clf)
Example #2
0
def tuner(clf, parameters, data):
    from sklearn.model_selection import GridSearchCV
    labels, features = targetFeatureSplit(data)
    scaler = MinMaxScaler()
    select = SelectKBest()

    steps = [("scale", scaler),
             ("select", select),
             ("classifier", clf)]
    
    pipeline = Pipeline(steps)

    shuffle = StratifiedShuffleSplit(n_splits=1000, test_size=0.3,
                                     random_state=42)
    
    my_scorer = make_scorer(my_score_func)
    scoring_metric = my_scorer
    
    grid_searcher = GridSearchCV(pipeline, param_grid=parameters,
                                 cv=shuffle, scoring=scoring_metric)

    features = select.fit_transform(features, labels)

    grid_searcher.fit(features, labels)

    print("Cross-validated {0} score: {1}".format(scoring_metric,
                                                  grid_searcher.best_score_))

    print("Params: ", grid_searcher.best_params_)
Example #3
0
def test_grid_search_precomputed_kernel():
    # Test that grid search works when the input features are given in the
    # form of a precomputed kernel matrix
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    # compute the training kernel matrix corresponding to the linear kernel
    K_train = np.dot(X_[:180], X_[:180].T)
    y_train = y_[:180]

    clf = SVC(kernel='precomputed')
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(K_train, y_train)

    assert_true(cv.best_score_ >= 0)

    # compute the test kernel matrix
    K_test = np.dot(X_[180:], X_[:180].T)
    y_test = y_[180:]

    y_pred = cv.predict(K_test)

    assert_true(np.mean(y_pred == y_test) >= 0)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
Example #4
0
def build_and_train():

	data = pd.read_csv('../data/training.csv')
	data = data.dropna(subset=['Gender', 'Married', 'Credit_History', 'LoanAmount'])

	pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\
				'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

	X_train, X_test, y_train, y_test = train_test_split(data[pred_var], data['Loan_Status'], \
														test_size=0.25, random_state=42)
	y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()
	y_test = y_test.replace({'Y':1, 'N':0}).as_matrix()

	pipe = make_pipeline(PreProcessing(),
						RandomForestClassifier())

	param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30],
				 "randomforestclassifier__max_depth" : [None, 6, 8, 10],
				 "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], 
				 "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]}

	grid = GridSearchCV(pipe, param_grid=param_grid, cv=3)

	grid.fit(X_train, y_train)

	return(grid)
def tune_parameters(features, labels):
    """ 
        Use GridSearchCV to identify and return the best parameters to use 
            for the Decision Tree algorithm.
        
        features = features list as returned by the targetFeatureSplit script
        labels = target list as returned by the targetFeatureSplit script
    """
    from sklearn import tree
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import make_scorer
    
    # Make scorer for the GridSearchCV function
    scorer = make_scorer(custom_scorer, greater_is_better = True)
    
    # Parameters names and settings to be used by GridSearchCV
    parameters = [{"criterion": ["gini", "entropy"], 
                   "splitter": ["best", "random"], 
                   "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], 
                   "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                   "min_impurity_split": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5], 
                   "presort": [True, False], 
                   "random_state": [42]}]
    
    # Use GridSearchCV to identify the best parameters
    # K-fold cross-validation is used (100 folds)
    # F1 score from custom_scorer function is used as the evaluator
    clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, cv = 100, scoring = scorer)
    
    clf.fit(features, labels)
    
    best_parameters = clf.best_params_
    
    return best_parameters
def test_ovo_gridsearch():
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    Cs = [0.1, 0.5, 0.8]
    cv = GridSearchCV(ovo, {'estimator__C': Cs})
    cv.fit(iris.data, iris.target)
    best_C = cv.best_estimator_.estimators_[0].C
    assert_true(best_C in Cs)
def svm_hitters_params(to_predict_hitters, x_hitters, hitter_predictions):
	# create lists of parameters to search through
	c = [10**i for i in np.arange(-3,3)]
	gamma = c
	poly_coeff0 = [10**i for i in np.arange(0,3)]

	# finding optimal parameters for svm

	best_params = []

	# preprocess the x values
	x_hitters = preprocessing.scale(x_hitters)

	for col in to_predict_hitters:
	    y = hitter_predictions[col].tolist()
	    x_train, x_test, y_train, y_test = train_test_split(x_hitters, y)
	    
	    svr = svm.SVC()
	    
	    parameters = {'kernel':['rbf'], 'gamma': gamma,'coef0': poly_coeff0}
	    clf = GridSearchCV(svr, parameters)
	    clf.fit(x_train, y_train)
	    best_params.append({col:clf.best_params_})
	    
	return best_params
def test_pipeline():
    param_grid = [{'logisticregression__C': [1, 0.1, 10]}]
    pipe = make_pipeline(StandardScaler(),
                         CopyTransformer(),
                         LogisticRegression())
    grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1)
    grid.fit(X, y)
Example #9
0
def test_stochastic_gradient_loss_param():
    # Make sure the predict_proba works when loss is specified
    # as one of the parameters in the param_grid.
    param_grid = {
        'loss': ['log'],
    }
    X = np.arange(24).reshape(6, -1)
    y = [0, 0, 0, 1, 1, 1]
    clf = GridSearchCV(estimator=SGDClassifier(loss='hinge'),
                       param_grid=param_grid)

    # When the estimator is not fitted, `predict_proba` is not available as the
    # loss is 'hinge'.
    assert_false(hasattr(clf, "predict_proba"))
    clf.fit(X, y)
    clf.predict_proba(X)
    clf.predict_log_proba(X)

    # Make sure `predict_proba` is not available when setting loss=['hinge']
    # in param_grid
    param_grid = {
        'loss': ['hinge'],
    }
    clf = GridSearchCV(estimator=SGDClassifier(loss='hinge'),
                       param_grid=param_grid)
    assert_false(hasattr(clf, "predict_proba"))
    clf.fit(X, y)
    assert_false(hasattr(clf, "predict_proba"))
def plot_cross_val_selection():
    iris = load_iris()
    X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data,
                                                              iris.target,
                                                              random_state=0)

    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X_trainval, y_trainval)
    scores = grid_search.grid_scores_[15:]

    best = np.argmax([x.mean_validation_score for x in scores])
    plt.figure(figsize=(10, 3))
    plt.xlim(-1, len(scores))
    plt.ylim(0, 1.1)
    for i, score in enumerate(scores):
        marker_cv, = plt.plot([i] * 5, score.cv_validation_scores, '^', c='gray', markersize=5, alpha=.5)
        marker_mean, = plt.plot(i, score.mean_validation_score, 'v', c='none', alpha=1, markersize=10)
        if i == best:
            marker_best, = plt.plot(i, score.mean_validation_score, 'o', c='red', fillstyle="none", alpha=1, markersize=20, markeredgewidth=3)

    plt.xticks(range(len(scores)), [str(score.parameters).strip("{}").replace("'", "") for score in scores], rotation=90);
    plt.ylabel("validation accuracy")
    plt.xlabel("parameter settings")
    plt.legend([marker_cv, marker_mean, marker_best], ["cv accuracy", "mean accuracy", "best parameter setting"], loc=(1.05, .4))
        def build(X, y=None):
            """
            Inner build function that builds a single model.
            :param X:
            :param y:
            :return:
            """
            model = Pipeline([
                ('vectorizer', TfidfVectorizer(
                    tokenizer=self.spacy_tokenizer, preprocessor=None, lowercase=False)),
                ('clf', SVC(C=1,kernel="linear",
                            probability=True,
                            class_weight='balanced'))])

            from sklearn.model_selection import GridSearchCV

            items,counts= np.unique(y, return_counts=True)

            cv_splits = max(2, min(5, np.min(counts) // 5))

            Cs = [0.01,0.25,1, 2, 5, 10, 20, 100]
            param_grid = {'clf__C': Cs, 'clf__kernel': ["linear"]}
            grid_search = GridSearchCV(model,
                                       param_grid=param_grid,
                                       scoring='f1_weighted',
                                       cv=cv_splits,
                                       verbose=2,
                                       n_jobs=-1
                                       )
            grid_search.fit(X, y)

            return grid_search
Example #12
0
def train_sgd(data, result, scoring=None):
    print("train SGDClassifier {}".format(len(data)))
    #scaler = None
    #scaler = preprocessing.MinMaxScaler()
    print("Scale: {}".format(type(scaler)))
    if scaler != None:
        data = scaler.fit_transform(data)

    #classifier = SGDClassifier(loss="hinge", penalty="l2")
    #classifier.fit(data, result)
    #return scaler, classifier

    parameters = {
        'loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
                 'squared_loss', 'huber', 'epsilon_insensitive',
                 'squared_epsilon_insensitive'),
        'penalty': ('none', 'l2', 'l1', 'elasticnet')
    }
    print(parameters)
    search = GridSearchCV(SGDClassifier(), parameters, scoring=scoring, n_jobs=1)
    search.fit(data, result)
    print("best params: {}".format(search.best_params_))
    print("best score: {}".format(search.best_score_))
    print
    return scaler, search.best_estimator_.fit(data,result)
Example #13
0
    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit Ridge regression model after searching for the best mu and tau.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data

        y : array-like, shape = [n_samples] or [n_samples, n_targets]
            Target values

        sample_weight : float or array-like of shape [n_samples]
            Sample weight

        Returns
        -------
        self : Returns self.
        """
        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self._label_binarizer.fit_transform(y)
        if self._label_binarizer.y_type_.startswith('multilabel'):
            raise ValueError(
                "%s doesn't support multi-label classification" % (
                    self.__class__.__name__))
        else:
            y = column_or_1d(y, warn=False)

        param_grid = {'tau': self.taus, 'lamda': self.lamdas}
        fit_params = {'sample_weight': sample_weight,
                      'check_input': check_input}
        estimator = L1L2TwoStepClassifier(
            mu=self.mu, fit_intercept=self.fit_intercept,
            use_gpu=self.use_gpu, threshold=self.threshold,
            normalize=self.normalize, precompute=self.precompute,
            max_iter=self.max_iter,
            copy_X=self.copy_X, tol=self.tol, warm_start=self.warm_start,
            positive=self.positive,
            random_state=self.random_state, selection=self.selection)
        gs = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid, fit_params=fit_params, cv=self.cv,
            scoring=self.scoring, n_jobs=self.n_jobs, iid=self.iid,
            refit=self.refit, verbose=self.verbose,
            pre_dispatch=self.pre_dispatch, error_score=self.error_score,
            return_train_score=self.return_train_score)
        gs.fit(X, y)
        estimator = gs.best_estimator_
        self.tau_ = estimator.tau
        self.lamda_ = estimator.lamda
        self.coef_ = estimator.coef_
        self.intercept_ = estimator.intercept_
        self.best_estimator_ = estimator  # XXX DEBUG

        if self.classes_.shape[0] > 2:
            ndim = self.classes_.shape[0]
        else:
            ndim = 1
            self.coef_ = self.coef_.reshape(ndim, -1)

        return self
Example #14
0
    def score_nestedCV(self, G1, model, param_grid, effect, nested):
        k_fold = model_selection.KFold(n_splits=self.n_folds).split(range(self.Y.shape[0]))
        i_fold=0
        scores = sp.zeros(self.n_folds)
        params = list()

        for train, test in k_fold:
            (trainData, trainY) = self._packData(G1, train, effect)
            (testData, testY) = self._packData(G1, test, effect)

            if nested:
                clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs = self.n_jobs_grid,
                                   cv=self.n_folds_params, scoring=self.scoring, verbose=self.verbose)

                clf.fit(trainData, trainY.flatten())

                params.append(clf.best_params_)

                scores[i_fold] = clf.score(testData, testY.flatten(), method_scorer=False)
            else:

                model.fit(trainData, trainY.flatten())
                scores[i_fold] = SCORERS[self.scoring](model, testData, testY.flatten())
            i_fold+=1

        return scores,params
Example #15
0
 def fit_branchmodel(self,xwl2,hm_y):
     clf = LogisticRegression(C=1.,class_weight='balanced',penalty='l1',solver='liblinear',max_iter=300)
     param_grid = dict(C=(np.logspace(-.2, 1, 15)))
     #gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=4), n_jobs=-1,scoring='precision_weighted')
     gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedShuffleSplit(n_splits=50, test_size=.2,random_state=1), n_jobs=-1,scoring='accuracy')
     gridclf.fit(xwl2,hm_y)
     return gridclf.best_estimator_
Example #16
0
def optimize_model_regress(data, tc):
    train_data = data.sample(frac=.8)
    test_data = data.drop(train_data.index)
    train_y = train_data['temperature']/tc
    train_X = train_data.drop(['T/Tc','temperature'], axis=1)
    test_y = test_data['temperature']/tc
    test_X = test_data.drop(['T/Tc','temperature'], axis=1)

    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1,.5,.1,1e-2,1e-3, 1e-4],
                     'C': [.1,.5, 1,5, 10, 50, 100,500, 1000]},
                    {'kernel': ['linear'], 'C': [.1,.5, 1,5, 10, 50, 100,500, 1000]}]

    model = GridSearchCV(svm.SVR(), tuned_parameters, cv=5)
    model.fit(train_X, train_y)
    print()
    print("Best parameters:")
    print()
    print(model.best_params_)
    print()
    print("Grid scores:")
    print()
    means = model.cv_results_['mean_test_score']
    stds = model.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, model.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()
    y_true, y_pred = test_y, model.predict(test_X)
    print("Mean Absolute Error : " + str(mean_absolute_error(y_pred,y_true)))
    print()
Example #17
0
def model_selection(
        x_matrix, y_vector, param_grid, cv=None, scoring=None):
    pipeline = Pipeline(
        [('resampler', None), ('classifier', DummyClassifier())])
    grid_search_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring)
    grid_search_cv.fit(x_matrix, y_vector)
    return grid_search_cv
def inner_cv_loop(Xtrain,Ytrain,clf,parameters,
                    oversample=None,fa_dims=20,
                    verbose=False):
    """
    use GridSearchCV to find best classifier for training set
    """

    rocscore={}
    best_est={}
    facanal={}
    for fa_d in [0,fa_dims]:
        clfname='fa' if fa_d>0 else "nofa"
        if fa_d>0:
            facanal[clfname]=FactorAnalysis(fa_d)
            Xtrain=facanal[clfname].fit_transform(Xtrain)
        else:
            facanal[clfname]=None

        if verbose:
            print(clfname)
        gs=GridSearchCV(clf,parameters,scoring='roc_auc')
        gs.fit(Xtrain,Ytrain)
        rocscore[clfname]=gs.best_score_
        best_est[clfname]=gs.best_estimator_

    bestscore=numpy.max([rocscore[i] for i in rocscore.keys()])
    bestclf=[i for i in rocscore.keys() if rocscore[i]==bestscore][0]
    if verbose:
        print('best:',bestclf,bestscore,best_est[bestclf],facanal[bestclf])
    return best_est[bestclf],bestscore,facanal[bestclf]
Example #19
0
 def _search_param(self, metric, X, y):
   '''
   Find best potential parameters set using few n_estimators
   '''
   # Make sure user specified params are in the grid.
   max_depth_grid = list(np.unique([self.model_instance.max_depth, 5, 7]))
   colsample_bytree_grid = list(
       np.unique([self.model_instance.colsample_bytree, 0.66, 0.9]))
   reg_lambda_grid = list(np.unique([self.model_instance.reg_lambda, 1, 5]))
   param_grid = {
       'max_depth': max_depth_grid,
       'learning_rate': [max(self.model_instance.learning_rate, 0.3)],
       'n_estimators': [min(self.model_instance.n_estimators, 60)],
       'gamma': [self.model_instance.gamma],
       'min_child_weight': [self.model_instance.min_child_weight],
       'max_delta_step': [self.model_instance.max_delta_step],
       'subsample': [self.model_instance.subsample],
       'colsample_bytree': colsample_bytree_grid,
       'colsample_bylevel': [self.model_instance.colsample_bylevel],
       'reg_alpha': [self.model_instance.reg_alpha],
       'reg_lambda': reg_lambda_grid,
       'scale_pos_weight': [self.model_instance.scale_pos_weight],
       'base_score': [self.model_instance.base_score],
       'seed': [self.model_instance.seed]
   }
   grid_search = GridSearchCV(
       self.model_instance, param_grid, cv=2, refit=False, scoring=metric)
   grid_search.fit(X, y)
   best_params = grid_search.best_params_
   # Change params back original params
   best_params['learning_rate'] = self.model_instance.learning_rate
   best_params['n_estimators'] = self.model_instance.n_estimators
   return best_params
def test_gridsearch():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=2)

    sfs1 = SFS(estimator=knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=5)

    pipe = Pipeline([('sfs', sfs1),
                     ('knn', knn)])

    param_grid = [
        {'sfs__k_features': [1, 2, 3, 4],
         'sfs__estimator__n_neighbors': [1, 2, 3, 4]}
    ]

    gs = GridSearchCV(estimator=pipe,
                      param_grid=param_grid,
                      n_jobs=1,
                      iid=False,
                      cv=5,
                      refit=False)

    gs = gs.fit(X, y)

    assert gs.best_params_['sfs__k_features'] == 3
    def __param_search(self, clasif, train_data):
        """

        :param clasif: clasificador para realizar la CV. String
        :param train_data: datos para el entrenamiento
        :return: objeto GridSearchCV

        Recompone los parámetros a utilizar y realiza una validación cruzada

        """

        param_grid = dict()
        if self.preprocess_data is False:
            pipeline = self.pipelines[clasif]
            self.parameters[clasif].update(parameters.vect_params)
            param_grid = self.parameters[clasif]
        else:
            pipeline = self.pipelines_train[clasif]
            for k in self.parameters[clasif].keys():
                param_grid[k[4:]] = self.parameters[clasif][k]
            print(param_grid)
            print(pipeline)
        print("\n#Searching parameters for ", clasif) if self.v >= 1 else None
        print("Parametros: ", param_grid) if self.v >= 2 else None
        print("train_data: ", train_data) if self.v >= 3 else None
        print("Longitudes: %d %d" % (len(train_data[0]), len(train_data[1]))) if self.v >= 2 else None
        if self.bin_flag:
            try:
                grid_search = GridSearchCV(pipeline, param_grid, verbose=self.v, scoring='roc_auc', refit=True, cv=3)
            except Exception as e:
                os.system('cat %s >> archivo.txt' % e)
        else:
            grid_search = GridSearchCV(pipeline, param_grid, verbose=self.v, scoring='accuracy', refit=True, cv=3)
        grid_search.fit(train_data[0], train_data[1])
        return grid_search
Example #22
0
def svr_linear(X,Y,x,y):
    reg = GridSearchCV(SVR(kernel='linear'), cv=10,param_grid={"C":[1e0, 1e1, 1e2, 1e3], "degree":[1,2,3,4]})
    reg.fit(X, Y)
    y_predict = reg.predict(x)
    rmse = RMSE(y=y, y_predict=y_predict)
    print "rmse: ", str(rmse)
    return rmse, y_predict
Example #23
0
File: ML2.py Project: MojiFar/test
def PipeFeauture(Xtrain, Ytrain):
    pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
    ])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        #'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        #'tfidf__norm': ('l1', 'l2'),
        'clf__alpha': (0.00001, 0.000001),
        'clf__penalty': ('l2', 'elasticnet'),
        #'clf__n_iter': (10, 50, 80),
    }
    
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(Xtrain, Ytrain)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
Example #24
0
def test_count_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS

    # label junk food as -1, the others as +1
    target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)

    # split the dataset for model development and final evaluation
    train_data, test_data, target_train, target_test = train_test_split(
        data, target, test_size=.2, random_state=0)

    pipeline = Pipeline([('vect', CountVectorizer()),
                         ('svc', LinearSVC())])

    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'svc__loss': ('hinge', 'squared_hinge')
    }

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # Check that the best model found by grid search is 100% correct on the
    # held out evaluation set.
    pred = grid_search.fit(train_data, target_train).predict(test_data)
    assert_array_equal(pred, target_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accuracy models
    assert_equal(grid_search.best_score_, 1.0)
    best_vectorizer = grid_search.best_estimator_.named_steps['vect']
    assert_equal(best_vectorizer.ngram_range, (1, 1))
 def logistic_regression(self):
     C_array = [2**i for i in range(-10, 10)]
     tuned_parameters = {'C' : C_array}
     logi_reg = LogisticRegression()
     clf = GridSearchCV(logi_reg, tuned_parameters, cv=5, scoring = "recall")# make_scorer(my_scorer))
     clf.fit(self.train_data_x, self.train_labels_y)
     self.models.append(clf)
Example #26
0
def kernel_ridge_linear(X,Y,x,y):
    reg = GridSearchCV(KernelRidge(kernel='linear'), cv=10,param_grid={"alpha": [1e0,0.1,1e-2,1e-3],"degree":[1,2,3,4] })
    reg.fit(X, Y)
    y_predict = reg.predict(x)
    rmse = RMSE(y=y, y_predict=y_predict)
    print "rmse: ", str(rmse)
    return y_predict
Example #27
0
def test_grid_search_correct_score_results():
    # test that correct scores are used
    n_splits = 3
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits)
        results = grid_search.fit(X, y).cv_results_

        # Test scorer names
        result_keys = list(results.keys())
        expected_keys = (("mean_test_score", "rank_test_score") +
                         tuple("split%d_test_score" % cv_i
                               for cv_i in range(n_splits)))
        assert_true(all(in1d(expected_keys, result_keys)))

        cv = StratifiedKFold(n_splits=n_splits)
        n_splits = grid_search.n_splits_
        for candidate_i, C in enumerate(Cs):
            clf.set_params(C=C)
            cv_scores = np.array(
                list(grid_search.cv_results_['split%d_test_score'
                                             % s][candidate_i]
                     for s in range(n_splits)))
            for i, (train, test) in enumerate(cv.split(X, y)):
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, cv_scores[i])
Example #28
0
accuracies.mean()
accuracies.std()

# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{
    'C': [1, 2, 3, 4],
    'kernel': ['linear']
}, {
    'C': [1, 2, 3, 4],
    'kernel': ['rbf'],
    'gamma': [0.67, 0.68, 0.69, 0.675, 0.685]
}]
grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
              step=0.01),
    np.arange(start=X_set[:, 1].min() - 1,
              stop=X_set[:, 1].max() + 1,
 def get_grid_result(self, param_grid):
     model = KerasClassifier(build_fn=self.build_fn, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
     clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=self.split)
     grid_result = clf.fit(self.train_x, self.train_y)
     return grid_result
Example #30
0
def A2_SVM_ParameterTuning(x_train, y_train):
    param_grid = {'kernel': ('linear', 'poly', 'rbf'), 'C': [1, 10]}
    grid = GridSearchCV(SVC(), param_grid=param_grid, cv=4)  # GridSearchCV
    grid.fit(x_train, y_train)
    return grid.best_params_
Example #31
0
kfold = StratifiedKFold(n_splits=10)

rf_param_grid = {
    "max_depth": [None],
    "max_features": [1, 3, 6],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [False],
    "n_estimators": [50, 100, 300, 500],
    "criterion": ["gini", "entropy"]
}

from sklearn.model_selection import GridSearchCV
gsRFC = GridSearchCV(classifier,
                     param_grid=rf_param_grid,
                     cv=kfold,
                     scoring="accuracy",
                     n_jobs=-1,
                     verbose=1)

gsRFC.fit(X_train, y_train)

RFC_best = gsRFC.best_estimator_

gsRFC.best_score_

# Generate competition submission
X_comp, y_none = pre_process(comp_test_dataset)

comp_test_pred = gsRFC.predict(X_comp)

comp_output = pd.DataFrame(data=np.append(comp_test_dataset[["PassengerId"
# Model
estimator = RandomForestRegressor(n_estimators=250, criterion='mse',
                                  n_jobs=15, verbose=1, random_state=0)

pipeline = Pipeline([
    ('imputation', make_union(SimpleImputer(strategy="median"),
                              MissingIndicator(error_on_new=False))),
    ('estimator', estimator)])

cv = ShuffleSplit(n_splits=100, test_size=0.1, random_state=0)

param_grid = {'estimator__max_depth': [5, 10, 20, 40, None],
              'estimator__max_features': [1, 5, 'log2', 'sqrt', 'auto', None]}

grid_search = GridSearchCV(pipeline, param_grid=param_grid,
                           cv=5, verbose=2, n_jobs=15)
metrics = []


def predict_collect_save(data_pred, data_collect, y_true, test_index,
                         split, save_type):
    scores = {}
    pred_ = grid_search.predict(data_pred)
    y_true_ = y_true.iloc[test_index]

    predictions = pd.DataFrame(pred_, columns=['predicted'],
                               index=y_true_.index)
    predictions['true'] = y_true_
    predictions['test_indices'] = pd.DataFrame(test_index,
                                               columns=['test indices'],
                                               index=y_true_.index)
Example #33
0
# We create a instance of model.
Estimator_DTree = DecisionTreeClassifier()

# Now, we are going to use a grid search cross-validation to explore combinations of parameters.
param_grid = {
    'criterion': ['gini'],
    'max_features': ['auto'],
    'splitter': ['random', 'best'],
    'min_samples_split': [25, 30, 35, 40, 45],
    'max_depth': range(4, 6),
    'random_state': [0]
}

Grid_DTree = GridSearchCV(Estimator_DTree,
                          param_grid,
                          cv=10,
                          verbose=2,
                          scoring='f1')
Grid_DTree.fit(X_train, Y_train)

# Once it has been fitted, we get several parameters.

#print("ParameterGrid: ",'\n',list(ParameterGrid(param_grid)),'\n')
print("Best estimator: ", Grid_DTree.best_estimator_, '\n')
print("Best Score: ", round(Grid_DTree.best_score_, 2))

print("Best Parameters ", Grid_DTree.best_params_)

# Now, we came back fit it Best_Grid_estimator with.

Best_Grid_estimator_DTree = Grid_DTree.best_estimator_
    return normalized_gini

gini_scorer = make_scorer(normalized_gini, greater_is_better=True)

cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2)

gsc = GridSearchCV(
    estimator=rf,
    param_grid={
        #'class_weight': [{0: 1, 1: x} for x in range(300, 701, 100)] + ['balanced'],
        #'min_samples_leaf': range(5,51,5),
        #'min_samples_split': range(5,56,10),
        #'n_estimators': range(200, 601, 200),
        #'criterion': ('gini', 'entropy')
        #'max_features': ('auto', 'sqrt'),
        #'max_features': range(3, 9),
        #'max_depth': range(3, 7),
    },
    #scoring='neg_log_loss',
    scoring='roc_auc',
    #scoring='f1',
    #scoring=gini_scorer,
    cv=cv,
    verbose=2
)

rsc = RandomizedSearchCV(
     estimator=rf,
     param_distributions={
         'n_estimators': randint(250, 2500),
         #'class_weight': [{0: 1, 1: x} for x in range(15, 51, 5)],
Example #35
0
        environ[
            "PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses (n_jobs > 1)

        # Linear SVM
        print("\rLinear SVM         ", end='')
        parameters = {'C': [0.01, 0.1, 1, 10, 100]}
        # svc = svm.LinearSVC(class_weight=args.class_weight, random_state=seed)
        svc = svm.SVC(kernel='linear',
                      class_weight=args.class_weight,
                      random_state=args.seed,
                      probability=True,
                      max_iter=max_iters)
        clf = GridSearchCV(svc,
                           parameters,
                           cv=sss,
                           n_jobs=-1,
                           scoring=scoring,
                           refit='roc_auc',
                           return_train_score=True)
        try:
            clf.fit(data, labels)
        except Exception as e:
            if hasattr(e, 'message'):
                print(e.message)
            else:
                print(e)

        save_results(args.savefile, 'a', 'Linear SVM', clf)

        # RBF SVM
        print("\rRBF SVM             ", end='')
from sklearn.preprocessing import MinMaxScaler
Scaler = MinMaxScaler()
X_train_scaler = Scaler.fit_transform(X_train)
X_test_scaler = Scaler.transform(X_test)


# In[ ]:



from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,roc_auc_score,precision_score,recall_score
LoR = LogisticRegression(random_state=0)
c_values = {'C':[1,15,10,100,150,250]}
grdClf = GridSearchCV(LoR,param_grid=c_values,scoring='precision')
grdClf.fit(X_train_scaler,y_train)
y_decs = grdClf.decision_function(X_test_scaler)
(grdClf.best_params_,grdClf.best_score_,roc_auc_score(y_test,y_decs))


# Performing Cross validation to evaluate the model

# In[ ]:


from sklearn.model_selection import cross_val_score
cross_val_score(DTClf,X_train,y_train,cv=5,scoring='precision')


# In[ ]:

print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test))


confusion_matrix(y_test,y_pred)


# From our confusion matrix we can see that accuracy is pretty low.

# In[35]:


from sklearn.model_selection import GridSearchCV
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]
param_grid = {'tol':tol, 'max_iter': max_iter}


# In[38]:


grid_model = GridSearchCV(logreg,param_grid,cv = 5)
rescaledX = scaler.fit_transform(X)
grid_model_result = grid_model.fit(rescaledX,y)
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print(best_score, best_params)


# Best Score : 0.186
from sklearn.pipeline import Pipeline

# TODO: 1.生成数据集并进行数据划分
X, y = make_blobs(n_samples=200, centers=2, cluster_std=5, random_state=16)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=16)

# TODO: 2.创建管道模型,并创建预处理模块scaler和神经网络模块mlp
params = {
    'mlp__hidden_layer_sizes': [[50], [100], [100, 100]],
    'mlp__alpha': [0.0001, 0.001, 0.01, 0.1]
}

pipeline = Pipeline(steps=[('scaler', StandardScaler()),
                           ('mlp', MLPClassifier(max_iter=1600,
                                                 random_state=16))],
                    verbose=0)

# TODO: 3.创建网格搜索模型,并输出预测结果
grid = GridSearchCV(pipeline,
                    param_grid=params,
                    cv=5,
                    iid=False,
                    n_jobs=8,
                    verbose=1)
grid.fit(X_train, y_train)
print('交叉验证评分:{:.2f}'.format(grid.best_score_))
print('模型最优参数:{}'.format(grid.best_params_))
print('测试集得分:{}'.format(grid.score(X_test, y_test)))

print(pipeline.steps)
Example #39
0
def model_selection(X, y, rep=10, random_state=42):
    """ Uses grid search and cross validation to choose the best clf for the task (X,y)"""

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state)

    # models = [
    # 		 ## ("XGB",XGBClassifier(seed=random_state,objective = "multi:softmax")),
    # 		  ("NaiveBayes",GaussianNB()),
    # 		  ("RF",RandomForestClassifier(random_state=random_state)),
    #           ("AdaBoost",AdaBoostClassifier(random_state=random_state)),
    #           ("LR",LogisticRegression(random_state=random_state)),
    #           ## ("linSVC",LinearSVC(multi_class="ovr")),
    #           ("SVC",SVC(random_state=random_state,probability=True)),
    #           ("MLP",MLPClassifier(random_state=random_state))
    #           # ("KNN",KNeighborsClassifier())
    #           ]

    # hyperparameters = [
    # 					 ## [("max_depth",[15,20]),("n_estimators",[100,200])],
    # 					 [],
    # 					 [("max_depth",[5,6,10])],
    # 			  	     [("n_estimators",[10,50,100])],
    # 			  	     [("C",[1.0,0.95,0.9])],
    # 					 ## [],
    # 					 [],
    # 					 [("hidden_layer_sizes",[(200,),(150,),(100,100),(300,)])]
    # 			  	     # [("n_neighbors",[5,6,7,10])]
    # 			  	]

    best_est = None
    best_score = 0.0
    results_summary = []
    all_models = []
    for model, hyperp_setting in zip(models_clfs, hyperparameters_clfs):
        print("Fitting " + model[0])
        pipeline = Pipeline([model])
        # pipeline = Pipeline([("scaling",StandardScaler()),model])
        param_grid = {}
        for param in hyperp_setting:
            param_grid[model[0] + "__" + param[0]] = param[1]
        grid_search = GridSearchCV(pipeline,
                                   param_grid=param_grid,
                                   verbose=True,
                                   scoring="f1_weighted",
                                   cv=3,
                                   n_jobs=5)
        grid_search.fit(X_train, y_train)

        clf = grid_search.best_estimator_
        scores = []
        np.random.seed(random_state)
        for i in range(0, rep):
            rows = np.random.randint(2, size=len(X_train)).astype('bool')
            clf.fit(X_train[rows], y_train[rows])
            preds = clf.predict(X_test)
            scores.append(f1_score(y_test, preds, average='weighted'))

        results_summary.append([model, scores])
        print(results_summary[-1])
        avg_score = pd.DataFrame(scores).mean()[0]
        if (avg_score > best_score):
            best_score = avg_score
            best_est = clf
        clf.fit(X, y)
        all_models.append([model[0], clf])

    y_pred = best_est.predict(X_test)
    rocs = []
    preds_score = best_est.predict_proba(X_test)
    for i in range(0, len(best_est.classes_)):
        correct_class = best_est.classes_[i]
        fpr, tpr, _ = roc_curve(y_test.as_matrix(),
                                [p[i] for p in preds_score],
                                pos_label=correct_class)
        roc_df = pd.DataFrame(tpr, columns=["tpr"]).join(
            pd.DataFrame(fpr, columns=["fpr"])).join(
                pd.DataFrame([correct_class] * len(tpr), columns=["class"]))
        rocs.append(roc_df)
    rocs_df = pd.concat(rocs)

    #using whole data after cv
    best_est.fit(X, y)

    return best_est, best_score, confusion_matrix(
        y_test, y_pred), rocs_df, results_summary, all_models
    # model details
    model_name = f'{args.model}_t{args.topic_num:02d}_l{args.max_seq_len}_d{args.drop_out}'
    model_group = models_info[args.model]['group']
    params = models_info[args.model]['params']

    # model training
    print(f'{model_name} training...')

    if model_group == 'sklearn':
        # build models
        model = build_SKM(model_type=args.model,
                          max_features=20000,
                          selectK=10000)

        # grid search
        clf = GridSearchCV(model, params, cv=3, n_jobs=-1)
        clf.fit(X_train, y_train)

        # train with best params
        model.set_params(**clf.best_params_)
        model.fit(X_train, y_train)

        # test and save
        save_file(model, 'models', model_name)

        results = model_evaluation(X_test, y_test, model=model)

    if model_group == 'keras':
        # preprocessing
        X_train, word_index, doc2seq = tokenizer_transform(
            X_train,
    #               #'model__beta_2' : [1e-7, 1e-8, 1e-9]
    #
    # }

    # Two options:
    #   1) Grid search to find best model
    #   2) Train best model and save to disk

    # Option 1) Grid search to find best model
    if args.type == "gridsearch":
        print("Start Grid search...")

        X_train, X_test, y_train, y_test = train_test_split(
            V, labels.values.ravel(), test_size=0.3, random_state=0)

        grid = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1, verbose=1)

        #grid = grid.fit(V, labels.values.ravel())
        grid = grid.fit(X_train, y_train)

        y_pred = grid.best_estimator_.predict(X_test)

        print("\nBest: %f using %s" % (grid.best_score_, grid.best_params_))

        print("F1-Score:", f1_score(y_test, y_pred))
        print("Precision: ", precision_score(y_test, y_pred))
        print("Recall: ", recall_score(y_test, y_pred))
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("roc auc: ", roc_auc_score(y_test, y_pred))
        print("Performance overall: ")
        print(classification_report(y_test, y_pred))
                                   random_state=42)

rnd_search_cv.fit(train_prepared, price_labels)

#%%
rnd_search_cv.best_estimator_

#%%

#Random Forest Regressor
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42)
ridge_reg.fit(train_prepared, price_labels)
print(ridge_reg.predict(test_prepared))
#%%

#Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': (1000, 10, 100), 'max_features': [15, 5, 10]}
forest = RandomForestRegressor(random_state=0)

gsearch = GridSearchCV(forest, parameters, cv=5)
gsearch.fit(X_train, y_train)
#%%
gsearch.best_estimator_
gsearch.best_score_

#%%
Example #43
0
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores("Random Forest Regression", forest_rmse_scores)

#SupportVectorRegressor Validation
sv_scores = cross_val_score(sv_reg,
                            X_train,
                            Y_train,
                            scoring="neg_mean_squared_error",
                            cv=10)
sv_rmse_scores = np.sqrt(-sv_scores)
display_scores("Support Vector Regression", sv_rmse_scores)

#8.Fine-Tune RandomForestRegressor Model using Grid Search
param_grid = [{'n_estimators': [10, 20, 30, 40], 'max_features': [4, 8]}]
grid_search = GridSearchCV(forest_reg,
                           param_grid,
                           cv=10,
                           scoring='neg_mean_squared_error')
grid_search.fit(X_train, Y_train)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
print("Best Parameter: ")
print(grid_search.best_params_)

final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print("Final Prediction: ")
print(final_rmse)
Example #44
0
class GradientBoost:
    def __init__(self):
        """
        Don't pass in anything
        """
        self.m = GradientBoostingClassifier()

    def log_loss_score(self, y_true, y_pred):
        """
        input:
            y_true, y_pred: 1d arrays of size n
        output:
            float: log loss score. It's a negative number, closer to zero is better.
        """
        return -log_loss(y_true, y_pred)

    def _change_data(self, old_df):
        #return jconvert(a_convert(old_df))
        """
        take dataframe, output dataframe
        """
        return max_data_pipeline(jconvert(a_convert(do_it(old_df))))

    def fit(self, data):
        """
        data is a dataframe
        no output
        """

        self.data = self._change_data(data)

        self.X = self.data.drop(['fraud'], axis=1).values
        self.y = self.data['fraud'].values

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y)
        self.m.fit(self.X_train, self.y_train)

    def optimize(self, param_grid, n_jobs=-1, cv=5):
        """
        run grid search to change model
        """
        self.grid = GridSearchCV(self.m,
                                 param_grid=param_grid,
                                 cv=cv,
                                 n_jobs=n_jobs,
                                 scoring='neg_log_loss')
        self.grid.fit(self.X_train, self.y_train)
        self.m = self.grid.best_estimator_

    def score(self):
        return self.log_loss_score(self.y_test,
                                   self.m.predict_proba(self.X_test))

    def predict(self, df):
        X = self._change_data(df).values
        return self.m.predict(X)

    def predict_proba(self, df):
        converted = self._change_data(df)
        default_col = pd.Series([0] * len(converted))
        for col in self.data.columns:
            if col not in converted.columns and col != 'fraud':
                converted[col] = default_col
        return self.m.predict_proba(converted.values)
Example #45
0
def cross_validate(featnames,
                   tasknames,
                   cvs,
                   classifiers,
                   gps,
                   logger,
                   n_jobs,
                   npy_suffix='',
                   mid_layer=4):
    '''featnames: list of string, ['mine', 'mfcc']

    - tasknames = list of stringm ['ballroom_extended', 'gtzan_genre', 'gtzan_speechmusic',
                                   'emoMusic', 'jamendo_vc', 'urbansound']
    - cvs: list of cv, 10 for rest, split arrays for urbansound and jamendo_vd

    - classifier: list of classifier class, e.g [KNeighborsClassifier, SVC]

    - gps: list of gp, e.g. [{"n_neighbors":[1, 2, 8, 12, 16]}, {"C":[0.1, 8.0], "kernel":['linear', 'rbf']}]

    - mid_layer: scalar, or list of scalar .

    '''

    np.random.seed(1209)

    if not isinstance(mid_layer, list):
        mid_layer = [mid_layer]
    logger.info('')
    logger.info('--- Cross-validation started for {} ---'.format(''.join(
        [str(i) for i in mid_layer])))
    for featname in featnames:
        logger.info(' * feat_name: {} ---'.format(featname))
        for classifier, gp in zip(classifiers, gps):
            clname = classifier.__name__
            logger.info('   - classifier: {} ---'.format(clname))
            for taskname, cv in zip(tasknames, cvs):
                logger.info('     . task: {} ---'.format(taskname))
                model_filename = 'clf_{}_{}_{}.cP'.format(
                    featname, taskname, clname)
                x, y = load_xy_many(taskname,
                                    featname,
                                    npy_suffix,
                                    logger,
                                    mid_layer=mid_layer)
                estimators = [('stdd', OptionalStandardScaler()),
                              ('clf', classifier())]
                pipe = Pipeline(estimators)

                if isinstance(gp, dict):  # k-nn or svm with single kernel
                    params = {'stdd__on': [True, False]}
                    params.update({
                        'clf__' + key: value
                        for (key, value) in gp.iteritems()
                    })
                elif isinstance(
                        gp,
                        list):  # svm: grid param can be a list of dictionaries
                    params = []
                    for dct in gp:  # should be dict of list for e.g. svm
                        sub_params = {'stdd__on': [True, False]}
                        sub_params.update({
                            'clf__' + key: value
                            for (key, value) in dct.iteritems()
                        })
                        params.append(sub_params)

                clf = GridSearchCV(pipe,
                                   params,
                                   cv=cv,
                                   n_jobs=n_jobs,
                                   pre_dispatch='8*n_jobs').fit(x, y)
                logger.info('     . best score {}'.format(clf.best_score_))
                logger.info(clf.best_params_)
                print('best score of {}, {}, {}: {}'.format(
                    featname, taskname, clname, clf.best_score_))
                print(clf.best_params_)
                cP.dump(clf, open(os.path.join(PATH_CLS, model_filename), 'w'))
                featname_midlayer = '{}_{}'.format(
                    featname, ''.join([str(i) for i in mid_layer]))
                save_result(featname_midlayer, taskname, clname,
                            clf.best_score_)
Example #46
0
import pandas as pd

(X, y), _ = load_preproccessed_dataset(test_split=0.0, include_grades=True)

hyperparams = {
    'n_estimators': [
        100,
        500,
    ],
    'max_depth': [
        3,
        None,
    ],
    'min_samples_leaf': [
        1,
        0.05,
    ],
}

clf = GridSearchCV(estimator=RandomForestClassifier(),
                   param_grid=hyperparams,
                   cv=10)
clf.fit(X, y)

cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[[
    *(f'param_{p}' for p in hyperparams.keys()), 'mean_fit_time',
    'mean_test_score', 'std_test_score'
]])
# In[60]:


y_pred_c


# # GridSearchCV

# In[63]:


parameters = [{'gamma': [0.001, 0.005, 0.01, 0.02, 0.05, 0.1],
               'C': [0.1, 0.2, 0.25, 0.5, 1, 1.5, 2]}]
               #'nu': [0.75, 0.8, 0.85, 0.9, 0.95, 0.97]}]
reg1 = GridSearchCV(SVR(kernel='rbf', tol=0.01), parameters, cv=5, scoring='neg_mean_absolute_error')
reg1.fit(x_train, y_train.flatten())
y_pred1 = reg1.predict(x_train)

print("Best CV score: {:.4f}".format(reg1.best_score_))
print(reg1.best_params_)
#print(y_pred1)


# In[ ]:





# In[ ]:
Example #48
0
# open a file, where you ant to store the data
file = open('regression_model.pkl', 'wb')

# dump information to that file
pickle.dump(reg, file)

########################## Ridge Regression #######################
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
parameters = {
    'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40]
}
ridge_regressor = GridSearchCV(ridge,
                               parameters,
                               scoring='neg_mean_squared_error',
                               cv=5)
ridge_regressor.fit(X, y)

print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)

########################## Lasso Regression #######################
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

lasso = Lasso()
parameters = {
    'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40]
}
lasso_regressor = GridSearchCV(lasso,
Example #49
0
#                           random_state=100, topic_word_prior=None,
#                           total_samples=1000000.0, verbose=0)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50., random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                                                 evaluate_every=-1, learning_decay=0.7, learning_method=None,
                                                 learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
                                                 mean_change_tol=0.001, n_components=10, n_jobs=1,
                                                 n_topics=None, perp_tol=0.1, random_state=None,
                                                 topic_word_prior=None, total_samples=1000000.0, verbose=0),
             fit_params=None, iid=True, n_jobs=1,
             param_grid={'n_topics': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)

Example #50
0
def findClassifierParameters(clumpsImg,
                             classesIntCol,
                             variables,
                             preProcessor=None,
                             gridSearch=GridSearchCV(RandomForestClassifier(),
                                                     {})):
    """
Find the optimal parameters for a classifier using a grid search and return a classifier instance with those optimal parameters.

:param clumpsImg: is the clumps image on which the classification is to be performed
:param classesIntCol: is the column with the training data as int values
:param variables: is an array of column names which are to be used for the classification
:param preProcessor: is a scikit-learn processors such as sklearn.preprocessing.MaxAbsScaler() which can rescale the input variables independently as read in (Define: None; i.e., not in use).
:param gridSearch: is an instance of GridSearchCV parameterised with a classifier and parameters to be searched.

:return: Instance of the classifier with optimal parameters defined.

Example::

    from rsgislib.classification import classratutils
    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV
    from sklearn.preprocessing import MaxAbsScaler
    
    clumpsImg = "./LS8_20150621_lat10lon652_r67p233_clumps.kea"
    classesIntCol = 'ClassInt'
    
    classParameters = {'kernel':['linear', 'rbf',  'poly', 'sigmoid'], 'C':[1, 2, 3, 4, 5, 10, 100, 400, 500, 1e3, 5e3, 1e4, 5e4, 1e5], 'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 'auto'], 'degree':[2, 3, 4, 5, 6, 7, 8], 'class_weight':['', 'balanced'], 'decision_function_shape':['ovo', 'ovr', None]}
    variables = ['BlueRefl', 'GreenRefl', 'RedRefl', 'NIRRefl', 'SWIR1Refl', 'SWIR2Refl']
    
    gSearch = GridSearchCV(SVC(), classParameters)
    classifier = classratutils.findClassifierParameters(clumpsImg, classesIntCol, variables, preProcessor=MaxAbsScaler(), gridSearch=gSearch)

"""
    # Check gdal is available
    if not haveGDALPy:
        raise Exception(
            "The GDAL python bindings required for this function could not be imported\n\t"
            + gdalErr)
    # Check numpy is available
    if not haveNumpy:
        raise Exception(
            "The numpy module is required for this function could not be imported\n\t"
            + numErr)
    # Check rios rat is available
    if not haveRIOSRat:
        raise Exception(
            "The RIOS rat tools are required for this function could not be imported\n\t"
            + riosRatErr)
    # Check scikit-learn pre-processing is available
    if not haveSKLearnPreProcess:
        raise Exception(
            "The scikit-learn pre-processing tools are required for this function could not be imported\n\t"
            + sklearnPreProcessErr)
    # Check scikit-learn Grid Search is available
    if not haveSKLearnGS:
        raise Exception(
            "The scikit-learn grid search tools are required for this function could not be imported\n\t"
            + sklearnGSErr)

    ratDataset = gdal.Open(clumpsImg, gdal.GA_Update)
    numpyVars = []
    for var in variables:
        print("Reading " + var)
        tmpArr = rat.readColumn(ratDataset, var)
        if not preProcessor is None:
            tmpArr = tmpArr.reshape(-1, 1)
            tmpArr = preProcessor.fit_transform(tmpArr)
            tmpArr = tmpArr.reshape(-1)
        numpyVars.append(tmpArr)

    # Read in training classes
    classesInt = rat.readColumn(ratDataset, classesIntCol)

    xData = numpy.array(numpyVars)
    xData = xData.transpose()
    xData = numpy.where(numpy.isfinite(xData), xData, 0)

    print("Input data size: {} x {}".format(xData.shape[0], xData.shape[1]))

    trainingData = xData[numpy.isfinite(xData).all(axis=1)]
    classesInt = classesInt[numpy.isfinite(xData).all(axis=1)]

    trainingData = trainingData[classesInt > 0]
    classesInt = classesInt[classesInt > 0]

    print("Training data size: {} x {}".format(trainingData.shape[0],
                                               trainingData.shape[1]))
    print("Training data IDs size: {}".format(classesInt.shape[0]))

    classIDs = numpy.unique(classesInt)
    print(classIDs)
    for id in classIDs:
        print("Class {} has {} samples.".format(
            id, classesInt[classesInt == id].shape[0]))

    gridSearch.fit(trainingData, classesInt)
    if not gridSearch.refit:
        raise Exception("Grid Search did no find a fit therefore failed...")

    print("Best score was {} and has parameters {}.".format(
        gridSearch.best_score_, gridSearch.best_params_))

    return gridSearch.best_estimator_
# X = sc.fit_transform(X)
# X_test = sc.transform(X_test)
# =============================================================================

################################################################################
# logistic regression
################################################################################
logistic = linear_model.LogisticRegression()
# Create regularization penalty space
penalty = ['l1', 'l2']
# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)
# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)
# Create grid search using 5-fold cross validation
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0, n_jobs=-1)
# Fit grid search
best_model = clf.fit(X, y)
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
# prediction on test data using the best model
# =============================================================================
# y_pred = best_model.predict(X_test)
# =============================================================================

################################################################################
# random forest regression
################################################################################
# =============================================================================
# from sklearn.ensemble import RandomForestRegressor
    'knn__weights': ['uniform', 'distance']
}

# --------------------------------
pipe_ext = Pipeline([
    ('ext', ExtraTreesClassifier())
])
pipe_ext_params = {
    'ext__n_estimators': [100, 150, 200],
    'ext__max_depth': [None, 1, 2, 3, 4],
    'ext__min_samples_split': [2, 3, 4],
    'ext__min_samples_leaf': [1, 2, 3, 4]
}


gs_lr = GridSearchCV(pipe_lr, pipe_lr_params, cv=3, verbose=0, n_jobs = -1)
gs_rf= GridSearchCV(pipe_rf, pipe_rf_params, cv=3, verbose=0, n_jobs = -1)
gs_gbc = GridSearchCV(pipe_gbc, pipe_gbc_params, cv =3, verbose=0, n_jobs = -1)
gs_knn = GridSearchCV(pipe_knn, pipe_knn_params, cv =3, verbose=0, n_jobs = -1)
gs_ext = GridSearchCV(pipe_ext, pipe_ext_params, cv =3, verbose=0, n_jobs = -1)



gs_lr.fit(X_t_train_sc, y_t_train)
gs_rf.fit(X_t_train_sc, y_t_train)
gs_gbc.fit(X_t_train_sc, y_t_train)
gs_knn.fit(X_t_train_sc, y_t_train)
gs_ext.fit(X_t_train_sc, y_t_train)


df = pd.DataFrame({'default.payment.next.month': next_month.index,'values': next_month.values})
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.figure(figsize = (6,6))
plt.title('信用卡违约率客户\n (违约:1,守约:0)')
sns.set_color_codes("pastel")
sns.barplot(x = 'default.payment.next.month', y="values", data=df)
locs, labels = plt.xticks()
plt.show()
# 特征选择,去掉ID字段、最后一个结果字段即可
data.drop(['ID'], inplace=True, axis =1) #ID这个字段没有用
target = data['default.payment.next.month'].values
columns = data.columns.tolist()
columns.remove('default.payment.next.month')
features = data[columns].values
# 30%作为测试集,其余作为训练集
train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1)


#分类器
ada=AdaBoostClassifier( random_state=1)
#需要调整的参数
parameters={'n_estimators':[10,50,100]}

# 使用 GridSearchCV 进行参数调优
clf=GridSearchCV(estimator=ada,param_grid=parameters,scoring = 'accuracy')

clf.fit(train_x,train_y)
print("GridSearch最优参数:", clf.best_params_)
print("GridSearch最优分数: %0.4lf" %clf.best_score_)
predict_y=clf.predict(test_x)
print("准确率 %0.4lf" %accuracy_score(test_y, predict_y))
Example #54
0
    ('bow',
     CountVectorizer(strip_accents='ascii',
                     stop_words='english',
                     lowercase=True)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier',
     MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
# this is where we define the values for GridSearchCV to iterate over
parameters = {
    'bow__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'classifier__alpha': (1e-2, 1e-3),
}
# do 10-fold cross validation for each of the 6 possible combinations of the above params
grid = GridSearchCV(pipeline, cv=10, param_grid=parameters, verbose=1)
grid.fit(X_train, y_train)
# summarize results
print("\nBest Model: %f using %s" % (grid.best_score_, grid.best_params_))
print('\n')
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("Mean: %f Stdev:(%f) with: %r" % (mean, stdev, param))

X_test = combi['tidy_tweet'][31962:]
# save best model to current working directory
joblib.dump(grid, "twitter_sentiment.pkl")
# load from file and predict using the best configs found in the CV step
model_NB = joblib.load("twitter_sentiment.pkl")
Example #55
0
def get_predict_model():
    parent = root + '/type_err_feature/'
    types = os.listdir(parent)
    for t in types:
        print(t)
        if not os.path.exists(parent + t + '/gdbt.model'):
            #if True:
            data = pd.read_csv(parent + t + '/err_feature.csv',
                               encoding='utf-8',
                               usecols=[
                                   'result', 'YEAR_USE', 'dow', 'doy', 'month',
                                   'hour', 'result_before_1', 'result_before_7'
                               ])
            n_estimators = range(40, 81, 10)
            min_sample_split = range(20, 81, 5)
            if data.shape[0] > 1000000:
                n_estimators = range(250, 501, 50)
                min_sample_split = range(120, 241, 30)
            elif data.shape[0] > 500000:
                n_estimators = range(100, 351, 50)
                min_sample_split = range(60, 141, 20)
            col = [
                c for c in data.columns.tolist()
                if c not in ['TIME', 'PAR_ROOM', 'NE_OBJ_ID', 'result']
            ]
            trainX, testX, train_y, test_y = train_test_split(
                data[col],
                data['result'],
                test_size=0.3,
                random_state=80,
                stratify=data['result'])
            param_grid = {
                'n_estimators': n_estimators,
                'min_samples_split': min_sample_split
            }
            estimator = GradientBoostingClassifier(random_state=80,
                                                   max_depth=5,
                                                   learning_rate=0.005)
            cv = StratifiedShuffleSplit(n_splits=3,
                                        test_size=0.3,
                                        random_state=80)
            gbm = GridSearchCV(estimator=estimator,
                               param_grid=param_grid,
                               refit=True,
                               n_jobs=-1,
                               return_train_score=True,
                               scoring='roc_auc',
                               cv=cv)
            # 训练和输出
            if train_y.values.tolist().count(1) >= 3:
                gbm.fit(trainX, train_y)
                # 预测
                y_pred = gbm.predict(testX)
                y_predprob = gbm.predict_proba(testX)[:, 1]
                print("型号 " + t + " roc_auc得分为:" + str(
                    metrics.roc_auc_score(
                        test_y, y_predprob, average='weighted')))
                # 输出结果至csv
                clf = gbm.best_estimator_
                result = pd.DataFrame()
                result["BEST_PARAMS"] = [str(gbm.best_params_)]
                result["FEATURE_RANK"] = [str(clf.feature_importances_)]
                result["TRAIN_ROC_AUC"] = [str(gbm.best_score_)]
                result["ACCURACY"] = [
                    metrics.accuracy_score(test_y.values, y_pred)
                ]
                result["ROC_AUC"] = [
                    metrics.roc_auc_score(test_y,
                                          y_predprob,
                                          average='weighted')
                ]
                result['y_pred'] = [y_pred.tolist()]
                result['y_true'] = [test_y.tolist()]
                result.to_csv(parent + t + '/gdbt_result.csv',
                              header=True,
                              index=False,
                              encoding='utf-8')
                joblib.dump(clf, parent + t + '/gdbt.model')
            else:
                print(t + "型号样本故障数据过少,无法训练模型")
Example #56
0

X, y = load_data("Dataset")
F = extract_lbp_features(X)

# Apply grid and search for SCV classifier

# Create hyperparameter options
hyperparams = {
    'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
    'gamma': [1, 0.1, 0.001, 0.0001],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [1, 2, 3, 4]
}

grid_svc = GridSearchCV(SVC(), hyperparams, cv=5)

grid_svc.fit(F, y)

print('the best parameters for SVC classifier using GridSearchCV are ' +
      str(grid_svc.best_params_))

searchcv_svc = RandomizedSearchCV(SVC(), hyperparams, n_iter=20, cv=5)

searchcv_svc.fit(F, y)

print('the best parameters for SVC classifier using RandomizedSearchCV are ' +
      str(searchcv_svc.best_params_))

# Apply grid and search for logreg classifier
Example #57
0
# Confusion Matrix for model 2
xgb_cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n", xgb_cm)
print('-' * 50)

# Classification Report for model 2
xgb_cr = classification_report(y_test, y_pred)
print("Classification Report :\n", xgb_cr)
print('-' * 50)

print(
    "*************Model3 (XGBoost Classifier using grid search)*************")

# Initialize Grid search model
xgb_clf = XGBClassifier(random_state=0)
clf_model = GridSearchCV(estimator=xgb_clf, param_grid=parameters)

# Fit the grid model
clf_model.fit(X_train, y_train)

# Prediction for the grid model
y_pred = clf_model.predict(X_test)

# Accuracy of the grid model
clf_score = accuracy_score(y_test, y_pred)
print("Accuracy score of model 3 :", round(clf_score, 2))
print('-' * 50)

# Confusion Matrix for the grid model
clf_cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n", clf_cm)
def best_estimator(classifier,features_train,labels_train,features_test):

    param_mapping={

         'lor':{
             'reduce_dim__n_components':list(range(1,10)),
             'lor__C':[0.00000001,0.00001,1.0],
             'lor__tol':[1e-3,1e-1],
             'lor__penalty':['l1','l2'],
             'lor__random_state':[42]
              },

         'svc':{
                'svc__kernel':['rbf'],
                'svc__C':[10000,100000,1000],
                'svc__gamma':[0.001,0.0001,'auto'],
                'svc__random_state':[68]},

         'dtc':{

                'dtc__criterion':['entropy','gini'],
                'dtc__min_samples_split':[5,10,8,10,12],
                'dtc__random_state':[68],
                'dtc__min_samples_leaf':[4,6,8,10,12],
                },

         'knn':{
                'reduce_dim__n_components':list(range(1,10)),
                'knn__n_neighbors':[5,7,11],
                'knn__algorithm':['ball_tree','kd_tree','brute','auto'],
                'knn__leaf_size':[2,3,5,10,12]}

         }


    steps={
            'svc':[(),('scale',StandardScaler()),('svc',SVC())],
            'dtc':[('scale',StandardScaler()),('dtc',DecisionTreeClassifier())],
            'knn':[('scale',StandardScaler()),('reduce_dim',PCA()),('knn',KNeighborsClassifier())],
            'lor':[('scale',StandardScaler()),('reduce_dim',PCA()),('lor',LogisticRegression())]
           }


    pipe = Pipeline(steps[classifier])
    tune_params=param_mapping[classifier]
    sss = StratifiedShuffleSplit(n_splits=50, test_size=0.1, random_state=42)

    grid_search = GridSearchCV(estimator=pipe,
                               param_grid=tune_params,
                               scoring='f1',
                               error_score=0,
                               cv=sss
                               )
    grid_search.fit(features_train, labels_train)
    predictions = grid_search.predict(features_test)

    clf1 = grid_search.best_estimator_
    clf1_parm=grid_search.best_params_

    return clf1,clf1_parm
def plot_cross_val_selection():
    iris = load_iris()
    X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data,
                                                              iris.target,
                                                              random_state=0)

    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X_trainval, y_trainval)
    results = pd.DataFrame(grid_search.cv_results_)[15:]

    best = np.argmax(results.mean_test_score.values)
    plt.figure(figsize=(10, 3))
    plt.xlim(-1, len(results))
    plt.ylim(0, 1.1)
    for i, (_, row) in enumerate(results.iterrows()):
        scores = row[['test_split%d_test_score' % i for i in range(5)]]
        marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5,
                              alpha=.5)
        marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1,
                                markersize=10, markeredgecolor='k')
        if i == best:
            marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red',
                                    fillstyle="none", alpha=1, markersize=20,
                                    markeredgewidth=3)

    plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x
                                     in grid_search.cv_results_['params']],
               rotation=90)
    plt.ylabel("Validation accuracy")
    plt.xlabel("Parameter settings")
    plt.legend([marker_cv, marker_mean, marker_best],
               ["cv accuracy", "mean accuracy", "best parameter setting"],
               loc=(1.05, .4))
def main():
    # 1 查看训练集和测试集的数据特征
    train_data = pandas.read_csv('data/train.csv')
    test_data = pandas.read_csv('data/test.csv')
    print(train_data.info())
    print(test_data.info())
    # 2 人工选取预测有效的特征
    selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare']
    x_train = train_data[selected_features]
    x_test = test_data[selected_features]

    y_train = train_data['Survived']

    # 3 补充缺失值
    # 得知Embared特征惨在缺失值,需要补完
    print(x_train['Embarked'].value_counts())
    print(x_test['Embarked'].value_counts())

    # 对于类别型特征,使用出现频率最高的特征来填充,可以作为减少引入误差的方法之一
    x_train['Embarked'].fillna('S', inplace=True)
    x_test['Embarked'].fillna('S', inplace=True)

    x_train['Age'].fillna(x_train['Age'].mean(), inplace=True)
    x_test['Age'].fillna(x_test['Age'].mean(), inplace=True)

    x_test['Fare'].fillna(x_test['Fare'].mean(), inplace=True)
    print(x_train.info())
    print(x_test.info())

    # 4 采用DictVectorizer对特征向量化
    dict_vectorizer = DictVectorizer(sparse=False)
    x_train = dict_vectorizer.fit_transform(x_train.to_dict(orient='record'))
    print(dict_vectorizer.feature_names_)
    x_test = dict_vectorizer.transform(x_test.to_dict(orient='record'))

    # 5 训练模型
    forest_classifier = RandomForestClassifier()
    xgb_classifier = XGBClassifier()

    # 使用5折交叉验证的方式进行性能评估
    forest_mean_score = cross_val_score(forest_classifier, x_train, y_train, cv=5).mean()
    print(forest_mean_score)
    xgb_mean_score = cross_val_score(xgb_classifier, x_train, y_train, cv=5).mean()
    print(xgb_mean_score)

    # 6 使用并行网格搜索的方式选择更好的超参组合
    params = {
        'max_depth': range(2, 8), 'n_estimators': range(100, 1200, 200),
        'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]
    }
    xgbc_best = XGBClassifier()
    grid_search_cv = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5)
    grid_search_cv.fit(x_train, y_train)
    print(grid_search_cv.best_score_)
    print(grid_search_cv.best_params_)

    # 7 预测结果并写入文件
    predict_result = grid_search_cv.predict(x_test)
    submission_data = pandas.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predict_result})
    submission_data.to_csv('data/submission/titanic_submission.csv', index=False)