Beispiel #1
0
def xgb_params():
    para_dict = {
        "max_depth": rd(3, 10),
        "learning_rate": uni(loc=0, scale=1),
        "n_estimators": rd(50, 200),
        'objective': ['reg:linear'],
    }
    return para_dict
def uniform(lower, upper):
    return uni(lower, upper - lower)
Beispiel #3
0
    def __init__(self, data_set_file, lowercase=False, use_idf=False, 
                 developers_dict_file=None, 
                 developers_list_file=None):
        super().__init__(developers_dict_file, developers_list_file)
        np.random.seed(0) # We set the seed
        self.lowercase = lowercase
        self.use_idf = use_idf
        
        self._estimators = [("count", CountVectorizer( \
        lowercase=self.lowercase, token_pattern=u"(?u)\S+")), \
        ("tf_idf", TfidfTransformer(use_idf=self.use_idf, smooth_idf=False))]
                
        self._nearest_centroid_estimators = self._estimators + \
        [("clf", NearestCentroid())]
        self._naive_bayes_estimators = self._estimators + \
        [("clf", MultinomialNB())]
        self._linear_svm_estimators = self._estimators + \
        [("clf", LinearSVC())]
        self._logistic_regression_estimators = self._estimators + \
        [("clf", LogisticRegression())]
        self._perceptron_estimators = self._estimators + \
        [("clf", Perceptron())]
        self._stochastic_gradient_descent_estimators = \
        self._estimators + [("clf", SGDClassifier())]

        self._nearest_centroid_estimators_params = \
        dict( \
            clf__metric=["manhattan", "euclidean"] \
        )        
        self._naive_bayes_estimators_params = \
        dict( \
            clf__alpha=np.linspace(0, 1, 11), \
            clf__fit_prior=[True, False] \
        )
        self._linear_svm_estimators_params = \
        dict( \
            clf__C=np.logspace(-4, 4, 10), \
            clf__loss=["squared_hinge", "hinge"], \
            clf__class_weight=["balanced"] \
        )
        self._primal_logistic_regression_estimators_params = \
        dict( \
            clf__dual=[False],
            clf__C=np.logspace(-4, 4, 10), \
            clf__class_weight=["balanced"], \
            clf__solver=["newton-cg", "sag", "lbfgs"], \
            clf__multi_class=["multinomial"]
        )
        self._dual_logistic_regression_estimators_params = \
        dict( \
            clf__dual=[True],
            clf__C=np.logspace(-4, 4, 10), \
            clf__class_weight=["balanced"], \
            clf__solver=["liblinear"], \
            clf__multi_class=["ovr"]
        )
        self._perceptron_with_penalty_estimators_params = \
        dict( \
            clf__penalty=["l2", "elasticnet"],
            clf__alpha=10.0**-np.arange(1,7), \
            clf__class_weight=["balanced"]
        )
        self._perceptron_without_penalty_estimators_params = \
        dict( \
            clf__penalty=[None],
            clf__class_weight=["balanced"]
        )        
        self._stochastic_gradient_descent_estimators_params = \
        dict( \
            clf__loss=["hinge", "log", "modified_huber", \
            "squared_hinge", "perceptron"], \
            clf__penalty=["l2", "elasticnet"], \
            clf__alpha=10.0**-np.arange(1,7), \
            clf__class_weight=["balanced"], \
            clf__average=[True, False]
        )
        
        # Below, there is a dictionary to store the names, the 
        # classifiers used, the parameters sent to the constructor of
        # the classifiers and the fitted classifiers (grid search)
        self._models_cv = { \
            "NearestCentroid": [GridSearchCV, {
                "estimator": Pipeline(self._nearest_centroid_estimators),
                "param_grid": self._nearest_centroid_estimators_params,
                "n_jobs": 8,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "MultinomialNB": [GridSearchCV, {
                "estimator": Pipeline(self._naive_bayes_estimators),
                "param_grid": self._naive_bayes_estimators_params,
                "n_jobs": 1,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "LinearSVC": [GridSearchCV, {
                "estimator": Pipeline(self._linear_svm_estimators),
                "param_grid": self._linear_svm_estimators_params,
                "n_jobs": 1,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "Primal LogisticRegression": [GridSearchCV, {
                "estimator": Pipeline(self._logistic_regression_estimators),
                "param_grid": self._primal_logistic_regression_estimators_params,
                "n_jobs": 1,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "Dual LogisticRegression": [GridSearchCV, {
                "estimator": Pipeline(self._logistic_regression_estimators),
                "param_grid": self._dual_logistic_regression_estimators_params,
                "n_jobs": 8,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "PerceptronWithPenalty": [GridSearchCV, {
                "estimator": Pipeline(self._perceptron_estimators),
                "param_grid": self._perceptron_with_penalty_estimators_params,
                "n_jobs": 8,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "PerceptronWithoutPenalty": [GridSearchCV, {
                "estimator": Pipeline(self._perceptron_estimators),
                "param_grid": self._perceptron_without_penalty_estimators_params,
                "n_jobs": 8,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "SGDClassifier": [GridSearchCV, {
                "estimator": Pipeline(self._stochastic_gradient_descent_estimators),
                "param_grid": self._stochastic_gradient_descent_estimators_params,
                "n_jobs": 1,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
        }
        
        self._nearest_centroid_estimators_random_params = \
        dict( \
            clf__metric=["manhattan", "euclidean"] \
        )        
        self._naive_bayes_estimators_random_params = \
        dict( \
            clf__alpha=uni(loc=0,scale=1), \
            clf__fit_prior=[True, False] \
        )
        self._linear_svm_estimators_random_params = \
        dict( \
            clf__C=LogSpaceUniform(loc=-4,scale=8), \
            clf__loss=["squared_hinge", "hinge"], \
            clf__class_weight=["balanced"] \
        )
        self._primal_logistic_regression_estimators_random_params = \
        dict( \
            clf__dual=[False],
            clf__C=LogSpaceUniform(loc=-4,scale=8), \
            clf__class_weight=["balanced"], \
            clf__solver=["newton-cg", "sag", "lbfgs"], \
            clf__multi_class=["multinomial"]
        )
        self._dual_logistic_regression_estimators_random_params = \
        dict( \
            clf__dual=[True],
            clf__C=LogSpaceUniform(loc=-4,scale=8), \
            clf__class_weight=["balanced"], \
            clf__solver=["liblinear"], \
            clf__multi_class=["ovr"]
        )
        self._perceptron_with_penalty_estimators_random_params = \
        dict( \
            clf__penalty=["l2", "elasticnet"],
            clf__alpha=LogSpaceUniform(loc=-6,scale=5), \
            clf__class_weight=["balanced"]
        )
        self._perceptron_without_penalty_estimators_random_params = \
        dict( \
            clf__penalty=[None],
            clf__class_weight=["balanced"]
        )
        
        self._stochastic_gradient_descent_estimators_random_params = \
        dict( \
            clf__loss=["hinge", "log", "modified_huber", \
            "squared_hinge", "perceptron"], \
            clf__penalty=["l2", "elasticnet"], \
            clf__alpha=LogSpaceUniform(loc=-6,scale=5), \
            clf__class_weight=["balanced"], \
            clf__average=[True, False]
        )
        
        # Below, there is a dictionary to store the names, the 
        # classifiers used, the parameters sent to the constructor of 
        # the classifiers and the fitted classifiers (random search)
        self._randomized_models_cv = { \
            "NearestCentroid": [RandomizedSearchCV, {
                "estimator": Pipeline(self._nearest_centroid_estimators),
                "param_distributions": self._nearest_centroid_estimators_random_params,
                "n_iter": 2,
                "n_jobs": 8,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "random_state": 0,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "MultinomialNB": [RandomizedSearchCV, {
                "estimator": Pipeline(self._naive_bayes_estimators),
                "param_distributions": self._naive_bayes_estimators_random_params,
                "n_iter": 22,
                "n_jobs": 1,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "random_state": 0,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "LinearSVC": [RandomizedSearchCV, {
                "estimator": Pipeline(self._linear_svm_estimators),
                "param_distributions": self._linear_svm_estimators_random_params,
                "n_iter": 20,
                "n_jobs": 1,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "random_state": 0,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "Primal LogisticRegression": [RandomizedSearchCV, {
                "estimator": Pipeline(self._logistic_regression_estimators),
                "param_distributions": self._primal_logistic_regression_estimators_random_params,
                "n_iter": 30,
                "n_jobs": 1,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "random_state": 0,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "Dual LogisticRegression": [RandomizedSearchCV, {
                "estimator": Pipeline(self._logistic_regression_estimators),
                "param_distributions": self._dual_logistic_regression_estimators_random_params,
                "n_iter": 10,
                "n_jobs": 8,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "random_state": 0,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "PerceptronWithPenalty": [RandomizedSearchCV, {
                "estimator": Pipeline(self._perceptron_estimators),
                "param_distributions": self._perceptron_with_penalty_estimators_random_params,
                "n_iter": 12,
                "n_jobs": 8,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "random_state": 0,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "PerceptronWithoutPenalty": [RandomizedSearchCV, {
                "estimator": Pipeline(self._perceptron_estimators),
                "param_distributions": self._perceptron_without_penalty_estimators_random_params,
                "n_iter": 1,
                "n_jobs": 8,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "random_state": 0,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
            "SGDClassifier": [RandomizedSearchCV, {
                "estimator": Pipeline(self._stochastic_gradient_descent_estimators),
                "param_distributions": self._stochastic_gradient_descent_estimators_random_params,
                "n_iter": 120,
                "n_jobs": 1,
                "iid": False,
                "cv": self._tscv,
                "verbose": 10,
                "random_state": 0,
                "error_score": np.array([-1, -1]),
                "scoring": accuracy_mrr_scoring_object
            }, None], \
        }
        
        # Below, there is a dictionary to store the accuracy of each 
        # configuration on the test set (grid search)
        self._configurations_accuracies = {}
        # Below, there is a dictionary to store the MRR value of each
        # configuration on the test set (grid search)
        self._configurations_mrr_values = {}

        # Below, there is a dictionary to store the accuracy of each 
        # configuration on the test set (random search)
        self._randomized_configurations_accuracies = {}
        # Below, there is a dictionary to store the MRR value of each
        # configuration on the test set (random search)
        self._randomized_configurations_mrr_values = {}
        
        cleaned_results_file_name = "cleaned_tuning_" + \
        "individual_classifier_generic_experiment_results.json"
        self._cleaned_results_file_name = os.path.join( \
        self._current_dir, cleaned_results_file_name)
        
        self._data_set_file = os.path.join(self._current_dir, \
        data_set_file)
        
        log_file = os.path.join(self._current_dir, \
        "tuning_individual_classifier_generic_experiment.log")
        logging.basicConfig(filename=log_file, filemode="w", \
                            level=logging.DEBUG)
        
        self._build_data_set()