def xgb_params(): para_dict = { "max_depth": rd(3, 10), "learning_rate": uni(loc=0, scale=1), "n_estimators": rd(50, 200), 'objective': ['reg:linear'], } return para_dict
def uniform(lower, upper): return uni(lower, upper - lower)
def __init__(self, data_set_file, lowercase=False, use_idf=False, developers_dict_file=None, developers_list_file=None): super().__init__(developers_dict_file, developers_list_file) np.random.seed(0) # We set the seed self.lowercase = lowercase self.use_idf = use_idf self._estimators = [("count", CountVectorizer( \ lowercase=self.lowercase, token_pattern=u"(?u)\S+")), \ ("tf_idf", TfidfTransformer(use_idf=self.use_idf, smooth_idf=False))] self._nearest_centroid_estimators = self._estimators + \ [("clf", NearestCentroid())] self._naive_bayes_estimators = self._estimators + \ [("clf", MultinomialNB())] self._linear_svm_estimators = self._estimators + \ [("clf", LinearSVC())] self._logistic_regression_estimators = self._estimators + \ [("clf", LogisticRegression())] self._perceptron_estimators = self._estimators + \ [("clf", Perceptron())] self._stochastic_gradient_descent_estimators = \ self._estimators + [("clf", SGDClassifier())] self._nearest_centroid_estimators_params = \ dict( \ clf__metric=["manhattan", "euclidean"] \ ) self._naive_bayes_estimators_params = \ dict( \ clf__alpha=np.linspace(0, 1, 11), \ clf__fit_prior=[True, False] \ ) self._linear_svm_estimators_params = \ dict( \ clf__C=np.logspace(-4, 4, 10), \ clf__loss=["squared_hinge", "hinge"], \ clf__class_weight=["balanced"] \ ) self._primal_logistic_regression_estimators_params = \ dict( \ clf__dual=[False], clf__C=np.logspace(-4, 4, 10), \ clf__class_weight=["balanced"], \ clf__solver=["newton-cg", "sag", "lbfgs"], \ clf__multi_class=["multinomial"] ) self._dual_logistic_regression_estimators_params = \ dict( \ clf__dual=[True], clf__C=np.logspace(-4, 4, 10), \ clf__class_weight=["balanced"], \ clf__solver=["liblinear"], \ clf__multi_class=["ovr"] ) self._perceptron_with_penalty_estimators_params = \ dict( \ clf__penalty=["l2", "elasticnet"], clf__alpha=10.0**-np.arange(1,7), \ clf__class_weight=["balanced"] ) self._perceptron_without_penalty_estimators_params = \ dict( \ clf__penalty=[None], clf__class_weight=["balanced"] ) self._stochastic_gradient_descent_estimators_params = \ dict( \ clf__loss=["hinge", "log", "modified_huber", \ "squared_hinge", "perceptron"], \ clf__penalty=["l2", "elasticnet"], \ clf__alpha=10.0**-np.arange(1,7), \ clf__class_weight=["balanced"], \ clf__average=[True, False] ) # Below, there is a dictionary to store the names, the # classifiers used, the parameters sent to the constructor of # the classifiers and the fitted classifiers (grid search) self._models_cv = { \ "NearestCentroid": [GridSearchCV, { "estimator": Pipeline(self._nearest_centroid_estimators), "param_grid": self._nearest_centroid_estimators_params, "n_jobs": 8, "iid": False, "cv": self._tscv, "verbose": 10, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "MultinomialNB": [GridSearchCV, { "estimator": Pipeline(self._naive_bayes_estimators), "param_grid": self._naive_bayes_estimators_params, "n_jobs": 1, "iid": False, "cv": self._tscv, "verbose": 10, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "LinearSVC": [GridSearchCV, { "estimator": Pipeline(self._linear_svm_estimators), "param_grid": self._linear_svm_estimators_params, "n_jobs": 1, "iid": False, "cv": self._tscv, "verbose": 10, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "Primal LogisticRegression": [GridSearchCV, { "estimator": Pipeline(self._logistic_regression_estimators), "param_grid": self._primal_logistic_regression_estimators_params, "n_jobs": 1, "iid": False, "cv": self._tscv, "verbose": 10, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "Dual LogisticRegression": [GridSearchCV, { "estimator": Pipeline(self._logistic_regression_estimators), "param_grid": self._dual_logistic_regression_estimators_params, "n_jobs": 8, "iid": False, "cv": self._tscv, "verbose": 10, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "PerceptronWithPenalty": [GridSearchCV, { "estimator": Pipeline(self._perceptron_estimators), "param_grid": self._perceptron_with_penalty_estimators_params, "n_jobs": 8, "iid": False, "cv": self._tscv, "verbose": 10, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "PerceptronWithoutPenalty": [GridSearchCV, { "estimator": Pipeline(self._perceptron_estimators), "param_grid": self._perceptron_without_penalty_estimators_params, "n_jobs": 8, "iid": False, "cv": self._tscv, "verbose": 10, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "SGDClassifier": [GridSearchCV, { "estimator": Pipeline(self._stochastic_gradient_descent_estimators), "param_grid": self._stochastic_gradient_descent_estimators_params, "n_jobs": 1, "iid": False, "cv": self._tscv, "verbose": 10, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ } self._nearest_centroid_estimators_random_params = \ dict( \ clf__metric=["manhattan", "euclidean"] \ ) self._naive_bayes_estimators_random_params = \ dict( \ clf__alpha=uni(loc=0,scale=1), \ clf__fit_prior=[True, False] \ ) self._linear_svm_estimators_random_params = \ dict( \ clf__C=LogSpaceUniform(loc=-4,scale=8), \ clf__loss=["squared_hinge", "hinge"], \ clf__class_weight=["balanced"] \ ) self._primal_logistic_regression_estimators_random_params = \ dict( \ clf__dual=[False], clf__C=LogSpaceUniform(loc=-4,scale=8), \ clf__class_weight=["balanced"], \ clf__solver=["newton-cg", "sag", "lbfgs"], \ clf__multi_class=["multinomial"] ) self._dual_logistic_regression_estimators_random_params = \ dict( \ clf__dual=[True], clf__C=LogSpaceUniform(loc=-4,scale=8), \ clf__class_weight=["balanced"], \ clf__solver=["liblinear"], \ clf__multi_class=["ovr"] ) self._perceptron_with_penalty_estimators_random_params = \ dict( \ clf__penalty=["l2", "elasticnet"], clf__alpha=LogSpaceUniform(loc=-6,scale=5), \ clf__class_weight=["balanced"] ) self._perceptron_without_penalty_estimators_random_params = \ dict( \ clf__penalty=[None], clf__class_weight=["balanced"] ) self._stochastic_gradient_descent_estimators_random_params = \ dict( \ clf__loss=["hinge", "log", "modified_huber", \ "squared_hinge", "perceptron"], \ clf__penalty=["l2", "elasticnet"], \ clf__alpha=LogSpaceUniform(loc=-6,scale=5), \ clf__class_weight=["balanced"], \ clf__average=[True, False] ) # Below, there is a dictionary to store the names, the # classifiers used, the parameters sent to the constructor of # the classifiers and the fitted classifiers (random search) self._randomized_models_cv = { \ "NearestCentroid": [RandomizedSearchCV, { "estimator": Pipeline(self._nearest_centroid_estimators), "param_distributions": self._nearest_centroid_estimators_random_params, "n_iter": 2, "n_jobs": 8, "iid": False, "cv": self._tscv, "verbose": 10, "random_state": 0, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "MultinomialNB": [RandomizedSearchCV, { "estimator": Pipeline(self._naive_bayes_estimators), "param_distributions": self._naive_bayes_estimators_random_params, "n_iter": 22, "n_jobs": 1, "iid": False, "cv": self._tscv, "verbose": 10, "random_state": 0, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "LinearSVC": [RandomizedSearchCV, { "estimator": Pipeline(self._linear_svm_estimators), "param_distributions": self._linear_svm_estimators_random_params, "n_iter": 20, "n_jobs": 1, "iid": False, "cv": self._tscv, "verbose": 10, "random_state": 0, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "Primal LogisticRegression": [RandomizedSearchCV, { "estimator": Pipeline(self._logistic_regression_estimators), "param_distributions": self._primal_logistic_regression_estimators_random_params, "n_iter": 30, "n_jobs": 1, "iid": False, "cv": self._tscv, "verbose": 10, "random_state": 0, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "Dual LogisticRegression": [RandomizedSearchCV, { "estimator": Pipeline(self._logistic_regression_estimators), "param_distributions": self._dual_logistic_regression_estimators_random_params, "n_iter": 10, "n_jobs": 8, "iid": False, "cv": self._tscv, "verbose": 10, "random_state": 0, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "PerceptronWithPenalty": [RandomizedSearchCV, { "estimator": Pipeline(self._perceptron_estimators), "param_distributions": self._perceptron_with_penalty_estimators_random_params, "n_iter": 12, "n_jobs": 8, "iid": False, "cv": self._tscv, "verbose": 10, "random_state": 0, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "PerceptronWithoutPenalty": [RandomizedSearchCV, { "estimator": Pipeline(self._perceptron_estimators), "param_distributions": self._perceptron_without_penalty_estimators_random_params, "n_iter": 1, "n_jobs": 8, "iid": False, "cv": self._tscv, "verbose": 10, "random_state": 0, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ "SGDClassifier": [RandomizedSearchCV, { "estimator": Pipeline(self._stochastic_gradient_descent_estimators), "param_distributions": self._stochastic_gradient_descent_estimators_random_params, "n_iter": 120, "n_jobs": 1, "iid": False, "cv": self._tscv, "verbose": 10, "random_state": 0, "error_score": np.array([-1, -1]), "scoring": accuracy_mrr_scoring_object }, None], \ } # Below, there is a dictionary to store the accuracy of each # configuration on the test set (grid search) self._configurations_accuracies = {} # Below, there is a dictionary to store the MRR value of each # configuration on the test set (grid search) self._configurations_mrr_values = {} # Below, there is a dictionary to store the accuracy of each # configuration on the test set (random search) self._randomized_configurations_accuracies = {} # Below, there is a dictionary to store the MRR value of each # configuration on the test set (random search) self._randomized_configurations_mrr_values = {} cleaned_results_file_name = "cleaned_tuning_" + \ "individual_classifier_generic_experiment_results.json" self._cleaned_results_file_name = os.path.join( \ self._current_dir, cleaned_results_file_name) self._data_set_file = os.path.join(self._current_dir, \ data_set_file) log_file = os.path.join(self._current_dir, \ "tuning_individual_classifier_generic_experiment.log") logging.basicConfig(filename=log_file, filemode="w", \ level=logging.DEBUG) self._build_data_set()