def make_use_tfidf_with_results(self): x_all = self.__vectors_provider.get_tfidf_vectors() y_all = self.__data_source.get_y_multi_label() model1 = LabelPowerset( LogisticRegression(C=1.0, solver='sag', n_jobs=-1)) Evaluator.multi_label_predict_proba_tfidf( model1, x_all, y_all, data_source=self.__data_source)
def make_use_d2v(self): x_all = self.__vectors_provider.get_d2v_vectors_dbow() y_all = self.__data_source.get_y_multi_label() # TODO here grid search base_estimators = [ LogisticRegression(C=1.0, solver='sag', n_jobs=-1), LogisticRegression(n_jobs=-1), LinearSVC(), MLPClassifier() ] model_params = [ "LogisticRegression(C=1.0, solver='sag')", 'LogisticRegression()', "LinearSVC()", "MLPClassifier()" ] i = 0 for base_estimator in base_estimators: logging.warning(str(datetime.now()) + 'Start ' + model_params[i]) try: model = OneVsRestClassifier(base_estimator, n_jobs=-1) # cross_val_f1 = Evaluator.evaluate_only_cross_val(model, x_all, y_all) # self.__visualizer.show_results_briefly(self.__CLASSIFIER_NAME, model_params[i], # "Word2Vec_CBOW", cross_val_f1) report, micro, macro, weighted = Evaluator.multi_label_report( model, x_all, y_all) self.__visualizer.save_metrics(self.__CLASSIFIER_NAME, model_params[i], "Doc2Vec_DBOW", report, micro, macro, weighted) except: logging.warning('Error on ' + model_params[i]) logging.warning(str(datetime.now()) + 'End ' + model_params[i]) i += 1
def make_use_w2v_fix(self): x_all = self.__vectors_provider.get_w2v_vectors_fix() y_all = self.__data_source.get_y_multi_label() # TODO here grid search base_estimators = [ LogisticRegression(C=1.0, solver='sag', n_jobs=-1), # LogisticRegression(n_jobs=-1), # LinearSVC(), # MLPClassifier() ] model_params = [ "LogisticRegression(C=1.0, solver='sag')", # "LogisticRegression()", # "LinearSVC()", # "MLPClassifier()" ] i = 0 for base_estimator in base_estimators: logging.warning(str(datetime.now()) + 'Start ' + model_params[i]) try: model = LabelPowerset(base_estimator) cross_val_f1 = Evaluator.evaluate_only_cross_val( model, x_all, y_all) self.__visualizer.show_results_briefly(self.__CLASSIFIER_NAME, model_params[i], "Word2Vec_fix", cross_val_f1) except: logging.warning('Error on ' + model_params[i]) logging.warning(str(datetime.now()) + 'End ' + model_params[i]) i += 1
def make_use_w2v_big(self): x_all = self.__vectors_provider.get_w2v_big_vectors() y_all = self.__data_source.get_y() # TODO here grid search estimators = [] part1 = LogisticRegression(C=0.5, solver='liblinear') estimators.append(('logistic', part1)) part2 = SVC(C=10, kernel='rbf') estimators.append(('svc', part2)) part3 = KNeighborsClassifier(algorithm='auto', metric='minkowski', weights='distance') estimators.append(('knn', part3)) model1 = VotingClassifier(estimators) cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred \ = Evaluator.evaluate(model1, x_all, y_all) self.__visualizer.show_results(self.__CLASSIFIER_NAME, "model1", "Word2VecBig", cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred)
def make_use_w2v(self): x_all = self.__vectors_provider.get_w2v_vectors_cbow() y_all = self.__data_source.get_y_multi_label() # TODO here grid search base_estimators = [ LogisticRegression(C=1.0, solver='sag', n_jobs=-1), LogisticRegression(n_jobs=-1), # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, # intercept_scaling=True, max_iter=100, multi_class='ovr', penalty='l2', random_state=None, # solver='newton-cg', # tol=0.0001, warm_start=False, n_jobs=-1), LinearSVC(), MLPClassifier() # SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0, # decision_function_shape='ovr', degree=0, gamma='auto', kernel='linear', # max_iter=-1, probability=True, random_state=None, shrinking=True, # tol=0.001, verbose=False) ] model_params = [ "LogisticRegression(C=1.0, solver='sag')", 'LogisticRegression()', # "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\ # intercept_scaling=True, max_iter=100, multi_class='ovr', penalty='l2', random_state=None,\ # solver='newton-cg',\ # tol=0.0001, warm_start=False, n_jobs=-1)", "LinearSVC()", "MLPClassifier()", # "SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,\ # decision_function_shape='ovr', degree=0, gamma='auto', kernel='linear',\ # max_iter=-1, probability=True, random_state=None, shrinking=True,\ # tol=0.001, verbose=False)" ] i = 0 for base_estimator in base_estimators: logging.warning(str(datetime.now()) + 'Start ' + model_params[i]) try: model = OneVsRestClassifier(base_estimator, n_jobs=-1) # cross_val_f1 = Evaluator.evaluate_only_cross_val(model, x_all, y_all) # self.__visualizer.show_results_briefly(self.__CLASSIFIER_NAME, model_params[i], # "Word2Vec_CBOW", cross_val_f1) report, micro, macro, weighted = Evaluator.multi_label_report( model, x_all, y_all) self.__visualizer.save_metrics(self.__CLASSIFIER_NAME, model_params[i], "Word2Vec_CBOW", report, micro, macro, weighted) except: logging.warning('Error on ' + model_params[i]) logging.warning(str(datetime.now()) + 'End ' + model_params[i]) i += 1
def make_use_w2v(self): x_all = self.__vectors_provider.get_w2v_vectors() y_all = self.__data_source.get_y() # TODO here grid search model1 = LogisticRegression(C=1.0, solver='sag', n_jobs=-1) Evaluator.cross_probabilities(model=model1, x_all=x_all, y_all=y_all, data_source=self.__data_source, method='w2v') cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred \ = Evaluator.evaluate(model1, x_all, y_all) self.__visualizer.show_results(self.__CLASSIFIER_NAME, "(C=1.0, solver='sag')", "Word2Vec", cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred)
def make_use_w2v_with_tfidf(self): x_all = self.__vectors_provider.get_w2v_tfidf_vectors() y_all = self.__data_source.get_y() # TODO here grid search model1 = SVC(C=1, kernel='linear', probability=True) cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred \ = Evaluator.evaluate(model1, x_all, y_all) self.__visualizer.show_results( self.__CLASSIFIER_NAME, "(C=1, kernel='linear', probability=True)", "Word2Vec&TF-IDF", cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred)
def make_use_w2v_with_tfidf(self): x_all = self.__vectors_provider.get_w2v_tfidf_vectors() y_all = self.__data_source.get_y() # TODO here grid search model1 = LogisticRegression(C=1.0, solver='sag', n_jobs=-1) cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred \ = Evaluator.evaluate(model1, x_all, y_all) self.__visualizer.show_results(self.__CLASSIFIER_NAME, "(C=1.0, solver='sag')", "Word2Vec&TF-IDF", cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred)
def make_use_w2v_with_tfidf(self): x_all = self.__vectors_provider.get_w2v_tfidf_vectors() y_all = self.__data_source.get_y() # TODO here grid search model1 = KNeighborsClassifier(algorithm='auto', metric='minkowski', weights='distance') cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred \ = Evaluator.evaluate(model1, x_all, y_all) self.__visualizer.show_results(self.__CLASSIFIER_NAME, "(algorithm='auto', metric='minkowski', weights='distance')", "Word2Vec&TF-IDF", cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred)
def make_use_w2v_old(self): """ Обучает старый датасет с использованием w2v обученного на новом """ x_all = self.__vectors_provider.get_w2v_old_vectors() y_all = self.__data_source.get_y() # TODO here grid search model1 = SVC(C=1, kernel='linear', probability=True) cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred \ = Evaluator.evaluate(model1, x_all, y_all) self.__visualizer.show_results( self.__CLASSIFIER_NAME, "(C=1, kernel='linear', probability=True)", "Word2VecNewOld", cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred)
def make_use_w2v_old(self): """ Обучает старый датасет с использованием w2v обученного на новом """ x_all = self.__vectors_provider.get_w2v_old_vectors() y_all = self.__data_source.get_y() # TODO here grid search model1 = LogisticRegression(C=1.0, solver='sag', n_jobs=-1) cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred \ = Evaluator.evaluate(model1, x_all, y_all) self.__visualizer.show_results(self.__CLASSIFIER_NAME, "(C=1.0, solver='sag')", "Word2VecNewOld", cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred)
def make_use_w2v_old(self): """ Обучает старый датасет с использованием w2v обученного на новом """ x_all = self.__vectors_provider.get_w2v_old_vectors() y_all = self.__data_source.get_y() # TODO here grid search model1 = KNeighborsClassifier(algorithm='auto', metric='minkowski', weights='distance') cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred \ = Evaluator.evaluate(model1, x_all, y_all) self.__visualizer.show_results(self.__CLASSIFIER_NAME, "(algorithm='auto', metric='minkowski', weights='distance')", "Word2VecNewOld", cross_val_accuracy, cross_val_f1, train_accuracy, train_f1, test_accuracy, test_f1, y_true, y_pred)
def make_use_tfidf(self): x_all = self.__vectors_provider.get_tfidf_vectors() y_all = self.__data_source.get_y_multi_label() # binarizer = MultiLabelBinarizer() # y_all = binarizer.fit_transform(y_all) # # default = LabelPowerset() # parameters = [ # { # 'classifier': [SVC()], # 'classifier__C': np.logspace(-1, 5, 7), # 'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], # 'classifier__degree': range(0, 5, 1), # 'classifier__probability': [True, False], # 'classifier__shrinking': [True, False], # # } # ] # # grid = GridSearchCV(default, parameters, scoring='f1_weighted', cv=5, n_jobs=-1) # grid.fit(x_all, y_all) # best_param = grid.best_params_ # self.__visualizer.save_best_params(self.__CLASSIFIER_NAME, 'SVC()', "tfidf", best_param) # # logging.warning(str(datetime.now()) + 'End SVC') # # default = LabelPowerset() # # parameters = [ # { # 'classifier': [LogisticRegression()], # 'classifier__fit_intercept': (True, False), # 'classifier__C': np.logspace(-1, 5, 7), # 'classifier__intercept_scaling': (True, False), # 'classifier__class_weight': (None, 'balanced'), # 'classifier__solver': ('newton-cg', 'lbfgs', 'sag', 'saga'), # 'classifier__multi_class': ('ovr', 'multinomial') # }] # # grid = GridSearchCV(default, parameters, scoring='f1_weighted', cv=5, n_jobs=-1) # grid.fit(x_all, y_all) # best_param = grid.best_params_ # self.__visualizer.save_best_params(self.__CLASSIFIER_NAME, 'LogisticRegression()', "tfidf", best_param) # # logging.warning(str(datetime.now()) + 'End LogisticRegression') base_estimators = [ LogisticRegression(C=1.0, solver='sag', n_jobs=-1), # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, # intercept_scaling=True, max_iter=100, multi_class='ovr', penalty='l2', random_state=None, # solver='newton-cg', # tol=0.0001, warm_start=False, n_jobs=-1), # LinearSVC(), # MLPClassifier(), # SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0, # decision_function_shape='ovr', degree=0, gamma='auto', kernel='linear', # max_iter=-1, probability=True, random_state=None, shrinking=True, # tol=0.001, verbose=False) ] model_params = [ "LogisticRegression(C=1.0, solver='sag')", # "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\ # intercept_scaling=True, max_iter=100, multi_class='ovr', penalty='l2', random_state=None,\ # solver='newton-cg',\ # tol=0.0001, warm_start=False, n_jobs=-1)", # "LinearSVC()", # "MLPClassifier()", # "SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,\ # decision_function_shape='ovr', degree=0, gamma='auto', kernel='linear',\ # max_iter=-1, probability=True, random_state=None, shrinking=True,\ # tol=0.001, verbose=False)" ] i = 0 for base_estimator in base_estimators: logging.warning(str(datetime.now()) + 'Start ' + model_params[i]) try: model = LabelPowerset(base_estimator) # cross_val_f1 = Evaluator.evaluate_only_cross_val(model, x_all, y_all) # self.__visualizer.show_results_briefly(self.__CLASSIFIER_NAME, model_params[i], # "Word2Vec_CBOW", cross_val_f1) report, micro, macro, weighted = Evaluator.multi_label_report( model, x_all, y_all, True) self.__visualizer.save_metrics(self.__CLASSIFIER_NAME, model_params[i], "tfidf", report, micro, macro, weighted) except: logging.warning('Error on ' + model_params[i]) logging.warning(str(datetime.now()) + 'End ' + model_params[i]) i += 1