def test_if_works_with_cross_validation(self): # scikit does not support sparse output matrix in CV classifier = RakelO(base_classifier=GaussianNB(), base_classifier_require_dense=[True, True], labelset_size=TEST_LABELSET_SIZE, model_count=TEST_MODEL_COUNT, require_dense=[False, True]) self.assertClassifierWorksWithCV(classifier)
def RAkELO(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_clasif, num_labels, num_models): classifier = RakelO(base_classifier=base_clasif, labelset_size=num_labels, model_count=num_models) classifier.fit(dataset_train_x, dataset_train_y) predictions = classifier.predict(dataset_test_x) Metrics_Accuracy("RAkELO", predictions, dataset_test_y)
def RAkELO(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_clasif, num_labels, num_models): classifier = RakelO(base_classifier=base_clasif, labelset_size=num_labels, model_count=num_models) start_time = time.time() classifier.fit(dataset_train_x, dataset_train_y) stop_time = time.time() time_lapsed = stop_time - start_time predictions = classifier.predict(dataset_test_x) Metrics_Accuracy("RAkELO", predictions, dataset_test_y) print("Execution time: {}s".format(time_lapsed))
def getClassifier(self): if self.classifierType.lower() == 'rakelo': classifier = RakelO( base_classifier=LabelPowerset(GaussianNB()), #base_classifier_require_dense=[True, True], model_count=10, labelset_size=2 #len(labelTypes) // 4 ) elif self.classifierType.lower() == 'mlknn': classifier = MLkNN(k=3) # elif self.classifierType.lower() == 'mltsvm': # classifier = MLTSVM(c_k = 2**-1) elif self.classifierType.lower() == 'mlaram': classifier = MLARAM() elif self.classifierType.lower() == 'labelpowerset': classifier = LabelPowerset( classifier=RandomForestClassifier(n_estimators=100), require_dense=[False, True]) return classifier
def get_rakeld_with_nb(self): return RakelO(base_classifier=GaussianNB(), base_classifier_require_dense=[True, True], labelset_size=TEST_LABELSET_SIZE, model_count=TEST_MODEL_COUNT)
def get_rakeld_with_svc(self): return RakelO(base_classifier=SVC(), base_classifier_require_dense=[False, True], labelset_size=TEST_LABELSET_SIZE, model_count=TEST_MODEL_COUNT)
def Util_ClassifierMethods(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y): #BR Util_Title("Binary Relevance") base_classif = GaussianNB() BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "GaussianNB") dict_res = FindBestSVCParams(skpt.BinaryRelevance(), dataset_train_x, dataset_train_y) base_classif = SVC(kernel=dict_res['classifier__kernel'], degree=dict_res['classifier__degree']) BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "SVC tuned") dict_res = FindBestMNBParams(skpt.BinaryRelevance(), dataset_train_x, dataset_train_y) base_classif = MultinomialNB(alpha=dict_res['classifier__alpha']) BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "MNB tuned") #CC Util_Title("Classifier Chain") base_classif = GaussianNB() ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "GaussianNB") dict_res = FindBestSVCParams(skpt.ClassifierChain(), dataset_train_x, dataset_train_y) base_classif = SVC(kernel=dict_res['classifier__kernel'], degree=dict_res['classifier__degree']) ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "SVC tuned") dict_res = FindBestMNBParams(skpt.ClassifierChain(), dataset_train_x, dataset_train_y) base_classif = MultinomialNB(alpha=dict_res['classifier__alpha']) ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "MNB tuned") #LP Util_Title("Label Powerset") base_classif = GaussianNB() LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "GaussianNB") dict_res = FindBestSVCParams(skpt.LabelPowerset(), dataset_train_x, dataset_train_y) base_classif = SVC(kernel=dict_res['classifier__kernel'], degree=dict_res['classifier__degree']) LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "SVC tuned") dict_res = FindBestMNBParams(skpt.LabelPowerset(), dataset_train_x, dataset_train_y) base_classif = MultinomialNB(alpha=dict_res['classifier__alpha']) LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "MNB tuned") #MLkNN Util_Title("MLkNN") dict_res = FindBestK(skadapt.MLkNN(), dataset_train_x, dataset_train_y) MLkNN(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['k'], dict_res['s']) #MLARAM Util_Title("MLARAM") dict_res = FindBestVT(dataset_train_x, dataset_train_y) MLARAM(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['vigilance'], dict_res['threshold']) #BRkNNa Util_Title("BRkNNa") dict_res = FindBestK(skadapt.BRkNNaClassifier(), dataset_train_x, dataset_train_y) BRkNNa(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['k']) #BRkNNb Util_Title("BRkNNb") dict_res = FindBestK(skadapt.BRkNNbClassifier(), dataset_train_x, dataset_train_y) BRkNNb(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['k']) #RAkELD Util_Title("RAkELd") dict_res = GridSearchCV_baseRakel(RakelD(), dataset_train_x, dataset_train_y) RAkELd(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['base_classifier'], dict_res['labelset_size']) #RAkELo Util_Title("RAkELo") dict_res = GridSearchCV_baseRakel(RakelO(), dataset_train_x, dataset_train_y) RAkELO(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['base_classifier'], dict_res['labelset_size'], dict_res['model_count']) #MLTSVM Util_Title("MLTSVM") dict_res = FindCKParam(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y) TwinMLSVM(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['c_k'], dict_res['sor_omega'])
def GridSearchCV_baseRakel(classif, dataset_train_x, dataset_train_y): #labelset_size denotes the desired size of partition range_labelset_size = list(range(1, 11)) rangefloat = [round(x * 0.1, 1) for x in range(1, 11)] parameters = [ { 'base_classifier': [GaussianNB()], 'labelset_size': range_labelset_size, }, { 'base_classifier': [MultinomialNB()], 'base_classifier__alpha': rangefloat, #for smoothing {Additive smoothing parameter NB} 'labelset_size': range_labelset_size, }, { 'base_classifier': [SVC()], 'base_classifier__kernel': ['rbf', 'linear', 'sigmoid'], 'labelset_size': range_labelset_size, }, ] print(type(classif) == type(RakelO())) if (type(classif) == type(RakelO())): end_range = dataset_train_y.shape[1] // 2 if dataset_train_y.shape[ 1] // 2 > (3 + 1) else dataset_train_y.shape[1] range_labelset_size = list(range(3, end_range)) #starting_range = dataset_train_y.shape[1]//range_labelset_size[0] range_model_count = list( range(2 * dataset_train_y.shape[1], 2 * dataset_train_y.shape[1] + 1) ) #[x*2 for x in range((starting_range), (starting_range+1))]#[x*2 for x in range(dataset_train_y.shape[1]//6, dataset_train_y.shape[1]//2)] print(dataset_train_y.shape[1]) print(range_labelset_size) print(range_model_count) parameters = [ { 'base_classifier': [GaussianNB()], 'labelset_size': range_labelset_size, 'model_count': range_model_count, }, { 'base_classifier': [MultinomialNB()], 'base_classifier__alpha': rangefloat, #for smoothing {Additive smoothing parameter NB} 'labelset_size': range_labelset_size, 'model_count': range_model_count, }, { 'base_classifier': [SVC()], 'base_classifier__kernel': ['rbf', 'linear', 'sigmoid'], 'labelset_size': range_labelset_size, 'model_count': range_model_count, }, ] classifier = GridSearchCV(classif, parameters, scoring=make_scorer(metrics.hamming_loss, greater_is_better=False), n_jobs=3) classifier.fit(dataset_train_x, dataset_train_y) return classifier.best_params_
def build_clf(X, y, top_estimator, base_estimator): """ Parameters ---------- X : NumPy Array The array is generated by concatenating corpus and keywords vectors. y : NumPy Array y.shape[0] should be equal to X.shape[0]. top_estimator : str One of ['ClassifierChain', 'LabelPowerset', 'Rake10']. base_estimator : str One of ['ClassifierChain', 'LabelPowerset', 'Rake10']. In Scikit-learn, only the following support multilabel classification: sklearn.tree.DecisionTreeClassifier sklearn.tree.ExtraTreeClassifier sklearn.ensemble.ExtraTreesClassifier sklearn.neighbors.KNeighborsClassifier sklearn.neural_network.MLPClassifier sklearn.neighbors.RadiusNeighborsClassifier sklearn.ensemble.RandomForestClassifier sklearn.linear_model.RidgeClassifierCV We have to use the `multiclass` or `multioutput` modules if the base classifier is not among the above listed. # In the multilabel learning literature, # OvR is also known as the binary relevance method. # An indicator matrix - a matrix of shape (n_samples, n_classes) # turns on multilabel classification. Returns ------- A fitted classifier object. """ if base_estimator in ['KNeighbors', 'MLP', 'RandomForest']: if base_estimator == 'KNeighbors': """ https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html """ from sklearn.neighbors import KNeighborsClassifier n_neighbors = int(input('Enter the number of neighbors\n')) weights = input('Enter the weight function used in prediction\n') leaf_size = int(input('Enter the leaf size\n')) classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, leaf_size=leaf_size) classifier.fit(X, y) print('Training...\n') return classifier elif base_estimator == 'MLP': """ https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html """ from sklearn.neural_network import MLPClassifier hidden_layer_sizes = tuple(eval(input('Enter the size of the hidden layers.\n e.g: (64,64,64)'\ 'indicates a simple three-layer feedforward neural network with 64 neurons in each layer\n'))) classifier = MLPClassifier( hidden_layer_sizes=hidden_layer_sizes, solver='lbfgs') # use lbgfs because it's a small dataset classifier.fit(X, y) print('Training...\n') return classifier elif base_estimator == 'RandomForest': """ https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html """ from sklearn.ensemble import RandomForestClassifier n_estimators = int( input('Enter the number of trees in the forest.\n')) max_depth = eval( input('Enter the maximum depth of the tree (int or None)\n')) min_samples_split = eval( input( 'The minimum number of samples required to split an internal node\n' )) min_samples_leaf = eval( input( 'The minimum number of samples required to be at a leaf node\n' )) classifier = RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) classifier.fit(X, y) print('Training...\n') return classifier else: if base_estimator == 'LogisticRegression': """ http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html """ from sklearn.linear_model import LogisticRegression penalty = input( 'Enter the norm used in the penalization. One of {l1, l2, elasticnet, none}\n' ) tol = float( input( 'Enter the tolerance for stopping criteria; default=1e-4. Bigger number means faster convergence\n' )) C = float(input('Enter the regularization strength. Smaller values specify stronger regularization.'\ 'Must be a positive number.\n')) solver = input( 'Enter the optimizer. One of {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}\n' ) max_iter = int(input('Enter the maximum number of iterations.\n')) base_classifier = LogisticRegression(penalty=penalty, tol=tol, C=C, solver=solver, max_iter=max_iter) elif base_estimator == 'SGD': """ https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html """ from sklearn.linear_model import SGDClassifier loss = input( 'Enter the loss function. One of {‘hinge’, ‘log’, ‘modified_huber’, ‘squared_hinge’}\n' ) penalty = input( 'Enter the penalty. One of {‘l2’, ‘l1’, ‘elasticnet’}\n') tol = float( input( 'Enter the tolerance for stopping criteria; default=1e-4. Bigger number means faster convergence\n' )) learning_rate = input( 'Enter the learning rate: Either ‘optimal’ or ‘adaptive’\n') eta0 = eval(input('Enter the initial learning rate\n')) base_classifier = SGDClassifier(loss=loss, penalty=penalty, tol=tol, learning_rate=learning_rate, eta0=eta0, alpha=0.0008) if top_estimator == 'Rakelo': classifier = RakelO(base_classifier=base_classifier, base_classifier_require_dense=[True, True], labelset_size=y.shape[1] // 4, model_count=6) classifier.fit(X, y) print('Training...\n') return classifier else: classifier = globals()[top_estimator](base_classifier) classifier.fit(X, y) print('Training...\n') return classifier
def test_if_works_with_cross_validation(self): classifier = RakelO(classifier=self.get_labelpowerset_with_nb(), model_count=20, labelset_size=5) self.assertClassifierWorksWithCV(classifier)
def test_if_dense_classification_works_on_dense_base_classifier(self): classifier = RakelO(classifier=self.get_labelpowerset_with_nb(), model_count=20, labelset_size=5) self.assertClassifierWorksWithSparsity(classifier, 'dense')