def rakeld_ensemble(vec, label): problem_transform_classifier = LabelPowerset(classifier=LinearSVC(), require_dense=[False, True]) classifier = RakelD(classifier=problem_transform_classifier, labelset_size=5) classifier.fit(vec, label) return classifier
def RAkELd(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_clasif, num_labels): classifier = RakelD(base_classifier=base_clasif, labelset_size=num_labels) classifier.fit(dataset_train_x, dataset_train_y) predictions = classifier.predict(dataset_test_x) Metrics_Accuracy("RAkELd", predictions, dataset_test_y)
def build_Rake(X_train, y_train, X_test, y_test): classifier = RakelD(base_classifier=GaussianNB(), base_classifier_require_dense=[True, True], labelset_size=4) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))
def RAkELd(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_clasif, num_labels): classifier = RakelD(base_classifier=base_clasif, labelset_size=num_labels) start_time = time.time() classifier.fit(dataset_train_x, dataset_train_y) stop_time = time.time() time_lapsed = stop_time - start_time predictions = classifier.predict(dataset_test_x) Metrics_Accuracy("RAkELd", predictions, dataset_test_y) print("Execution time: {}s".format(time_lapsed))
def run(classifier, train_test_set): X_train, X_test, y_train, y_test = train_test_set # init model and fit to train data rakel = RakelD(base_classifier=classifier) rakel.fit(X_train, y_train) # make predictions y_pred = rakel.predict(X_test) print('\n--------Rakel with {:}'.format(rakel)) return y_test, y_pred
def GridSearchCV_base(classif, dataset_train_x, dataset_train_y): rangefloat = [round(x * 0.1, 1) for x in range(1, 11)] parameters = [ { 'base_classifier': [GaussianNB()], #'labelset_size': }, { 'base_classifier': [MultinomialNB()], 'base_classifier__alpha': rangefloat, #for smoothing {Additive smoothing parameter NB} }, { 'base_classifier': [SVC()], 'base_classifier__kernel': ['rbf', 'linear', 'sigmoid'], }, ] classifier = GridSearchCV(RakelD(), parameters, scoring=make_scorer(metrics.hamming_loss, greater_is_better=False), n_jobs=3) classifier.fit(dataset_train_x, dataset_train_y) return classifier.best_params_
def RAkEL_fit(clfs, steps, X_train, y_train, X_test, y_test): metrics = {} for key, clf in zip(clfs.keys(), clfs.values()): acc = [] prec_micro = [] prec_macro = [] hamm_loss = [] f1_micro = [] f1_macro = [] print('Fitting RAkEL with Base Classifier: %s' % key) for k in steps: classifier = RakelD(base_classifier=clf, labelset_size=k) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) acc.append(accuracy_score(y_test, prediction)) prec_micro.append(precision_score(y_test, prediction, average='micro')) prec_macro.append(precision_score(y_test, prediction, average='macro')) hamm_loss.append(hamming_loss(y_test, prediction)) f1_micro.append(f1_score(y_test, prediction, average='micro')) f1_macro.append(f1_score(y_test, prediction, average='macro')) metrics[key] = [acc, hamm_loss, f1_micro, f1_macro, prec_micro, prec_macro] return metrics
def rakel_model(X_train, X_test, y_train, y_test, labels, seed): rakel = Pipeline([ ('count_vectorizer', CountVectorizer()), ('tf-idf_log', TfidfTransformer(sublinear_tf=True)), ('rakel', RakelD(base_classifier=LinearSVC(C=1, class_weight='balanced', random_state=seed), base_classifier_require_dense=[True, True], labelset_size=3)) ]) # train and predict model start_time = time.time() rakel.fit(X_train, y_train) prediction = rakel.predict(X_test) stop_time = time.time() # calculate scores f1 = f1_score(y_test, prediction, average=None) accuracy = jaccard_score(y_test, prediction, average=None) return f1, accuracy, stop_time - start_time
def Util_ClassifierMethods(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y): #BR Util_Title("Binary Relevance") base_classif = GaussianNB() BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "GaussianNB") dict_res = FindBestSVCParams(skpt.BinaryRelevance(), dataset_train_x, dataset_train_y) base_classif = SVC(kernel=dict_res['classifier__kernel'], degree=dict_res['classifier__degree']) BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "SVC tuned") dict_res = FindBestMNBParams(skpt.BinaryRelevance(), dataset_train_x, dataset_train_y) base_classif = MultinomialNB(alpha=dict_res['classifier__alpha']) BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "MNB tuned") #CC Util_Title("Classifier Chain") base_classif = GaussianNB() ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "GaussianNB") dict_res = FindBestSVCParams(skpt.ClassifierChain(), dataset_train_x, dataset_train_y) base_classif = SVC(kernel=dict_res['classifier__kernel'], degree=dict_res['classifier__degree']) ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "SVC tuned") dict_res = FindBestMNBParams(skpt.ClassifierChain(), dataset_train_x, dataset_train_y) base_classif = MultinomialNB(alpha=dict_res['classifier__alpha']) ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "MNB tuned") #LP Util_Title("Label Powerset") base_classif = GaussianNB() LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "GaussianNB") dict_res = FindBestSVCParams(skpt.LabelPowerset(), dataset_train_x, dataset_train_y) base_classif = SVC(kernel=dict_res['classifier__kernel'], degree=dict_res['classifier__degree']) LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "SVC tuned") dict_res = FindBestMNBParams(skpt.LabelPowerset(), dataset_train_x, dataset_train_y) base_classif = MultinomialNB(alpha=dict_res['classifier__alpha']) LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, base_classif, "MNB tuned") #MLkNN Util_Title("MLkNN") dict_res = FindBestK(skadapt.MLkNN(), dataset_train_x, dataset_train_y) MLkNN(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['k'], dict_res['s']) #MLARAM Util_Title("MLARAM") dict_res = FindBestVT(dataset_train_x, dataset_train_y) MLARAM(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['vigilance'], dict_res['threshold']) #BRkNNa Util_Title("BRkNNa") dict_res = FindBestK(skadapt.BRkNNaClassifier(), dataset_train_x, dataset_train_y) BRkNNa(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['k']) #BRkNNb Util_Title("BRkNNb") dict_res = FindBestK(skadapt.BRkNNbClassifier(), dataset_train_x, dataset_train_y) BRkNNb(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['k']) #RAkELD Util_Title("RAkELd") dict_res = GridSearchCV_baseRakel(RakelD(), dataset_train_x, dataset_train_y) RAkELd(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['base_classifier'], dict_res['labelset_size']) #RAkELo Util_Title("RAkELo") dict_res = GridSearchCV_baseRakel(RakelO(), dataset_train_x, dataset_train_y) RAkELO(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['base_classifier'], dict_res['labelset_size'], dict_res['model_count']) #MLTSVM Util_Title("MLTSVM") dict_res = FindCKParam(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y) TwinMLSVM(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y, dict_res['c_k'], dict_res['sor_omega'])
def get_rakeld_with_nb(self): return RakelD(base_classifier=GaussianNB(), base_classifier_require_dense=[True, True], labelset_size=TEST_LABELSET_SIZE)
classifier = LabelSpacePartitioningClassifier( problem_transform_classifier, clusterer) # setup the ensemble metaclassifier classifier.fit(X_train, t_train) predictions = classifier.predict(X_test) # all zero using SVC probabilities = classifier.predict_proba(X_test) accuracy_score(t_test, predictions) # 0.029049295774647887, the result is reasonable mean_squared_error(t_test.toarray(), probabilities.toarray()) # by trying different cluster methods, the walktrap initially is 0.043, greedy 0.029, infomap 0.051 # with a naive test of boosting with some combinations of parameters, forest is better than boosting # Rakel base_classifier = RandomForestClassifier() problem_transform_classifier = LabelPowerset(classifier=base_classifier) classifier = RakelD(problem_transform_classifier, labelset_size=3) # setup the ensemble meta-classifier classifier.fit(X_train, t_train) predictions = classifier.predict(X_test) probabilities = classifier.predict_proba(X_test) accuracy_score( t_test, predictions) # 0.0079225352112676055, random partition is not good here mean_squared_error(t_test.toarray(), probabilities.toarray()) # parameter tuning of space partitioning with clusterer parameters = { 'classifier': [LabelPowerset()], # BinaryRelevance performs pretty bad here 'clusterer': [ IGraphLabelCooccurenceClusterer('infomap', weighted=True,
def test_if_works_with_cross_validation(self): classifier = RakelD(classifier=self.get_labelpowerset_with_nb(), labelset_size=3) self.assertClassifierWorksWithCV(classifier)
ft_OT.rename(mapper=lambda x: x + "_OT", axis=1, inplace=True) X = np.concatenate((ft_FP, ft_OT), axis=1) scoring_funcs = { "hamming loss": hamming_func, "aiming": aiming_func, "coverage": coverage_func, "accuracy": accuracy_func, "absolute true": absolute_true_func, } # Keep recorded parameters = {'labelset_size': [2, 3, 4, 5, 6, 7, 8, 9, 10]} rakeld = GridSearchCV(RakelD( base_classifier=GaussianNB(), baseclassifier_require_dense=[True, True], ), param_grid=parameters, n_jobs=-1, cv=loocv, scoring=scoring_funcs, verbose=3, refit="absolute true") rakeld.fit(X, Y.values) print(rakeld.best_score_) mytuple = (rakeld, ) to_save = dump(mytuple, filename="rakeld.joblib")
def pipeline(method, X_train, y_train, scoring, params=None, search_r=True, best=None): if search_r: # Random search params r = np.random.uniform(-2, 2, size=5) C = np.array(10 ** r) alpha = np.random.uniform(0, 1, size=5) params_tree = {'__max_depth': sp.randint(1, 30), '__max_features': sp.randint(1, X_train.shape[1]), '__min_samples_split': sp.randint(2, X_train.shape[0] / 3), '__criterion': ['gini', 'entropy']} params_lgr = {'__C': C} params_nb = {'__alpha': alpha} tree_k, tree_v = list(params_tree.keys()), list(params_tree.values()) lgr_k, lgr_v = list(params_lgr.keys()), list(params_lgr.values()) nb_k, nb_v = list(params_nb.keys()), list(params_nb.values()) else: params_cc, params_rk, params_bn = params[0], params[1], params[2] if method == 'CC': base_str = 'base_estimator' if search_r: params_tree, params_lgr, params_nb = redefine(base_str, tree_k, tree_v), \ redefine(base_str, lgr_k, lgr_v), \ redefine(base_str, nb_k, nb_v) params = [params_lgr, params_tree, params_nb] else: params = params_cc tree_k, tree_v = list(params[1].keys()), list(params[1].values()) lgr_k, lgr_v = list(params[0].keys()), list(params[0].values()) nb_k, nb_v = list(params[2].keys()), list(params[2].values()) params_tree, params_lgr, params_nb = redefine(base_str, tree_k, tree_v), \ redefine(base_str, lgr_k, lgr_v), \ redefine(base_str, nb_k, nb_v) params = [params_lgr, params_tree, params_nb] print(colored('Fitting Classifiers Chain pipeline...', 'green')) classifiers = { "Logistic Regression": ClassifierChain(LogisticRegression(random_state=0, solver='lbfgs', n_jobs=-1)), "Decision Tree Classifier": ClassifierChain(DecisionTreeClassifier()), "MultinomialNB": ClassifierChain(MultinomialNB())} elif method == 'RAkEL': base_str = 'base_classifier' if search_r: params_tree, params_lgr, params_nb = redefine(base_str, tree_k, tree_v), \ redefine(base_str, lgr_k, lgr_v), \ redefine(base_str, nb_k, nb_v) params = [params_lgr, params_tree, params_nb] else: params = params_rk tree_k, tree_v = list(params[1].keys()), list(params[1].values()) lgr_k, lgr_v = list(params[0].keys()), list(params[0].values()) nb_k, nb_v = list(params[2].keys()), list(params[2].values()) params_tree, params_lgr, params_nb = redefine(base_str, tree_k, tree_v), \ redefine(base_str, lgr_k, lgr_v), \ redefine(base_str, nb_k, nb_v) params = [params_lgr, params_tree, params_nb] print(colored('Fitting RAkEL pipeline...', 'green')) classifiers = {"Logistic Regression": RakelD(LogisticRegression(random_state=0, solver='lbfgs', n_jobs=-1)), "Decision Tree Classifier": RakelD(DecisionTreeClassifier(), labelset_size=5), "MultinomialNB": RakelD(MultinomialNB(), labelset_size=5)} elif method == 'BinaryRelevance': base_str = 'classifier' if search_r: params_tree, params_lgr, params_nb = redefine(base_str, tree_k, tree_v), \ redefine(base_str, lgr_k, lgr_v), \ redefine(base_str, nb_k, nb_v) params = [params_lgr, params_tree, params_nb] else: params = params_bn tree_k, tree_v = list(params[1].keys()), list(params[1].values()) lgr_k, lgr_v = list(params[0].keys()), list(params[0].values()) nb_k, nb_v = list(params[2].keys()), list(params[2].values()) params_tree, params_lgr, params_nb = redefine(base_str, tree_k, tree_v), \ redefine(base_str, lgr_k, lgr_v), \ redefine(base_str, nb_k, nb_v) params = [params_lgr, params_tree, params_nb] print(colored('Fitting BinaryRelevance pipeline...', 'green')) classifiers = { "Logistic Regression": BinaryRelevance(LogisticRegression(random_state=0, solver='lbfgs', n_jobs=-1)), "Decision Tree Classifier": BinaryRelevance(DecisionTreeClassifier()), "MultinomialNB": BinaryRelevance(MultinomialNB())} else: raise ValueError('Invalid method passed. Expected one of: "CC", "RAkEL", "BinaryRelevance", got {} instead' .format(method)) res = {} for keys, classifier, par in zip(classifiers.keys(), classifiers.values(), params): res[keys] = hyperparameters_search(classifier, par, X_train, y_train, best, scoring, keys, candidates=30, random_search=search_r)
def class_multi_label(x, Y, model, wekamodelname, value): # detect is the data classification is a multi-label problem. num_of_labels = Y.ndim print("\n\n-----------------------------------------------------------\n") if (num_of_labels == 1): print("This is not a multi-label problem!!!!!!") return model javapath = "C:\\" "Program Files" "\\Java\\jre1.8.0_251\\bin\\javaw.exe" myclasspath = download_meka() print(myclasspath) try: while 1: if (value < 1) or (value > 9): print("This is a Multi label problem") print("Please select:") print("1. For binary relevance") print("2. For pairwise comparison") print("3. Calibrated label ranking") print("4. Chain classifier ") print("5. PowerSet no pruning ") print("6. PowerSet with pruning ") print("7. Random-k Labelsets ") print("8. Pairwise comparison ") print("9. Multi Label knn ") value = input("Please enter a choice:\n") if value == 1: print("Applying binary relevance") #clf=BinaryRelevance(classifier=model,require_dense=[False, True]) if wekamodelname == "nothing": print("WEKA does not support this classifier") clf = 0 break clf = Meka( meka_classifier="meka.classifiers.multilabel.BR", weka_classifier=wekamodelname, meka_classpath=myclasspath, java_command=javapath # path to java executable ) break elif value == 2: print("Fourclass Pairwise") if wekamodelname == "nothing": print("WEKA does not support this classifier") clf = 0 break clf = Meka( meka_classifier="meka.classifiers.multilabel.FW", weka_classifier=wekamodelname, meka_classpath=myclasspath, java_command=javapath # path to java executable ) break elif value == 3: print("Applying calibrated label ranking") if wekamodelname == "nothing": print("WEKA does not support this classifier") clf = 0 break clf = Meka( meka_classifier="meka.classifiers.multilabel.MULAN", weka_classifier=wekamodelname + " -S CLR", meka_classpath=myclasspath, java_command=javapath # path to java executable ) break elif value == 4: print("Applying Chain Classifier") ##clf = ClassifierChain(classifier=model,require_dense=[False, True]) if wekamodelname == "nothing": print("WEKA does not support this classifier") clf = 0 break clf = Meka( meka_classifier="meka.classifiers.multilabel.CC", weka_classifier=wekamodelname, meka_classpath=myclasspath, java_command=javapath # path to java executable ) break elif value == 5: print("Applying powerset NO pruning") clf = LabelPowerset(classifier=model, require_dense=[False, True]) break elif value == 6: print("Applying powerset with pruning") if wekamodelname == "nothing": print("WEKA does not support this classifier") clf = 0 break clf = Meka( meka_classifier="meka.classifiers.multilabel.PS", weka_classifier=wekamodelname, meka_classpath=myclasspath, java_command=javapath # path to java executable ) break elif value == 7: print("Applying Random-k Labelsets") try: clf = RakelD(base_classifier=model, base_classifier_require_dense=[False, True], labelset_size=4) except: print("RakelD exception") break elif value == 8: print("Monte-Carlo Classifier Chains") if wekamodelname == "nothing": print("WEKA does not support this classifier") clf = 0 break clf = Meka( meka_classifier="meka.classifiers.multilabel.MCC", weka_classifier=wekamodelname, meka_classpath=myclasspath, java_command=javapath # path to java executable ) break elif value == 9: print("Applying Multilabel k Nearest Neighbours") try: clf = MLkNN(k=3) except: print("Multilabel k Nearest Neighbours exception") break else: print("Try again!!!!") except: print("\nSomething went wrong, but continue\n") return clf
def test_if_dense_classification_works_on_dense_base_classifier(self): classifier = RakelD(classifier=self.get_labelpowerset_with_nb(), labelset_size=3) self.assertClassifierWorksWithSparsity(classifier, 'dense')
#Methode 3 : Chaineclassifieur clf = clf = ClassifierChain(classifier=RandomForestClassifier(max_depth=200), require_dense=[False, True]) anova_clf = Pipeline([('anova', vare), ('chaine', clf)]) anova_clf.fit(Xtrain, Ytrain) pred = anova_clf.predict(Xtest) matrix = multilabel_confusion_matrix(Ytest, pred) accuracy = accuracy_score(Ytest, pred) print(accuracy) #Methode 4 : onevsrest clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100)) anova_clf = Pipeline([('anova', vare), ('oneVSrest', clf)]) anova_clf.fit(Xtrain, Ytrain) pred = anova_clf.predict(Xtest) matrix = multilabel_confusion_matrix(Ytest, pred) accuracy = accuracy_score(Ytest, pred) print(accuracy) ## Methode 5 : Rakel clf = clf = RakelD(labelset_size=2, base_classifier=RandomForestClassifier()) anova_clf = Pipeline([('anova', vare), ('Rekel', clf)]) anova_clf.fit(Xtrain, Ytrain) pred = anova_clf.predict(Xtest) matrix = multilabel_confusion_matrix(Ytest, pred) accuracy = accuracy_score(Ytest, pred) print(accuracy)
ft_OT.rename(mapper=lambda x: x + "_OT", axis=1, inplace=True) X = np.concatenate((ft_FP, ft_OT), axis=1) scoring_funcs = { "hamming loss": hamming_func, "aiming": aiming_func, "coverage": coverage_func, "accuracy": accuracy_func, "absolute true": absolute_true_func, } # Keep recorded parameters = {'labelset_size': [2, 3, 4, 5, 6, 7, 8, 9, 10]} rakeld = GridSearchCV(RakelD( base_classifier=RandomForestClassifier(), baseclassifier_require_dense=[True, True], ), param_grid=parameters, n_jobs=-1, cv=loocv, scoring=scoring_funcs, verbose=3, refit="absolute true") rakeld.fit(X, Y.values) print(rakeld.best_score_) mytuple = (rakeld, ) to_save = dump(mytuple, filename="rakeld-rf.joblib")
X = pca.transform(X) svd = TruncatedSVD(dims) Xpca = svd.fit_transform(BOW_right) #For BOW, uncomment this line: #X_shuf, y_hot_shuf = shuffle(Xpca, y_hot, random_state = 7) #For anything else, uncomment this line: X_shuf, y_hot_shuf = shuffle(X, y_hot,random_state = 7) #Five folds classifier = RakelD( base_classifier=GaussianNB(), base_classifier_require_dense=[True, True], labelset_size=3 ) kfold = KFold(n_splits=5, random_state = 7) #There is randomness inherent in this, so these numbers will change scores = cross_val_score(classifier, X_shuf, y_hot_shuf, cv=kfold, scoring='f1_micro') print("Scores") print(np.mean(scores)) kf = KFold(n_splits=5, random_state = 7) kf.get_n_splits(X_shuf) accs = [] h_scs = [] for train_index, test_index in kf.split(X_shuf):
# initialize classifier chains multi-label classifier from sklearn.metrics import hamming_loss from sklearn.metrics import f1_score from sklearn.metrics import jaccard_similarity_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.naive_bayes import GaussianNB from skmultilearn.ensemble import RakelD from datetime import timedelta import time start = time.time() classifier = RakelD(base_classifier=GaussianNB(), base_classifier_require_dense=[True, True], labelset_size=4) classifier.fit(x_train, y_train) predictions = classifier.predict(x_test) # accuracy print("Accuracy = ", accuracy_score(y_test, predictions)) print("\n") print("F1 = ", f1_score(y_test, predictions, average='micro')) print("\n") print("Jaccard = ", jaccard_similarity_score(y_test, predictions)) print("\n") print("Precision = ", precision_score(y_test, predictions, average='micro'))
def get_rakeld_with_svc(self): return RakelD(base_classifier=SVC(probability=True), base_classifier_require_dense=[False, True], labelset_size=TEST_LABELSET_SIZE)