コード例 #1
0
 def test_if_works_with_cross_validation(self):
     # scikit does not support sparse output matrix in CV
     classifier = RakelO(base_classifier=GaussianNB(),
                         base_classifier_require_dense=[True, True],
                         labelset_size=TEST_LABELSET_SIZE,
                         model_count=TEST_MODEL_COUNT,
                         require_dense=[False, True])
     self.assertClassifierWorksWithCV(classifier)
def RAkELO(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
           base_clasif, num_labels, num_models):
    classifier = RakelO(base_classifier=base_clasif,
                        labelset_size=num_labels,
                        model_count=num_models)

    classifier.fit(dataset_train_x, dataset_train_y)
    predictions = classifier.predict(dataset_test_x)

    Metrics_Accuracy("RAkELO", predictions, dataset_test_y)
def RAkELO(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
           base_clasif, num_labels, num_models):
    classifier = RakelO(base_classifier=base_clasif,
                        labelset_size=num_labels,
                        model_count=num_models)

    start_time = time.time()
    classifier.fit(dataset_train_x, dataset_train_y)
    stop_time = time.time()
    time_lapsed = stop_time - start_time
    predictions = classifier.predict(dataset_test_x)

    Metrics_Accuracy("RAkELO", predictions, dataset_test_y)
    print("Execution time: {}s".format(time_lapsed))
コード例 #4
0
 def getClassifier(self):
     if self.classifierType.lower() == 'rakelo':
         classifier = RakelO(
             base_classifier=LabelPowerset(GaussianNB()),
             #base_classifier_require_dense=[True, True],
             model_count=10,
             labelset_size=2  #len(labelTypes) // 4
         )
     elif self.classifierType.lower() == 'mlknn':
         classifier = MLkNN(k=3)
     # elif self.classifierType.lower() == 'mltsvm':
     #     classifier = MLTSVM(c_k = 2**-1)
     elif self.classifierType.lower() == 'mlaram':
         classifier = MLARAM()
     elif self.classifierType.lower() == 'labelpowerset':
         classifier = LabelPowerset(
             classifier=RandomForestClassifier(n_estimators=100),
             require_dense=[False, True])
     return classifier
コード例 #5
0
 def get_rakeld_with_nb(self):
     return RakelO(base_classifier=GaussianNB(),
                   base_classifier_require_dense=[True, True],
                   labelset_size=TEST_LABELSET_SIZE,
                   model_count=TEST_MODEL_COUNT)
コード例 #6
0
 def get_rakeld_with_svc(self):
     return RakelO(base_classifier=SVC(),
                   base_classifier_require_dense=[False, True],
                   labelset_size=TEST_LABELSET_SIZE,
                   model_count=TEST_MODEL_COUNT)
def Util_ClassifierMethods(dataset_train_x, dataset_train_y, dataset_test_x,
                           dataset_test_y):
    #BR
    Util_Title("Binary Relevance")
    base_classif = GaussianNB()
    BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x,
                    dataset_test_y, base_classif, "GaussianNB")

    dict_res = FindBestSVCParams(skpt.BinaryRelevance(), dataset_train_x,
                                 dataset_train_y)
    base_classif = SVC(kernel=dict_res['classifier__kernel'],
                       degree=dict_res['classifier__degree'])
    BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x,
                    dataset_test_y, base_classif, "SVC tuned")

    dict_res = FindBestMNBParams(skpt.BinaryRelevance(), dataset_train_x,
                                 dataset_train_y)
    base_classif = MultinomialNB(alpha=dict_res['classifier__alpha'])
    BinaryRelevance(dataset_train_x, dataset_train_y, dataset_test_x,
                    dataset_test_y, base_classif, "MNB tuned")

    #CC
    Util_Title("Classifier Chain")
    base_classif = GaussianNB()
    ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x,
                    dataset_test_y, base_classif, "GaussianNB")

    dict_res = FindBestSVCParams(skpt.ClassifierChain(), dataset_train_x,
                                 dataset_train_y)
    base_classif = SVC(kernel=dict_res['classifier__kernel'],
                       degree=dict_res['classifier__degree'])
    ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x,
                    dataset_test_y, base_classif, "SVC tuned")

    dict_res = FindBestMNBParams(skpt.ClassifierChain(), dataset_train_x,
                                 dataset_train_y)
    base_classif = MultinomialNB(alpha=dict_res['classifier__alpha'])
    ClassifierChain(dataset_train_x, dataset_train_y, dataset_test_x,
                    dataset_test_y, base_classif, "MNB tuned")

    #LP
    Util_Title("Label Powerset")
    base_classif = GaussianNB()
    LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x,
                  dataset_test_y, base_classif, "GaussianNB")

    dict_res = FindBestSVCParams(skpt.LabelPowerset(), dataset_train_x,
                                 dataset_train_y)
    base_classif = SVC(kernel=dict_res['classifier__kernel'],
                       degree=dict_res['classifier__degree'])
    LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x,
                  dataset_test_y, base_classif, "SVC tuned")

    dict_res = FindBestMNBParams(skpt.LabelPowerset(), dataset_train_x,
                                 dataset_train_y)
    base_classif = MultinomialNB(alpha=dict_res['classifier__alpha'])
    LabelPowerset(dataset_train_x, dataset_train_y, dataset_test_x,
                  dataset_test_y, base_classif, "MNB tuned")

    #MLkNN
    Util_Title("MLkNN")
    dict_res = FindBestK(skadapt.MLkNN(), dataset_train_x, dataset_train_y)
    MLkNN(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
          dict_res['k'], dict_res['s'])

    #MLARAM
    Util_Title("MLARAM")
    dict_res = FindBestVT(dataset_train_x, dataset_train_y)
    MLARAM(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
           dict_res['vigilance'], dict_res['threshold'])

    #BRkNNa
    Util_Title("BRkNNa")
    dict_res = FindBestK(skadapt.BRkNNaClassifier(), dataset_train_x,
                         dataset_train_y)
    BRkNNa(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
           dict_res['k'])

    #BRkNNb
    Util_Title("BRkNNb")
    dict_res = FindBestK(skadapt.BRkNNbClassifier(), dataset_train_x,
                         dataset_train_y)
    BRkNNb(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
           dict_res['k'])

    #RAkELD
    Util_Title("RAkELd")
    dict_res = GridSearchCV_baseRakel(RakelD(), dataset_train_x,
                                      dataset_train_y)
    RAkELd(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
           dict_res['base_classifier'], dict_res['labelset_size'])

    #RAkELo
    Util_Title("RAkELo")
    dict_res = GridSearchCV_baseRakel(RakelO(), dataset_train_x,
                                      dataset_train_y)
    RAkELO(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
           dict_res['base_classifier'], dict_res['labelset_size'],
           dict_res['model_count'])

    #MLTSVM
    Util_Title("MLTSVM")
    dict_res = FindCKParam(dataset_train_x, dataset_train_y, dataset_test_x,
                           dataset_test_y)
    TwinMLSVM(dataset_train_x, dataset_train_y, dataset_test_x, dataset_test_y,
              dict_res['c_k'], dict_res['sor_omega'])
def GridSearchCV_baseRakel(classif, dataset_train_x, dataset_train_y):
    #labelset_size denotes the desired size of partition
    range_labelset_size = list(range(1, 11))
    rangefloat = [round(x * 0.1, 1) for x in range(1, 11)]
    parameters = [
        {
            'base_classifier': [GaussianNB()],
            'labelset_size': range_labelset_size,
        },
        {
            'base_classifier': [MultinomialNB()],
            'base_classifier__alpha':
            rangefloat,  #for smoothing {Additive smoothing parameter NB}
            'labelset_size': range_labelset_size,
        },
        {
            'base_classifier': [SVC()],
            'base_classifier__kernel': ['rbf', 'linear', 'sigmoid'],
            'labelset_size': range_labelset_size,
        },
    ]
    print(type(classif) == type(RakelO()))
    if (type(classif) == type(RakelO())):
        end_range = dataset_train_y.shape[1] // 2 if dataset_train_y.shape[
            1] // 2 > (3 + 1) else dataset_train_y.shape[1]
        range_labelset_size = list(range(3, end_range))
        #starting_range = dataset_train_y.shape[1]//range_labelset_size[0]
        range_model_count = list(
            range(2 * dataset_train_y.shape[1],
                  2 * dataset_train_y.shape[1] + 1)
        )  #[x*2 for x in range((starting_range), (starting_range+1))]#[x*2 for x in range(dataset_train_y.shape[1]//6, dataset_train_y.shape[1]//2)]
        print(dataset_train_y.shape[1])
        print(range_labelset_size)
        print(range_model_count)
        parameters = [
            {
                'base_classifier': [GaussianNB()],
                'labelset_size': range_labelset_size,
                'model_count': range_model_count,
            },
            {
                'base_classifier': [MultinomialNB()],
                'base_classifier__alpha':
                rangefloat,  #for smoothing {Additive smoothing parameter NB}
                'labelset_size': range_labelset_size,
                'model_count': range_model_count,
            },
            {
                'base_classifier': [SVC()],
                'base_classifier__kernel': ['rbf', 'linear', 'sigmoid'],
                'labelset_size': range_labelset_size,
                'model_count': range_model_count,
            },
        ]

    classifier = GridSearchCV(classif,
                              parameters,
                              scoring=make_scorer(metrics.hamming_loss,
                                                  greater_is_better=False),
                              n_jobs=3)
    classifier.fit(dataset_train_x, dataset_train_y)
    return classifier.best_params_
コード例 #9
0
def build_clf(X, y, top_estimator, base_estimator):
    """

    Parameters
    ----------
    X : NumPy Array
        The array is generated by concatenating corpus and keywords vectors.
    y : NumPy Array
        y.shape[0] should be equal to X.shape[0].
    top_estimator : str
        One of ['ClassifierChain', 'LabelPowerset', 'Rake10'].
    base_estimator : str
        One of ['ClassifierChain', 'LabelPowerset', 'Rake10'].


    In Scikit-learn, only the following support multilabel classification:

    sklearn.tree.DecisionTreeClassifier
    sklearn.tree.ExtraTreeClassifier
    sklearn.ensemble.ExtraTreesClassifier
    sklearn.neighbors.KNeighborsClassifier
    sklearn.neural_network.MLPClassifier
    sklearn.neighbors.RadiusNeighborsClassifier
    sklearn.ensemble.RandomForestClassifier
    sklearn.linear_model.RidgeClassifierCV

    We have to use the `multiclass` or `multioutput` modules
    if the base classifier is not among the above listed.

    # In the multilabel learning literature,
    # OvR is also known as the binary relevance method.
    # An indicator matrix - a matrix of shape (n_samples, n_classes)
    # turns on multilabel classification.
    
    Returns
    -------
    A fitted classifier object.

    """
    if base_estimator in ['KNeighbors', 'MLP', 'RandomForest']:
        if base_estimator == 'KNeighbors':
            """
            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
            """
            from sklearn.neighbors import KNeighborsClassifier
            n_neighbors = int(input('Enter the number of neighbors\n'))
            weights = input('Enter the weight function used in prediction\n')
            leaf_size = int(input('Enter the leaf size\n'))
            classifier = KNeighborsClassifier(n_neighbors=n_neighbors,
                                              weights=weights,
                                              leaf_size=leaf_size)
            classifier.fit(X, y)
            print('Training...\n')
            return classifier

        elif base_estimator == 'MLP':
            """
            https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
            """
            from sklearn.neural_network import MLPClassifier
            hidden_layer_sizes = tuple(eval(input('Enter the size of the hidden layers.\n e.g: (64,64,64)'\
                'indicates a simple three-layer feedforward neural network with 64 neurons in each layer\n')))
            classifier = MLPClassifier(
                hidden_layer_sizes=hidden_layer_sizes,
                solver='lbfgs')  # use lbgfs because it's a small dataset
            classifier.fit(X, y)
            print('Training...\n')
            return classifier

        elif base_estimator == 'RandomForest':
            """
            https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
            """
            from sklearn.ensemble import RandomForestClassifier
            n_estimators = int(
                input('Enter the number of trees in the forest.\n'))
            max_depth = eval(
                input('Enter the maximum depth of the tree (int or None)\n'))
            min_samples_split = eval(
                input(
                    'The minimum number of samples required to split an internal node\n'
                ))
            min_samples_leaf = eval(
                input(
                    'The minimum number of samples required to be at a leaf node\n'
                ))
            classifier = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf)
            classifier.fit(X, y)
            print('Training...\n')
            return classifier

    else:
        if base_estimator == 'LogisticRegression':
            """
            http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
            """
            from sklearn.linear_model import LogisticRegression
            penalty = input(
                'Enter the norm used in the penalization. One of {l1, l2, elasticnet, none}\n'
            )
            tol = float(
                input(
                    'Enter the tolerance for stopping criteria; default=1e-4. Bigger number means faster convergence\n'
                ))
            C = float(input('Enter the regularization strength. Smaller values specify stronger regularization.'\
                'Must be a positive number.\n'))
            solver = input(
                'Enter the optimizer. One of {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}\n'
            )
            max_iter = int(input('Enter the maximum number of iterations.\n'))
            base_classifier = LogisticRegression(penalty=penalty,
                                                 tol=tol,
                                                 C=C,
                                                 solver=solver,
                                                 max_iter=max_iter)

        elif base_estimator == 'SGD':
            """
            https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
            """
            from sklearn.linear_model import SGDClassifier
            loss = input(
                'Enter the loss function. One of {‘hinge’, ‘log’, ‘modified_huber’, ‘squared_hinge’}\n'
            )
            penalty = input(
                'Enter the penalty. One of {‘l2’, ‘l1’, ‘elasticnet’}\n')
            tol = float(
                input(
                    'Enter the tolerance for stopping criteria; default=1e-4. Bigger number means faster convergence\n'
                ))
            learning_rate = input(
                'Enter the learning rate: Either ‘optimal’ or ‘adaptive’\n')
            eta0 = eval(input('Enter the initial learning rate\n'))
            base_classifier = SGDClassifier(loss=loss,
                                            penalty=penalty,
                                            tol=tol,
                                            learning_rate=learning_rate,
                                            eta0=eta0,
                                            alpha=0.0008)

        if top_estimator == 'Rakelo':
            classifier = RakelO(base_classifier=base_classifier,
                                base_classifier_require_dense=[True, True],
                                labelset_size=y.shape[1] // 4,
                                model_count=6)
            classifier.fit(X, y)
            print('Training...\n')
            return classifier

        else:
            classifier = globals()[top_estimator](base_classifier)
            classifier.fit(X, y)
            print('Training...\n')
            return classifier
コード例 #10
0
    def test_if_works_with_cross_validation(self):
        classifier = RakelO(classifier=self.get_labelpowerset_with_nb(),
                            model_count=20,
                            labelset_size=5)

        self.assertClassifierWorksWithCV(classifier)
コード例 #11
0
    def test_if_dense_classification_works_on_dense_base_classifier(self):
        classifier = RakelO(classifier=self.get_labelpowerset_with_nb(),
                            model_count=20,
                            labelset_size=5)

        self.assertClassifierWorksWithSparsity(classifier, 'dense')