Esempio n. 1
0
 def train(self,
           data: DataFrame,
           X_column: str,
           y_columns: List[str] = None):
     if y_columns is None:
         _ = data.columns.to_list()
         y_columns = list(set(_) - set([X_column]))
     X = data[X_column]
     y: DataFrame = data.drop(X_column, axis=1)
     xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                     y,
                                                     random_state=42,
                                                     test_size=0.2)
     mlb = MultiLabelBinarizer()
     train_labels = mlb.fit_transform(ytrain[y_columns].values)
     # test_labels not used when training
     # test_labels = mlb.fit_transform(ytest[y_columns].values)
     train_cleaned = xtrain.copy(deep=True).apply(
         nlp_preprocess.Preprocess().clean_text)
     # test cleaned not used when training
     # test_cleaned = xtest.copy(deep=True).apply(clean_text)
     vectorizer = TfidfVectorizer()
     vectorised_train_documents = vectorizer.fit_transform(train_cleaned)
     powersetsvc = LabelPowerset(LinearSVC())
     powersetsvc.fit(vectorised_train_documents, train_labels)
     dump(powersetsvc, open("powersetsvc.pickle", "wb"))
     with open('vec.pickle', 'wb') as f1:
         dump(vectorizer, f1)
     return powersetsvc, vectorizer
Esempio n. 2
0
class LP():
    '''
        Label Powerset Method
    '''

    h = None

    def __init__(self, h=LogisticRegression()):
        self.h = LabelPowerset(h)

    def fit(self, X, Y):
        '''
            Train the model on training data X,Y
        '''
        return self.h.fit(X, Y)

    def predict(self, X):
        '''
            Return predictions Y, given X
        '''
        return self.h.predict(X)

    def predict_proba(self, X):
        '''
            Return matrix P, where P[i,j] = P(Y[i,j] = 1 | X[i])
            (where i-th row/example, and j-th label)
        '''
        return self.h.predict_proba(X)
class MyLabelPowerSetFeatureSelect():
   
    def fit(self, X, y):
        
        # I'm using a gaussian naive bayes base classifier
        self.LabelPowerSetObject = LabelPowerset(GaussianNB())
        
        # fitting the data
        self.LabelPowerSetObject.fit(X, y)
        
        # transformed y 
        y_transformed  = self.LabelPowerSetObject.transform(y)
        
        # instanciating with SelectKBest object
        self.X_new = SelectKBest(chi2, k=2)
        
        # the feature selecting
        self.X_transformed = self.X_new.fit_transform(X, y_transformed)
        
        # save indices of the saved attributes
        self.selected_attributes_indices = self.X_new.get_support(indices = True)
        
        #print(self.attributes_indices,'the indices of the selected atributes')
        
        return self
        
    
    def transform(self, X):    
        return X[:,self.selected_attributes_indices]
    
    def predict(self, X):
        return self.LabelPowerSetObject.predict(X)
    
    def predict_proba(self, X):
        return self.LabelPowerSetObject.predict_proba(X)
Esempio n. 4
0
def multi_classTo_multi_multi(Y, model):
    num_of_labels = Y.ndim
    if (num_of_labels >= 2):
        print("This is already a multi-label problem!!!!!!")
        return Y
    transclf = LabelPowerset(classifier=model, require_dense=[False, True])
    return transclf.inverse_transform(Y)
Esempio n. 5
0
    def filter_rare_classes(self, feature_matrix, target_matrix):
        '''
        In order to perform stratified split between train and test,there
        should be atleast 2 instances present in the data. Hence, filter 
        label combinations that occurs only once in the entire dataset.
        Input : 
            Feature Matrix : matrix of features
            Target Matrix : matrix containing the the target labels
        Output :
            Feature Matrix : Filtered 
            Target Matrix : Filtered    
        
        '''
        lp = LabelPowerset()
        multi_class_target_labels = lp.transform(target_matrix)
        classes_vc = np.asarray(
            np.unique(multi_class_target_labels,
                      return_counts=True)).T  # 1635 unique classes
        class_to_keep = classes_vc[np.where(classes_vc[:, 1] > 1)][:, 0]
        mask = [
            True if
            (multi_class_target_labels[i] in (class_to_keep)) else False
            for i in range(len(multi_class_target_labels))
        ]
        feature_matrix = feature_matrix[mask]
        target_matrix = target_matrix[mask]

        return feature_matrix, target_matrix
def reduce_dimension(data1, label1, dimension_num, estimators=100):
    # The method is to reduce the dimension of vector
    # and choose the most important features

    # print('label1: ', label1.shape)
    y_train = sparse.lil_matrix((label1.shape[0], 85))
    y_train[:, :] = label1
    # print(y_train.shape)

    X_train = sparse.lil_matrix((label1.shape[0], 4189))
    X_train[:, :] = data1
    # print(X_train.shape)

    classifier5 = RandomForestClassifier(n_estimators=estimators,
                                         random_state=1)
    classifier1 = LabelPowerset(classifier=classifier5,
                                require_dense=[False, True])
    classifier1.fit(X_train, y_train)

    importances = classifier5.feature_importances_
    # print('importances1: ', importances)
    indices = np.argsort(importances)[::-1]
    # print('indices', indices)
    features_importances = importances[indices]
    # plot_feature_importances(importances, 'Features Importance(Random Forest)', name1)

    return indices[:dimension_num], indices[
        dimension_num:], features_importances
Esempio n. 7
0
    def train_test_split(self, feature_matrix, target_matrix, test_size=0.2):
        '''
        Stratified Shuffle split technique is used to split train and test set,
        to have the equal proportion of classes in train and test.
        
        Input:
            feature_matrix : Feature matrix with rare classes filtered out
            target_matrix : Target matrix with rare classes filtered out
            test_size: default is  20%
        
        Output:
            train_x, train_y, test_x, test_y
        '''
        lp = LabelPowerset()
        sss_level_1 = StratifiedShuffleSplit(lp.transform(target_matrix),
                                             n_iter=1,
                                             test_size=0.2,
                                             random_state=123)
        for train_ix, test_ix in sss_level_1:

            train_x = feature_matrix.iloc[train_ix, :]
            train_y = target_matrix.iloc[train_ix, :]

            test_x = feature_matrix.iloc[test_ix, :]
            test_y = target_matrix.iloc[test_ix, :]

        return train_x, train_y, test_x, test_y
 def __init__(self, C=1.0, use_idf=False, filename=None, **kwargs):
     self.lm = LabelPowerset(MultinomialNB())
     self.vect1 = TfidfVectorizer(norm=None,
                                  use_idf=use_idf,
                                  min_df=0.0,
                                  ngram_range=(1, 1))
     self.selector = sklearn.feature_selection.SelectKBest(k='all')
     self.output_dim = 0
     if filename is not None: self.load(filename)
Esempio n. 9
0
def multi_labelTo_multi_class(Y, model):
    num_of_labels = Y.ndim
    if (num_of_labels == 1):
        print("This is not a multi-label problem!!!!!!")
        return Y
    #LabelPowerset is used here as it contains the transform function
    #that actuall do the multi_label to muti_class transformation.
    transclf = LabelPowerset(classifier=model, require_dense=[False, True])
    return [transclf, transclf.transform(Y)]
Esempio n. 10
0
    def labelSet(self):
        classifier = LabelPowerset(GaussianNB())

        classifier.fit(self.X_train, self.y_train)

        # predict
        predictions = classifier.predict(self.X_test)
        result = accuracy_score(self.y_test, predictions)
        print(result)
Esempio n. 11
0
def logistic_regression_classifier(train_x, train_y):
    from sklearn.linear_model import LogisticRegression
    from skmultilearn.problem_transform import BinaryRelevance
    from skmultilearn.problem_transform import LabelPowerset
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB

    model = LabelPowerset(LogisticRegression(penalty='l1'))
    model.fit(train_x, train_y)
    return model
def generat_model():
    try:
        logging.info("Generating mlb.pkl model file in 'pkl' folder")
        mlb = read_input()
        # Loading processed data pickle
        annotated_reviews_df = pd.read_pickle("./pkl/annotated_reviews_df.pkl")

        # Convert the multi-labels into arrays
        y = mlb.fit_transform(annotated_reviews_df.aspects)
        X = annotated_reviews_df.text

        # Split data into train and test set
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=0)

        # save the the fitted binarizer labels
        # This is important: it contains the how the multi-label was binarized, so you need to
        # load this in the next folder in order to undo the transformation for the correct labels.
        filename = "./pkl/mlb.pkl"
        pickle.dump(mlb, open(filename, 'wb'))
        logging.info(
            "Successfully generated and saved mlb.pkl model file in 'pkl' folder"
        )

        text_clf = Pipeline([
            ('vect', CountVectorizer(stop_words="english",
                                     ngram_range=(1, 1))),
            ('tfidf', TfidfTransformer(use_idf=False)),
            ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),
        ])
        text_clf = text_clf.fit(X_train, y_train)
        predicted = text_clf.predict(X_test)

        # Calculate accuracy
        np.mean(predicted == y_test)

        # Test if SVM performs better

        text_clf_svm = Pipeline([('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf-svm',
                                  LabelPowerset(
                                      SGDClassifier(loss='hinge',
                                                    penalty='l2',
                                                    alpha=1e-3,
                                                    max_iter=6,
                                                    random_state=42)))])
        _ = text_clf_svm.fit(X_train, y_train)
        predicted_svm = text_clf_svm.predict(X_test)
    except:
        logging.error("Error in Generating mlb.pkl model file in 'pkl' folder")
        pass
    return predicted_svm, y_test, X, y
Esempio n. 13
0
def naive_bayes_classifier(train_x, train_y):
    from skmultilearn.problem_transform import BinaryRelevance
    from skmultilearn.problem_transform import LabelPowerset
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB
    classifier = LabelPowerset(GaussianNB())
    #    classifier = ClassifierChain(GaussianNB())
    #    classifier = BinaryRelevance(GaussianNB())
    classifier.fit(train_x, train_y)

    return classifier
Esempio n. 14
0
def buildLBClassifier(xTrain, yTrain):
    # initialize Label Powerset multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = LabelPowerset(GaussianNB())

    # train
    xTrain = np.ascontiguousarray(xTrain)
    yTrain = np.ascontiguousarray(yTrain)
    classifier.fit(xTrain, yTrain)

    return classifier
    def powerset(self):

        classifier = LabelPowerset(LogisticRegression())
        classifier.fit(self.x_data, self.y_data)

        predictions = classifier.predict(self.x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
def getClassifier(classifierType):
    if (classifierType == OVR_SVC_UNIGRAM_3 or classifierType == OVR_SVC_BiGRAM_3):
        return  OneVsRestClassifier(LinearSVC())
    elif (classifierType == OVR_MNB_UNIGRAM or classifierType == OVR_MNB_BiGRAM):
        return  OneVsRestClassifier(MultinomialNB(alpha=0.7))
    elif (classifierType == OVR_SGD_UNIGRAM or classifierType == OVR_SGD_BiGRAM):
        return  OneVsRestClassifier(linear_model.SGDClassifier())
    elif (classifierType == LP_SVC_UNIGRAM or classifierType == LP_SVC_BIGRAM):
        return  LabelPowerset(LinearSVC())
    elif (classifierType == LP_MNB_UNIGRAM or classifierType == LP_MNB_BIGRAM):
        return  LabelPowerset(MultinomialNB(alpha=0.7))
    elif (classifierType == LP_SGD_UNIGRAM or classifierType == LP_SGD_BiGRAM):
        return  LabelPowerset(linear_model.SGDClassifier())
    def resampling_data(self, X, y):

        # Import a dataset with X and multi-label y
        lp = LabelPowerset()
        ros = RandomOverSampler(random_state=42)

        # Applies the above stated multi-label (ML) to multi-class (MC) transformation.
        yt = lp.transform(y)
        X_resampled, y_resampled = ros.fit_sample(X, yt)
        # Inverts the ML-MC transformation to recreate the ML set
        y_resampled = lp.inverse_transform(y_resampled)

        return X_resampled, y_resampled
Esempio n. 18
0
def chooseClassifier(classifier, X_train, y_train):
    if (classifier == "XGBoost"):
        model = LabelPowerset(XGBClassifier(random_state=0)).fit(
            X_train, y_train)
    elif (classifier == "RandomForest"):
        model = LabelPowerset(
            RandomForestClassifier(n_estimators=1000,
                                   criterion='entropy',
                                   random_state=0)).fit(X_train, y_train)
    elif (classifier == "SVM"):
        model = LabelPowerset(LinearSVC(random_state=0)).fit(X_train, y_train)
    elif (classifier == "LogisticRegression"):
        model = LabelPowerset(LogisticRegression(random_state=0)).fit(
            X_train, y_train)
    return model
Esempio n. 19
0
def classifiers(X_train, Y_train, X_test):

    classifier1 = BinaryRelevance(GaussianNB())
    classifier2 = ClassifierChain(GaussianNB())
    classifier3 = LabelPowerset(GaussianNB())

    classifier1.fit(X_train, Y_train)
    classifier2.fit(X_train, Y_train)
    classifier3.fit(X_train, Y_train)

    predictions1 = classifier1.predict(X_test)
    predictions2 = classifier2.predict(X_test)
    predictions3 = classifier3.predict(X_test)

    return predictions1, predictions2, predictions3
Esempio n. 20
0
def rakeld_ensemble(vec, label):
    problem_transform_classifier = LabelPowerset(classifier=LinearSVC(),
                                                 require_dense=[False, True])
    classifier = RakelD(classifier=problem_transform_classifier,
                        labelset_size=5)
    classifier.fit(vec, label)
    return classifier
Esempio n. 21
0
    def make_use_w2v_fix(self):
        x_all = self.__vectors_provider.get_w2v_vectors_fix()
        y_all = self.__data_source.get_y_multi_label()

        # TODO here grid search

        base_estimators = [
            LogisticRegression(C=1.0, solver='sag', n_jobs=-1),
            # LogisticRegression(n_jobs=-1),
            # LinearSVC(),
            # MLPClassifier()
        ]

        model_params = [
            "LogisticRegression(C=1.0, solver='sag')",
            # "LogisticRegression()",
            # "LinearSVC()",
            # "MLPClassifier()"
        ]

        i = 0
        for base_estimator in base_estimators:
            logging.warning(str(datetime.now()) + 'Start ' + model_params[i])
            try:
                model = LabelPowerset(base_estimator)
                cross_val_f1 = Evaluator.evaluate_only_cross_val(
                    model, x_all, y_all)
                self.__visualizer.show_results_briefly(self.__CLASSIFIER_NAME,
                                                       model_params[i],
                                                       "Word2Vec_fix",
                                                       cross_val_f1)
            except:
                logging.warning('Error on ' + model_params[i])
            logging.warning(str(datetime.now()) + 'End ' + model_params[i])
            i += 1
Esempio n. 22
0
def train(classifier, X_train, X_test, y_train, y_test, strategy):
    """Computes a multi-label classification.

    This approach is used by `one-vs-the-rest`, `classifier-chains`, and
    `label-powerset` strategies. For each classifier, the classes are fitted
    at the same time or in sequence. Since all the classes are represented by one
    and only one classifier, it is possible to gain knowledge about the classes
    by inspecting this unique classifier.

    Args:
        classifier: An instance of a scikit-learn classifier.
        classes: A list of strings representing the classes to be trained.
        X_train: A matrix containing features for training.
        y_train: A one-column dataframe containing labels for training.
        strategy: A string defining which of the three strategies will be used.

    Returns:
        A classification model and its performance report
    """
    if strategy == 'one-vs-the-rest':
        model = OneVsRestClassifier(classifier)
    if strategy == 'classifier-chains':
        model = ClassifierChain(classifier)
    if strategy == 'label-powerset':
        model = LabelPowerset(classifier)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test,
                                   y_pred,
                                   output_dict=True,
                                   target_names=y_train.columns)

    return model, report
def calculate_accuracy():
    #Calculate accuracy
    try:
        logging.info("Generating naive_model1.pkl model file in 'pkl' folder")
        predicted_svm, y_test, X, y = generat_model()
        np.mean(predicted_svm == y_test)

        # Train naive bayes on full dataset and save model
        text_clf = Pipeline([
            ('vect', CountVectorizer(stop_words="english",
                                     ngram_range=(1, 1))),
            ('tfidf', TfidfTransformer(use_idf=False)),
            ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),
        ])
        text_clf = text_clf.fit(X, y)

        # save the model to disk
        filename = './pkl/naive_model1.pkl'
        pickle.dump(text_clf, open(filename, 'wb'))
        logging.info(
            "Successfully Generated naive_model1.pkl model file in 'pkl' folder"
        )
    except:
        logging.error(
            "Error in Generating naive_model1.pkl model file in 'pkl' folder")
        pass
Esempio n. 24
0
    def fit(self, X, y):
        """Fit classifier to multi-label data

        Parameters
        ----------
        X : numpy.ndarray or scipy.sparse
            input features, can be a dense or sparse matrix of size
            :code:`(n_samples, n_features)`
        y : numpy.ndaarray or scipy.sparse {0,1}
            binary indicator matrix with label assignments, shape
            :code:`(n_samples, n_labels)`

        Returns
        -------
        fitted instance of self
        """
        self._label_count = y.shape[1]
        self.model_count_ = int(np.ceil(self._label_count /
                                        self.labelset_size))
        self.classifier_ = LabelSpacePartitioningClassifier(
            classifier=LabelPowerset(
                classifier=self.base_classifier,
                require_dense=self.base_classifier_require_dense),
            clusterer=GreedyLabelSpaceClusterer(
                cluster_size=self.labelset_size,
                cluster_count=self.model_count_,
                allow_overlap=False),
            require_dense=[False, False])
        return self.classifier_.fit(X, y)
 def __init__(
         self,
         rdm_state=84,
         params={"classifier__C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
         niterations=5):
     self.model = LabelPowerset(LogisticRegression(random_state=rdm_state))
     self.params = params
     self.niterations = niterations
Esempio n. 26
0
def evaluate_verse(embedding, labels, number_shuffles=10, train_perc=0.1):
    from skmultilearn.problem_transform import LabelPowerset

    micro = []
    macro = []
    sss = StratifiedShuffleSplit(
        n_splits=number_shuffles,
        test_size=1 - train_perc)
    for train_index, test_index in sss.split(embedding, labels):
        X_train, X_test = embedding[train_index], embedding[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        clf = LabelPowerset(LogisticRegression())
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        micro.append(f1_score(y_test, preds, average='micro'))
        macro.append(f1_score(y_test, preds, average='macro'))
    return (micro, macro)
Esempio n. 27
0
def LabelPowerset_method(X_train, y_train, samples_leaf, samples_split):
    """
	问题转换-->标签Powerset方法
	:param X_train: 输入数据
	:param y_train: 对应标签数据
	:return:
	"""
    try:
        classifier = LabelPowerset(
            DecisionTreeClassifier(min_samples_leaf=int(samples_leaf),
                                   min_samples_split=int(samples_split)))
        classifier.fit(X_train, y_train)
        return classifier
    except Exception as e:
        print("warning----标签Powerset|LabelPowerset_method----" + str(e))

    return None
Esempio n. 28
0
    def make_use_tfidf_with_results(self):
        x_all = self.__vectors_provider.get_tfidf_vectors()
        y_all = self.__data_source.get_y_multi_label()

        model1 = LabelPowerset(
            LogisticRegression(C=1.0, solver='sag', n_jobs=-1))
        Evaluator.multi_label_predict_proba_tfidf(
            model1, x_all, y_all, data_source=self.__data_source)
Esempio n. 29
0
def runSet(model, x, y):
    mse = []
    accuracy = []
    kf = KFold(n_splits=splitNo)
    for train, test in kf.split(x):
        classifier = LabelPowerset(model)
        classifier.fit(x[train], y[train])
        predictions = classifier.predict(x[test])
        accuracy.append(accuracy_score(y[test], predictions))
        mse.append(mean_squared_error(y[test], predictions.toarray()))
    mse = np.array(mse)
    accuracy = np.array(accuracy)

    mse = np.mean(mse)
    accuracy = np.mean(accuracy)

    return accuracy, mse
def get_train_test_lda(topic):
    model = VGG16(include_top=False, pooling='avg')

    x_train, y_train, x_test, y_test = load()

    x_train = x_train.astype('float32')
    x_train /= 255

    y_train = y_train.astype('int64')

    x_test = x_test.astype('float32')
    x_test /= 255
    y_test = y_test.astype('float32')

    X_train = model.predict(x_train)
    print(X_train.shape)
    X_test = model.predict(x_test)
    # X_train = model.predict(x_train)
    # X_test = model.predict(x_test)

    for k in topic:
        X_iter = X_train

        model_label = lda.LDA(n_topics=k, n_iter=1000)
        model_label.fit(y_train)
        doc_topic = model_label.doc_topic_
        x2 = doc_topic

        x = x2
        x = discretization_doc_topic(x)
        X_train = np.hstack((X_train, x))

        # multi-label learning to get x2
        classifier = LabelPowerset(RandomForestClassifier())
        classifier.fit(X_iter, x)

        x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray())
        # print(x)
        # x = alpha * x1 + (1-alpha) * x2
        # x = self.discretization_doc_topic(x)
        X_test = np.hstack((X_test, x))

    return np.array(X_train)[:, -28:], np.array(y_train), np.array(
        X_test)[:, -28:], np.array(y_test)