def build_model(category_names):
    """
    Builds a multilabel classifier using X and y from load_data()

    Args:
        None
    Returns:
        cv: (sklearn.model_selection.GridSearchCV) estimator containing the model and the parameters to be trained
    """
    try:
        # initialise columns to be used for data preparation purposes in the model pipeline
        message_col = 0

        # build a pipeline containing the feature transformations and estimator
        pipeline = Pipeline([
            (
                'features',
                ColumnTransformer(
                    [
                        # apply message transformations
                        ('text_pipeline',
                         Pipeline(
                             [('vect',
                               CountVectorizer(tokenizer=partial(tokenize))),
                              ('tfidf', TfidfTransformer())]), message_col),
                        ('starting_verb', StartingVerbExtractor(),
                         message_col),
                        ('category_terms',
                         CategoryTermExtractor(category_names=category_names),
                         message_col),
                    ],
                    remainder='drop')),

            # specify the estimator
            ('clf', LabelPowerset(MultinomialNB(fit_prior=True)))
        ])

        # parameter grid to be used for grid search
        parameters = {
            'features__text_pipeline__vect__max_features': [10000],
            'features__text_pipeline__tfidf__sublinear_tf': [True],
            'features__text_pipeline__vect__ngram_range': [(1, 1), (1, 2)],
            'features__text_pipeline__vect__min_df': [1],
            'features__text_pipeline__vect__max_df': [.95],
            'features__text_pipeline__tfidf__smooth_idf': [True],
            'features__text_pipeline__tfidf__norm': ['l2'],
            'clf__classifier__alpha': [0.01, 1.]
        }

        # perform cross validation using grid search on the pipeline described above
        cv = GridSearchCV(pipeline, param_grid=parameters, cv=5, verbose=2)
        return cv
    except:
        raise Exception("Could not build model.")
    def powerset(self):

        classifier = LabelPowerset(LogisticRegression())
        classifier.fit(self.x_data, self.y_data)

        predictions = classifier.predict(self.x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
def naive_bayes_classifier(train_x, train_y):
    from skmultilearn.problem_transform import BinaryRelevance
    from skmultilearn.problem_transform import LabelPowerset
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB
    classifier = LabelPowerset(GaussianNB())
    #    classifier = ClassifierChain(GaussianNB())
    #    classifier = BinaryRelevance(GaussianNB())
    classifier.fit(train_x, train_y)

    return classifier
Exemple #4
0
def buildLBClassifier(xTrain, yTrain):
    # initialize Label Powerset multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = LabelPowerset(GaussianNB())

    # train
    xTrain = np.ascontiguousarray(xTrain)
    yTrain = np.ascontiguousarray(yTrain)
    classifier.fit(xTrain, yTrain)

    return classifier
 def __init__(
         self,
         random_state=84,
         params={
             'classifier__C': [1, 10, 100, 1000],
             'classifier__gamma': [0.001, 0.0001],
             'classifier__kernel': ['rbf', 'linear']
         },
         niterations=10):
     self.model = LabelPowerset(SVC(random_state=random_state))
     self.params = params
     self.niterations = niterations
Exemple #6
0
def train_test_svm(dataset, return_predictions=True):
    train_in, train_out, test_in, test_out = dataset

    classifier = LabelPowerset(LinearSVC())
    time_func(classifier.fit)(train_in, train_out)

    acc, predictions = time_func(validate)(classifier, test_in, test_out,
                                           return_predictions)

    if return_predictions:
        return acc, predictions
    else:
        return acc
    def resampling_data(self, X, y):

        # Import a dataset with X and multi-label y
        lp = LabelPowerset()
        ros = RandomOverSampler(random_state=42)

        # Applies the above stated multi-label (ML) to multi-class (MC) transformation.
        yt = lp.transform(y)
        X_resampled, y_resampled = ros.fit_sample(X, yt)
        # Inverts the ML-MC transformation to recreate the ML set
        y_resampled = lp.inverse_transform(y_resampled)

        return X_resampled, y_resampled
 def __init__(
         self,
         random_state=84,
         n_estimators=20,
         params={
             'classifier__n_estimators': [250, 500, 1000, 1500],
             'classifier__min_samples_split': [2, 4, 8]
         },
         niterations=10):
     self.model = LabelPowerset(
         ExtraTreesClassifier(random_state=random_state,
                              n_estimators=n_estimators))
     self.params = params
     self.niterations = niterations
def build_Mklnn(X_train, y_train):

    parameters = {
        'classifier': [LabelPowerset(), ClassifierChain()],
        'classifier__classifier': [RandomForestClassifier()],
        'classifier__classifier__n_estimators': [10, 20, 50],
    }

    clf = GridSearchCV(LabelSpacePartitioningClassifier(),
                       parameters,
                       scoring='f1_macro')
    clf.fit(X_train, y_train)

    print(clf.best_params_, clf.best_score_)
Exemple #10
0
def classifiers(X_train, Y_train, X_test):

    classifier1 = BinaryRelevance(GaussianNB())
    classifier2 = ClassifierChain(GaussianNB())
    classifier3 = LabelPowerset(GaussianNB())

    classifier1.fit(X_train, Y_train)
    classifier2.fit(X_train, Y_train)
    classifier3.fit(X_train, Y_train)

    predictions1 = classifier1.predict(X_test)
    predictions2 = classifier2.predict(X_test)
    predictions3 = classifier3.predict(X_test)

    return predictions1, predictions2, predictions3
 def __init__(
         self,
         random_state=84,
         n_estimators=20,
         params={
             "classifier__max_depth": [3, None],
             "classifier__max_features": [1, 3, 10],
             "classifier__min_samples_leaf": [1, 3, 10]
         },
         niterations=10):
     self.model = LabelPowerset(
         GradientBoostingClassifier(random_state=random_state,
                                    n_estimators=n_estimators))
     self.params = params
     self.niterations = niterations
 def __init__(
         self,
         random_state=84,
         params={
             'classifier__activation':
             ['identity', 'logistic', 'tanh', 'relu'],
             'classifier__solver': ['lbfgs', 'sgd', 'adam'],
             'classifier__alpha': sp_randint(0.0001, 1),
             'classifier__learning_rate':
             ['constant', 'invscaling', 'adaptive'],
             'classifier__momentum': [0.9, 0.95, 0.99]
         },
         niterations=5):
     self.model = LabelPowerset(MLPClassifier(random_state=random_state))
     self.params = params
     self.niterations = niterations
 def __init__(
         self,
         random_state=84,
         n_estimators=20,
         params={
             "classifier__max_depth": [3, None],
             "classifier__max_features": [1, 3, 10],
             "classifier__min_samples_leaf": [1, 3, 10],
             "classifier__bootstrap": [True, False],
             "classifier__criterion": ["gini", "entropy"]
         },
         niterations=10):
     self.model = LabelPowerset(
         RandomForestClassifier(random_state=random_state,
                                n_estimators=n_estimators))
     self.params = params
     self.niterations = niterations
Exemple #14
0
def runSet(model, x, y):
    mse = []
    accuracy = []
    kf = KFold(n_splits=splitNo)
    for train, test in kf.split(x):
        classifier = LabelPowerset(model)
        classifier.fit(x[train], y[train])
        predictions = classifier.predict(x[test])
        accuracy.append(accuracy_score(y[test], predictions))
        mse.append(mean_squared_error(y[test], predictions.toarray()))
    mse = np.array(mse)
    accuracy = np.array(accuracy)

    mse = np.mean(mse)
    accuracy = np.mean(accuracy)

    return accuracy, mse
Exemple #15
0
def LabelPowerset_method(X_train, y_train, samples_leaf, samples_split):
    """
	问题转换-->标签Powerset方法
	:param X_train: 输入数据
	:param y_train: 对应标签数据
	:return:
	"""
    try:
        classifier = LabelPowerset(
            DecisionTreeClassifier(min_samples_leaf=int(samples_leaf),
                                   min_samples_split=int(samples_split)))
        classifier.fit(X_train, y_train)
        return classifier
    except Exception as e:
        print("warning----标签Powerset|LabelPowerset_method----" + str(e))

    return None
Exemple #16
0
def evaluate_verse(embedding, labels, number_shuffles=10, train_perc=0.1):
    from skmultilearn.problem_transform import LabelPowerset

    micro = []
    macro = []
    sss = StratifiedShuffleSplit(
        n_splits=number_shuffles,
        test_size=1 - train_perc)
    for train_index, test_index in sss.split(embedding, labels):
        X_train, X_test = embedding[train_index], embedding[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        clf = LabelPowerset(LogisticRegression())
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        micro.append(f1_score(y_test, preds, average='micro'))
        macro.append(f1_score(y_test, preds, average='macro'))
    return (micro, macro)
Exemple #17
0
    def test_model_selection_works(self):
        x, y = make_multilabel_classification(sparse=True, n_labels=5,
                                              return_indicator='sparse', allow_unlabeled=False)

        parameters = {
            'labelset_size': list(range(2, 3)),
            'classifier': [LabelPowerset(), BinaryRelevance()],
            'classifier__classifier': [MultinomialNB()],
            'classifier__classifier__alpha': [0.7, 1.0],
        }

        clf = GridSearchCV(RakelD(), parameters, scoring='f1_macro')
        clf.fit(x, y)

        for p in list(parameters.keys()):
            self.assertIn(p, clf.best_params_)

        self.assertIsNotNone(clf.best_score_)
def get_train_test_lda(topic):
    model = VGG16(include_top=False, pooling='avg')

    x_train, y_train, x_test, y_test = load()

    x_train = x_train.astype('float32')
    x_train /= 255

    y_train = y_train.astype('int64')

    x_test = x_test.astype('float32')
    x_test /= 255
    y_test = y_test.astype('float32')

    X_train = model.predict(x_train)
    print(X_train.shape)
    X_test = model.predict(x_test)
    # X_train = model.predict(x_train)
    # X_test = model.predict(x_test)

    for k in topic:
        X_iter = X_train

        model_label = lda.LDA(n_topics=k, n_iter=1000)
        model_label.fit(y_train)
        doc_topic = model_label.doc_topic_
        x2 = doc_topic

        x = x2
        x = discretization_doc_topic(x)
        X_train = np.hstack((X_train, x))

        # multi-label learning to get x2
        classifier = LabelPowerset(RandomForestClassifier())
        classifier.fit(X_iter, x)

        x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray())
        # print(x)
        # x = alpha * x1 + (1-alpha) * x2
        # x = self.discretization_doc_topic(x)
        X_test = np.hstack((X_test, x))

    return np.array(X_train)[:, -28:], np.array(y_train), np.array(
        X_test)[:, -28:], np.array(y_test)
Exemple #19
0
def load_ucmerced_dataset():

    dataset = scipy.io.loadmat(
        '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/dataset1.mat'
    )
    dataset = dataset['dataset1']
    edges = np.squeeze(dataset['edges'])  #adjacecny matrix
    index = np.squeeze(dataset['index'])  # image index to keep track
    classes = np.squeeze(dataset['class'])  #image class number to keep track

    #loading features in which NaN values have been replaced
    features = scipy.io.loadmat(
        '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/features.mat'
    )
    features = features['features']
    features = features['val']
    features = features[0]
    for i in range(0, len(features)):
        if np.isnan(features[i]).any() == True:
            print('features %d have NaN:' % i, np.isnan(features[i]).any())

    # loading multi-labels
    labels = scipy.io.loadmat(
        '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/LandUse_multilabels.mat'
    )
    labels = labels['labels']
    labels = np.transpose(labels, (1, 0))

    # Calculating class weights
    lp = LabelPowerset()
    trans_labels = lp.transform(labels)
    unique, counts = np.unique(trans_labels, return_counts=True)
    class_freq = 1.0 / counts
    weight_mat = np.zeros((np.shape(trans_labels)))
    for i in range(len(weight_mat)):
        weight_mat[i] = class_freq[np.where(trans_labels[i] == unique)]

    # Calculating label weights
    sum_labels = np.sum(labels, axis=0, dtype=np.float32)
    sum_tot = np.sum(sum_labels, dtype=np.float32)
    label_freq = np.true_divide(sum_labels, sum_tot)

    return features, edges, labels, weight_mat, label_freq, index, classes
def multiple_smote(X, y):
    """
    为multi-label样本过采样
    """

    # Import a dataset with X and multi-label y
    y = np.array(y)
    lp = LabelPowerset()
    # oversampler = ADASYN(random_state=1994, n_neighbors=2)
    oversampler = SMOTE(k_neighbors=2)

    # Applies the above stated multi-label (ML) to multi-class (MC) transformation.
    yt = lp.transform(y)

    X_resampled, y_resampled = oversampler.fit_resample(X, yt)

    # Inverts the ML-MC transformation to recreate the ML set
    y_resampled = lp.inverse_transform(y_resampled) # return a sparse matrix

    return X_resampled, y_resampled.toarray()
Exemple #21
0
 def label_fscore(self, subset=False):
     if subset:
         true_list, pred_list = self.true_list, self.pred_list
         if self._is_binarized:
             # transform multilabel to multiclass for subset measurement
             lp = LabelPowerset()
             transformed = lp.transform(np.concatenate((true_list,
                                                        pred_list)))
             true_list, pred_list = np.split(transformed, 2)
     else:
         true_list, pred_list = self._binarized_labels()
     prec, rec, fscore, count = skm.precision_recall_fscore_support(
         true_list, pred_list)
     fscores_dict = {}
     for c, f in zip(count, fscore):
         # label = class_by_count[c]
         # for when remove_multi_labeled is used
         if c == 0:
             continue
         fscores_dict[c] = f
     return fscores_dict
Exemple #22
0
    def LabelPowerset(self):
        print("")
        print(
            "Starting LabelPowerset Classifier of skmultilearn.problem_transform..."
        )
        print("")
        start = datetime.now()

        parameters = [
            {
                'classifier': [BernoulliNB()],
                'classifier__alpha': [0.7, 1.0],
            },
            # {
            #    'classifier': [SVC()],
            #    'classifier__kernel': ['rbf', 'linear'],
            #    'classifier__C': [1, 0.8],
            #    'classifier__class_weight': ['dict', 'balanced'],
            # },
            # {
            #    'classifier': [Perceptron()],
            #    'classifier__penalty': ['l2', 'l1'],
            #    'classifier__alpha': [0.7, 1.0],
            #    'classifier__max_iter': [1000, 10000],
            # },
        ]

        grid_search_cv = GridSearchCV(LabelPowerset(),
                                      parameters,
                                      scoring='f1_macro',
                                      verbose=2,
                                      n_jobs=-1)
        grid_search_cv.fit(self.x_train, self.y_train)
        clf = grid_search_cv.best_estimator_

        print('Finished training in : ', datetime.now() - start)

        y_pred = clf.predict(self.x_test)
        return self.multilabel_evaluation(y_pred, self.y_test)
Exemple #23
0
    def test_model_selection_works(self):
        for x, y in self.get_multilabel_data_for_tests('dense'):
            parameters = {
                'classifier': [LabelPowerset(),
                               BinaryRelevance()],
                'clusterer': [RandomLabelSpaceClusterer(None, None, False)],
                'clusterer__cluster_size': list(range(2, 3)),
                'clusterer__cluster_count': [3],
                'clusterer__allow_overlap': [False],
                'classifier__classifier': [MultinomialNB()],
                'classifier__classifier__alpha': [0.7, 1.0],
            }

            clf = GridSearchCV(LabelSpacePartitioningClassifier(),
                               parameters,
                               scoring='f1_macro')
            clf.fit(x, y)

            for p in list(parameters.keys()):
                self.assertIn(p, clf.best_params_)

            self.assertIsNotNone(clf.best_score_)
def aspectBasedMining(request):
  basepath = os.path.dirname(os.getcwd())
  annotated_reviews_df = pd.read_csv(os.path.join(basepath, 'aspectLabelled.csv'))
  annotated_reviews_df = annotated_reviews_df.sample(frac=1).reset_index(drop=True)
  annotated_reviews_df = annotated_reviews_df[annotated_reviews_df['Label'].notna()]
  def aspect_list(row):
    temp = str(row['Aspect'])
    temp2 = list(temp.split(",")) 
    return temp2
  annotated_reviews_df['Aspect'] = annotated_reviews_df.apply(lambda x: aspect_list(x), axis=1)
  nlp = spacy.load('en_core_web_lg')
  neuralcoref.add_to_pipe(nlp)
  def replace_pronouns(text):
    text = text['Description']
    doc = nlp(text)
    resolved_text = doc._.coref_resolved
    return resolved_text
  annotated_reviews_df["text_pro"] = annotated_reviews_df.apply(lambda x: replace_pronouns(x), axis=1)
  # Convert the multi-labels into arrays
  mlb = MultiLabelBinarizer()
  y = mlb.fit_transform(annotated_reviews_df.Aspect)
  X = annotated_reviews_df.text_pro

  # Split data into train and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

  # save the the fitted binarizer labels
  # This is important: it contains the how the multi-label was binarized, so you need to
  # load this in the next folder in order to undo the transformation for the correct labels.
  filename = 'mlb.sav'
  pickle.dump(mlb, open(filename, 'wb'))
  text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
  text_clf = text_clf.fit(X_train, y_train)
  filename = 'aspect_clf.sav'
  pickle.dump(text_clf, open(filename, 'wb'))
  data = {"data": {"code": 200, "message": "Classifier Updated"}}
  return Response(data)
def problemTransformation(data):

    # Binary Relevance
    # Classifier Chains
    # Label Powerset

    # initialize multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = BinaryRelevance(GaussianNB())
    classifier.fit(X_train, y_train)  # train
    predictions = classifier.predict(X_test)  # predict
    accuracyScore = accuracy_score(y_test, predictions)

    classifier = ClassifierChain(GaussianNB())
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    accuracyScore = accuracy_score(y_test, predictions)

    classifier = LabelPowerset(GaussianNB())
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    accuracyScore = accuracy_score(y_test, predictions)
    return None
 def fit(self, X, y):
     
     # I'm using a gaussian naive bayes base classifier
     self.LabelPowerSetObject = LabelPowerset(GaussianNB())
     
     # fitting the data
     self.LabelPowerSetObject.fit(X, y)
     
     # transformed y 
     y_transformed  = self.LabelPowerSetObject.transform(y)
     
     # instanciating with SelectKBest object
     self.X_new = SelectKBest(chi2, k=2)
     
     # the feature selecting
     self.X_transformed = self.X_new.fit_transform(X, y_transformed)
     
     # save indices of the saved attributes
     self.selected_attributes_indices = self.X_new.get_support(indices = True)
     
     #print(self.attributes_indices,'the indices of the selected atributes')
     
     return self
Exemple #27
0
def cross_validation_fold(index, splits_in, splits_out):
    """
    k-fold cross-validation "fold": performs validation using exactly
    one of the splits as validation set and the rest of the dataset
    as training data.
    :param index: Index of the split to use as validation data
    :param splits_in: List of splits of the original dataset inputs
    :param splits_out: List of splits of the origina dataset outputs
    :return: The accuracy score for a LinearSVC trained on all the
    splits except <index> and then validated on split <index>
    """
    validation_in = splits_in[index]
    validation_out = splits_out[index]
    cf = LabelPowerset(LinearSVC())

    # train on all splits except split <index>
    cf.fit(np.vstack(splits_in[:index] + splits_in[index + 1:]),
           sparse_vstack(splits_out[:index] + splits_out[index + 1:]))

    # validate on split <index>
    return validate(cf,
                    validation_in,
                    validation_out,
                    return_predictions=False)
# The matrices are initially in lil_matrix format
# Converting them to compressed row matrix format

X_train = X_train.tocsr()
y_train = y_train.todense()
X_test = X_test.tocsr()
y_test = y_test.todense()

label_set = set([0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21])
label_list = [0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21]

y_train = y_train[:, label_list]
y_test = y_test[:, label_list]

start_time = time.process_time()
# classifier = LabelPowerset(RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1))

# classifier = RandomForestClassifier(random_state=0, n_estimators=10)
# classifier = BinaryRelevance(classifier = LinearSVC(), require_dense = [False, True])
classifier = LabelPowerset(SGDClassifier(penalty='l2', alpha=0.01))
classifier.fit(X_train, y_train)
y_predicted = classifier.predict(X_test)
total_time = time.process_time() - start_time

print("Total time taken is : " + str(total_time))

print("Jaccard Similarity Score is : " +
      str(jaccard_similarity_score(y_test, y_predicted)))
print("Hamming Loss is : " + str(hamming_loss(y_test, y_predicted)))
# print("F1_Similarity score is : "+str(f1_score(y_test,y_predicted,average='macro')))
Exemple #29
0
    elif li==4:
        index_4.append(i)
    elif li==5:
        index_5.append(i)
print len(index_0)
print len(index_1)
print len(index_2)
print len(index_3)
print len(index_4)
print len(index_5)
index=index_5

y1=y1.toarray()
y=y1[:,index_label]
y=csc_matrix(y)
classifier= LabelPowerset(MultinomialNB(), require_dense=[True, True])
clf=RakelO(classifier,labelset_size=6,model_count=800)
clf.fit(X[index[100:]], y[index[100:]])
# joblib.dump(clf, 'F:/medical_result/cure_after/filename.pkl')
# clf = joblib.load('F:/medical_result/cure_after/filename.pkl')
#print "X[0]:",X[0]
predictions = clf.predict(X[index[:100]])
#将预测值和真实值作对比观察
pre=predictions.tocsr()
for i in range(100):
    print "--------------------------"
    print pre[i]
    print
    print y[index_0[i]]
    print "--------------------------"
# train for Classifier Chaines
classifier_cc.fit(X_train, y_train)

# predict for Classifier Chains
predictions_cc = classifier_cc.predict(X_test)

#Hamming Loss for Classifier Chaines
hamm_loss_cc = hamming_loss(y_test, predictions_cc)

print("Hamming Loss:", hamm_loss_cc)

print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes")

#initialize Label Powerset multi-label classifier
#with a gaussian naive bayes base classifier
classifier_lp = LabelPowerset(GaussianNB())

# train for Label Powerset
classifier_lp.fit(X_train, y_train)

# predict for Label Powerset
predictions_lp = classifier_lp.predict(X_test)

#Hamming Loss for Label PowerSet
hamm_loss_lp = hamming_loss(y_test, predictions_lp)

print("Hamming Loss:", hamm_loss_lp)

print("\n\n\nAll hamming loss:")
print("Binary Relevance:\n", hamm_loss_binary)
print("Classifier Chains:\n", hamm_loss_cc)