def build_model(category_names): """ Builds a multilabel classifier using X and y from load_data() Args: None Returns: cv: (sklearn.model_selection.GridSearchCV) estimator containing the model and the parameters to be trained """ try: # initialise columns to be used for data preparation purposes in the model pipeline message_col = 0 # build a pipeline containing the feature transformations and estimator pipeline = Pipeline([ ( 'features', ColumnTransformer( [ # apply message transformations ('text_pipeline', Pipeline( [('vect', CountVectorizer(tokenizer=partial(tokenize))), ('tfidf', TfidfTransformer())]), message_col), ('starting_verb', StartingVerbExtractor(), message_col), ('category_terms', CategoryTermExtractor(category_names=category_names), message_col), ], remainder='drop')), # specify the estimator ('clf', LabelPowerset(MultinomialNB(fit_prior=True))) ]) # parameter grid to be used for grid search parameters = { 'features__text_pipeline__vect__max_features': [10000], 'features__text_pipeline__tfidf__sublinear_tf': [True], 'features__text_pipeline__vect__ngram_range': [(1, 1), (1, 2)], 'features__text_pipeline__vect__min_df': [1], 'features__text_pipeline__vect__max_df': [.95], 'features__text_pipeline__tfidf__smooth_idf': [True], 'features__text_pipeline__tfidf__norm': ['l2'], 'clf__classifier__alpha': [0.01, 1.] } # perform cross validation using grid search on the pipeline described above cv = GridSearchCV(pipeline, param_grid=parameters, cv=5, verbose=2) return cv except: raise Exception("Could not build model.")
def powerset(self): classifier = LabelPowerset(LogisticRegression()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def naive_bayes_classifier(train_x, train_y): from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import LabelPowerset from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB classifier = LabelPowerset(GaussianNB()) # classifier = ClassifierChain(GaussianNB()) # classifier = BinaryRelevance(GaussianNB()) classifier.fit(train_x, train_y) return classifier
def buildLBClassifier(xTrain, yTrain): # initialize Label Powerset multi-label classifier # with a gaussian naive bayes base classifier classifier = LabelPowerset(GaussianNB()) # train xTrain = np.ascontiguousarray(xTrain) yTrain = np.ascontiguousarray(yTrain) classifier.fit(xTrain, yTrain) return classifier
def __init__( self, random_state=84, params={ 'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf', 'linear'] }, niterations=10): self.model = LabelPowerset(SVC(random_state=random_state)) self.params = params self.niterations = niterations
def train_test_svm(dataset, return_predictions=True): train_in, train_out, test_in, test_out = dataset classifier = LabelPowerset(LinearSVC()) time_func(classifier.fit)(train_in, train_out) acc, predictions = time_func(validate)(classifier, test_in, test_out, return_predictions) if return_predictions: return acc, predictions else: return acc
def resampling_data(self, X, y): # Import a dataset with X and multi-label y lp = LabelPowerset() ros = RandomOverSampler(random_state=42) # Applies the above stated multi-label (ML) to multi-class (MC) transformation. yt = lp.transform(y) X_resampled, y_resampled = ros.fit_sample(X, yt) # Inverts the ML-MC transformation to recreate the ML set y_resampled = lp.inverse_transform(y_resampled) return X_resampled, y_resampled
def __init__( self, random_state=84, n_estimators=20, params={ 'classifier__n_estimators': [250, 500, 1000, 1500], 'classifier__min_samples_split': [2, 4, 8] }, niterations=10): self.model = LabelPowerset( ExtraTreesClassifier(random_state=random_state, n_estimators=n_estimators)) self.params = params self.niterations = niterations
def build_Mklnn(X_train, y_train): parameters = { 'classifier': [LabelPowerset(), ClassifierChain()], 'classifier__classifier': [RandomForestClassifier()], 'classifier__classifier__n_estimators': [10, 20, 50], } clf = GridSearchCV(LabelSpacePartitioningClassifier(), parameters, scoring='f1_macro') clf.fit(X_train, y_train) print(clf.best_params_, clf.best_score_)
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
def __init__( self, random_state=84, n_estimators=20, params={ "classifier__max_depth": [3, None], "classifier__max_features": [1, 3, 10], "classifier__min_samples_leaf": [1, 3, 10] }, niterations=10): self.model = LabelPowerset( GradientBoostingClassifier(random_state=random_state, n_estimators=n_estimators)) self.params = params self.niterations = niterations
def __init__( self, random_state=84, params={ 'classifier__activation': ['identity', 'logistic', 'tanh', 'relu'], 'classifier__solver': ['lbfgs', 'sgd', 'adam'], 'classifier__alpha': sp_randint(0.0001, 1), 'classifier__learning_rate': ['constant', 'invscaling', 'adaptive'], 'classifier__momentum': [0.9, 0.95, 0.99] }, niterations=5): self.model = LabelPowerset(MLPClassifier(random_state=random_state)) self.params = params self.niterations = niterations
def __init__( self, random_state=84, n_estimators=20, params={ "classifier__max_depth": [3, None], "classifier__max_features": [1, 3, 10], "classifier__min_samples_leaf": [1, 3, 10], "classifier__bootstrap": [True, False], "classifier__criterion": ["gini", "entropy"] }, niterations=10): self.model = LabelPowerset( RandomForestClassifier(random_state=random_state, n_estimators=n_estimators)) self.params = params self.niterations = niterations
def runSet(model, x, y): mse = [] accuracy = [] kf = KFold(n_splits=splitNo) for train, test in kf.split(x): classifier = LabelPowerset(model) classifier.fit(x[train], y[train]) predictions = classifier.predict(x[test]) accuracy.append(accuracy_score(y[test], predictions)) mse.append(mean_squared_error(y[test], predictions.toarray())) mse = np.array(mse) accuracy = np.array(accuracy) mse = np.mean(mse) accuracy = np.mean(accuracy) return accuracy, mse
def LabelPowerset_method(X_train, y_train, samples_leaf, samples_split): """ 问题转换-->标签Powerset方法 :param X_train: 输入数据 :param y_train: 对应标签数据 :return: """ try: classifier = LabelPowerset( DecisionTreeClassifier(min_samples_leaf=int(samples_leaf), min_samples_split=int(samples_split))) classifier.fit(X_train, y_train) return classifier except Exception as e: print("warning----标签Powerset|LabelPowerset_method----" + str(e)) return None
def evaluate_verse(embedding, labels, number_shuffles=10, train_perc=0.1): from skmultilearn.problem_transform import LabelPowerset micro = [] macro = [] sss = StratifiedShuffleSplit( n_splits=number_shuffles, test_size=1 - train_perc) for train_index, test_index in sss.split(embedding, labels): X_train, X_test = embedding[train_index], embedding[test_index] y_train, y_test = labels[train_index], labels[test_index] clf = LabelPowerset(LogisticRegression()) clf.fit(X_train, y_train) preds = clf.predict(X_test) micro.append(f1_score(y_test, preds, average='micro')) macro.append(f1_score(y_test, preds, average='macro')) return (micro, macro)
def test_model_selection_works(self): x, y = make_multilabel_classification(sparse=True, n_labels=5, return_indicator='sparse', allow_unlabeled=False) parameters = { 'labelset_size': list(range(2, 3)), 'classifier': [LabelPowerset(), BinaryRelevance()], 'classifier__classifier': [MultinomialNB()], 'classifier__classifier__alpha': [0.7, 1.0], } clf = GridSearchCV(RakelD(), parameters, scoring='f1_macro') clf.fit(x, y) for p in list(parameters.keys()): self.assertIn(p, clf.best_params_) self.assertIsNotNone(clf.best_score_)
def get_train_test_lda(topic): model = VGG16(include_top=False, pooling='avg') x_train, y_train, x_test, y_test = load() x_train = x_train.astype('float32') x_train /= 255 y_train = y_train.astype('int64') x_test = x_test.astype('float32') x_test /= 255 y_test = y_test.astype('float32') X_train = model.predict(x_train) print(X_train.shape) X_test = model.predict(x_test) # X_train = model.predict(x_train) # X_test = model.predict(x_test) for k in topic: X_iter = X_train model_label = lda.LDA(n_topics=k, n_iter=1000) model_label.fit(y_train) doc_topic = model_label.doc_topic_ x2 = doc_topic x = x2 x = discretization_doc_topic(x) X_train = np.hstack((X_train, x)) # multi-label learning to get x2 classifier = LabelPowerset(RandomForestClassifier()) classifier.fit(X_iter, x) x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray()) # print(x) # x = alpha * x1 + (1-alpha) * x2 # x = self.discretization_doc_topic(x) X_test = np.hstack((X_test, x)) return np.array(X_train)[:, -28:], np.array(y_train), np.array( X_test)[:, -28:], np.array(y_test)
def load_ucmerced_dataset(): dataset = scipy.io.loadmat( '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/dataset1.mat' ) dataset = dataset['dataset1'] edges = np.squeeze(dataset['edges']) #adjacecny matrix index = np.squeeze(dataset['index']) # image index to keep track classes = np.squeeze(dataset['class']) #image class number to keep track #loading features in which NaN values have been replaced features = scipy.io.loadmat( '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/features.mat' ) features = features['features'] features = features['val'] features = features[0] for i in range(0, len(features)): if np.isnan(features[i]).any() == True: print('features %d have NaN:' % i, np.isnan(features[i]).any()) # loading multi-labels labels = scipy.io.loadmat( '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/LandUse_multilabels.mat' ) labels = labels['labels'] labels = np.transpose(labels, (1, 0)) # Calculating class weights lp = LabelPowerset() trans_labels = lp.transform(labels) unique, counts = np.unique(trans_labels, return_counts=True) class_freq = 1.0 / counts weight_mat = np.zeros((np.shape(trans_labels))) for i in range(len(weight_mat)): weight_mat[i] = class_freq[np.where(trans_labels[i] == unique)] # Calculating label weights sum_labels = np.sum(labels, axis=0, dtype=np.float32) sum_tot = np.sum(sum_labels, dtype=np.float32) label_freq = np.true_divide(sum_labels, sum_tot) return features, edges, labels, weight_mat, label_freq, index, classes
def multiple_smote(X, y): """ 为multi-label样本过采样 """ # Import a dataset with X and multi-label y y = np.array(y) lp = LabelPowerset() # oversampler = ADASYN(random_state=1994, n_neighbors=2) oversampler = SMOTE(k_neighbors=2) # Applies the above stated multi-label (ML) to multi-class (MC) transformation. yt = lp.transform(y) X_resampled, y_resampled = oversampler.fit_resample(X, yt) # Inverts the ML-MC transformation to recreate the ML set y_resampled = lp.inverse_transform(y_resampled) # return a sparse matrix return X_resampled, y_resampled.toarray()
def label_fscore(self, subset=False): if subset: true_list, pred_list = self.true_list, self.pred_list if self._is_binarized: # transform multilabel to multiclass for subset measurement lp = LabelPowerset() transformed = lp.transform(np.concatenate((true_list, pred_list))) true_list, pred_list = np.split(transformed, 2) else: true_list, pred_list = self._binarized_labels() prec, rec, fscore, count = skm.precision_recall_fscore_support( true_list, pred_list) fscores_dict = {} for c, f in zip(count, fscore): # label = class_by_count[c] # for when remove_multi_labeled is used if c == 0: continue fscores_dict[c] = f return fscores_dict
def LabelPowerset(self): print("") print( "Starting LabelPowerset Classifier of skmultilearn.problem_transform..." ) print("") start = datetime.now() parameters = [ { 'classifier': [BernoulliNB()], 'classifier__alpha': [0.7, 1.0], }, # { # 'classifier': [SVC()], # 'classifier__kernel': ['rbf', 'linear'], # 'classifier__C': [1, 0.8], # 'classifier__class_weight': ['dict', 'balanced'], # }, # { # 'classifier': [Perceptron()], # 'classifier__penalty': ['l2', 'l1'], # 'classifier__alpha': [0.7, 1.0], # 'classifier__max_iter': [1000, 10000], # }, ] grid_search_cv = GridSearchCV(LabelPowerset(), parameters, scoring='f1_macro', verbose=2, n_jobs=-1) grid_search_cv.fit(self.x_train, self.y_train) clf = grid_search_cv.best_estimator_ print('Finished training in : ', datetime.now() - start) y_pred = clf.predict(self.x_test) return self.multilabel_evaluation(y_pred, self.y_test)
def test_model_selection_works(self): for x, y in self.get_multilabel_data_for_tests('dense'): parameters = { 'classifier': [LabelPowerset(), BinaryRelevance()], 'clusterer': [RandomLabelSpaceClusterer(None, None, False)], 'clusterer__cluster_size': list(range(2, 3)), 'clusterer__cluster_count': [3], 'clusterer__allow_overlap': [False], 'classifier__classifier': [MultinomialNB()], 'classifier__classifier__alpha': [0.7, 1.0], } clf = GridSearchCV(LabelSpacePartitioningClassifier(), parameters, scoring='f1_macro') clf.fit(x, y) for p in list(parameters.keys()): self.assertIn(p, clf.best_params_) self.assertIsNotNone(clf.best_score_)
def aspectBasedMining(request): basepath = os.path.dirname(os.getcwd()) annotated_reviews_df = pd.read_csv(os.path.join(basepath, 'aspectLabelled.csv')) annotated_reviews_df = annotated_reviews_df.sample(frac=1).reset_index(drop=True) annotated_reviews_df = annotated_reviews_df[annotated_reviews_df['Label'].notna()] def aspect_list(row): temp = str(row['Aspect']) temp2 = list(temp.split(",")) return temp2 annotated_reviews_df['Aspect'] = annotated_reviews_df.apply(lambda x: aspect_list(x), axis=1) nlp = spacy.load('en_core_web_lg') neuralcoref.add_to_pipe(nlp) def replace_pronouns(text): text = text['Description'] doc = nlp(text) resolved_text = doc._.coref_resolved return resolved_text annotated_reviews_df["text_pro"] = annotated_reviews_df.apply(lambda x: replace_pronouns(x), axis=1) # Convert the multi-labels into arrays mlb = MultiLabelBinarizer() y = mlb.fit_transform(annotated_reviews_df.Aspect) X = annotated_reviews_df.text_pro # Split data into train and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # save the the fitted binarizer labels # This is important: it contains the how the multi-label was binarized, so you need to # load this in the next folder in order to undo the transformation for the correct labels. filename = 'mlb.sav' pickle.dump(mlb, open(filename, 'wb')) text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),]) text_clf = text_clf.fit(X_train, y_train) filename = 'aspect_clf.sav' pickle.dump(text_clf, open(filename, 'wb')) data = {"data": {"code": 200, "message": "Classifier Updated"}} return Response(data)
def problemTransformation(data): # Binary Relevance # Classifier Chains # Label Powerset # initialize multi-label classifier # with a gaussian naive bayes base classifier classifier = BinaryRelevance(GaussianNB()) classifier.fit(X_train, y_train) # train predictions = classifier.predict(X_test) # predict accuracyScore = accuracy_score(y_test, predictions) classifier = ClassifierChain(GaussianNB()) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracyScore = accuracy_score(y_test, predictions) classifier = LabelPowerset(GaussianNB()) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracyScore = accuracy_score(y_test, predictions) return None
def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.LabelPowerSetObject = LabelPowerset(GaussianNB()) # fitting the data self.LabelPowerSetObject.fit(X, y) # transformed y y_transformed = self.LabelPowerSetObject.transform(y) # instanciating with SelectKBest object self.X_new = SelectKBest(chi2, k=2) # the feature selecting self.X_transformed = self.X_new.fit_transform(X, y_transformed) # save indices of the saved attributes self.selected_attributes_indices = self.X_new.get_support(indices = True) #print(self.attributes_indices,'the indices of the selected atributes') return self
def cross_validation_fold(index, splits_in, splits_out): """ k-fold cross-validation "fold": performs validation using exactly one of the splits as validation set and the rest of the dataset as training data. :param index: Index of the split to use as validation data :param splits_in: List of splits of the original dataset inputs :param splits_out: List of splits of the origina dataset outputs :return: The accuracy score for a LinearSVC trained on all the splits except <index> and then validated on split <index> """ validation_in = splits_in[index] validation_out = splits_out[index] cf = LabelPowerset(LinearSVC()) # train on all splits except split <index> cf.fit(np.vstack(splits_in[:index] + splits_in[index + 1:]), sparse_vstack(splits_out[:index] + splits_out[index + 1:])) # validate on split <index> return validate(cf, validation_in, validation_out, return_predictions=False)
# The matrices are initially in lil_matrix format # Converting them to compressed row matrix format X_train = X_train.tocsr() y_train = y_train.todense() X_test = X_test.tocsr() y_test = y_test.todense() label_set = set([0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21]) label_list = [0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21] y_train = y_train[:, label_list] y_test = y_test[:, label_list] start_time = time.process_time() # classifier = LabelPowerset(RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1)) # classifier = RandomForestClassifier(random_state=0, n_estimators=10) # classifier = BinaryRelevance(classifier = LinearSVC(), require_dense = [False, True]) classifier = LabelPowerset(SGDClassifier(penalty='l2', alpha=0.01)) classifier.fit(X_train, y_train) y_predicted = classifier.predict(X_test) total_time = time.process_time() - start_time print("Total time taken is : " + str(total_time)) print("Jaccard Similarity Score is : " + str(jaccard_similarity_score(y_test, y_predicted))) print("Hamming Loss is : " + str(hamming_loss(y_test, y_predicted))) # print("F1_Similarity score is : "+str(f1_score(y_test,y_predicted,average='macro')))
elif li==4: index_4.append(i) elif li==5: index_5.append(i) print len(index_0) print len(index_1) print len(index_2) print len(index_3) print len(index_4) print len(index_5) index=index_5 y1=y1.toarray() y=y1[:,index_label] y=csc_matrix(y) classifier= LabelPowerset(MultinomialNB(), require_dense=[True, True]) clf=RakelO(classifier,labelset_size=6,model_count=800) clf.fit(X[index[100:]], y[index[100:]]) # joblib.dump(clf, 'F:/medical_result/cure_after/filename.pkl') # clf = joblib.load('F:/medical_result/cure_after/filename.pkl') #print "X[0]:",X[0] predictions = clf.predict(X[index[:100]]) #将预测值和真实值作对比观察 pre=predictions.tocsr() for i in range(100): print "--------------------------" print pre[i] print print y[index_0[i]] print "--------------------------"
# train for Classifier Chaines classifier_cc.fit(X_train, y_train) # predict for Classifier Chains predictions_cc = classifier_cc.predict(X_test) #Hamming Loss for Classifier Chaines hamm_loss_cc = hamming_loss(y_test, predictions_cc) print("Hamming Loss:", hamm_loss_cc) print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes") #initialize Label Powerset multi-label classifier #with a gaussian naive bayes base classifier classifier_lp = LabelPowerset(GaussianNB()) # train for Label Powerset classifier_lp.fit(X_train, y_train) # predict for Label Powerset predictions_lp = classifier_lp.predict(X_test) #Hamming Loss for Label PowerSet hamm_loss_lp = hamming_loss(y_test, predictions_lp) print("Hamming Loss:", hamm_loss_lp) print("\n\n\nAll hamming loss:") print("Binary Relevance:\n", hamm_loss_binary) print("Classifier Chains:\n", hamm_loss_cc)