def train_linearsvc(X_train, Y_train): model_bow = BinaryRelevance(classifier=LinearSVC(C=0.5, tol=0.2)) model_bow.fit(X_train, Y_train) return model_bow
def __init__(self, task, modelName): self.task = task #create the model if task ==0: if modelName == "NB": self.model = GaussianNB() elif modelName == "KNN": self.model = KNeighborsClassifier() elif modelName == "SVM": self.model = SVC() elif modelName == "C45": self.model = tree.DecisionTreeClassifier # 增加适用多标签的贝叶斯 elif modelName == "MultiLabelNB": self.model = BinaryRelevance(GaussianNB()) elif modelName == "MultiLabelSVM": self.model = BinaryRelevance(SVC()) else: print("YOU CHOSE WRONG MODEL FOR CLASSIFICATION!") else: if modelName == "LR": self.model = linear_model.LinearRegression() elif modelName == "M5": self.model = tree.DecisionTreeRegressor elif modelName == "KNN": self.model = KNeighborsRegressor() else: print("YOU CHOSE WRONG MODEL FOR REGRESSION!") self.model = linear_model.LinearRegression()
def binary(X_train, X_test, y_train, y_test): print("Binary Relevance") model = BinaryRelevance(classifier=SVC(), require_dense=[True, True]).fit(X_train, y_train) y_pred = model.predict(X_test) hamming = hamming_loss(y_test, y_pred) subset_accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred, average='micro') precision = precision_score(y_test, y_pred, average='micro') f1 = f1_score(y_test, y_pred, average='micro') coverage = coverage_error(y_test, y_pred.toarray()) aps = label_ranking_average_precision_score(y_test, y_pred.toarray()) rankingloss = label_ranking_loss(y_test, y_pred.toarray()) print("Hamming: " + str(hamming)) print("Subset Accuracy: " + str(subset_accuracy)) print("Recall: " + str(recall)) print("Precision: " + str(precision)) print("F1: " + str(f1)) print("Coverage error: " + str(coverage)) print("Average Precision Score: " + str(aps)) print("Ranking Loss: " + str(rankingloss)) print("\n") return hamming, subset_accuracy, recall, precision, f1, coverage, aps, rankingloss
def select_features(data_train, data_test): x_train = data_train.iloc[:, :NO_FEATURES] y_train = data_train.iloc[:, NO_FEATURES:-1] x_train_sp = lil_matrix(x_train).toarray() y_train_sp = lil_matrix(y_train).toarray() forest = ExtraTreesClassifier(n_estimators=100, random_state=SEED_NUMBER) classifier = BinaryRelevance(forest) classifier.fit(x_train_sp, y_train_sp) feature_scores = [ forest.feature_importances_ for forest in classifier.classifiers_ ] indices = [argsort(importance)[::-1] for importance in feature_scores] selected_per_class = [ index[:int(0.1 * NO_FEATURES)].tolist() for index in indices ] selected_union = list(set().union(*selected_per_class)) avg_feature_scores = mean(feature_scores, axis=0) avg_indices = argsort(avg_feature_scores)[::-1] selected_avg = avg_indices[:int(0.1 * NO_FEATURES)] drop_col = [idx for idx in range(NO_FEATURES) if idx not in selected_union] train_red = data_train.drop(data_train.columns[drop_col], axis=1) test_red = data_test.drop(data_test.columns[drop_col], axis=1) return train_red, test_red, selected_union.__len__()
class BinaryRelevancesSimple: def __init__(self, model): # self.params = { # # 'num_class': num_class, # # "boosting_type": "gbdt", # "objective": "binary", # "metric": 'None', # "learning_rate": 0.05, # "verbosity": 1, # "seed": 888, # "num_threads": NUM_THREAD # } self.model = BinaryRelevance(LGBMClassifier()) if model == 'RF': self.model = BinaryRelevance(RandomForestClassifier(n_estimators=200, max_depth=12)) # def set_grow_step(self, new_step): # self.grow_boost_round = new_step def fit(self, X_train, y_train): print ('###start trainging...') start = time.time() self.model.fit(X_train, y_train) print ('####training time:', time.time() - start) def predict_proba(self, X_test): return self.model.predict_proba(X_test).A
def buildBRClassifier(xTrain, yTrain): # initialize binary relevance multi-label classifier # with a gaussian naive bayes base classifier classifier = BinaryRelevance(GaussianNB()) # train classifier.fit(xTrain, yTrain) return classifier
def train(X, y): classifier = BinaryRelevance(classifier=SVC(), require_dense=[False, True]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) print("before train") classifier.fit(X_train, y_train) print("train over begin predict") predictions = classifier.predict(X_test) print("validate loss accuracy: {}".format( 1 - hamming_loss(y_test, np.stack(predictions))))
def train(self): classifier = BinaryRelevance(GaussianNB()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def binRel(X_train, X_test, y_test, y_train): # initialize binary relevance multi-label classifier # with a gaussian naive bayes base classifier classifier = BinaryRelevance(GaussianNB()) # train classifier.fit(X_train, y_train) # predict predictions = classifier.predict(X_test) print('Hamming loss: {0}'.format( sklearn.metrics.hamming_loss(y_test, predictions)))
def BN_fit(clfs, X_train, y_train, X_test, y_test, evaluate): metrics_lb = {} for key, clf in zip(clfs.keys(), clfs.values()): print('Fitting BinaryRelevance with Classifier : %s' % key) clf = BinaryRelevance(clf) clf.fit(X_train, y_train) preds = clf.predict(X_test) for m in evaluate: metrics_lb[key + ' ' + m] = scores(m, y_test, preds) return metrics_lb
def __init__(self): # self.params = { # # 'num_class': num_class, # # "boosting_type": "gbdt", # "objective": "binary", # "metric": 'None', # "learning_rate": 0.05, # "verbosity": 1, # "seed": 888, # "num_threads": NUM_THREAD # } self.model = BinaryRelevance(LGBMClassifier())
def formDataMultiLabel(sampData, df): ####### Convert Response Category into Labelized Binarizer ################ manActData = df.label_num.unique() lb = preprocessing.LabelBinarizer() lb.fit(manActData) tfdLabelNum = lb.transform(df.label_num) ####### Convert Next Action Category into Labelized Binarizer ############# nxtActData = df.sec_label_num.unique() lb = preprocessing.LabelBinarizer() lb.fit(nxtActData) tfdSecLabelNum = lb.transform(df.sec_label_num) ####### Convert Response Category into Labelized Binarizer ################ inpRPAData = (df.inp_Data).astype(str) inpRPAData = inpRPAData.apply(lambda x: x.split()[0]) lab, lev = pd.factorize(inpRPAData) lb = preprocessing.LabelBinarizer() lb.fit(np.unique(lab)) tfdInpRPAData = lb.transform(lab) #print (np.unique(tfdInpRPAData)) #This concatenation is the actual process #conCatData = np.concatenate((tfdLabelNum, tfdSecLabelNum, tfdInpRPAData), axis=1) ####### Build Multi-Label Prediction Model ############################### respTrain, respTest, labTrain, labTest = train_test_split(sampData, tfdSecLabelNum, random_state=1) TR = tree.DecisionTreeClassifier(criterion = "gini", max_depth=100, min_samples_leaf=2) GNB = GaussianNB() RF = RandomForestClassifier(n_estimators = 100) classifier = BinaryRelevance(GNB) #classifier = ClassifierChain(TR) #classifier = LabelPowerset(RF) vect = TfidfVectorizer(min_df=1, max_df=1.0, stop_words='english') respTrainVec = vect.fit_transform(respTrain) respTestVec = vect.transform(respTest) classifier.fit(respTrainVec, labTrain) predictions = classifier.predict(respTestVec) acc = metrics.accuracy_score(labTest, predictions) print (acc) return lab
def main(): bibtex = sci.loadmat('D:\课程作业\机器学习\机器学习课程设计\dataset\\bibtex.mat') medical = sci.loadmat('D:\课程作业\机器学习\机器学习课程设计\dataset\medical.mat') bib_X = bibtex['data'] #7395,1836 bib_y = bibtex['target'] #159,7395 med_X = medical['data'] #978,1449 med_y = medical['target'] #45,978 scaler = MinMaxScaler() scaler.fit(bib_X) bib_X = scaler.transform(bib_X) scaler = MinMaxScaler() scaler.fit(med_X) med_X = scaler.transform(med_X) f1_scores = [] l2_s = ['l1', 'l2'] for l2 in l2_s: clf = BinaryRelevance( LogisticRegression(penalty=l2, solver='liblinear', dual=False)) clf.fit(med_X, med_y.T) pre = clf.predict(med_X) f1_scores.append(metrics.f1_score(med_y.T, pre, average='samples')) for l2 in l2_s: clf = BinaryRelevance(LinearSVC(penalty=l2, dual=False)) clf.fit(med_X, med_y.T) pre = clf.predict(med_X) f1_scores.append(metrics.f1_score(med_y.T, pre, average='samples')) tabel = PrettyTable(["", "log", "hinge"]) tabel.padding_width = 1 tabel.add_row(["l1", f1_scores[0], f1_scores[2]]) tabel.add_row(["l2", f1_scores[1], f1_scores[3]]) csfs = CSFS(u=0.1) W, b = csfs.fit(med_X.T, med_y.T, u=0.1) pred = csfs.predict(med_X.T, W, b) new_y = np.zeros(med_y.shape) size = int(med_y.shape[1] * 0.7) new_y[:, :size] = med_y[:, :size] smile = SMILE(alpha=0.1) smile.fit(med_X.T, new_y) pred_s = smile.predict(med_X.T) csfs2 = CSFS(u=0.1) W, b = csfs.fit(med_X.T, new_y.T, u=0.1) pred = csfs.predict(med_X.T, W, b) print('large mult_score:', metrics.f1_score(med_y.T, pred, average='samples')) print('CSFSf1_scores:', metrics.f1_score(med_y.T, pred_s, average='samples')) print('SMILE_score:', metrics.f1_score(med_y.T, pred, average='samples')) print(tabel)
def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.BinaryRelevanceObject = BinaryRelevance( classifier=SVC(gamma='auto', probability=True), require_dense=[True, True]) #self.BinaryRelevanceObject = BinaryRelevance() # fitting the data self.BinaryRelevanceObject.fit(X, y) #the classifiers for each label self.classifiers = self.BinaryRelevanceObject.classifiers_ return self.BinaryRelevanceObject.fit(X, y)
def train_model(self, train): data = [self.get_value(s, False) for s in train] X_train = np.array([d[0] for d in data]) y_train = np.array([d[1] for d in data]) model = BinaryRelevance(classifier=SVC(probability=True, class_weight='balanced', break_ties=True), require_dense=[False, True]) # model = BinaryRelevance(classifier=LogisticRegression(class_weight='balanced', solver='lbfgs'), require_dense=[False, True]) model.fit(X_train, y_train) return model
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
def __init__(self, model): # self.params = { # # 'num_class': num_class, # # "boosting_type": "gbdt", # "objective": "binary", # "metric": 'None', # "learning_rate": 0.05, # "verbosity": 1, # "seed": 888, # "num_threads": NUM_THREAD # } self.model = BinaryRelevance(LGBMClassifier()) if model == 'RF': self.model = BinaryRelevance(RandomForestClassifier(n_estimators=200, max_depth=12))
def test_if_dense_classification_works_on_non_dense_base_classifier( self): classifier = BinaryRelevance(classifier=Keras( create_model_single_class, False, KERAS_PARAMS), require_dense=[True, True]) self.assertClassifierWorksWithSparsity(classifier, 'dense')
def multiLabel_SKLearn_GaussianNBayes(rData, lData, sData): xData = rData.values yData = np.array( [lData.values, sData.values] ) respTrain, respTest, labTrain, labTest = train_test_split(xData, yData, random_state=1) classifier = BinaryRelevance(GaussianNB()) #classifier = ClassifierChain(GaussianNB()) #classifier = LabelPowerset(GaussianNB()) classifier.fit(respTrain, labTrain) predictions = classifier.predict(respTest) acc = accuracy_score(labTest, predictions) return acc
def RecommendByBinaryRelevance(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """使用多标签问题的 二值相关 """ classifier = BinaryRelevance(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20)) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data) predictions = predictions.todense().getA() recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def binary_relevance(self): '''Name: Binary Relevance Main Idea: Divide multi-classify into multi binary classfier Evaluation Metric: accuracy_score ''' print(self.X_train) print(self.y_train) classifier = BinaryRelevance(GaussianNB()) classifier.fit(self.X_train, self.y_train) predictions = classifier.predict(self.X_test) print(predictions) #print(y_test) #print("predictions:\n",predictions) result = accuracy_score(self.y_test, predictions) print(result)
def __init__( self, rdm_state=84, params={"estimator__C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]}, niterations=5): self.model = BinaryRelevance( LogisticRegression(random_state=rdm_state)) self.params = params self.niterations = niterations
def BinaryRelevance_method(X_train, y_train, samples_leaf, samples_split): """ 问题转换-->二元关联方法 :param X_train: 输入数据 :param y_train: 对应标签数据 :return: """ try: classifier = BinaryRelevance( DecisionTreeClassifier(min_samples_leaf=int(samples_leaf), min_samples_split=int(samples_split))) classifier.fit(X_train, y_train) return classifier except Exception as e: print("warning----二元关联|BinaryRelevance_method----" + str(e)) return None
class MyBinaryRelevanceFeatureSelect(): def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.BinaryRelevanceObject = BinaryRelevance( classifier=SVC(gamma='auto', probability=True), require_dense=[True, True]) #self.BinaryRelevanceObject = BinaryRelevance() # fitting the data self.BinaryRelevanceObject.fit(X, y) #the classifiers for each label self.classifiers = self.BinaryRelevanceObject.classifiers_ return self.BinaryRelevanceObject.fit(X, y) # def partition(self): # return self.BinaryRelevanceObject.partition_#BinaryRelevanceObject # def model_count(self): # return self.BinaryRelevanceObject.model_count_ def predict(self, X, y=None): return self.BinaryRelevanceObject.predict(X) def predict_proba(self, X): return self.BinaryRelevanceObject.predict_proba(X) # def feature_select(self, X, y, transformer): # transformer.fit(X, y) # selected_attributes_indices = transformer.get_support(indices = True) # # return selected_attributes_indices # # def sets_of_selected_features(self, X, predictions, classifier, transformer ): #X is the df with the predictions # selected_features_array = [] # # for i in predictions: # indices_features_selected = classifier.feature_select(X, predictions[i], transformer) # selected_features_array.append(indices_features_selected) # # return selected_features_array
def get_train_test_lda(self, topic): # get training set dataset = arff.load(open(os.path.join(dir, "medical-train.arff")), encode_nominal=True) dataset = np.array(dataset.get("data")) X_train = dataset[:, :-num_label] y_train = dataset[:, -num_label:] # get test set dataset = arff.load(open(os.path.join(dir, "medical-test.arff")), encode_nominal=True) dataset = np.array(dataset.get("data")) X_test = dataset[:, :-num_label] y_test = dataset[:, -num_label:] for k in topic: X_iter = X_train.astype(np.int64) # get training_data feature topics model = lda.LDA(n_topics=k, n_iter=1000) model.fit(X_iter) doc_topic_x = model.doc_topic_ # get training data label topics model_label = lda.LDA(n_topics=k, n_iter=1000) model_label.fit(y_train) doc_topic_y = model_label.doc_topic_ # concat feature-topic and label topic x = np.hstack((doc_topic_x, doc_topic_y)) # discretize the topics x = self.discretization_doc_topic(x) X_train = np.hstack((X_train, x)) # multi-label learning to get test_data label topics and feature topics classifier = BinaryRelevance(RandomForestClassifier()) classifier.fit(X_iter, x) x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray()) X_test = np.hstack((X_test, x)) return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
def gaussianNaiveBayesBinary(): print("Gaussian naive bayes binary") start = time.time() classifier = BinaryRelevance(GaussianNB()) filename = "gaussianNaiveBayes" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def supportVectorMachine(): print("Support vector machine") start = time.time() classifier = BinaryRelevance(classifier=svm.SVC(), require_dense=[False, True]) filename = "SupportVectorMachine" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def randomForest(): print("Random forest classifier") start = time.time() classifier = BinaryRelevance(classifier=RandomForestClassifier(), require_dense=[False, True]) filename = "randomForest" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def knnBinary(m): print("knn binary") start = time.time() classifier = BinaryRelevance(KNeighborsClassifier(n_neighbors=m)) filename = "knnBinary" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def __init__( self, random_state=84, params={ 'estimator__C': [1, 10, 100, 1000], 'estimator__gamma': [0.001, 0.0001], 'estimator__kernel': ['rbf', 'linear'] }, niterations=10): self.model = BinaryRelevance(SVC(random_state=random_state)) self.params = params self.niterations = niterations