def main(): bibtex = sci.loadmat('D:\课程作业\机器学习\机器学习课程设计\dataset\\bibtex.mat') medical = sci.loadmat('D:\课程作业\机器学习\机器学习课程设计\dataset\medical.mat') bib_X = bibtex['data'] #7395,1836 bib_y = bibtex['target'] #159,7395 med_X = medical['data'] #978,1449 med_y = medical['target'] #45,978 scaler = MinMaxScaler() scaler.fit(bib_X) bib_X = scaler.transform(bib_X) scaler = MinMaxScaler() scaler.fit(med_X) med_X = scaler.transform(med_X) f1_scores = [] l2_s = ['l1', 'l2'] for l2 in l2_s: clf = BinaryRelevance( LogisticRegression(penalty=l2, solver='liblinear', dual=False)) clf.fit(med_X, med_y.T) pre = clf.predict(med_X) f1_scores.append(metrics.f1_score(med_y.T, pre, average='samples')) for l2 in l2_s: clf = BinaryRelevance(LinearSVC(penalty=l2, dual=False)) clf.fit(med_X, med_y.T) pre = clf.predict(med_X) f1_scores.append(metrics.f1_score(med_y.T, pre, average='samples')) tabel = PrettyTable(["", "log", "hinge"]) tabel.padding_width = 1 tabel.add_row(["l1", f1_scores[0], f1_scores[2]]) tabel.add_row(["l2", f1_scores[1], f1_scores[3]]) csfs = CSFS(u=0.1) W, b = csfs.fit(med_X.T, med_y.T, u=0.1) pred = csfs.predict(med_X.T, W, b) new_y = np.zeros(med_y.shape) size = int(med_y.shape[1] * 0.7) new_y[:, :size] = med_y[:, :size] smile = SMILE(alpha=0.1) smile.fit(med_X.T, new_y) pred_s = smile.predict(med_X.T) csfs2 = CSFS(u=0.1) W, b = csfs.fit(med_X.T, new_y.T, u=0.1) pred = csfs.predict(med_X.T, W, b) print('large mult_score:', metrics.f1_score(med_y.T, pred, average='samples')) print('CSFSf1_scores:', metrics.f1_score(med_y.T, pred_s, average='samples')) print('SMILE_score:', metrics.f1_score(med_y.T, pred, average='samples')) print(tabel)
def binary(X_train, X_test, y_train, y_test): print("Binary Relevance") model = BinaryRelevance(classifier=SVC(), require_dense=[True, True]).fit(X_train, y_train) y_pred = model.predict(X_test) hamming = hamming_loss(y_test, y_pred) subset_accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred, average='micro') precision = precision_score(y_test, y_pred, average='micro') f1 = f1_score(y_test, y_pred, average='micro') coverage = coverage_error(y_test, y_pred.toarray()) aps = label_ranking_average_precision_score(y_test, y_pred.toarray()) rankingloss = label_ranking_loss(y_test, y_pred.toarray()) print("Hamming: " + str(hamming)) print("Subset Accuracy: " + str(subset_accuracy)) print("Recall: " + str(recall)) print("Precision: " + str(precision)) print("F1: " + str(f1)) print("Coverage error: " + str(coverage)) print("Average Precision Score: " + str(aps)) print("Ranking Loss: " + str(rankingloss)) print("\n") return hamming, subset_accuracy, recall, precision, f1, coverage, aps, rankingloss
def train(X, y): classifier = BinaryRelevance(classifier=SVC(), require_dense=[False, True]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) print("before train") classifier.fit(X_train, y_train) print("train over begin predict") predictions = classifier.predict(X_test) print("validate loss accuracy: {}".format( 1 - hamming_loss(y_test, np.stack(predictions))))
def train(self): classifier = BinaryRelevance(GaussianNB()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def BN_fit(clfs, X_train, y_train, X_test, y_test, evaluate): metrics_lb = {} for key, clf in zip(clfs.keys(), clfs.values()): print('Fitting BinaryRelevance with Classifier : %s' % key) clf = BinaryRelevance(clf) clf.fit(X_train, y_train) preds = clf.predict(X_test) for m in evaluate: metrics_lb[key + ' ' + m] = scores(m, y_test, preds) return metrics_lb
def binRel(X_train, X_test, y_test, y_train): # initialize binary relevance multi-label classifier # with a gaussian naive bayes base classifier classifier = BinaryRelevance(GaussianNB()) # train classifier.fit(X_train, y_train) # predict predictions = classifier.predict(X_test) print('Hamming loss: {0}'.format( sklearn.metrics.hamming_loss(y_test, predictions)))
def formDataMultiLabel(sampData, df): ####### Convert Response Category into Labelized Binarizer ################ manActData = df.label_num.unique() lb = preprocessing.LabelBinarizer() lb.fit(manActData) tfdLabelNum = lb.transform(df.label_num) ####### Convert Next Action Category into Labelized Binarizer ############# nxtActData = df.sec_label_num.unique() lb = preprocessing.LabelBinarizer() lb.fit(nxtActData) tfdSecLabelNum = lb.transform(df.sec_label_num) ####### Convert Response Category into Labelized Binarizer ################ inpRPAData = (df.inp_Data).astype(str) inpRPAData = inpRPAData.apply(lambda x: x.split()[0]) lab, lev = pd.factorize(inpRPAData) lb = preprocessing.LabelBinarizer() lb.fit(np.unique(lab)) tfdInpRPAData = lb.transform(lab) #print (np.unique(tfdInpRPAData)) #This concatenation is the actual process #conCatData = np.concatenate((tfdLabelNum, tfdSecLabelNum, tfdInpRPAData), axis=1) ####### Build Multi-Label Prediction Model ############################### respTrain, respTest, labTrain, labTest = train_test_split(sampData, tfdSecLabelNum, random_state=1) TR = tree.DecisionTreeClassifier(criterion = "gini", max_depth=100, min_samples_leaf=2) GNB = GaussianNB() RF = RandomForestClassifier(n_estimators = 100) classifier = BinaryRelevance(GNB) #classifier = ClassifierChain(TR) #classifier = LabelPowerset(RF) vect = TfidfVectorizer(min_df=1, max_df=1.0, stop_words='english') respTrainVec = vect.fit_transform(respTrain) respTestVec = vect.transform(respTest) classifier.fit(respTrainVec, labTrain) predictions = classifier.predict(respTestVec) acc = metrics.accuracy_score(labTest, predictions) print (acc) return lab
def problemTransformation(data): # Binary Relevance # Classifier Chains # Label Powerset # initialize multi-label classifier # with a gaussian naive bayes base classifier classifier = BinaryRelevance(GaussianNB()) classifier.fit(X_train, y_train) # train predictions = classifier.predict(X_test) # predict accuracyScore = accuracy_score(y_test, predictions) classifier = ClassifierChain(GaussianNB()) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracyScore = accuracy_score(y_test, predictions) classifier = LabelPowerset(GaussianNB()) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracyScore = accuracy_score(y_test, predictions) return None
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
def multiLabel_SKLearn_GaussianNBayes(rData, lData, sData): xData = rData.values yData = np.array( [lData.values, sData.values] ) respTrain, respTest, labTrain, labTest = train_test_split(xData, yData, random_state=1) classifier = BinaryRelevance(GaussianNB()) #classifier = ClassifierChain(GaussianNB()) #classifier = LabelPowerset(GaussianNB()) classifier.fit(respTrain, labTrain) predictions = classifier.predict(respTest) acc = accuracy_score(labTest, predictions) return acc
def binary_relevance(self): '''Name: Binary Relevance Main Idea: Divide multi-classify into multi binary classfier Evaluation Metric: accuracy_score ''' print(self.X_train) print(self.y_train) classifier = BinaryRelevance(GaussianNB()) classifier.fit(self.X_train, self.y_train) predictions = classifier.predict(self.X_test) print(predictions) #print(y_test) #print("predictions:\n",predictions) result = accuracy_score(self.y_test, predictions) print(result)
class MyBinaryRelevanceFeatureSelect(): def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.BinaryRelevanceObject = BinaryRelevance( classifier=SVC(gamma='auto', probability=True), require_dense=[True, True]) #self.BinaryRelevanceObject = BinaryRelevance() # fitting the data self.BinaryRelevanceObject.fit(X, y) #the classifiers for each label self.classifiers = self.BinaryRelevanceObject.classifiers_ return self.BinaryRelevanceObject.fit(X, y) # def partition(self): # return self.BinaryRelevanceObject.partition_#BinaryRelevanceObject # def model_count(self): # return self.BinaryRelevanceObject.model_count_ def predict(self, X, y=None): return self.BinaryRelevanceObject.predict(X) def predict_proba(self, X): return self.BinaryRelevanceObject.predict_proba(X) # def feature_select(self, X, y, transformer): # transformer.fit(X, y) # selected_attributes_indices = transformer.get_support(indices = True) # # return selected_attributes_indices # # def sets_of_selected_features(self, X, predictions, classifier, transformer ): #X is the df with the predictions # selected_features_array = [] # # for i in predictions: # indices_features_selected = classifier.feature_select(X, predictions[i], transformer) # selected_features_array.append(indices_features_selected) # # return selected_features_array
def get_train_test_lda(self, topic): # get training set dataset = arff.load(open(os.path.join(dir, "medical-train.arff")), encode_nominal=True) dataset = np.array(dataset.get("data")) X_train = dataset[:, :-num_label] y_train = dataset[:, -num_label:] # get test set dataset = arff.load(open(os.path.join(dir, "medical-test.arff")), encode_nominal=True) dataset = np.array(dataset.get("data")) X_test = dataset[:, :-num_label] y_test = dataset[:, -num_label:] for k in topic: X_iter = X_train.astype(np.int64) # get training_data feature topics model = lda.LDA(n_topics=k, n_iter=1000) model.fit(X_iter) doc_topic_x = model.doc_topic_ # get training data label topics model_label = lda.LDA(n_topics=k, n_iter=1000) model_label.fit(y_train) doc_topic_y = model_label.doc_topic_ # concat feature-topic and label topic x = np.hstack((doc_topic_x, doc_topic_y)) # discretize the topics x = self.discretization_doc_topic(x) X_train = np.hstack((X_train, x)) # multi-label learning to get test_data label topics and feature topics classifier = BinaryRelevance(RandomForestClassifier()) classifier.fit(X_iter, x) x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray()) X_test = np.hstack((X_test, x)) return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
def gaussianNaiveBayesBinary(): print("Gaussian naive bayes binary") start = time.time() classifier = BinaryRelevance(GaussianNB()) filename = "gaussianNaiveBayes" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def knnBinary(m): print("knn binary") start = time.time() classifier = BinaryRelevance(KNeighborsClassifier(n_neighbors=m)) filename = "knnBinary" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def supportVectorMachine(): print("Support vector machine") start = time.time() classifier = BinaryRelevance(classifier=svm.SVC(), require_dense=[False, True]) filename = "SupportVectorMachine" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def randomForest(): print("Random forest classifier") start = time.time() classifier = BinaryRelevance(classifier=RandomForestClassifier(), require_dense=[False, True]) filename = "randomForest" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def BinaryRelevence (): # Train-Test Split ======================================================= print("setting up a neural network...") from sklearn.model_selection import train_test_split train, test = train_test_split(df, test_size=0.33, shuffle=True) train_text = train['Book_Text'] test_text = test['Book_Text'] # TF-IDF ================================================================== from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2') vectorizer.fit(train_text) vectorizer.fit(test_text) x_train = vectorizer.transform(train_text) y_train = train.drop(labels = ['Book_Text'], axis=1) x_test = vectorizer.transform(test_text) y_test = test.drop(labels = ['Book_Text'], axis=1) # using binary relevance from skmultilearn.problem_transform import BinaryRelevance from sklearn.naive_bayes import GaussianNB # initialize binary relevance multi-label classifier # with a gaussian naive bayes base classifier classifier = BinaryRelevance(GaussianNB()) # train classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions)) print("\n")
def multinomialLogisticRegressionChain(): # Train multi-classification model with logistic regression print("Logisticka regresija chain") start = time.time() classifier = BinaryRelevance(classifier=linear_model.LogisticRegression( multi_class='multinomial', solver='newton-cg'), require_dense=[False, True]) filename = "logistickaRegresija.sav" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def multil_labels_binary_relevance(train_data, test_data): """ 可以正常运行 使用二元关联。多分类多标签问题。 :param train_data: :param test_data: :return: """ from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB from xgboost import XGBClassifier from sklearn.preprocessing import MultiLabelBinarizer xgt_param = {'max_depth': 6, 'eta': 0.5, 'eval_metric': 'merror', 'silent': 1, 'objective': 'multi:softmax', 'num_class': 20} # 用一个基于高斯朴素贝叶斯的分类器 classifier = BinaryRelevance(GaussianNB())# 初始化二元关联多标签分类器 # classifier = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4)) X_train, y_train = train_data.iloc[:, [0]], train_data.iloc[:, list(range(1, 21))] X_test, y_test = test_data.iloc[:, [0]], test_data.iloc[:, list(range(1, 21))] # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # 训练 temp = X_train.values.tolist() X = [] for i in range(len(temp)): X.append(temp[i][0]) x = tfidf.transform(X) y = y_train.values.tolist() # Y = [0]*20#长度为20的矩阵 # for j in range(len(y)): # if "1" in y[j]: # indexs = y[j].index("1") # Y.append(indexs+1) # else: # # print("0") # Y.append(0)#其实有21类,因为有空 Y = np.array(y) Y = Y.astype(np.int32) # Y值不能是多值?? classifier.fit(x, Y)# 直接预测数字? """ 报错:raise TypeError('no supported conversion for types: %r' % (args,)) TypeError: no supported conversion for types: (dtype('O'),) 难道是?? """ # 预测 temp = X_test.values.tolist() X_ts = [] for i in range(len(temp)): X_ts.append(temp[i][0]) x_test = tfidf.transform(X_ts) y_test = y_test.values.tolist() # Y_test = [] # for j in range(len(y_test)): # if "1" in y_test[j]: # indexs = y_test[j].index("1") # Y_test.append(indexs + 1) # else: # # print("0") # Y_test.append(0) # 其实有21类,因为有空 Y_test = np.array(y_test)#形成一个矩阵 Y_test = Y_test.astype(np.int32) unique_test, counts_test = np.unique(Y_test, return_counts=True) print("truth=", dict(zip(unique_test, counts_test))) predictions = classifier.predict(x_test)#此时csr_matrix类型 predictions = predictions.toarray() # 里面有0吗?? unique, counts = np.unique(predictions, return_counts=True) print("preditions=", dict(zip(unique, counts))) from sklearn.metrics import accuracy_score score = accuracy_score(Y_test, predictions) print(score)
print(y_train.shape) print(y_test.shape) ''' print("\n\nTraining data with Binary Relevance using Gaussian Naive Bayes") #initialize binary relevance multi-label classifier #with a gaussian naive bayes base classifier classifier_binary = BinaryRelevance(GaussianNB()) # train for Binary Relevance classifier_binary.fit(X_train, y_train) # predict for Binary Relevance predictions_binary = classifier_binary.predict(X_test) #Hamming Loss for Binary Relevance hamm_loss_binary = hamming_loss(y_test, predictions_binary) print("Hamming Loss:", hamm_loss_binary) print("\n\n\nTraining data with Classifier Chains using Gaussian Naive Bayes") #initialize Classifier Chains multi-label classifier #with a gaussian naive bayes base classifier classifier_cc = ClassifierChain(GaussianNB()) # train for Classifier Chaines classifier_cc.fit(X_train, y_train)
# In[6]: # 1. using binary relevance # The multi-label problem is broken into some different single class classification problems from skmultilearn.problem_transform import BinaryRelevance from sklearn.naive_bayes import GaussianNB # initialize binary relevance multi-label classifier # with a gaussian naive bayes base classifier classifier = BinaryRelevance(GaussianNB()) # train classifier.fit(X_train, y_train) # predict predictions = classifier.predict(X_test) # In[7]: from sklearn.metrics import accuracy_score accuracy_score(y_test, predictions) # In[8]: # 2. using classifier chains # The problem would be transformed into some different single label problems. # Different from the previous method, it forms chains in order to preserve label correlation. from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score
decision_function_shape='ovr', degree=1, gamma=100, kernel='linear', max_iter=-1, probability=False, random_state=6, shrinking=True, tol=0.001, verbose=False), require_dense = [False, True]) i = 0 j = 0 for i in range(0, 47): X_copy = X_orig[(i):(i+1)] #Slice the ith element from the numpy array y_copy = y_orig[(i):(i+1)] X_model = X_orig y_model = y_orig X_model = np.delete(X_model, i, axis = 0) #Create a new array to train the model with slicing out the ith item for LOOCV y_model = np.delete(y_model, i, axis = 0) classifier.fit(X_model, y_model) prediction = classifier.predict(X_copy) equal = prediction.toarray() print(equal, y_copy) if np.array_equal(y_copy, equal): j = j + 1 #print(y_copy, equal) if np.not_equal: #print(y_copy, equal) pass print(j/48)
fold_auc = [] for s in dataset: fp = open(timeStamp('./datasets/' + s + '/' + s), 'w') for i in range(0, nfolds): X_train, y_train = readDataFromFile('./datasets/' + s + '/' + s + str(i) + '.train') print('Reading: ./datasets/' + s + '/' + s + str(i) + '.train') X_test, y_test = readDataFromFile('./datasets/' + s + '/' + s + str(i) + '.test') print('Reading: ./datasets/' + s + '/' + s + str(i) + '.test') classif = BinaryRelevance( classifier=RandomForestClassifier(n_estimators=10), require_dense=[False, True]) classif.fit(X_train, y_train) y_score = classif.predict(X_test) #y_prob = classif.predict_proba(X_test) #-----------------------------------------# #Medidas: sklearn.metrics...(true,predict,..) acc = sklearn.metrics.accuracy_score(y_test, y_score) fold_accuracy.append(acc) #-----------------------------------------# hl = sklearn.metrics.hamming_loss(y_test, y_score) fold_hamming.append(hl) #-----------------------------------------# #Mean average precision m = sklearn.metrics.average_precision_score(y_test, y_score.toarray(), average='macro',
t_train = sc.sparse.csr_matrix(t_train.values) X_test = sc.sparse.csr_matrix(X_test.drop('user', axis=1).values) t_test = sc.sparse.csr_matrix(t_test.values) X_train_scale = scale(X_train.toarray( )) # scaling not work well for many methods, for its offset of similarities X_test_scale = scale(X_test.toarray()) X_sparse = sc.sparse.csr_matrix(X.drop('user', axis=1).values) t_sparse = sc.sparse.csr_matrix(t.values) # firstly test the transformations with a simple naive-bayes classifier, roughly conclude that BR suits the most # intuitively the hotels shouldn't have correlation based on userID, for its randomness classifier = BinaryRelevance(GaussianNB()) classifier.fit(X_train, t_train) predictions = classifier.predict(X_test) probabilities = classifier.predict_proba(X_test) accuracy_score(t_test, predictions) # 0 mean_squared_error(t_test.toarray(), probabilities.toarray()) # 0.063299324514418692 classifier = ClassifierChain(GaussianNB()) classifier.fit(X_train, t_train) predictions = classifier.predict(X_test) probabilities = classifier.predict_proba(X_test) accuracy_score(t_test, predictions) # 0 mean_squared_error(t_test.toarray(), probabilities.toarray()) # 0.084135897849476421 classifier = LabelPowerset(GaussianNB()) classifier.fit(X_train, t_train)
# y_pred = cross_val_predict(clf, X_train, y_train, cv=args.kfolds, n_jobs=-1, verbose=1) # 交叉验证 rs = KFold(n_splits=args.kfolds, shuffle=True, random_state=args.randomseed) # 生成 k-fold 训练集、测试集索引 cv_index_set = rs.split(X_train) k_fold_step = 1 # 初始化折数 # 暂存每次选中的测试集索引和对应预测结果 test_idx_cache = np.array([], dtype=np.int) pred_cache = np.empty((1, args.nclass)) # 迭代训练 k-fold 交叉验证 for train_index, test_index in cv_index_set: clf.fit(X_train[train_index], y_train[train_index]) # 测试集验证 y_pred = clf.predict(X_train[test_index]) # 暂存每次选中的测试集和预测结果 test_idx_cache = np.concatenate((test_idx_cache, test_index)) pred_cache = np.concatenate((pred_cache, y_pred.toarray()), axis=0) # 每个fold训练结束后次数 +1 k_fold_step += 1 # 删除第一行空白元素 pred_cache = np.delete(pred_cache, 0, axis=0) # 输出交叉验证后的预测结果 df_pred = pd.DataFrame(index=test_idx_cache, data=pred_cache, columns=df.columns.values[0:args.nclass]) df_pred.to_csv('{0}-pred.csv'.format(args.output), index_label='Sample ID') print('\nThe prediction saved to:`{0}-pred.csv`'.format(args.output)) end_time = time.time() # 程序结束时间 print("\n[Finished in: {0:.6f} mins = {1:.6f} seconds]".format(
import pandas as pd import numpy as np import pm_telemetry as pmt from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelEncoder from skmultilearn.problem_transform import BinaryRelevance pd.set_option("display.max_columns", None) # prepare data tele = pmt.PmTelemetry() tele.merge_failures() tele.merge_errors() tele.merge_maint() tele.merge_machines() tele.gen_past_hr_rolling_telemetry(24, "_24", np.mean) print("Make datetime ordinal") tele.data["dt_ordinal"] = tele.data["datetime"].apply(lambda x: x.toordinal()) print("Label encode model") label_encoder = LabelEncoder() tele.data["model"] = label_encoder.fit_transform(tele.data["model"]) print(tele.data.info()) print(tele.data.head()) # start modelling print("Start modeling") features = [ "dt_ordinal", "machineID", "volt", "pressure", "vibration", "errorID_error1", "errorID_error2", "errorID_error3", "errorID_error4", "errorID_error5", "comp_comp1", "comp_comp2", "comp_comp3", "comp_comp4", "model", "age", "volt_24", "rotate_24", "vibration_24", "pressure_24"
roc_auc_score(y_test, pd.DataFrame(pred))) # # BinaryRelevance # * This one is an ensemble of single-class (Yes/No) binary classifier # * If there are n number of different labels it will create n datasets and train for each label and will result the union of all predicted labels. # * Here the correlation b/w the labels is not taken into account # In[65]: classifier = BinaryRelevance(LogisticRegression()) # In[66]: classifier.fit(x_train, y_train) print('Accuracy_score using BinaryRelevance is ', round(accuracy_score(y_test, classifier.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using BinaryRelevance is ', roc_auc_score(y_test, classifier.predict_proba(x_test).toarray())) # # Label Powerset # * Label Powerset creates a unique class for every possible label combination that is present in the training set, this way it makes use of label correlation # * Only problem with this method is as the no of classes increases its computational complexity also increases. # In[67]: log_classifier = LabelPowerset(LogisticRegression()) # In[68]:
train_data = tokenization_pos_tag(train_data["comment_text"]) test_data = tokenization_pos_tag(test_data["comment_text"]) X_train = dataset_creation(train_data) X_test = dataset_creation(test_data) X_train_tfidf, X_test_tfidf = function_tfidf(X_train, X_test) from skmultilearn.problem_transform import BinaryRelevance from sklearn.svm import LinearSVC classifier = BinaryRelevance(LinearSVC()) classifier.fit(X_train_tfidf, train_labels) # predict predictions = classifier.predict(X_test_tfidf) # accuracy from sklearn.metrics import accuracy_score print("Accuracy = ", accuracy_score(test_labels, predictions)) # confusion matix pred = predictions.toarray() import sklearn.metrics as skm cm = skm.multilabel_confusion_matrix(test_labels, pred) print(cm) print(skm.classification_report(test_labels, pred))
from sklearn.cross_validation import cross_val_predict data = pd.read_csv("your_csv_file.csv", dtype={'sentence': np.str_}) y = data[['isA', 'isB', 'isC', 'isD', 'isE']] to_drop = ['id', 'isA', 'isB', 'isC', 'isD', 'isE'] X = data.drop(to_drop, axis=1) count_vectorizer = CountVectorizer() counts = count_vectorizer.fit_transform(X['sentence'].values) X_train, X_test, y_train, y_test = train_test_split(counts, y, test_size=0.33) clf = BinaryRelevance(RandomForestClassifier()) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Print invidiual label wise F1, precision and recall scores target_names = ['A', 'B', 'C', 'D', 'E'] print(classification_report(y_pred, y_test)) # Extract the values of label A alone, repeat the same for other labels extractedData1 = y_pred[:, [1]] z = scipy.sparse.csr_matrix(y_test) extractedData2 = z[:, [1]] # Flatten both the array and print the accuracy score for label A actual = extractedData1.toarray().flatten() predicted = extractedData2.toarray().flatten() print(accuracy_score(actual, predicted))