class BinaryRelevancesSimple: def __init__(self, model): # self.params = { # # 'num_class': num_class, # # "boosting_type": "gbdt", # "objective": "binary", # "metric": 'None', # "learning_rate": 0.05, # "verbosity": 1, # "seed": 888, # "num_threads": NUM_THREAD # } self.model = BinaryRelevance(LGBMClassifier()) if model == 'RF': self.model = BinaryRelevance(RandomForestClassifier(n_estimators=200, max_depth=12)) # def set_grow_step(self, new_step): # self.grow_boost_round = new_step def fit(self, X_train, y_train): print ('###start trainging...') start = time.time() self.model.fit(X_train, y_train) print ('####training time:', time.time() - start) def predict_proba(self, X_test): return self.model.predict_proba(X_test).A
def RecommendByBinaryRelevance(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """使用多标签问题的 二值相关 """ classifier = BinaryRelevance(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20)) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data) predictions = predictions.todense().getA() recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
class MyBinaryRelevanceFeatureSelect(): def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.BinaryRelevanceObject = BinaryRelevance( classifier=SVC(gamma='auto', probability=True), require_dense=[True, True]) #self.BinaryRelevanceObject = BinaryRelevance() # fitting the data self.BinaryRelevanceObject.fit(X, y) #the classifiers for each label self.classifiers = self.BinaryRelevanceObject.classifiers_ return self.BinaryRelevanceObject.fit(X, y) # def partition(self): # return self.BinaryRelevanceObject.partition_#BinaryRelevanceObject # def model_count(self): # return self.BinaryRelevanceObject.model_count_ def predict(self, X, y=None): return self.BinaryRelevanceObject.predict(X) def predict_proba(self, X): return self.BinaryRelevanceObject.predict_proba(X) # def feature_select(self, X, y, transformer): # transformer.fit(X, y) # selected_attributes_indices = transformer.get_support(indices = True) # # return selected_attributes_indices # # def sets_of_selected_features(self, X, predictions, classifier, transformer ): #X is the df with the predictions # selected_features_array = [] # # for i in predictions: # indices_features_selected = classifier.feature_select(X, predictions[i], transformer) # selected_features_array.append(indices_features_selected) # # return selected_features_array
X_test = sc.sparse.csr_matrix(X_test.drop('user', axis=1).values) t_test = sc.sparse.csr_matrix(t_test.values) X_train_scale = scale(X_train.toarray( )) # scaling not work well for many methods, for its offset of similarities X_test_scale = scale(X_test.toarray()) X_sparse = sc.sparse.csr_matrix(X.drop('user', axis=1).values) t_sparse = sc.sparse.csr_matrix(t.values) # firstly test the transformations with a simple naive-bayes classifier, roughly conclude that BR suits the most # intuitively the hotels shouldn't have correlation based on userID, for its randomness classifier = BinaryRelevance(GaussianNB()) classifier.fit(X_train, t_train) predictions = classifier.predict(X_test) probabilities = classifier.predict_proba(X_test) accuracy_score(t_test, predictions) # 0 mean_squared_error(t_test.toarray(), probabilities.toarray()) # 0.063299324514418692 classifier = ClassifierChain(GaussianNB()) classifier.fit(X_train, t_train) predictions = classifier.predict(X_test) probabilities = classifier.predict_proba(X_test) accuracy_score(t_test, predictions) # 0 mean_squared_error(t_test.toarray(), probabilities.toarray()) # 0.084135897849476421 classifier = LabelPowerset(GaussianNB()) classifier.fit(X_train, t_train) predictions = classifier.predict(X_test)
# * If there are n number of different labels it will create n datasets and train for each label and will result the union of all predicted labels. # * Here the correlation b/w the labels is not taken into account # In[65]: classifier = BinaryRelevance(LogisticRegression()) # In[66]: classifier.fit(x_train, y_train) print('Accuracy_score using BinaryRelevance is ', round(accuracy_score(y_test, classifier.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using BinaryRelevance is ', roc_auc_score(y_test, classifier.predict_proba(x_test).toarray())) # # Label Powerset # * Label Powerset creates a unique class for every possible label combination that is present in the training set, this way it makes use of label correlation # * Only problem with this method is as the no of classes increases its computational complexity also increases. # In[67]: log_classifier = LabelPowerset(LogisticRegression()) # In[68]: log_classifier.fit(x_train, y_train) print('Accuracy_score using LabelPowerset is ', round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1), '%')
def predict(): # loading data train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") # cleaning comments def clean_text(text): text = text.lower() pat = re.compile(r"[^A-Za-z\s']") text = pat.sub(" ", text) text = text.rstrip() newLines = re.compile(r"[\n\r\t]") text = newLines.sub(" ", text) extraspace = re.compile(r'\s{2,}') text = extraspace.sub(" ", text) return text eng_stopwords = set(stopwords.words('english')) def preprocess_text(text): text = " ".join([word for word in text.split() if len(word) >2]) text = " ".join([word for word in text.split() if word not in eng_stopwords]) text = " ".join([word for word in text.split() if word not in ["i'm", "can't"]]) text = " ".join([WordNetLemmatizer().lemmatize(word) for word in text.split()]) return text train["cleaned_comments"] = train["comment_text"].map(clean_text) train["final_cleaned_comments"] = train["cleaned_comments"].map(preprocess_text) test["cleaned_comments"] = test["comment_text"].map(clean_text) test["final_cleaned_comments"] = test["cleaned_comments"].map(preprocess_text) tfidfVect = TfidfVectorizer(min_df = 100, strip_accents = "unicode", stop_words = "english", smooth_idf = True) tfidfVect = tfidfVect.fit_transform(train["final_cleaned_comments"]) train_target = train[["toxic", "severe_toxic","obscene", "threat", "insult", "identity_hate"]] X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(train["final_cleaned_comments"], train_target, test_size = 0.3, random_state = 0) tfidfVectClean = TfidfVectorizer(min_df = 100, strip_accents = "unicode", stop_words = "english", smooth_idf = True).fit(X_train_w) nbModel_w = BinaryRelevance(naive_bayes.MultinomialNB()) X_train_w_dtm_tfidf = tfidfVectClean.transform(X_train_w) nbModel_w.fit(X_train_w_dtm_tfidf, y_train_w) predictions = nbModel_w.predict(tfidfVectClean.transform(X_test_w)) probs = nbModel_w.predict_proba(tfidfVectClean.transform(X_test_w)) if request.method == "POST": message = request.form["message"] data = [message] vect = tfidfVectClean.transform(data).toarray() class_prediction = nbModel_w.predict(vect) #toxic_prediction = class_prediction.toarray()[:,0] keys = ["toxic", "severe_toxic","obscene", "threat", "insult", "identity_hate"] values = class_prediction.toarray() t_pred = dict(zip(keys, values[0])) tox_labels = [k for k,v in t_pred.items() if v == 1] if len(tox_labels) == 0: output = "Thank you for keeping your comment respectful to all" else: output = "Your comment has been flagged as: " + ', '.join(tox_labels) + "." #return render_template("result.html", prediction = toxic_prediction) return render_template("result.html", output = output)
binary_rel_clf = BinaryRelevance(MultinomialNB()) binary_rel_clf.fit(X_train, y_train) # Predictions X_test_n1 = np.array([X_test[0]]) # sample size of one print("X_test -----------------") print(X_test) print("y_test -----------------") print(y_test) print("------------------------") br_prediction = binary_rel_clf.predict(X_test) print(br_prediction) print("Pred Prob ------------------------") br_prediction_prob = binary_rel_clf.predict_proba(X_test) print(br_prediction_prob) # Accuracy print("Accuracy Score: " + str(accuracy_score(y_test, br_prediction) * 100) + " %") # save trained model import joblib # binary_rel_clf_file = open("beep_boop_stonks.pkl","wb") joblib.dump(binary_rel_clf, 'beep_boop_stonks.joblib') # load trained model model = joblib.load('beep_boop_stonks.joblib')
def evaluate_model_svm(x, y, learn_path, k=10, thresh=0.5): print(len(y), len(y[0])) # create a k fold with no unique classes count = 0 while True: count += 1 # print(count, 'Finding a proper KF...') kf = list( KFold(n_splits=k, shuffle=True, random_state=randint(0, 100000)).split(x)) good_folds = True for train_index, test_index in kf: for i in range(len(y[0])): if len(np.unique( y[train_index, i])) < 2: # or len(np.unique(y[test_index, i])) < 2: # print(y[train_index, i],np.unique(y[train_index, i])) print(i) good_folds = False break if not good_folds: break if good_folds: break print('Found a good KF in', count, 'try!') with open(learn_path + 'topic_classifier-folds.pkl', 'wb') as out_file: pickle.dump(kf, out_file) fold_num = 0 stats = QuickDataFrame([ 'Jaccard (normalised)', 'Accuracy (normalised)', 'Accuracy', 'F1_score (micro averaged)', 'F1_score (macro averaged by labels)', 'F1_score (averaged by samples)', 'Hamming loss', 'Label Ranking loss:' ]) prog = Progresser(k) for train_index, test_index in kf: # print(train_index, test_index) print('___________________________________________________') x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] # cls = SVC(kernel='linear') # cls = SVC(kernel='poly', probability=True, tol=1e-5) cls = SVC(kernel='linear', probability=True, tol=1e-5) # cls = GaussianNB() # cls = RandomForestClassifier(max_features='auto', random_state=1) topic_classifier = BinaryRelevance(classifier=cls, require_dense=[True, True]) try: topic_classifier.fit(x_train, y_train) except Exception as e: print('\nfit error!:', e) continue # with open(learn_path + 'topic_classifier-SVC' + str(fold_num) + '.pkl', 'wb') as out_file: # pickle.dump(topic_classifier, out_file) try: # predictions = topic_classifier.predict(x_test) predictions = np.zeros((len(x_test), y.shape[1])) preds = topic_classifier.predict_proba(x_test) for i in range(len(x_test)): for j in range(y.shape[1]): predictions[i, j] = 1.0 if preds[i, j] > thresh else 0.0 s = [ jaccard_similarity_score(y_test, predictions, normalize=True), accuracy_score(y_test, predictions, normalize=True), accuracy_score(y_test, predictions, normalize=False), f1_score(y_test, predictions, average='micro'), f1_score(y_test, predictions, average='macro'), f1_score(y_test, predictions, average='samples'), hamming_loss(y_test, predictions), label_ranking_loss(y_test, predictions) ] stats.append(s) print(stats[stats.length - 1]) except Exception as e: print('Eval error!:', e) fold_num += 1 prog.count() for col in stats.cols: print(col, np.mean(stats[col]))