def classifer_chain(self): # initialize classifier chains multi-label classifier # with a gaussian naive bayes base classifier print("build classifier...") classifier = ClassifierChain(RandomForestClassifier()) #classifier = LabelPowerset(RandomForestClassifier()) print("end...") print("start training...") classifier.fit(self.X_train, self.y_train) print("end...") # predict print("start test...") predictions = classifier.predict(self.X_test) print("end...") print("result as following:") result = hamming_loss(self.y_test, predictions) print("hanming_loss: ", result) print("accuracy score: ", accuracy_score(y_test, predictions)) result = f1_score(self.y_test, predictions, average='micro') print("micro-f1_score: ", result)
def buildCCClassifier(xTrain, yTrain): # initialize classifier chains multi-label classifier # with a gaussian naive bayes base classifier classifier = ClassifierChain(GaussianNB()) # train classifier.fit(xTrain, yTrain) return classifier
def check(request): vect = TfidfVectorizer(max_features=40000, stop_words='english') target = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] data = pd.read_csv('train.csv') test_data = pd.read_csv('D:/T.Y.BTECH/BML/Project/test.csv') X = data.comment_text test_X = test_data.comment_text xt = vect.fit_transform(X) yt = vect.transform(test_X) y_trans = data.iloc[:, 2:8] X_train, X_test, y_train, y_test = train_test_split(xt, y_trans, test_size=0.3) input_comment = '' output_class = None toxic = None severe_toxic = None obscene = None threat = None insult = None identity_hate = None posts = Post.objects.all() for post in posts: cmnt = post input_comment1 = str(cmnt) input_comment1 = [input_comment1] input_comment1 = vect.transform(input_comment1) from skmultilearn.problem_transform import ClassifierChain classifier = ClassifierChain(LogisticRegression(), require_dense=[False, True]) classifier.fit(X_train, y_train) output_class = classifier.predict_proba(input_comment1).toarray() #load_model = joblib.load('knn.pkl') #load_model = joblib.load('lr.pkl') #output_class = load_model.predict_proba(input_comment1).toarray() # output_class = output_class.tolist() output_class = list(chain.from_iterable(output_class)) toxic = output_class[0] severe_toxic = output_class[1] obscene = output_class[2] threat = output_class[3] insult = output_class[4] identity_hate = output_class[5] print(output_class) context = dict() context['input_comment'] = input_comment context['output_class1'] = toxic context['output_class2'] = severe_toxic context['output_class3'] = obscene context['output_class4'] = threat context['output_class5'] = insult context['output_class6'] = identity_hate return render(request, 'polls/comment_details.html', context)
def classify(self): from skmultilearn.problem_transform import ClassifierChain from sklearn.svm import SVC,LinearSVC import sklearn.metrics as metrics # ============================= # ClassifierChain # # ============================= from sklearn.multiclass import OneVsRestClassifier # from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression # cc = ClassifierChain(LogisticRegression()) self.cc = ClassifierChain(LinearSVC()) self.cc.fit(self.train_data, self.train_labels) # y_pred = self.cc.predict(self.test_data) # cc_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro') # # initialize Classifier Chain multi-label classifier # # with an SVM classifier # # SVM in scikit only supports the X matrix in sparse representation # classifier = ClassifierChain( # classifier=SVC(), # require_dense=[False, True] # ) # # train # classifier.fit(self.train_data, self.train_labels) # # predict # predictions = classifier.predict(self.test_data) # print(predictions) # art_f1 = metrics.f1_score(self.test_labels, predictions, average='macro') # return art_f1 # ============================= # KNeighborsClassifier # # ============================= from sklearn.neighbors import KNeighborsClassifier knc = KNeighborsClassifier() knc.fit(self.train_data, self.train_labels) # Y_pred = knc.predict(self.test_data) # knc_art_f1 = metrics.f1_score(self.test_labels, Y_pred, average='micro') # ============================= # SGDClassifier # # ============================= from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=0, max_iter=6, tol=None) clf = OneVsRestClassifier(sgd) clf.fit(self.train_data, self.train_labels)
def test_if_order_is_set(self): classifier = ClassifierChain( classifier=GaussianNB(), require_dense=[True, True], order=None ) X, y = self.get_multilabel_data_for_tests(sparsity_indicator='sparse')[0] classifier.fit(X,y) self.assertEqual(classifier._order(), list(range(y.shape[1])))
def test_if_order_is_set(self): classifier = ClassifierChain(classifier=GaussianNB(), require_dense=[True, True], order=None) X, y = self.get_multilabel_data_for_tests( sparsity_indicator='sparse')[0] classifier.fit(X, y) self.assertEqual(classifier._order(), list(range(y.shape[1])))
def train(self): classifier = ClassifierChain(LogisticRegression()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def test_if_order_is_set_when_explicitly_given(self): X, y = self.get_multilabel_data_for_tests(sparsity_indicator='sparse')[0] reversed_chain = list(reversed(range(y.shape[1]))) classifier = ClassifierChain( classifier=GaussianNB(), require_dense=[True, True], order=reversed_chain ) classifier.fit(X, y) self.assertEqual(classifier._order(), reversed_chain)
class ClassifierChains: def __init__(self): self.model = ClassifierChain(LGBMClassifier()) def set_grow_step(self, new_step): self.grow_boost_round = new_step def fit(self, X_train, y_train): self.model.fit(X_train, y_train) def predict(self, X_test): return self.model.predict(X_test).A
def test_if_order_is_set_when_explicitly_given(self): X, y = self.get_multilabel_data_for_tests( sparsity_indicator='sparse')[0] reversed_chain = list(reversed(range(y.shape[1]))) classifier = ClassifierChain(classifier=GaussianNB(), require_dense=[True, True], order=reversed_chain) classifier.fit(X, y) self.assertEqual(classifier._order(), reversed_chain)
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
def majority_voting_multilabel_classification(train_filename, dev_filename, test_filename, attribute): df_train = pd.read_csv(train_filename) df_dev = pd.read_csv(dev_filename) df_test = pd.read_csv(test_filename) mlb = MultiLabelBinarizer() X_train = df_train.tweet.apply(clean_text) y_train_text = df_train[attribute].apply(lambda x: x.split('_')) y_train = mlb.fit_transform(y_train_text) X_dev = df_dev.tweet.apply(clean_text) y_dev_text = df_dev[attribute].apply(lambda x: x.split('_')) y_dev = mlb.fit_transform(y_dev_text) X_test = df_test.tweet.apply(clean_text) y_test_text = df_test[attribute].apply(lambda x: x.split('_')) y_test = mlb.fit_transform(y_test_text) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) Y = mlb.fit_transform(y_train_text) classifier = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', ClassifierChain(DummyClassifier()))]) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) print('Accuracy %s' % accuracy_score(y_pred, y_test)) print('Test macro F1 score is %s' % f1_score(y_test, y_pred, average='macro')) print('Test micro F1 score is %s' % f1_score(y_test, y_pred, average='micro'))
def train(classifier, X_train, X_test, y_train, y_test, strategy): """Computes a multi-label classification. This approach is used by `one-vs-the-rest`, `classifier-chains`, and `label-powerset` strategies. For each classifier, the classes are fitted at the same time or in sequence. Since all the classes are represented by one and only one classifier, it is possible to gain knowledge about the classes by inspecting this unique classifier. Args: classifier: An instance of a scikit-learn classifier. classes: A list of strings representing the classes to be trained. X_train: A matrix containing features for training. y_train: A one-column dataframe containing labels for training. strategy: A string defining which of the three strategies will be used. Returns: A classification model and its performance report """ if strategy == 'one-vs-the-rest': model = OneVsRestClassifier(classifier) if strategy == 'classifier-chains': model = ClassifierChain(classifier) if strategy == 'label-powerset': model = LabelPowerset(classifier) model.fit(X_train, y_train) y_pred = model.predict(X_test) report = classification_report(y_test, y_pred, output_dict=True, target_names=y_train.columns) return model, report
def RecommendByClassifierChain(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """分类器链""" classifier = ClassifierChain(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20)) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data) predictions = predictions.todense().getA() recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def Classifier_Chain(ytrain, yvalid, ytest, base_model): """ Fits a Classifier Chain Model with LinearSVC as base classifier specifiying either themes or subthemes for Y. Returns a table of results with train, valid, test score, and recall, precision, f1 scores for valid and test data. """ classifier_chain = ClassifierChain(base_model) model = classifier_chain.fit(X_train, ytrain) train = model.score(X_train, np.array(ytrain)) valid = model.score(X_valid, np.array(yvalid)) test = model.score(X_test, np.array(ytest)) #validation scores predictions = model.predict(X_valid) recall = recall_score(np.array(yvalid), predictions, average='micro') precision = precision_score(np.array(yvalid), predictions, average='micro') f1 = f1_score(np.array(yvalid), predictions, average='micro') #test scores predictions_test = model.predict(X_test) recall_test = recall_score(np.array(ytest), predictions_test, average='micro') precision_test = precision_score(np.array(ytest), predictions_test, average='micro') f1_test = f1_score(np.array(ytest), predictions_test, average='micro') #All rounded to 3 decimal place case = { 'Model': "TF-IDF + LinearSVC", 'Train Accuracy': round(train, 3), 'Validation Accuracy': round(valid, 3), 'Test Accuracy': round(test, 3), 'Valid Recall': round(recall, 3), 'Valid Precision': round(precision, 3), 'Valid F1': round(f1, 3), 'Test Recall': round(recall_test, 3), 'Test Precision': round(precision_test, 3), 'Test F1': round(f1_test, 3) } results_dict.append(case)
def fit(self, X, y): """ fit the model to the data. Train the classifier Note: You should use the zodiac.classifier.cleaner on all the texts before you fit the data :parameter ---------- :param X: (list) list of clean text (you can use zodiac.cleaner.TextCleaner) :param y: (numpy.array) array of labels """ self.x_vec_ = self.vectorizer_.fit_transform(X) # initialize classifier chains multi-label classifier self.classifier_ = ClassifierChain(SVC(probability=True)) # Training logistic regression model on train data self.classifier_.fit(self.x_vec_, y)
def ClassifierChain_method(X_train, y_train, samples_leaf, samples_split): """ 问题转换-->分类器链方法 :param X_train: 输入数据 :param y_train: 对应标签数据 :return: """ try: classifier = ClassifierChain( DecisionTreeClassifier(min_samples_leaf=int(samples_leaf), min_samples_split=int(samples_split))) classifier.fit(X_train, y_train) return classifier except Exception as e: print("warning----分类器链|ClassifierChain_method----" + str(e)) return None
def build_MajorityVoting(X_train, y_train, X_test, y_test): classifier = MajorityVotingClassifier( clusterer=FixedLabelSpaceClusterer( clusters=[[1, 2, 3], [0, 2, 5], [4, 5]]), classifier=ClassifierChain(classifier=GaussianNB())) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))
def __init__( self, rdm_state=84, params={"classifier__C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]}, niterations=5): self.model = ClassifierChain( LogisticRegression(random_state=rdm_state)) self.params = params self.niterations = niterations
def train_model(X, y, strategy): X = np.array(X) y = np.array(y) clf = lightgbm.sklearn.LGBMClassifier(max_depth=9, num_leaves=500, n_estimators=50, n_jobs=-1) # 0.8 print(clf) if strategy=='ovr': # OneVsRest strategy also known as BinaryRelevance strategy ovr = OneVsRestClassifier(clf) ovr.fit(X, y) save_model(ovr, "model/flow/ovr") return ovr elif strategy=='classifier_chains': cc = ClassifierChain(clf) cc.fit(X, y) save_model(cc, "model/flow/cc") return cc else: raise Exception("Correct strategies:ovr or classifier_chains")
def __init__(self): self.total_data_df = pd.read_csv(os.path.join("data", "cleaned_data.csv"), encoding="ISO-8859-1") self.data_df = self.total_data_df[~self.total_data_df.Tags.isnull()] self.total_records = len(self.data_df.index) self.train_df = self.data_df.tail(int(self.total_records * .67)) self.test_df = self.data_df.head(int(self.total_records * .23)) self.total_tag_list = self.get_tag_list() self.total_word_list = self.get_word_list() self.modified_train_df = pd.DataFrame() self.modified_test_df = pd.DataFrame() self.classifier = BernoulliNB() self.classifier_multilabel = ClassifierChain(BernoulliNB()) self.classifier_dt = DecisionTreeRegressor(max_depth=2000) self.classifier_random_forest = RandomForestRegressor(max_depth=100) self.classifier_svm = svm.SVC(kernel='linear') self.test_tags = pd.DataFrame()
def main(): print("Welcome to SVM text classifier. Please choose a dataset: \n" "Press 'b' for BBC news dataset\n" "Press 'r' for Reuters-21578 dataset\n" "Press 'g' for 20 News group\n" "Press 'q' for exit\n \n" "Enter your decision: ") model = input() if model == 'b': train_X, train_Y, test_X, test_Y = bbc() elif model == 'r': train_X, train_Y, test_X, test_Y = reut() elif model == 'g': train_X, train_Y, test_X, test_Y = tng() elif model == 'q': print("Program is closing...") sys.exit(0) else: "Please choose one of described options" # OVO print("\n--------------\nOVO") if (model == 'b') or (model == 'g'): classifier = OneVsOneClassifier(LinearSVC(random_state=42)) classifier.fit(train_X, train_Y) predictions_SVM = classifier.predict(test_X) evaluate(test_Y, predictions_SVM) print_confm(test_Y, predictions_SVM, model) # OVA print("\n--------------\nOVA") classifier = OneVsRestClassifier(LinearSVC(random_state=42)) classifier.fit(train_X, train_Y) predictions_SVM = classifier.predict(test_X) evaluate(test_Y, predictions_SVM) print_confm(test_Y, predictions_SVM, model) if (model == 'r'): # OVA print("\n--------------\nOVA") # classifier = OneVsRestClassifier(LinearSVC(random_state=42)) classifier = ClassifierChain(classifier=LinearSVC(), require_dense=[False, True]) classifier.fit(train_X, train_Y) predictions_SVM = classifier.predict(test_X) evaluate(test_Y, predictions_SVM) print_confm(test_Y, predictions_SVM, model)
def randomForestClassifierChain(): print("Random forest classifier chain") start = time.time() classifier = ClassifierChain(classifier=RandomForestClassifier(), require_dense=[False, True]) filename = "randomForestClassifierChain" # classifier.fit(train_x, train_y) # save # pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def knnClassifierChain(): print("knn classifier chain") start = time.time() classifier = ClassifierChain(KNeighborsClassifier()) filename = "knnChain" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def __init__( self, random_state=84, params={ 'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf', 'linear'] }, niterations=10): self.model = ClassifierChain(SVC(random_state=random_state)) self.params = params self.niterations = niterations
def gaussianNaiveBayes(): print("Gaussian naive bayes") start = time.time() classifier = ClassifierChain(GaussianNB()) filename = "gaussianNaiveBayes" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def supportVectorMachineChain(): print("Support vector machine") start = time.time() classifier = ClassifierChain(classifier=svm.SVC(), require_dense=[False, True]) filename = "SupportVectorMachine" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def train_model(X, y, strategy): X = np.array(X) y = np.array(y) # clf = SVC(C=1,kernel='rbf',probability=True, gamma='scale') # svc without class_weight # clf = SVC(C=10,kernel='rbf',class_weight='balanced',probability=True, gamma='scale') # svc with class_weight clf = XGBClassifier(subsample=0.8, colsample_bytree=0.8) # clf = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=5, # min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, # objective='binary:logistic', nthread=4, scale_pos_weight=1) print(clf) if strategy == 'ovr': # OneVsRest strategy also known as BinaryRelevance strategy ovr = OneVsRestClassifier(clf) ovr.fit(X, y) save_model(ovr, "model/ovr") return ovr elif strategy == 'classifier_chains': cc = ClassifierChain(clf) cc.fit(X, y) save_model(cc, "model/cc") return cc else: raise Exception("Correct strategies:ovr or classifier_chains")
def __init__( self, random_state=84, n_estimators=20, params={ 'classifier__n_estimators': [250, 500, 1000, 1500], 'classifier__min_samples_split': [2, 4, 8] }, niterations=10): self.model = ClassifierChain( ExtraTreesClassifier(random_state=random_state, n_estimators=n_estimators)) self.params = params self.niterations = niterations
def build_Mklnn(X_train, y_train): parameters = { 'classifier': [LabelPowerset(), ClassifierChain()], 'classifier__classifier': [RandomForestClassifier()], 'classifier__classifier__n_estimators': [10, 20, 50], } clf = GridSearchCV(LabelSpacePartitioningClassifier(), parameters, scoring='f1_macro') clf.fit(X_train, y_train) print(clf.best_params_, clf.best_score_)
def ClassifierChain (): # Train-Test Split ======================================================= print("setting up a neural network...") from sklearn.model_selection import train_test_split train, test = train_test_split(df, test_size=0.33, shuffle=True) train_text = train['Book_Text'] test_text = test['Book_Text'] # TF-IDF ================================================================== from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2') vectorizer.fit(train_text) vectorizer.fit(test_text) x_train = vectorizer.transform(train_text) y_train = train.drop(labels = ['Book_Text'], axis=1) x_test = vectorizer.transform(test_text) y_test = test.drop(labels = ['Book_Text'], axis=1) # using classifier chains from skmultilearn.problem_transform import ClassifierChain from sklearn.linear_model import LogisticRegression # initialize classifier chains multi-label classifier classifier = ClassifierChain(LogisticRegression()) # Training logistic regression model on train data classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions)) print("\n")
def __init__( self, random_state=84, n_estimators=20, params={ "classifier__max_depth": [3, None], "classifier__max_features": [1, 3, 10], "classifier__min_samples_leaf": [1, 3, 10] }, niterations=10): self.model = ClassifierChain( GradientBoostingClassifier(random_state=random_state, n_estimators=n_estimators)) self.params = params self.niterations = niterations