def check(request): vect = TfidfVectorizer(max_features=40000, stop_words='english') target = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] data = pd.read_csv('train.csv') test_data = pd.read_csv('D:/T.Y.BTECH/BML/Project/test.csv') X = data.comment_text test_X = test_data.comment_text xt = vect.fit_transform(X) yt = vect.transform(test_X) y_trans = data.iloc[:, 2:8] X_train, X_test, y_train, y_test = train_test_split(xt, y_trans, test_size=0.3) input_comment = '' output_class = None toxic = None severe_toxic = None obscene = None threat = None insult = None identity_hate = None posts = Post.objects.all() for post in posts: cmnt = post input_comment1 = str(cmnt) input_comment1 = [input_comment1] input_comment1 = vect.transform(input_comment1) from skmultilearn.problem_transform import ClassifierChain classifier = ClassifierChain(LogisticRegression(), require_dense=[False, True]) classifier.fit(X_train, y_train) output_class = classifier.predict_proba(input_comment1).toarray() #load_model = joblib.load('knn.pkl') #load_model = joblib.load('lr.pkl') #output_class = load_model.predict_proba(input_comment1).toarray() # output_class = output_class.tolist() output_class = list(chain.from_iterable(output_class)) toxic = output_class[0] severe_toxic = output_class[1] obscene = output_class[2] threat = output_class[3] insult = output_class[4] identity_hate = output_class[5] print(output_class) context = dict() context['input_comment'] = input_comment context['output_class1'] = toxic context['output_class2'] = severe_toxic context['output_class3'] = obscene context['output_class4'] = threat context['output_class5'] = insult context['output_class6'] = identity_hate return render(request, 'polls/comment_details.html', context)
def RecommendByClassifierChain(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """分类器链""" classifier = ClassifierChain(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20)) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data) predictions = predictions.todense().getA() recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
train_data = train_data.iloc[:, 0].str.replace('<\d+>', '') test_data = test_data.iloc[:, 0].str.replace('<\d+>', '') #count the frequency of every word in vocabulary in each document vectorizer = CountVectorizer() train_data_vector = vectorizer.fit_transform(train_data) test_data_vector = vectorizer.transform(test_data) #train the classifier model = ClassifierChain(RandomForestClassifier(n_jobs=-1, verbose=1)) model.fit(train_data_vector, train_labels) #test the classifier predicted_labels = model.predict(test_data_vector) predicted_labels_train = model.predict(train_data_vector) predicted_probabilities = model.predict_proba(test_data_vector) #test accuracy #~7% with random forest and binary relevance #~7% with random forest and classifier chain #~5% with random forest and label powerset #~4% with multilabel knn test_acc = accuracy_score(test_labels, predicted_labels) train_acc = accuracy_score(train_labels, predicted_labels_train) test_hamm_loss = hamming_loss(test_labels, predicted_labels) test_cov_err = coverage_error(test_labels, predicted_probabilities.toarray()) test_rank_loss = label_ranking_loss(test_labels, predicted_probabilities.toarray()) test_avr_prec = label_ranking_average_precision_score( test_labels, predicted_probabilities.toarray())
# In[68]: log_classifier.fit(x_train, y_train) print('Accuracy_score using LabelPowerset is ', round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using LabelPowerset is ', roc_auc_score(y_test, log_classifier.predict_proba(x_test).toarray())) # # ClassifierChain # * This method uses a chain of binary classifiers # * Each new Classifier uses the predictions of all previous classifiers # * This was the correlation b/w labels is taken into account # In[69]: chain = ClassifierChain(LogisticRegression()) # In[70]: chain.fit(x_train, y_train) print('Accuracy_score using ClassifierChain is ', round(accuracy_score(y_test, chain.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using ClassifierChain is ', roc_auc_score(y_test, chain.predict_proba(x_test).toarray()))
class ArticleClassifier(ClassifierMixin): def __init__(self, ngram=(1, 3), tokenizer=prepareText, max_feature=20000): """ This classifier is a multi-label classifier. It have been trained on octo-articles dataset. You can train it using the fit function :parameter ---------- :param ngram {tuple}: default '(1,3)' ngram_range for the tfidfVectorizer :param tokenizer {func}: tokenizer used by tfidfvectorizer to prepapre the Data :param max_feature {int}: limit the matrix composition to the 'max_feature' most important element """ self.vectorizer_ = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=ngram, norm='l2', tokenizer=tokenizer, max_features=max_feature) pass def fit(self, X, y): """ fit the model to the data. Train the classifier Note: You should use the zodiac.classifier.cleaner on all the texts before you fit the data :parameter ---------- :param X: (list) list of clean text (you can use zodiac.cleaner.TextCleaner) :param y: (numpy.array) array of labels """ self.x_vec_ = self.vectorizer_.fit_transform(X) # initialize classifier chains multi-label classifier self.classifier_ = ClassifierChain(SVC(probability=True)) # Training logistic regression model on train data self.classifier_.fit(self.x_vec_, y) def score(self, X, y, average='samples', threshold=0.5): """ Compute the jaccard score using the given parameters :parameter ----------- :param x_test(list): list of text :param y_true (list): texts labels :param average: default 'average'. :return: ------- score : float jaccard score """ self.x_test_vec_ = self.vectorizer_.transform(X) predictions = self.classifier_.predict_proba(self.x_test_vec_) score = jaccard_score(y, predictions >= threshold, average=average) return score def show_stats(self, x_test, y): """ compute the jaccard score for differents threshold and display the jaccard scores using plotly scatter method :parameter ---------- :param x_test: (list) text list :param y: list of label """ thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] x_test_vec = self.vectorizer_.transform(x_test) predictions_probas = self.classifier_.predict_proba(x_test_vec) jaccard_scores = [] for threshold in thresholds: # print("For threshold: ", val) pred = predictions_probas.copy() ensemble_jaccard_score = jaccard_score( y, predictions_probas >= threshold, average='samples') jaccard_scores.append(ensemble_jaccard_score) self.jaccard_scores_threshold_df_ = pd.DataFrame({ 'threshold': thresholds, 'jaccard_score': jaccard_scores }) def load_weights(self, path): """ Load the weights of the model from path :parameter --- :param path {str}: path to the model weights """ joblib.load(path) def save_weights(self, path): """ Save the model weights locally :parameter ---------- :param path {str}: path to the directory to store the classifier wieghts """ joblib.dump(self.classifier_, path) px.scatter(self.jaccard_scores_threshold_df_, x='threshold', y='jaccard_score', color='threshold', title='Jaccard score depending on threshold')
y = df.drop("Utterance", axis=1) #vect = CountVectorizer() vect = TfidfVectorizer(preprocessor=preprocess, tokenizer=Lemmatizer()) # learn the vocabulary and transform it to a document-term-matrix X_dtm = vect.fit_transform(X) vect.get_feature_names() # show all the features after they have been vectorized pd.DataFrame(X_dtm.toarray(), columns=vect.get_feature_names()) # show all the labels print(list(y)) #classifier = BinaryRelevance(MultinomialNB()) classifier = ClassifierChain(MultinomialNB()) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # train classifier.fit(X_dtm, y) userInput = input("Text to classify: ") simple_test = [userInput] simple_test_dtm = vect.transform(simple_test) # predict predictions = classifier.predict_proba(simple_test_dtm) print(predictions) #accuracy_score(y_test, predictions)
#The results might vary due to the usage of random state with train and test split X_train, X_test, y_train, y_test = train_test_split(d, y, test_size=0.2, random_state=42) # The classifier instance with the classifier as # RandomForestClassifier clf_cc = ClassifierChain( RandomForestClassifier(n_estimators=100, max_depth=200)) #fitting the model for the classification into the labels clf_cc.fit(X_train, y_train.astype(float)) #predictions predictions_cc = clf_cc.predict(X_test) pred_prob = clf_cc.predict_proba(X_test) #Finding the evaluation metrics # micro recall, macro recall, micro precision, macro precision # micro f1, macro f1, hamming loss r1 = recall_score(y_true=y_test, y_pred=predictions_cc, average='micro') r2 = recall_score(y_true=y_test, y_pred=predictions_cc, average='macro') p1 = precision_score(y_true=y_test, y_pred=predictions_cc, average='micro') p2 = precision_score(y_true=y_test, y_pred=predictions_cc, average='macro') f1 = f1_score(y_true=y_test, y_pred=predictions_cc, average='micro') f2 = f1_score(y_true=y_test, y_pred=predictions_cc, average='macro') Score_cc_ham = hamming_loss(y_test, predictions_cc) # Printing the evaluation metrics print "Hamming Loss for classifier chains", Score_cc_ham print "The micro recall is", r1
def classification_model(): dataset_full = pd.read_csv("dataset_with_labels.csv") dataset = dataset_full[0:102] col = ['Label', 'Review'] dataset = dataset[col] dataset = dataset[pd.notnull(dataset['Review'])] dataset.shape ll = [] for s in dataset['Label']: l = s.split(",") ll.append(l) dataset['Label'] = ll REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') STOPWORDS = set(stopwords.words('english')) def clean_text(text): text = BeautifulSoup(text, "lxml").text # HTML decoding text = text.lower() # lowercase text text = REPLACE_BY_SPACE_RE.sub( ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text text = BAD_SYMBOLS_RE.sub( '', text) # delete symbols which are in BAD_SYMBOLS_RE from text text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text return text stemmer = SnowballStemmer("english") def stemming(sentence): stemSentence = "" for word in sentence.split(): stem = stemmer.stem(word) stemSentence += stem stemSentence += " " stemSentence = stemSentence.strip() return stemSentence dataset['Review'] = dataset['Review'].apply(clean_text) dataset['Review'] = dataset['Review'].apply(stemming) multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit_transform(dataset['Label']) y = multilabel_binarizer.transform(dataset['Label']) for idx, label in enumerate(multilabel_binarizer.classes_): dataset[label] = y[:, idx] rest_dataset = dataset_full[102:] train_text = dataset['Review'].values.astype('U') test_text = rest_dataset['Review'].values.astype('U') vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 3), norm='l2', max_features=10000) vectorizer.fit(train_text) vectorizer.fit(test_text) x_train = vectorizer.transform(train_text) y_train = dataset.drop(labels=['Label', 'Review'], axis=1) x_test = vectorizer.transform(test_text) selected_labels = y_train.columns[ y_train.sum(axis=0, skipna=True) > 0].tolist() y_train = y_train.filter(selected_labels, axis=1) cc_classifier = ClassifierChain(LogisticRegression(solver='lbfgs')) cc_classifier.fit(x_train, y_train) cc_predictions_proba = cc_classifier.predict_proba(x_test) t = 47 y_pred_new = (cc_predictions_proba >= t / 100).astype(int) #print(y_pred_new) y_train1 = lil_matrix(y_train).toarray() label_nums = { 0: "Compatibility Issue", 1: "Feature Request", 2: "Functional Complaint", 3: "Network Problem", 4: "Resource Heavy", 5: "Uninteresting Comment", 6: "Update Issue", 7: "User Interface" } offset = 103 y_pred = lil_matrix(y_pred_new).toarray() i = 0 ll = [] for i in range(738): #print(i+103) #print(rest_dataset[1][i+103]) #print(y_pred[i]) j = 0 l = [] for j in range(8): #print(j) if (y_pred[i][j] == 1): #print(label_nums[j]) str = label_nums[j] l.append(str) ll.append(l) rest_dataset['Label'] = ll dataset_full.to_csv("dataset_output.csv")
def index(request): vect = TfidfVectorizer(max_features=40000, stop_words='english') target = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] data = pd.read_csv('train.csv') test_data = pd.read_csv('D:/T.Y.BTECH/BML/Project/test.csv') X = data.comment_text test_X = test_data.comment_text xt = vect.fit_transform(X) yt = vect.transform(test_X) y_trans = data.iloc[:, 2:8] X_train, X_test, y_train, y_test = train_test_split(xt, y_trans, test_size=0.3) input_comment = '' output_class = None toxic = None severe_toxic = None obscene = None threat = None insult = None identity_hate = None ''' if request.method == 'GET' : if request.GET['dropdown'] == 'KNN' : load_model = joblib.load('knn.pkl') if request.GET['dropdown'] == 'SVM' : load_model = joblib.load('lr.pkl') ''' if request.method == 'POST': form = ContactForm(request.POST) if form.is_valid(): input_comment = form.cleaned_data['comment'] algo_sel = form.cleaned_data['algo_field'] print(algo_sel, input_comment) #output_class = dict_[input_comment] #output_class = [ 'violence', 'obscene', 'insult'] #print( input_comment ) #print( output_class ) input_comment1 = str(input_comment) input_comment1 = [input_comment1] input_comment1 = vect.transform(input_comment1) if (algo_sel == "logistic regression"): #load_model = joblib.load('D:/T.Y.BTECH/BML/Project/lr.pkl') from skmultilearn.problem_transform import ClassifierChain classifier = ClassifierChain(LogisticRegression(), require_dense=[False, True]) classifier.fit(X_train, y_train) output_class = classifier.predict_proba(input_comment1).toarray() elif (algo_sel == "KNN"): #load_model = joblib.load('knn.pkl') classifier = BinaryRelevance(LogisticRegression(), require_dense=[False, True]) classifier.fit(X_train, y_train) output_class = classifier.predict_proba(input_comment1).toarray() else: load_model = joblib.load('br_builtin.pkl') # SVM Classifier output_class = load_model.predict_proba(input_comment1).toarray() #output_class = load_model.predict_proba(input_comment1).toarray() print(output_class) # output_class = output_class.tolist() output_class = list(chain.from_iterable(output_class)) toxic = output_class[0] severe_toxic = output_class[1] obscene = output_class[2] threat = output_class[3] insult = output_class[4] identity_hate = output_class[5] print(output_class) #return HttpResponseRedirect('/thanks/') else: form = ContactForm() context = dict() context['form'] = form context['input_comment'] = input_comment context['output_class1'] = toxic context['output_class2'] = severe_toxic context['output_class3'] = obscene context['output_class4'] = threat context['output_class5'] = insult context['output_class6'] = identity_hate return render(request, 'polls/index.html', context)