Esempio n. 1
0
def check(request):
    vect = TfidfVectorizer(max_features=40000, stop_words='english')
    target = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    data = pd.read_csv('train.csv')
    test_data = pd.read_csv('D:/T.Y.BTECH/BML/Project/test.csv')
    X = data.comment_text
    test_X = test_data.comment_text
    xt = vect.fit_transform(X)
    yt = vect.transform(test_X)
    y_trans = data.iloc[:, 2:8]
    X_train, X_test, y_train, y_test = train_test_split(xt,
                                                        y_trans,
                                                        test_size=0.3)
    input_comment = ''
    output_class = None
    toxic = None
    severe_toxic = None
    obscene = None
    threat = None
    insult = None
    identity_hate = None
    posts = Post.objects.all()
    for post in posts:
        cmnt = post
    input_comment1 = str(cmnt)
    input_comment1 = [input_comment1]
    input_comment1 = vect.transform(input_comment1)
    from skmultilearn.problem_transform import ClassifierChain

    classifier = ClassifierChain(LogisticRegression(),
                                 require_dense=[False, True])
    classifier.fit(X_train, y_train)
    output_class = classifier.predict_proba(input_comment1).toarray()

    #load_model = joblib.load('knn.pkl')
    #load_model = joblib.load('lr.pkl')
    #output_class = load_model.predict_proba(input_comment1).toarray()
    # output_class = output_class.tolist()
    output_class = list(chain.from_iterable(output_class))
    toxic = output_class[0]
    severe_toxic = output_class[1]
    obscene = output_class[2]
    threat = output_class[3]
    insult = output_class[4]
    identity_hate = output_class[5]
    print(output_class)

    context = dict()
    context['input_comment'] = input_comment
    context['output_class1'] = toxic
    context['output_class2'] = severe_toxic
    context['output_class3'] = obscene
    context['output_class4'] = threat
    context['output_class5'] = insult
    context['output_class6'] = identity_hate
    return render(request, 'polls/comment_details.html', context)
Esempio n. 2
0
    def RecommendByClassifierChain(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """分类器链"""
        classifier = ClassifierChain(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20))
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data)
        predictions = predictions.todense().getA()

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
Esempio n. 3
0
train_data = train_data.iloc[:, 0].str.replace('<\d+>', '')
test_data = test_data.iloc[:, 0].str.replace('<\d+>', '')

#count the frequency of every word in vocabulary in each document
vectorizer = CountVectorizer()
train_data_vector = vectorizer.fit_transform(train_data)
test_data_vector = vectorizer.transform(test_data)

#train the classifier
model = ClassifierChain(RandomForestClassifier(n_jobs=-1, verbose=1))
model.fit(train_data_vector, train_labels)

#test the classifier
predicted_labels = model.predict(test_data_vector)
predicted_labels_train = model.predict(train_data_vector)
predicted_probabilities = model.predict_proba(test_data_vector)

#test accuracy
#~7% with random forest and binary relevance
#~7% with random forest and classifier chain
#~5% with random forest and label powerset
#~4% with multilabel knn
test_acc = accuracy_score(test_labels, predicted_labels)
train_acc = accuracy_score(train_labels, predicted_labels_train)
test_hamm_loss = hamming_loss(test_labels, predicted_labels)
test_cov_err = coverage_error(test_labels, predicted_probabilities.toarray())
test_rank_loss = label_ranking_loss(test_labels,
                                    predicted_probabilities.toarray())
test_avr_prec = label_ranking_average_precision_score(
    test_labels, predicted_probabilities.toarray())
# In[68]:

log_classifier.fit(x_train, y_train)
print('Accuracy_score using LabelPowerset is ',
      round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1),
      '%')
print('-------------------------------------------------')
print('roc_auc_score using LabelPowerset is ',
      roc_auc_score(y_test,
                    log_classifier.predict_proba(x_test).toarray()))

# # ClassifierChain
# * This method uses a chain of binary classifiers
# * Each new Classifier uses the predictions of all previous classifiers
# * This was the correlation b/w labels is taken into account

# In[69]:

chain = ClassifierChain(LogisticRegression())

# In[70]:

chain.fit(x_train, y_train)
print('Accuracy_score using ClassifierChain is ',
      round(accuracy_score(y_test, chain.predict(x_test)) * 100, 1), '%')
print('-------------------------------------------------')
print('roc_auc_score using ClassifierChain is ',
      roc_auc_score(y_test,
                    chain.predict_proba(x_test).toarray()))
Esempio n. 5
0
class ArticleClassifier(ClassifierMixin):
    def __init__(self, ngram=(1, 3), tokenizer=prepareText, max_feature=20000):
        """
        This classifier is a multi-label classifier. It have been trained on octo-articles dataset.
        You can train it using the fit function
        :parameter
        ----------
            :param ngram {tuple}:
                    default '(1,3)'  ngram_range for the tfidfVectorizer
            :param tokenizer {func}:
                    tokenizer used by tfidfvectorizer to prepapre the Data
            :param max_feature {int}:
                    limit the matrix composition to the 'max_feature' most important element
        """
        self.vectorizer_ = TfidfVectorizer(strip_accents='unicode',
                                           analyzer='word',
                                           ngram_range=ngram,
                                           norm='l2',
                                           tokenizer=tokenizer,
                                           max_features=max_feature)

        pass

    def fit(self, X, y):
        """
        fit the model to the data. Train the classifier
        Note: You should use the zodiac.classifier.cleaner on all the texts before you fit the data

        :parameter
        ----------
            :param X: (list)
                list of clean text (you can use zodiac.cleaner.TextCleaner)
            :param y: (numpy.array)
                array of labels
        """
        self.x_vec_ = self.vectorizer_.fit_transform(X)
        # initialize classifier chains multi-label classifier
        self.classifier_ = ClassifierChain(SVC(probability=True))
        # Training logistic regression model on train data
        self.classifier_.fit(self.x_vec_, y)

    def score(self, X, y, average='samples', threshold=0.5):
        """
        Compute the jaccard score using the given parameters
        :parameter
        -----------
            :param x_test(list):
                list of text
            :param y_true (list):
                texts labels
            :param average:
                default 'average'.
        :return:
        -------
            score : float
                jaccard score
        """
        self.x_test_vec_ = self.vectorizer_.transform(X)
        predictions = self.classifier_.predict_proba(self.x_test_vec_)
        score = jaccard_score(y, predictions >= threshold, average=average)
        return score

    def show_stats(self, x_test, y):
        """
        compute the jaccard score for differents threshold and display the jaccard scores using plotly scatter method

        :parameter
        ----------
            :param x_test: (list)
                text list
            :param y:
                list of label
        """
        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        x_test_vec = self.vectorizer_.transform(x_test)
        predictions_probas = self.classifier_.predict_proba(x_test_vec)
        jaccard_scores = []
        for threshold in thresholds:
            # print("For threshold: ", val)
            pred = predictions_probas.copy()
            ensemble_jaccard_score = jaccard_score(
                y, predictions_probas >= threshold, average='samples')
            jaccard_scores.append(ensemble_jaccard_score)
        self.jaccard_scores_threshold_df_ = pd.DataFrame({
            'threshold':
            thresholds,
            'jaccard_score':
            jaccard_scores
        })

    def load_weights(self, path):
        """
        Load the weights of the model from path
        :parameter
        ---
        :param path {str}:
            path to the model weights
        """
        joblib.load(path)

    def save_weights(self, path):
        """
        Save the model weights locally
        :parameter
        ----------
            :param path {str}:
                    path to the directory to store the classifier wieghts
        """
        joblib.dump(self.classifier_, path)
        px.scatter(self.jaccard_scores_threshold_df_,
                   x='threshold',
                   y='jaccard_score',
                   color='threshold',
                   title='Jaccard score depending on threshold')
Esempio n. 6
0
y = df.drop("Utterance", axis=1)
#vect = CountVectorizer()
vect = TfidfVectorizer(preprocessor=preprocess, tokenizer=Lemmatizer())

# learn the vocabulary and transform it to a document-term-matrix
X_dtm = vect.fit_transform(X)

vect.get_feature_names()
# show all the features after they have been vectorized
pd.DataFrame(X_dtm.toarray(), columns=vect.get_feature_names())

# show all the labels
print(list(y))

#classifier = BinaryRelevance(MultinomialNB())
classifier = ClassifierChain(MultinomialNB())
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# train
classifier.fit(X_dtm, y)

userInput = input("Text to classify: ")
simple_test = [userInput]
simple_test_dtm = vect.transform(simple_test)

# predict
predictions = classifier.predict_proba(simple_test_dtm)
print(predictions)

#accuracy_score(y_test, predictions)
Esempio n. 7
0
#The results might vary due to the usage of random state with train and test split
X_train, X_test, y_train, y_test = train_test_split(d,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# The classifier instance with the classifier as
# RandomForestClassifier
clf_cc = ClassifierChain(
    RandomForestClassifier(n_estimators=100, max_depth=200))

#fitting the model for the classification into the labels
clf_cc.fit(X_train, y_train.astype(float))
#predictions
predictions_cc = clf_cc.predict(X_test)
pred_prob = clf_cc.predict_proba(X_test)

#Finding the evaluation metrics
# micro recall, macro recall, micro precision, macro precision
# micro f1, macro f1, hamming loss
r1 = recall_score(y_true=y_test, y_pred=predictions_cc, average='micro')
r2 = recall_score(y_true=y_test, y_pred=predictions_cc, average='macro')
p1 = precision_score(y_true=y_test, y_pred=predictions_cc, average='micro')
p2 = precision_score(y_true=y_test, y_pred=predictions_cc, average='macro')
f1 = f1_score(y_true=y_test, y_pred=predictions_cc, average='micro')
f2 = f1_score(y_true=y_test, y_pred=predictions_cc, average='macro')
Score_cc_ham = hamming_loss(y_test, predictions_cc)

# Printing the evaluation metrics
print "Hamming Loss for classifier chains", Score_cc_ham
print "The micro recall is", r1
Esempio n. 8
0
def classification_model():
    dataset_full = pd.read_csv("dataset_with_labels.csv")
    dataset = dataset_full[0:102]
    col = ['Label', 'Review']
    dataset = dataset[col]
    dataset = dataset[pd.notnull(dataset['Review'])]
    dataset.shape
    ll = []
    for s in dataset['Label']:
        l = s.split(",")
        ll.append(l)
    dataset['Label'] = ll

    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))

    def clean_text(text):
        text = BeautifulSoup(text, "lxml").text  # HTML decoding
        text = text.lower()  # lowercase text
        text = REPLACE_BY_SPACE_RE.sub(
            ' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
        text = BAD_SYMBOLS_RE.sub(
            '', text)  # delete symbols which are in BAD_SYMBOLS_RE from text
        text = ' '.join(word for word in text.split()
                        if word not in STOPWORDS)  # delete stopwors from text
        return text

    stemmer = SnowballStemmer("english")

    def stemming(sentence):
        stemSentence = ""
        for word in sentence.split():
            stem = stemmer.stem(word)
            stemSentence += stem
            stemSentence += " "
        stemSentence = stemSentence.strip()
        return stemSentence

    dataset['Review'] = dataset['Review'].apply(clean_text)
    dataset['Review'] = dataset['Review'].apply(stemming)
    multilabel_binarizer = MultiLabelBinarizer()
    multilabel_binarizer.fit_transform(dataset['Label'])
    y = multilabel_binarizer.transform(dataset['Label'])
    for idx, label in enumerate(multilabel_binarizer.classes_):
        dataset[label] = y[:, idx]
    rest_dataset = dataset_full[102:]
    train_text = dataset['Review'].values.astype('U')
    test_text = rest_dataset['Review'].values.astype('U')
    vectorizer = TfidfVectorizer(strip_accents='unicode',
                                 analyzer='word',
                                 ngram_range=(1, 3),
                                 norm='l2',
                                 max_features=10000)
    vectorizer.fit(train_text)
    vectorizer.fit(test_text)
    x_train = vectorizer.transform(train_text)
    y_train = dataset.drop(labels=['Label', 'Review'], axis=1)
    x_test = vectorizer.transform(test_text)
    selected_labels = y_train.columns[
        y_train.sum(axis=0, skipna=True) > 0].tolist()
    y_train = y_train.filter(selected_labels, axis=1)
    cc_classifier = ClassifierChain(LogisticRegression(solver='lbfgs'))
    cc_classifier.fit(x_train, y_train)
    cc_predictions_proba = cc_classifier.predict_proba(x_test)
    t = 47
    y_pred_new = (cc_predictions_proba >= t / 100).astype(int)
    #print(y_pred_new)
    y_train1 = lil_matrix(y_train).toarray()
    label_nums = {
        0: "Compatibility Issue",
        1: "Feature Request",
        2: "Functional Complaint",
        3: "Network Problem",
        4: "Resource Heavy",
        5: "Uninteresting Comment",
        6: "Update Issue",
        7: "User Interface"
    }
    offset = 103
    y_pred = lil_matrix(y_pred_new).toarray()
    i = 0
    ll = []
    for i in range(738):
        #print(i+103)
        #print(rest_dataset[1][i+103])
        #print(y_pred[i])
        j = 0
        l = []
        for j in range(8):
            #print(j)
            if (y_pred[i][j] == 1):
                #print(label_nums[j])
                str = label_nums[j]
                l.append(str)
        ll.append(l)
    rest_dataset['Label'] = ll
    dataset_full.to_csv("dataset_output.csv")
Esempio n. 9
0
def index(request):
    vect = TfidfVectorizer(max_features=40000, stop_words='english')
    target = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    data = pd.read_csv('train.csv')
    test_data = pd.read_csv('D:/T.Y.BTECH/BML/Project/test.csv')
    X = data.comment_text
    test_X = test_data.comment_text
    xt = vect.fit_transform(X)
    yt = vect.transform(test_X)
    y_trans = data.iloc[:, 2:8]
    X_train, X_test, y_train, y_test = train_test_split(xt,
                                                        y_trans,
                                                        test_size=0.3)
    input_comment = ''
    output_class = None
    toxic = None
    severe_toxic = None
    obscene = None
    threat = None
    insult = None
    identity_hate = None
    '''
    if request.method == 'GET' :
        if request.GET['dropdown'] == 'KNN' :
            load_model = joblib.load('knn.pkl')
        if request.GET['dropdown'] == 'SVM' :
            load_model = joblib.load('lr.pkl')

'''
    if request.method == 'POST':
        form = ContactForm(request.POST)
        if form.is_valid():
            input_comment = form.cleaned_data['comment']
            algo_sel = form.cleaned_data['algo_field']
            print(algo_sel, input_comment)

        #output_class = dict_[input_comment]
        #output_class = [ 'violence', 'obscene', 'insult']
        #print( input_comment )
        #print( output_class )

        input_comment1 = str(input_comment)
        input_comment1 = [input_comment1]
        input_comment1 = vect.transform(input_comment1)

        if (algo_sel == "logistic regression"):
            #load_model = joblib.load('D:/T.Y.BTECH/BML/Project/lr.pkl')
            from skmultilearn.problem_transform import ClassifierChain

            classifier = ClassifierChain(LogisticRegression(),
                                         require_dense=[False, True])
            classifier.fit(X_train, y_train)
            output_class = classifier.predict_proba(input_comment1).toarray()

        elif (algo_sel == "KNN"):
            #load_model = joblib.load('knn.pkl')
            classifier = BinaryRelevance(LogisticRegression(),
                                         require_dense=[False, True])
            classifier.fit(X_train, y_train)
            output_class = classifier.predict_proba(input_comment1).toarray()
        else:
            load_model = joblib.load('br_builtin.pkl')  # SVM Classifier
            output_class = load_model.predict_proba(input_comment1).toarray()

        #output_class = load_model.predict_proba(input_comment1).toarray()
        print(output_class)
        # output_class = output_class.tolist()
        output_class = list(chain.from_iterable(output_class))
        toxic = output_class[0]
        severe_toxic = output_class[1]
        obscene = output_class[2]
        threat = output_class[3]
        insult = output_class[4]
        identity_hate = output_class[5]
        print(output_class)

        #return HttpResponseRedirect('/thanks/')
    else:
        form = ContactForm()

    context = dict()
    context['form'] = form
    context['input_comment'] = input_comment
    context['output_class1'] = toxic
    context['output_class2'] = severe_toxic
    context['output_class3'] = obscene
    context['output_class4'] = threat
    context['output_class5'] = insult
    context['output_class6'] = identity_hate
    return render(request, 'polls/index.html', context)