コード例 #1
0
    def classifer_chain(self):


        # initialize classifier chains multi-label classifier
        # with a gaussian naive bayes base classifier
        print("build classifier...")
        classifier = ClassifierChain(RandomForestClassifier())
        #classifier = LabelPowerset(RandomForestClassifier())
        print("end...")

        print("start training...")
        classifier.fit(self.X_train, self.y_train)
        print("end...")

        # predict
        print("start test...")
        predictions = classifier.predict(self.X_test)
        print("end...")

        print("result as following:")

        result = hamming_loss(self.y_test, predictions)
        print("hanming_loss: ", result)

        print("accuracy score: ", accuracy_score(y_test, predictions))

        result = f1_score(self.y_test, predictions, average='micro')
        print("micro-f1_score: ", result)
コード例 #2
0
def buildCCClassifier(xTrain, yTrain):
    # initialize classifier chains multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = ClassifierChain(GaussianNB())

    # train
    classifier.fit(xTrain, yTrain)
    return classifier
コード例 #3
0
def check(request):
    vect = TfidfVectorizer(max_features=40000, stop_words='english')
    target = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    data = pd.read_csv('train.csv')
    test_data = pd.read_csv('D:/T.Y.BTECH/BML/Project/test.csv')
    X = data.comment_text
    test_X = test_data.comment_text
    xt = vect.fit_transform(X)
    yt = vect.transform(test_X)
    y_trans = data.iloc[:, 2:8]
    X_train, X_test, y_train, y_test = train_test_split(xt,
                                                        y_trans,
                                                        test_size=0.3)
    input_comment = ''
    output_class = None
    toxic = None
    severe_toxic = None
    obscene = None
    threat = None
    insult = None
    identity_hate = None
    posts = Post.objects.all()
    for post in posts:
        cmnt = post
    input_comment1 = str(cmnt)
    input_comment1 = [input_comment1]
    input_comment1 = vect.transform(input_comment1)
    from skmultilearn.problem_transform import ClassifierChain

    classifier = ClassifierChain(LogisticRegression(),
                                 require_dense=[False, True])
    classifier.fit(X_train, y_train)
    output_class = classifier.predict_proba(input_comment1).toarray()

    #load_model = joblib.load('knn.pkl')
    #load_model = joblib.load('lr.pkl')
    #output_class = load_model.predict_proba(input_comment1).toarray()
    # output_class = output_class.tolist()
    output_class = list(chain.from_iterable(output_class))
    toxic = output_class[0]
    severe_toxic = output_class[1]
    obscene = output_class[2]
    threat = output_class[3]
    insult = output_class[4]
    identity_hate = output_class[5]
    print(output_class)

    context = dict()
    context['input_comment'] = input_comment
    context['output_class1'] = toxic
    context['output_class2'] = severe_toxic
    context['output_class3'] = obscene
    context['output_class4'] = threat
    context['output_class5'] = insult
    context['output_class6'] = identity_hate
    return render(request, 'polls/comment_details.html', context)
コード例 #4
0
    def test_if_order_is_set(self):
        classifier = ClassifierChain(
            classifier=GaussianNB(), require_dense=[True, True], order=None
        )
        X, y = self.get_multilabel_data_for_tests(sparsity_indicator='sparse')[0]

        classifier.fit(X,y)

        self.assertEqual(classifier._order(), list(range(y.shape[1])))
コード例 #5
0
    def test_if_order_is_set(self):
        classifier = ClassifierChain(classifier=GaussianNB(),
                                     require_dense=[True, True],
                                     order=None)
        X, y = self.get_multilabel_data_for_tests(
            sparsity_indicator='sparse')[0]

        classifier.fit(X, y)

        self.assertEqual(classifier._order(), list(range(y.shape[1])))
コード例 #6
0
ファイル: classifierChain.py プロジェクト: monk1337/MultiLab
    def train(self):
        classifier = ClassifierChain(LogisticRegression())
        classifier.fit(self.x_data, self.y_data)

        predictions = classifier.predict(self.x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
コード例 #7
0
    def test_if_order_is_set_when_explicitly_given(self):
        X, y = self.get_multilabel_data_for_tests(sparsity_indicator='sparse')[0]

        reversed_chain = list(reversed(range(y.shape[1])))
        classifier = ClassifierChain(
            classifier=GaussianNB(), require_dense=[True, True], order=reversed_chain
        )

        classifier.fit(X, y)

        self.assertEqual(classifier._order(), reversed_chain)
コード例 #8
0
    def test_if_order_is_set_when_explicitly_given(self):
        X, y = self.get_multilabel_data_for_tests(
            sparsity_indicator='sparse')[0]

        reversed_chain = list(reversed(range(y.shape[1])))
        classifier = ClassifierChain(classifier=GaussianNB(),
                                     require_dense=[True, True],
                                     order=reversed_chain)

        classifier.fit(X, y)

        self.assertEqual(classifier._order(), reversed_chain)
コード例 #9
0
ファイル: model_manger.py プロジェクト: solversa/AutoDL-1
class ClassifierChains:
    def __init__(self):
        self.model = ClassifierChain(LGBMClassifier())

    def set_grow_step(self, new_step):
        self.grow_boost_round = new_step

    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test).A
コード例 #10
0
def classifiers(X_train, Y_train, X_test):

    classifier1 = BinaryRelevance(GaussianNB())
    classifier2 = ClassifierChain(GaussianNB())
    classifier3 = LabelPowerset(GaussianNB())

    classifier1.fit(X_train, Y_train)
    classifier2.fit(X_train, Y_train)
    classifier3.fit(X_train, Y_train)

    predictions1 = classifier1.predict(X_test)
    predictions2 = classifier2.predict(X_test)
    predictions3 = classifier3.predict(X_test)

    return predictions1, predictions2, predictions3
コード例 #11
0
    def RecommendByClassifierChain(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """分类器链"""
        classifier = ClassifierChain(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20))
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data)
        predictions = predictions.todense().getA()

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
コード例 #12
0
def ClassifierChain_method(X_train, y_train, samples_leaf, samples_split):
    """
	问题转换-->分类器链方法
	:param X_train: 输入数据
	:param y_train: 对应标签数据
	:return:
	"""
    try:
        classifier = ClassifierChain(
            DecisionTreeClassifier(min_samples_leaf=int(samples_leaf),
                                   min_samples_split=int(samples_split)))
        classifier.fit(X_train, y_train)

        return classifier
    except Exception as e:
        print("warning----分类器链|ClassifierChain_method----" + str(e))

    return None
コード例 #13
0
def train_model(X, y, strategy):
    X = np.array(X)
    y = np.array(y)
    clf = lightgbm.sklearn.LGBMClassifier(max_depth=9, num_leaves=500,
                                          n_estimators=50, n_jobs=-1)  # 0.8
    print(clf)
    if strategy=='ovr':  # OneVsRest strategy also known as BinaryRelevance strategy
        ovr = OneVsRestClassifier(clf)
        ovr.fit(X, y)
        save_model(ovr, "model/flow/ovr")
        return ovr
    elif strategy=='classifier_chains':
        cc = ClassifierChain(clf)
        cc.fit(X, y)
        save_model(cc, "model/flow/cc")
        return cc
    else:
        raise Exception("Correct strategies:ovr or classifier_chains")
コード例 #14
0
ファイル: main.py プロジェクト: DraganSkiljevic/siap
def randomForestClassifierChain():
    print("Random forest classifier chain")

    start = time.time()
    classifier = ClassifierChain(classifier=RandomForestClassifier(),
                                 require_dense=[False, True])
    filename = "randomForestClassifierChain"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
コード例 #15
0
ファイル: main.py プロジェクト: DraganSkiljevic/siap
def gaussianNaiveBayes():
    print("Gaussian naive bayes")

    start = time.time()
    classifier = ClassifierChain(GaussianNB())

    filename = "gaussianNaiveBayes"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
コード例 #16
0
ファイル: main.py プロジェクト: DraganSkiljevic/siap
def knnClassifierChain():
    print("knn classifier chain")

    start = time.time()
    classifier = ClassifierChain(KNeighborsClassifier())

    filename = "knnChain"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
コード例 #17
0
ファイル: main.py プロジェクト: DraganSkiljevic/siap
def supportVectorMachineChain():
    print("Support vector machine")

    start = time.time()
    classifier = ClassifierChain(classifier=svm.SVC(),
                                 require_dense=[False, True])
    filename = "SupportVectorMachine"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
コード例 #18
0
def train_model(X, y, strategy):
    X = np.array(X)
    y = np.array(y)
    # clf = SVC(C=1,kernel='rbf',probability=True, gamma='scale') # svc without class_weight
    # clf = SVC(C=10,kernel='rbf',class_weight='balanced',probability=True, gamma='scale')  # svc with class_weight
    clf = XGBClassifier(subsample=0.8, colsample_bytree=0.8)
    # clf = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=5,
    #                     min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
    #                     objective='binary:logistic', nthread=4, scale_pos_weight=1)
    print(clf)
    if strategy == 'ovr':  # OneVsRest strategy also known as BinaryRelevance strategy
        ovr = OneVsRestClassifier(clf)
        ovr.fit(X, y)
        save_model(ovr, "model/ovr")
        return ovr
    elif strategy == 'classifier_chains':
        cc = ClassifierChain(clf)
        cc.fit(X, y)
        save_model(cc, "model/cc")
        return cc
    else:
        raise Exception("Correct strategies:ovr or classifier_chains")
コード例 #19
0
ファイル: Attempt 4.py プロジェクト: ElijahWilde/THR
def ClassifierChain ():
    # Train-Test Split =======================================================
    print("setting up a neural network...")
    from sklearn.model_selection import train_test_split
    train, test = train_test_split(df, test_size=0.33, shuffle=True)
    
    train_text = train['Book_Text']
    test_text = test['Book_Text']
    
    # TF-IDF ==================================================================
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
    vectorizer.fit(train_text)
    vectorizer.fit(test_text)
    
    x_train = vectorizer.transform(train_text)
    y_train = train.drop(labels = ['Book_Text'], axis=1)
    
    x_test = vectorizer.transform(test_text)
    y_test = test.drop(labels = ['Book_Text'], axis=1)
    
    # using classifier chains
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.linear_model import LogisticRegression

    # initialize classifier chains multi-label classifier
    classifier = ClassifierChain(LogisticRegression())
    
    # Training logistic regression model on train data
    classifier.fit(x_train, y_train)
    
    # predict
    predictions = classifier.predict(x_test)
    
    # accuracy
    print("Accuracy = ",accuracy_score(y_test,predictions))
    print("\n")
コード例 #20
0
def train_model(X, y, strategy):
    X = np.array(X)
    y = np.array(y, dtype=int)
    # clf = SVC(C=1,kernel='rbf',probability=True, gamma='scale')  # svc with class_weight
    # clf = XGBClassifier(max_depth=9, n_estimators=50, n_jobs=-1)
    # clf = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=5,
    #                     min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
    #                     objective='binary:logistic', nthread=4, scale_pos_weight=1)
    # clf = RandomForestClassifier(max_depth=20, n_estimators=2000,n_jobs=-1)
    clf = lightgbm.sklearn.LGBMClassifier(max_depth=9, num_leaves=500,
                                          n_estimators=50, n_jobs=-1)
    print(clf)
    if strategy == 'ovr':  # OneVsRest strategy also known as BinaryRelevance strategy
        ovr = OneVsRestClassifier(clf)
        ovr.fit(X, y)
        save_model(ovr, "model/flow/ovr")
        return ovr
    elif strategy == 'classifier_chains':
        cc = ClassifierChain(clf)
        cc.fit(X, y)
        save_model(cc, "model/flow/cc")
        return cc
    else:
        raise Exception("Correct strategies:ovr or classifier_chains")
コード例 #21
0
    def Classifier_Chain(ytrain, yvalid, ytest, base_model):
        """
        Fits a Classifier Chain Model with LinearSVC as base classifier 
        specifiying either themes or subthemes for Y.
        Returns a table of results with train, valid, test score, and 
        recall, precision, f1 scores for valid and test data. 
        """
        classifier_chain = ClassifierChain(base_model)
        model = classifier_chain.fit(X_train, ytrain)

        train = model.score(X_train, np.array(ytrain))
        valid = model.score(X_valid, np.array(yvalid))
        test = model.score(X_test, np.array(ytest))

        #validation scores
        predictions = model.predict(X_valid)
        recall = recall_score(np.array(yvalid), predictions, average='micro')
        precision = precision_score(np.array(yvalid),
                                    predictions,
                                    average='micro')
        f1 = f1_score(np.array(yvalid), predictions, average='micro')

        #test scores
        predictions_test = model.predict(X_test)
        recall_test = recall_score(np.array(ytest),
                                   predictions_test,
                                   average='micro')
        precision_test = precision_score(np.array(ytest),
                                         predictions_test,
                                         average='micro')
        f1_test = f1_score(np.array(ytest), predictions_test, average='micro')

        #All rounded to 3 decimal place
        case = {
            'Model': "TF-IDF + LinearSVC",
            'Train Accuracy': round(train, 3),
            'Validation Accuracy': round(valid, 3),
            'Test Accuracy': round(test, 3),
            'Valid Recall': round(recall, 3),
            'Valid Precision': round(precision, 3),
            'Valid F1': round(f1, 3),
            'Test Recall': round(recall_test, 3),
            'Test Precision': round(precision_test, 3),
            'Test F1': round(f1_test, 3)
        }

        results_dict.append(case)
コード例 #22
0
def binary_relevance(train_data, test_data):
    """
    可以正常运行和预测
    使用二元关联。
    仅仅选取一个分类结果,即将问题简化为多分类单标签问题。而实际问题是多分类多标签问题。
    :param train_data:
    :param test_data:
    :return:
    """

    from skmultilearn.problem_transform import BinaryRelevance
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB
    # 用一个基于高斯朴素贝叶斯的分类器
    # classifier = BinaryRelevance(GaussianNB())# 初始化二元关联多标签分类器
    classifier = ClassifierChain(GaussianNB())
    #X_train = train
    X_train, y_train = train_data.iloc[:, [0]], train_data.iloc[:, list(range(1, 21))]
    X_test, y_test = test_data.iloc[:, [0]], test_data.iloc[:, list(range(1, 21))]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    # 训练
    temp = X_train.values.tolist()
    X = []
    for i in range(len(temp)):
        X.append(temp[i][0])
    x = tfidf.transform(X)
    y = y_train.values.tolist()
    Y = []#长度为20的矩阵
    for j in range(len(y)):
        if "1" in y[j]:
            indexs = y[j].index("1")
            Y.append(indexs+1)
        else:
            # print("0")
            Y.append(0)#其实有21类,因为有空
    Y = np.array(Y)
    # Y值不能是多值??
    classifier.fit(x, Y)# 直接预测数字?
    """
    报错:raise TypeError('no supported conversion for types: %r' % (args,))
    TypeError: no supported conversion for types: (dtype('O'),)
    难道是??
    """

    # 预测
    temp = X_test.values.tolist()
    X_ts = []
    for i in range(len(temp)):
        X_ts.append(temp[i][0])
    x_test = tfidf.transform(X_ts)

    y_test = y_test.values.tolist()
    Y_test = []
    for j in range(len(y_test)):
        if "1" in y_test[j]:
            indexs = y_test[j].index("1")
            Y_test.append(indexs + 1)
        else:
            # print("0")
            Y_test.append(0)  # 其实有21类,因为有空
    Y_test = np.array(Y_test)#形成一个矩阵
    unique_test, counts_test = np.unique(Y_test, return_counts=True)
    print("truth=", dict(zip(unique_test, counts_test)))

    predictions = classifier.predict(x_test)#此时csr_matrix类型
    predictions = predictions.toarray()
    # 里面有0吗??
    unique, counts = np.unique(predictions, return_counts=True)
    print("preditions=", dict(zip(unique, counts)))
    from sklearn.metrics import accuracy_score
    score = accuracy_score(Y_test, predictions)
    print(score)
# predict for Binary Relevance
predictions_binary = classifier_binary.predict(X_test)

#Hamming Loss for Binary Relevance
hamm_loss_binary = hamming_loss(y_test, predictions_binary)

print("Hamming Loss:", hamm_loss_binary)

print("\n\n\nTraining data with Classifier Chains using Gaussian Naive Bayes")

#initialize Classifier Chains multi-label classifier
#with a gaussian naive bayes base classifier
classifier_cc = ClassifierChain(GaussianNB())

# train for Classifier Chaines
classifier_cc.fit(X_train, y_train)

# predict for Classifier Chains
predictions_cc = classifier_cc.predict(X_test)

#Hamming Loss for Classifier Chaines
hamm_loss_cc = hamming_loss(y_test, predictions_cc)

print("Hamming Loss:", hamm_loss_cc)

print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes")

#initialize Label Powerset multi-label classifier
#with a gaussian naive bayes base classifier
classifier_lp = LabelPowerset(GaussianNB())
コード例 #24
0
ファイル: main.py プロジェクト: rsmnnit/TaggingSOQuestions-1
Y_train = train.iloc[:,4:].values
Y_test = test.iloc[:,4:].values

print (Y_test)




"""
Naive Bayes Classifier
"""
#naiveBayes = GaussianNB()

classifier = ClassifierChain(GaussianNB())
classifier.fit(X_train_idf,Y_train)
predictions = classifier.predict(X_test_idf)
print (accuracy_score(Y_test,predictions))


"""
Get training and test dataset
"""

"""
naiveBayes.fit(X_train_idf,Y_train[:,97:98].flatten())
y_pred = naiveBayes.predict(X_test_idf)
"""


コード例 #25
0
ファイル: script2.py プロジェクト: resendevinicius/Mestrado
#id = 0.0
#for i in range (1, 20) :
#	classifier = MLkNN(k = i)
#	prediction = classifier.fit(train['X'], train['y']).predict(test['X'])
#	if (1 - metrics.hamming_loss(prediction, test['y']) > best) :
#		best = 1 - metrics.hamming_loss(prediction, test['y'])
#		id = i;
#
#classifier = MLkNN(k = id)
#prediction = classifier.fit(train['X'], train['y']).predict(test['X'])
#print classifier
#print 'Subset Accuracy: ', metrics.accuracy_score(prediction, test['y'])
#print 'Hamming Loss: ', metrics.hamming_loss(prediction, test['y'])
#print 'Accuracy: ', 1 - metrics.hamming_loss(prediction, test['y'])

classifier = ClassifierChain(SVC())
prediction = classifier.fit(train['X'], train['y']).predict(test['X'])
print '------------------------------------------'
print classifier
print 'Subset Accuracy: ', metrics.accuracy_score(prediction, test['y'])
print 'Hamming Loss: ', metrics.hamming_loss(prediction, test['y'])
print 'Accuracy: ', 1 - metrics.hamming_loss(prediction, test['y'])

classifier = BinaryRelevance(SVC())
prediction = classifier.fit(train['X'], train['y']).predict(test['X'])
print '------------------------------------------'
print classifier
print 'Subset Accuracy: ', metrics.accuracy_score(prediction, test['y'])
print 'Hamming Loss: ', metrics.hamming_loss(prediction, test['y'])
print 'Accuracy: ', 1 - metrics.hamming_loss(prediction, test['y'])
コード例 #26
0
class MultiLabelClassifier(object):
    def __init__(self):
        self.total_data_df = pd.read_csv(os.path.join("data",
                                                      "cleaned_data.csv"),
                                         encoding="ISO-8859-1")
        self.data_df = self.total_data_df[~self.total_data_df.Tags.isnull()]
        self.total_records = len(self.data_df.index)
        self.train_df = self.data_df.tail(int(self.total_records * .67))
        self.test_df = self.data_df.head(int(self.total_records * .23))
        self.total_tag_list = self.get_tag_list()
        self.total_word_list = self.get_word_list()
        self.modified_train_df = pd.DataFrame()
        self.modified_test_df = pd.DataFrame()
        self.classifier = BernoulliNB()
        self.classifier_multilabel = ClassifierChain(BernoulliNB())
        self.classifier_dt = DecisionTreeRegressor(max_depth=2000)
        self.classifier_random_forest = RandomForestRegressor(max_depth=100)
        self.classifier_svm = svm.SVC(kernel='linear')

        self.test_tags = pd.DataFrame()

    def get_tag_list(self):
        tag_set = set()
        for tags in self.train_df.Tags:
            if tags is not nan:
                tag_set.update(tags.split(','))
        return sorted(list(tag_set))

    def get_word_list(self):
        word_set = set()
        for words in self.train_df.stemmed_words:
            if words is not nan:
                word_set.update(words.split(' '))
        return sorted(list(word_set))

    def setup_data_frame(self):
        for each in self.total_word_list:
            self.modified_train_df[each] = pd.Series([
                1 if each in words.split(' ') else 0
                for words in self.train_df.stemmed_words
            ],
                                                     index=self.train_df.index)
            self.modified_test_df[each] = pd.Series([
                1 if each in words.split(' ') else 0
                for words in self.test_df.stemmed_words
            ],
                                                    index=self.test_df.index)
        for tag in self.total_tag_list:
            self.modified_train_df[tag] = pd.Series([
                1 if tag in tags.split(',') else 0
                for tags in self.train_df.Tags
            ],
                                                    index=self.train_df.index)
            self.test_tags[tag] = pd.Series([
                1 if tag in tags.split(',') else 0
                for tags in self.test_df.Tags
            ],
                                            index=self.test_df.index)
        pca = PCA(n_components=966)
        principal = pca.fit(self.modified_train_df)
        # self.modified_train_df = principal
        return self.modified_train_df

    def multi_label_naive_bayes_classifier(self):
        test_rows = self.modified_test_df.values
        self.modified_test_df['predicted_labels'] = pd.Series(
            ['' for each in self.modified_test_df.index],
            index=self.modified_test_df.index)
        for tag in self.total_tag_list:
            self.classifier.fit(
                self.modified_train_df[self.total_word_list].values,
                self.modified_train_df[tag].tolist())
            self.modified_test_df[tag] = pd.Series(
                self.classifier.predict(test_rows),
                index=self.modified_test_df.index)
            self.modified_test_df['predicted_labels'] = pd.Series(
                [
                    each + ',' + tag if value == 1 else each
                    for each, value in zip(
                        self.modified_test_df.predicted_labels,
                        self.modified_test_df.tag)
                ],
                index=self.modified_test_df.index)

    def multi_label_naive_bayes_classifier_sklearn(self):
        test_rows = self.modified_test_df.values
        self.classifier_multilabel.fit(
            self.modified_train_df[self.total_word_list].values,
            self.modified_train_df[self.total_tag_list])
        c = self.classifier_multilabel.predict(test_rows)

        print(c.shape)
        print(sps.csc_matrix(self.test_tags.values).shape)
        print(accuracy_score(sps.csc_matrix(self.test_tags.values), c))

    def multi_label_decision_tree_regressor(self):
        test_rows = self.modified_test_df.values
        self.classifier_dt.fit(
            self.modified_train_df[self.total_word_list].values,
            self.modified_train_df[self.total_tag_list])
        predictions = self.classifier_dt.predict(test_rows)
        temp_df = pd.DataFrame(predictions, columns=self.total_tag_list)
        self.test_df['predicted_labels'] = pd.Series(
            ['' for each in self.modified_test_df.index],
            index=self.modified_test_df.index)
        for tag in self.total_tag_list:
            self.test_df['predicted_labels'] = pd.Series(
                [
                    each + ',' + tag if value == 1 else each for each, value in
                    zip(self.test_df.predicted_labels, temp_df[tag])
                ],
                index=self.test_df.index)
        self.test_df[['stemmed_words', 'Tags', 'predicted_labels'
                      ]].to_csv(os.path.join("data",
                                             "decision_tree_result.csv"),
                                index=False)

    def multi_label_random_forest(self):
        test_rows = self.modified_test_df.values
        self.classifier_random_forest.fit(
            self.modified_train_df[self.total_word_list].values,
            self.modified_train_df[self.total_tag_list])
        predictions = self.classifier_random_forest.predict(test_rows)
        temp_df = pd.DataFrame(predictions, columns=self.total_tag_list)
        self.test_df['predicted_labels'] = pd.Series(
            ['' for each in self.modified_test_df.index],
            index=self.modified_test_df.index)
        for tag in self.total_tag_list:
            self.test_df['predicted_labels'] = pd.Series(
                [
                    each + ',' + tag if value == 1 else each for each, value in
                    zip(self.test_df.predicted_labels, temp_df[tag])
                ],
                index=self.test_df.index)
        self.test_df[['stemmed_words', 'Tags', 'predicted_labels'
                      ]].to_csv(os.path.join("data",
                                             "random_forest_result.csv"),
                                index=False)

    def multi_label_svm(self):
        test_rows = self.modified_test_df.values
        tags = array(self.modified_train_df[self.total_tag_list])
        temp_df = pd.DataFrame()
        for col in range(tags.shape[1]):
            self.classifier_svm.fit(
                self.modified_train_df[self.total_word_list].values, tags[:,
                                                                          col])
            predictions = self.classifier_svm.predict(test_rows)
            temp_df[self.total_tag_list[col]] = pd.Series(predictions)
        #temp_df = pd.DataFrame(predictions, columns=self.total_tag_list)
        self.test_df['predicted_labels'] = pd.Series(
            ['' for each in self.modified_test_df.index],
            index=self.modified_test_df.index)
        for tag in self.total_tag_list:
            self.test_df['predicted_labels'] = pd.Series(
                [
                    each + ',' + tag if value == 1 else each for each, value in
                    zip(self.test_df.predicted_labels, temp_df[tag])
                ],
                index=self.test_df.index)
        self.test_df[['stemmed_words', 'Tags', 'predicted_labels'
                      ]].to_csv(os.path.join("data", "linear_svm.csv"),
                                index=False)
コード例 #27
0
test_data = pd.read_csv('.data/test-data.dat', delimiter='\n', header=None)
train_labels = pd.read_csv('.data/train-label.dat', sep=' ', header=None)
test_labels = pd.read_csv('.data/test-label.dat', sep=' ', header=None)

#replace <D> with nothing from data
train_data = train_data.iloc[:, 0].str.replace('<\d+>', '')
test_data = test_data.iloc[:, 0].str.replace('<\d+>', '')

#count the frequency of every word in vocabulary in each document
vectorizer = CountVectorizer()
train_data_vector = vectorizer.fit_transform(train_data)
test_data_vector = vectorizer.transform(test_data)

#train the classifier
model = ClassifierChain(RandomForestClassifier(n_jobs=-1, verbose=1))
model.fit(train_data_vector, train_labels)

#test the classifier
predicted_labels = model.predict(test_data_vector)
predicted_labels_train = model.predict(train_data_vector)
predicted_probabilities = model.predict_proba(test_data_vector)

#test accuracy
#~7% with random forest and binary relevance
#~7% with random forest and classifier chain
#~5% with random forest and label powerset
#~4% with multilabel knn
test_acc = accuracy_score(test_labels, predicted_labels)
train_acc = accuracy_score(train_labels, predicted_labels_train)
test_hamm_loss = hamming_loss(test_labels, predicted_labels)
test_cov_err = coverage_error(test_labels, predicted_probabilities.toarray())
# In[68]:

log_classifier.fit(x_train, y_train)
print('Accuracy_score using LabelPowerset is ',
      round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1),
      '%')
print('-------------------------------------------------')
print('roc_auc_score using LabelPowerset is ',
      roc_auc_score(y_test,
                    log_classifier.predict_proba(x_test).toarray()))

# # ClassifierChain
# * This method uses a chain of binary classifiers
# * Each new Classifier uses the predictions of all previous classifiers
# * This was the correlation b/w labels is taken into account

# In[69]:

chain = ClassifierChain(LogisticRegression())

# In[70]:

chain.fit(x_train, y_train)
print('Accuracy_score using ClassifierChain is ',
      round(accuracy_score(y_test, chain.predict(x_test)) * 100, 1), '%')
print('-------------------------------------------------')
print('roc_auc_score using ClassifierChain is ',
      roc_auc_score(y_test,
                    chain.predict_proba(x_test).toarray()))
コード例 #29
0
    res = res / y_pred.shape[0]

    return np.round(res, 2)


for i in range(5):
    log = LogisticRegression()
    log.fit(np.hstack((X, Y[:, 0:i])), Y[:, i])  # 每次训练将前一次的预测结果附带上
    logs.append(log)

results = []
for i in range(5):
    res = logs[i].predict(np.hstack((X, Y[:, 0:i])))
    results.append(res)

fres = []
for i in range(len(results[0])):
    a = [
        results[0][i], results[1][i], results[2][i], results[3][i],
        results[4][i]
    ]
    fres.append(a)

fres = np.matrix(fres)
print(accuracy_score(fres, Y))
test = datasets.make_multilabel_classification()
# 使用已写好的分类器链算法验证结果
cl = ClassifierChain(LogisticRegression())
cl.fit(data[0], data[1])
pred = cl.predict(test[0])
print(accuracy_score(pred, test[1]))
コード例 #30
0
class Multi_labeling:
    def __init__(self, label_dict, train_labels, train_data, test_labels, test_data):
        self.label_dict = label_dict
        self.train_labels = train_labels
        self.train_data = train_data
        self.test_labels = test_labels
        self.test_data = test_data

    def classify(self):
        from skmultilearn.problem_transform import ClassifierChain
        from sklearn.svm import SVC,LinearSVC
        import sklearn.metrics as metrics

        # =============================
        #      ClassifierChain        #
        # =============================
        from sklearn.multiclass import OneVsRestClassifier
        # from sklearn.multioutput import ClassifierChain
        from sklearn.linear_model import LogisticRegression
        # cc = ClassifierChain(LogisticRegression())
        self.cc = ClassifierChain(LinearSVC())
        self.cc.fit(self.train_data, self.train_labels)
        # y_pred = self.cc.predict(self.test_data)
        # cc_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro')



        # # initialize Classifier Chain multi-label classifier
        # # with an SVM classifier
        # # SVM in scikit only supports the X matrix in sparse representation
        # classifier = ClassifierChain(
        #     classifier=SVC(),
        #     require_dense=[False, True]
        # )
        # # train
        # classifier.fit(self.train_data, self.train_labels)
        # # predict
        # predictions = classifier.predict(self.test_data)
        # print(predictions)
        # art_f1 = metrics.f1_score(self.test_labels, predictions, average='macro')
        # return art_f1




        # =============================
        #    KNeighborsClassifier     #
        # =============================
        from sklearn.neighbors import KNeighborsClassifier
        knc = KNeighborsClassifier()

        knc.fit(self.train_data, self.train_labels)
        # Y_pred = knc.predict(self.test_data)
        # knc_art_f1 = metrics.f1_score(self.test_labels, Y_pred, average='micro')




        # =============================
        #           SGDClassifier     #
        # =============================
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.linear_model import SGDClassifier
        sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=0, max_iter=6, tol=None)
        clf = OneVsRestClassifier(sgd)
        clf.fit(self.train_data, self.train_labels)
        # y_pred = clf.predict(self.test_data)
        # sgd_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro')
        # return cc_art_f1, knc_art_f1, sgd_art_f1

    def pred_all_other(self, input_data):
        y_pred = self.cc.predict(input_data)
        return y_pred
コード例 #31
0
        x_train = x_train['Documentation Text'].tolist()
        x_test = x_test['Documentation Text'].tolist()
        y_train = y_train.values
        y_test = y_test.values

        #n-gram
        #tfidf = TfidfVectorizer(ngram_range = (1,1), stop_words = 'english')
        #tfidf = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2')
        tfidf = CountVectorizer()
        tfidf.fit(x_train)
        x_train = tfidf.transform(x_train)
        x_test = tfidf.transform(x_test)

        # train
        #classifier = BinaryRelevance(GaussianNB())
        classifier.fit(x_train, y_train)

        # predict
        predictions = classifier.predict(x_test)

        y_pred = []
        for i in predictions:
            y_pred.append(list(i.A[0]))

        #print(y_pred)
        y_test = y_test.tolist()
        #print(len(y_test))
        y_pred_dataframe = pd.DataFrame(y_pred, columns=categories)
        y_test_dataframe = pd.DataFrame(y_test, columns=categories)
        #print(len(y_pred_dataframe))
        this_pred_list = y_pred_dataframe[category].tolist()
コード例 #32
0
ファイル: articles_classifier.py プロジェクト: x0rzkov/zodiac
class ArticleClassifier(ClassifierMixin):
    def __init__(self, ngram=(1, 3), tokenizer=prepareText, max_feature=20000):
        """
        This classifier is a multi-label classifier. It have been trained on octo-articles dataset.
        You can train it using the fit function
        :parameter
        ----------
            :param ngram {tuple}:
                    default '(1,3)'  ngram_range for the tfidfVectorizer
            :param tokenizer {func}:
                    tokenizer used by tfidfvectorizer to prepapre the Data
            :param max_feature {int}:
                    limit the matrix composition to the 'max_feature' most important element
        """
        self.vectorizer_ = TfidfVectorizer(strip_accents='unicode',
                                           analyzer='word',
                                           ngram_range=ngram,
                                           norm='l2',
                                           tokenizer=tokenizer,
                                           max_features=max_feature)

        pass

    def fit(self, X, y):
        """
        fit the model to the data. Train the classifier
        Note: You should use the zodiac.classifier.cleaner on all the texts before you fit the data

        :parameter
        ----------
            :param X: (list)
                list of clean text (you can use zodiac.cleaner.TextCleaner)
            :param y: (numpy.array)
                array of labels
        """
        self.x_vec_ = self.vectorizer_.fit_transform(X)
        # initialize classifier chains multi-label classifier
        self.classifier_ = ClassifierChain(SVC(probability=True))
        # Training logistic regression model on train data
        self.classifier_.fit(self.x_vec_, y)

    def score(self, X, y, average='samples', threshold=0.5):
        """
        Compute the jaccard score using the given parameters
        :parameter
        -----------
            :param x_test(list):
                list of text
            :param y_true (list):
                texts labels
            :param average:
                default 'average'.
        :return:
        -------
            score : float
                jaccard score
        """
        self.x_test_vec_ = self.vectorizer_.transform(X)
        predictions = self.classifier_.predict_proba(self.x_test_vec_)
        score = jaccard_score(y, predictions >= threshold, average=average)
        return score

    def show_stats(self, x_test, y):
        """
        compute the jaccard score for differents threshold and display the jaccard scores using plotly scatter method

        :parameter
        ----------
            :param x_test: (list)
                text list
            :param y:
                list of label
        """
        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        x_test_vec = self.vectorizer_.transform(x_test)
        predictions_probas = self.classifier_.predict_proba(x_test_vec)
        jaccard_scores = []
        for threshold in thresholds:
            # print("For threshold: ", val)
            pred = predictions_probas.copy()
            ensemble_jaccard_score = jaccard_score(
                y, predictions_probas >= threshold, average='samples')
            jaccard_scores.append(ensemble_jaccard_score)
        self.jaccard_scores_threshold_df_ = pd.DataFrame({
            'threshold':
            thresholds,
            'jaccard_score':
            jaccard_scores
        })

    def load_weights(self, path):
        """
        Load the weights of the model from path
        :parameter
        ---
        :param path {str}:
            path to the model weights
        """
        joblib.load(path)

    def save_weights(self, path):
        """
        Save the model weights locally
        :parameter
        ----------
            :param path {str}:
                    path to the directory to store the classifier wieghts
        """
        joblib.dump(self.classifier_, path)
        px.scatter(self.jaccard_scores_threshold_df_,
                   x='threshold',
                   y='jaccard_score',
                   color='threshold',
                   title='Jaccard score depending on threshold')