Ejemplo n.º 1
0
def train_linearsvc(X_train, Y_train):

    model_bow = BinaryRelevance(classifier=LinearSVC(C=0.5, tol=0.2))

    model_bow.fit(X_train, Y_train)

    return model_bow
Ejemplo n.º 2
0
 def __init__(self, task, modelName):
     self.task = task
     #create the model
     if task ==0:
         if modelName == "NB":
             self.model = GaussianNB()
         elif modelName == "KNN":
             self.model = KNeighborsClassifier()
         elif modelName == "SVM":
             self.model = SVC()
         elif modelName == "C45":
             self.model = tree.DecisionTreeClassifier
         # 增加适用多标签的贝叶斯
         elif modelName == "MultiLabelNB":
             self.model =  BinaryRelevance(GaussianNB())
         elif modelName == "MultiLabelSVM":
             self.model =  BinaryRelevance(SVC())
         else:
             print("YOU CHOSE WRONG MODEL FOR CLASSIFICATION!")
     else:
         if modelName == "LR":
             self.model = linear_model.LinearRegression()
         elif modelName == "M5":
             self.model = tree.DecisionTreeRegressor
         elif modelName == "KNN":
             self.model = KNeighborsRegressor()
         else:
             print("YOU CHOSE WRONG MODEL FOR REGRESSION!")
             self.model = linear_model.LinearRegression()
Ejemplo n.º 3
0
def binary(X_train, X_test, y_train, y_test):

    print("Binary Relevance")
    model = BinaryRelevance(classifier=SVC(),
                            require_dense=[True, True]).fit(X_train, y_train)
    y_pred = model.predict(X_test)

    hamming = hamming_loss(y_test, y_pred)
    subset_accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='micro')
    precision = precision_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    coverage = coverage_error(y_test, y_pred.toarray())
    aps = label_ranking_average_precision_score(y_test, y_pred.toarray())
    rankingloss = label_ranking_loss(y_test, y_pred.toarray())
    print("Hamming: " + str(hamming))
    print("Subset Accuracy: " + str(subset_accuracy))
    print("Recall: " + str(recall))
    print("Precision: " + str(precision))
    print("F1: " + str(f1))
    print("Coverage error: " + str(coverage))
    print("Average Precision Score: " + str(aps))
    print("Ranking Loss: " + str(rankingloss))
    print("\n")

    return hamming, subset_accuracy, recall, precision, f1, coverage, aps, rankingloss
Ejemplo n.º 4
0
def select_features(data_train, data_test):
    x_train = data_train.iloc[:, :NO_FEATURES]
    y_train = data_train.iloc[:, NO_FEATURES:-1]
    x_train_sp = lil_matrix(x_train).toarray()
    y_train_sp = lil_matrix(y_train).toarray()

    forest = ExtraTreesClassifier(n_estimators=100, random_state=SEED_NUMBER)
    classifier = BinaryRelevance(forest)
    classifier.fit(x_train_sp, y_train_sp)
    feature_scores = [
        forest.feature_importances_ for forest in classifier.classifiers_
    ]

    indices = [argsort(importance)[::-1] for importance in feature_scores]
    selected_per_class = [
        index[:int(0.1 * NO_FEATURES)].tolist() for index in indices
    ]
    selected_union = list(set().union(*selected_per_class))

    avg_feature_scores = mean(feature_scores, axis=0)
    avg_indices = argsort(avg_feature_scores)[::-1]
    selected_avg = avg_indices[:int(0.1 * NO_FEATURES)]
    drop_col = [idx for idx in range(NO_FEATURES) if idx not in selected_union]

    train_red = data_train.drop(data_train.columns[drop_col], axis=1)
    test_red = data_test.drop(data_test.columns[drop_col], axis=1)
    return train_red, test_red, selected_union.__len__()
Ejemplo n.º 5
0
class BinaryRelevancesSimple:
    def __init__(self, model):
        # self.params = {
        #     # 'num_class': num_class,
        #     # "boosting_type": "gbdt",
        #     "objective": "binary",
        #     "metric": 'None',
        #     "learning_rate": 0.05,
        #     "verbosity": 1,
        #     "seed": 888,
        #     "num_threads": NUM_THREAD
        # }

        self.model = BinaryRelevance(LGBMClassifier())
        if model == 'RF':
            self.model = BinaryRelevance(RandomForestClassifier(n_estimators=200, max_depth=12))

#     def set_grow_step(self, new_step):
#         self.grow_boost_round = new_step
     
    def fit(self, X_train, y_train):
        print ('###start trainging...')
        start = time.time()
        self.model.fit(X_train, y_train)
        print ('####training time:', time.time() - start)
 
    def predict_proba(self, X_test):
        return self.model.predict_proba(X_test).A
Ejemplo n.º 6
0
def buildBRClassifier(xTrain, yTrain):
    # initialize binary relevance multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = BinaryRelevance(GaussianNB())

    # train
    classifier.fit(xTrain, yTrain)
    return classifier
Ejemplo n.º 7
0
def train(X, y):
    classifier = BinaryRelevance(classifier=SVC(), require_dense=[False, True])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    print("before train")
    classifier.fit(X_train, y_train)
    print("train over begin predict")
    predictions = classifier.predict(X_test)
    print("validate loss accuracy: {}".format(
        1 - hamming_loss(y_test, np.stack(predictions))))
Ejemplo n.º 8
0
    def train(self):
        classifier = BinaryRelevance(GaussianNB())
        classifier.fit(self.x_data, self.y_data)
        predictions = classifier.predict(self.x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
def binRel(X_train, X_test, y_test, y_train):
    # initialize binary relevance multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = BinaryRelevance(GaussianNB())
    # train
    classifier.fit(X_train, y_train)
    # predict
    predictions = classifier.predict(X_test)
    print('Hamming loss: {0}'.format(
        sklearn.metrics.hamming_loss(y_test, predictions)))
Ejemplo n.º 10
0
def BN_fit(clfs, X_train, y_train, X_test, y_test, evaluate):
    metrics_lb = {}
    for key, clf in zip(clfs.keys(), clfs.values()):
        print('Fitting BinaryRelevance with Classifier : %s' % key)
        clf = BinaryRelevance(clf)
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        for m in evaluate:
            metrics_lb[key + ' ' + m] = scores(m, y_test, preds)
    return metrics_lb
Ejemplo n.º 11
0
 def __init__(self):
     # self.params = {
     #     # 'num_class': num_class,
     #     # "boosting_type": "gbdt",
     #     "objective": "binary",
     #     "metric": 'None',
     #     "learning_rate": 0.05,
     #     "verbosity": 1,
     #     "seed": 888,
     #     "num_threads": NUM_THREAD
     # }
     self.model = BinaryRelevance(LGBMClassifier())
def formDataMultiLabel(sampData, df):
    
    ####### Convert Response Category into Labelized Binarizer ################
    manActData = df.label_num.unique()
    lb = preprocessing.LabelBinarizer()    
    lb.fit(manActData)                       
    
    tfdLabelNum = lb.transform(df.label_num)
    
    ####### Convert Next Action Category into Labelized Binarizer #############
    nxtActData = df.sec_label_num.unique()
    lb = preprocessing.LabelBinarizer()    
    lb.fit(nxtActData)
    
    tfdSecLabelNum = lb.transform(df.sec_label_num)
       
    ####### Convert Response Category into Labelized Binarizer ################
    inpRPAData     = (df.inp_Data).astype(str)
    inpRPAData     = inpRPAData.apply(lambda x: x.split()[0])
    lab, lev = pd.factorize(inpRPAData)
    
    lb = preprocessing.LabelBinarizer()    
    lb.fit(np.unique(lab))
    
    tfdInpRPAData = lb.transform(lab)
    #print (np.unique(tfdInpRPAData))
    
    #This concatenation is the actual process
    #conCatData    = np.concatenate((tfdLabelNum, tfdSecLabelNum, tfdInpRPAData), axis=1)
    
    ####### Build Multi-Label Prediction Model  ###############################
    respTrain, respTest, labTrain, labTest = train_test_split(sampData, tfdSecLabelNum, random_state=1)

    TR  = tree.DecisionTreeClassifier(criterion = "gini", max_depth=100, min_samples_leaf=2) 
    GNB = GaussianNB()
    RF  = RandomForestClassifier(n_estimators = 100)
    
    classifier = BinaryRelevance(GNB)
    #classifier = ClassifierChain(TR)
    #classifier = LabelPowerset(RF)
    
    vect = TfidfVectorizer(min_df=1, max_df=1.0, stop_words='english')
    respTrainVec = vect.fit_transform(respTrain)
    
    respTestVec = vect.transform(respTest)
    
    classifier.fit(respTrainVec, labTrain)
    predictions = classifier.predict(respTestVec)
    acc = metrics.accuracy_score(labTest, predictions)
    print (acc)
    
    return lab
Ejemplo n.º 13
0
def main():
    bibtex = sci.loadmat('D:\课程作业\机器学习\机器学习课程设计\dataset\\bibtex.mat')
    medical = sci.loadmat('D:\课程作业\机器学习\机器学习课程设计\dataset\medical.mat')
    bib_X = bibtex['data']  #7395,1836
    bib_y = bibtex['target']  #159,7395
    med_X = medical['data']  #978,1449
    med_y = medical['target']  #45,978
    scaler = MinMaxScaler()
    scaler.fit(bib_X)
    bib_X = scaler.transform(bib_X)
    scaler = MinMaxScaler()
    scaler.fit(med_X)
    med_X = scaler.transform(med_X)

    f1_scores = []
    l2_s = ['l1', 'l2']
    for l2 in l2_s:
        clf = BinaryRelevance(
            LogisticRegression(penalty=l2, solver='liblinear', dual=False))
        clf.fit(med_X, med_y.T)
        pre = clf.predict(med_X)
        f1_scores.append(metrics.f1_score(med_y.T, pre, average='samples'))
    for l2 in l2_s:
        clf = BinaryRelevance(LinearSVC(penalty=l2, dual=False))
        clf.fit(med_X, med_y.T)
        pre = clf.predict(med_X)
        f1_scores.append(metrics.f1_score(med_y.T, pre, average='samples'))
    tabel = PrettyTable(["", "log", "hinge"])
    tabel.padding_width = 1
    tabel.add_row(["l1", f1_scores[0], f1_scores[2]])
    tabel.add_row(["l2", f1_scores[1], f1_scores[3]])
    csfs = CSFS(u=0.1)
    W, b = csfs.fit(med_X.T, med_y.T, u=0.1)
    pred = csfs.predict(med_X.T, W, b)
    new_y = np.zeros(med_y.shape)
    size = int(med_y.shape[1] * 0.7)
    new_y[:, :size] = med_y[:, :size]
    smile = SMILE(alpha=0.1)
    smile.fit(med_X.T, new_y)
    pred_s = smile.predict(med_X.T)
    csfs2 = CSFS(u=0.1)
    W, b = csfs.fit(med_X.T, new_y.T, u=0.1)
    pred = csfs.predict(med_X.T, W, b)
    print('large mult_score:',
          metrics.f1_score(med_y.T, pred, average='samples'))
    print('CSFSf1_scores:', metrics.f1_score(med_y.T,
                                             pred_s,
                                             average='samples'))
    print('SMILE_score:', metrics.f1_score(med_y.T, pred, average='samples'))
    print(tabel)
    def fit(self, X, y):

        # I'm using a gaussian naive bayes base classifier
        self.BinaryRelevanceObject = BinaryRelevance(
            classifier=SVC(gamma='auto', probability=True),
            require_dense=[True, True])
        #self.BinaryRelevanceObject = BinaryRelevance()

        # fitting the data
        self.BinaryRelevanceObject.fit(X, y)

        #the classifiers for each label
        self.classifiers = self.BinaryRelevanceObject.classifiers_

        return self.BinaryRelevanceObject.fit(X, y)
Ejemplo n.º 15
0
    def train_model(self, train):
        data = [self.get_value(s, False) for s in train]

        X_train = np.array([d[0] for d in data])
        y_train = np.array([d[1] for d in data])

        model = BinaryRelevance(classifier=SVC(probability=True,
                                               class_weight='balanced',
                                               break_ties=True),
                                require_dense=[False, True])
        # model = BinaryRelevance(classifier=LogisticRegression(class_weight='balanced', solver='lbfgs'), require_dense=[False, True])

        model.fit(X_train, y_train)

        return model
Ejemplo n.º 16
0
def classifiers(X_train, Y_train, X_test):

    classifier1 = BinaryRelevance(GaussianNB())
    classifier2 = ClassifierChain(GaussianNB())
    classifier3 = LabelPowerset(GaussianNB())

    classifier1.fit(X_train, Y_train)
    classifier2.fit(X_train, Y_train)
    classifier3.fit(X_train, Y_train)

    predictions1 = classifier1.predict(X_test)
    predictions2 = classifier2.predict(X_test)
    predictions3 = classifier3.predict(X_test)

    return predictions1, predictions2, predictions3
Ejemplo n.º 17
0
    def __init__(self, model):
        # self.params = {
        #     # 'num_class': num_class,
        #     # "boosting_type": "gbdt",
        #     "objective": "binary",
        #     "metric": 'None',
        #     "learning_rate": 0.05,
        #     "verbosity": 1,
        #     "seed": 888,
        #     "num_threads": NUM_THREAD
        # }

        self.model = BinaryRelevance(LGBMClassifier())
        if model == 'RF':
            self.model = BinaryRelevance(RandomForestClassifier(n_estimators=200, max_depth=12))
Ejemplo n.º 18
0
        def test_if_dense_classification_works_on_non_dense_base_classifier(
                self):
            classifier = BinaryRelevance(classifier=Keras(
                create_model_single_class, False, KERAS_PARAMS),
                                         require_dense=[True, True])

            self.assertClassifierWorksWithSparsity(classifier, 'dense')
def multiLabel_SKLearn_GaussianNBayes(rData, lData, sData):
    
    xData = rData.values
    yData = np.array( [lData.values, sData.values] )       
        
    respTrain, respTest, labTrain, labTest = train_test_split(xData, yData, random_state=1)    
    
    classifier = BinaryRelevance(GaussianNB())
    #classifier = ClassifierChain(GaussianNB())
    #classifier = LabelPowerset(GaussianNB())
    
    classifier.fit(respTrain, labTrain)
    predictions = classifier.predict(respTest)
    acc = accuracy_score(labTest, predictions)
    
    return acc
Ejemplo n.º 20
0
    def RecommendByBinaryRelevance(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """使用多标签问题的 二值相关 """
        classifier = BinaryRelevance(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20))
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data)
        predictions = predictions.todense().getA()

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
Ejemplo n.º 21
0
    def binary_relevance(self):
        '''Name: Binary Relevance
           Main Idea: Divide multi-classify into multi binary classfier
           Evaluation Metric: accuracy_score
        '''
        print(self.X_train)
        print(self.y_train)
        classifier = BinaryRelevance(GaussianNB())
        classifier.fit(self.X_train, self.y_train)

        predictions = classifier.predict(self.X_test)
        print(predictions)
        #print(y_test)
        #print("predictions:\n",predictions)

        result = accuracy_score(self.y_test, predictions)

        print(result)
 def __init__(
         self,
         rdm_state=84,
         params={"estimator__C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
         niterations=5):
     self.model = BinaryRelevance(
         LogisticRegression(random_state=rdm_state))
     self.params = params
     self.niterations = niterations
Ejemplo n.º 23
0
def BinaryRelevance_method(X_train, y_train, samples_leaf, samples_split):
    """
	问题转换-->二元关联方法
	:param X_train: 输入数据
	:param y_train: 对应标签数据
	:return:
	"""
    try:
        classifier = BinaryRelevance(
            DecisionTreeClassifier(min_samples_leaf=int(samples_leaf),
                                   min_samples_split=int(samples_split)))
        classifier.fit(X_train, y_train)

        return classifier
    except Exception as e:
        print("warning----二元关联|BinaryRelevance_method----" + str(e))

    return None
class MyBinaryRelevanceFeatureSelect():
    def fit(self, X, y):

        # I'm using a gaussian naive bayes base classifier
        self.BinaryRelevanceObject = BinaryRelevance(
            classifier=SVC(gamma='auto', probability=True),
            require_dense=[True, True])
        #self.BinaryRelevanceObject = BinaryRelevance()

        # fitting the data
        self.BinaryRelevanceObject.fit(X, y)

        #the classifiers for each label
        self.classifiers = self.BinaryRelevanceObject.classifiers_

        return self.BinaryRelevanceObject.fit(X, y)

#     def partition(self):
#         return self.BinaryRelevanceObject.partition_#BinaryRelevanceObject

#     def model_count(self):
#         return self.BinaryRelevanceObject.model_count_

    def predict(self, X, y=None):
        return self.BinaryRelevanceObject.predict(X)

    def predict_proba(self, X):
        return self.BinaryRelevanceObject.predict_proba(X)


#    def feature_select(self, X, y, transformer):
#        transformer.fit(X, y)
#        selected_attributes_indices = transformer.get_support(indices = True)
#
#        return selected_attributes_indices
#
#    def sets_of_selected_features(self, X, predictions, classifier, transformer ): #X is the df with the predictions
#        selected_features_array = []
#
#        for i in predictions:
#            indices_features_selected = classifier.feature_select(X, predictions[i], transformer)
#            selected_features_array.append(indices_features_selected)
#
#        return selected_features_array
    def get_train_test_lda(self, topic):

        # get training set
        dataset = arff.load(open(os.path.join(dir, "medical-train.arff")), encode_nominal=True)
        dataset = np.array(dataset.get("data"))

        X_train = dataset[:, :-num_label]
        y_train = dataset[:, -num_label:]

        # get test set
        dataset = arff.load(open(os.path.join(dir, "medical-test.arff")), encode_nominal=True)
        dataset = np.array(dataset.get("data"))

        X_test = dataset[:, :-num_label]
        y_test = dataset[:, -num_label:]

        for k in topic:
            X_iter = X_train.astype(np.int64)

            # get training_data feature topics
            model = lda.LDA(n_topics=k, n_iter=1000)
            model.fit(X_iter)
            doc_topic_x = model.doc_topic_

            # get training data label topics
            model_label = lda.LDA(n_topics=k, n_iter=1000)
            model_label.fit(y_train)
            doc_topic_y = model_label.doc_topic_

            # concat feature-topic and label topic
            x = np.hstack((doc_topic_x, doc_topic_y))

            # discretize the topics
            x = self.discretization_doc_topic(x)
            X_train = np.hstack((X_train, x))

            # multi-label learning to get test_data label topics and feature topics
            classifier = BinaryRelevance(RandomForestClassifier())
            classifier.fit(X_iter, x)
            x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray())

            X_test = np.hstack((X_test, x))

        return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
Ejemplo n.º 26
0
def gaussianNaiveBayesBinary():
    print("Gaussian naive bayes binary")

    start = time.time()
    classifier = BinaryRelevance(GaussianNB())

    filename = "gaussianNaiveBayes"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
Ejemplo n.º 27
0
def supportVectorMachine():
    print("Support vector machine")

    start = time.time()
    classifier = BinaryRelevance(classifier=svm.SVC(),
                                 require_dense=[False, True])
    filename = "SupportVectorMachine"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
Ejemplo n.º 28
0
def randomForest():
    print("Random forest classifier")

    start = time.time()
    classifier = BinaryRelevance(classifier=RandomForestClassifier(),
                                 require_dense=[False, True])
    filename = "randomForest"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
Ejemplo n.º 29
0
def knnBinary(m):
    print("knn binary")

    start = time.time()
    classifier = BinaryRelevance(KNeighborsClassifier(n_neighbors=m))

    filename = "knnBinary"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
 def __init__(
         self,
         random_state=84,
         params={
             'estimator__C': [1, 10, 100, 1000],
             'estimator__gamma': [0.001, 0.0001],
             'estimator__kernel': ['rbf', 'linear']
         },
         niterations=10):
     self.model = BinaryRelevance(SVC(random_state=random_state))
     self.params = params
     self.niterations = niterations