Example #1
0
class LP():
    '''
        Label Powerset Method
    '''

    h = None

    def __init__(self, h=LogisticRegression()):
        self.h = LabelPowerset(h)

    def fit(self, X, Y):
        '''
            Train the model on training data X,Y
        '''
        return self.h.fit(X, Y)

    def predict(self, X):
        '''
            Return predictions Y, given X
        '''
        return self.h.predict(X)

    def predict_proba(self, X):
        '''
            Return matrix P, where P[i,j] = P(Y[i,j] = 1 | X[i])
            (where i-th row/example, and j-th label)
        '''
        return self.h.predict_proba(X)
class MyLabelPowerSetFeatureSelect():
   
    def fit(self, X, y):
        
        # I'm using a gaussian naive bayes base classifier
        self.LabelPowerSetObject = LabelPowerset(GaussianNB())
        
        # fitting the data
        self.LabelPowerSetObject.fit(X, y)
        
        # transformed y 
        y_transformed  = self.LabelPowerSetObject.transform(y)
        
        # instanciating with SelectKBest object
        self.X_new = SelectKBest(chi2, k=2)
        
        # the feature selecting
        self.X_transformed = self.X_new.fit_transform(X, y_transformed)
        
        # save indices of the saved attributes
        self.selected_attributes_indices = self.X_new.get_support(indices = True)
        
        #print(self.attributes_indices,'the indices of the selected atributes')
        
        return self
        
    
    def transform(self, X):    
        return X[:,self.selected_attributes_indices]
    
    def predict(self, X):
        return self.LabelPowerSetObject.predict(X)
    
    def predict_proba(self, X):
        return self.LabelPowerSetObject.predict_proba(X)
Example #3
0
    def labelSet(self):
        classifier = LabelPowerset(GaussianNB())

        classifier.fit(self.X_train, self.y_train)

        # predict
        predictions = classifier.predict(self.X_test)
        result = accuracy_score(self.y_test, predictions)
        print(result)
    def powerset(self):

        classifier = LabelPowerset(LogisticRegression())
        classifier.fit(self.x_data, self.y_data)

        predictions = classifier.predict(self.x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
Example #5
0
def classifiers(X_train, Y_train, X_test):

    classifier1 = BinaryRelevance(GaussianNB())
    classifier2 = ClassifierChain(GaussianNB())
    classifier3 = LabelPowerset(GaussianNB())

    classifier1.fit(X_train, Y_train)
    classifier2.fit(X_train, Y_train)
    classifier3.fit(X_train, Y_train)

    predictions1 = classifier1.predict(X_test)
    predictions2 = classifier2.predict(X_test)
    predictions3 = classifier3.predict(X_test)

    return predictions1, predictions2, predictions3
Example #6
0
def evaluate_verse(embedding, labels, number_shuffles=10, train_perc=0.1):
    from skmultilearn.problem_transform import LabelPowerset

    micro = []
    macro = []
    sss = StratifiedShuffleSplit(
        n_splits=number_shuffles,
        test_size=1 - train_perc)
    for train_index, test_index in sss.split(embedding, labels):
        X_train, X_test = embedding[train_index], embedding[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        clf = LabelPowerset(LogisticRegression())
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        micro.append(f1_score(y_test, preds, average='micro'))
        macro.append(f1_score(y_test, preds, average='macro'))
    return (micro, macro)
Example #7
0
def runSet(model, x, y):
    mse = []
    accuracy = []
    kf = KFold(n_splits=splitNo)
    for train, test in kf.split(x):
        classifier = LabelPowerset(model)
        classifier.fit(x[train], y[train])
        predictions = classifier.predict(x[test])
        accuracy.append(accuracy_score(y[test], predictions))
        mse.append(mean_squared_error(y[test], predictions.toarray()))
    mse = np.array(mse)
    accuracy = np.array(accuracy)

    mse = np.mean(mse)
    accuracy = np.mean(accuracy)

    return accuracy, mse
def get_train_test_lda(topic):
    model = VGG16(include_top=False, pooling='avg')

    x_train, y_train, x_test, y_test = load()

    x_train = x_train.astype('float32')
    x_train /= 255

    y_train = y_train.astype('int64')

    x_test = x_test.astype('float32')
    x_test /= 255
    y_test = y_test.astype('float32')

    X_train = model.predict(x_train)
    print(X_train.shape)
    X_test = model.predict(x_test)
    # X_train = model.predict(x_train)
    # X_test = model.predict(x_test)

    for k in topic:
        X_iter = X_train

        model_label = lda.LDA(n_topics=k, n_iter=1000)
        model_label.fit(y_train)
        doc_topic = model_label.doc_topic_
        x2 = doc_topic

        x = x2
        x = discretization_doc_topic(x)
        X_train = np.hstack((X_train, x))

        # multi-label learning to get x2
        classifier = LabelPowerset(RandomForestClassifier())
        classifier.fit(X_iter, x)

        x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray())
        # print(x)
        # x = alpha * x1 + (1-alpha) * x2
        # x = self.discretization_doc_topic(x)
        X_test = np.hstack((X_test, x))

    return np.array(X_train)[:, -28:], np.array(y_train), np.array(
        X_test)[:, -28:], np.array(y_test)
Example #9
0
                  metrics=['accuracy'])
    return model


def create_model_multiclass(input_dim, output_dim):
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=input_dim, activation='relu'))
    model.add(Dense(output_dim, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


KERAS_PARAMS = dict(epochs=10, batch_size=100, verbose=0)

# clf = BinaryRelevance(classifier=Keras(create_model_single_class, False, KERAS_PARAMS), require_dense=[True,True])
# clf.fit(X_train, y_train)
# result = clf.predict(X_test)
# print(result)

clf = LabelPowerset(classifier=Keras(create_model_multiclass, True,
                                     KERAS_PARAMS),
                    require_dense=[True, True])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = %0.3f\n' % accuracy)
print(type(accuracy))
Example #10
0
accuracy_score(y_test, predicted)

accuracy_score(y_test, dtree_predictions)

from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(LinearSVC())

# train
classifier.fit(train_X, train_label)

# predict
predictions = classifier.predict(valid_X)
accuracy_score(valid_label, predictions)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

##
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)

from sklearn.multiclass import OutputCodeClassifier

classifier = OutputCodeClassifier(GradientBoostingClassifier(max_depth=5,
                                                             n_estimators=14),
                                  code_size=2,
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions_new))
print("\n")

# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())
# Training logistic regression model on train data
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
Example #12
0
        X_copy = X_orig[(p):(p +
                             1)]  #Slice the ith element from the numpy array
        y_copy = y_orig[(p):(p + 1)]
        X_model = X_orig
        y_model = y_orig  #Set X and y equal to samples and labels

        X_model = np.delete(
            X_model, p, axis=0
        )  #Create a new array to train the model with slicing out the ith item for LOOCV
        y_model = np.delete(y_model, p, axis=0)

        train_set = np.concatenate((X_model, y_model),
                                   axis=1)  #combine numpy matrices

        classifier.fit(X_model, y_model)
        prediction = classifier.predict(X_copy)
        #print(prediction.toarray(), y_copy)
        results = np.append(results, np.array(prediction.toarray()), axis=0)
        if np.array_equal(y_copy, prediction.toarray()):
            j = j + 1
            #print(y_copy, prediction.toarray())
        else:
            #print(y_copy, prediction.toarray())
            pass
    print(j / 49)

att = results[:, 0]
esc = results[:, 1:2]
ns = results[:, 2:3]
tang = results[:, 3:4]
multiclassifier = MultinomialNB()
multiclassifier.fit(x_train_vect,Y_LP_train)
multiclass_predict = multiclassifier.predict(x_test_vect)
print(Y_LP_test)
print(multiclass_predict)
print('Test accuracy {}'.format(accuracy_score(Y_LP_test,multiclass_predict)))
##############################Label Powerset classifier####################### 
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

LP_classifier = LabelPowerset(GaussianNB())

LP_classifier.fit(x_train_vect, Y_train)

LP_predictions = LP_classifier.predict(x_test_vect)

print('Test accuracy {}'.format(accuracy_score(Y_test,LP_prediction)))



#########################################################################################################




############################ using Naive Bayes pipeline###################################################
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
Example #14
0
# In[4]:

X_test = weather[weather['Year'] > 2016]
X_test = X_test[X_test['Month'] > 8].iloc[:, 5:12]
X_train = weather[~weather.index.isin(X_test.index)].iloc[:, 5:12]
y_test = y[y.index.isin(X_test.index)]
y_train = y[~y.index.isin(X_test.index)]

# In[5]:

# model = LabelPowerset(GaussianNB())
model = LabelPowerset(KNeighborsClassifier(n_neighbors=20))
# model = MLkNN(k=18)
model.fit(X_train.values, y_train.values)
predictions = model.predict(X_test)
result = predictions.toarray()
predicted = pd.DataFrame(result, columns=description)
print(accuracy_score(y_test, predictions))

# In[6]:


def columnName(row):
    name = ''
    idx = row[row == 1].index
    for i in range(len(idx)):
        name += (idx[i] + ' ')
    return name

Example #15
0
def training_phase(filename):
    cue_ground_truths = []
    fp = open(filename, 'r')
    data = fp.readlines()
    corpus = []
    for i in data:
        i = i.replace('\n', '')
        corpus.append(i.split('\t'))

    postag = []
    # feature extraction
    for line in range(len(corpus)):
        if (len(corpus[line]) > 8):
            count = int((len(corpus[line]) - 7) / 3)
            pakodi = []
            for j in range(count):
                word = corpus[line][7 + (j * 3)]
                pakodi.append(word)
            if (corpus[line][3] in pakodi):
                cue_ground_truths.append(1)
                postag.append(corpus[line][5].lower())

            else:
                cue_ground_truths.append(0)
                postag.append(corpus[line][5].lower())

        elif (len(corpus[line]) == 8):
            cue_ground_truths.append(0)
            postag.append(corpus[line][5].lower())

    pos_tags = []
    for i in postag:
        if i not in pos_tags:
            pos_tags.append(i)

    # one-hot-encoding the postags
    one_hot_postag = []

    for i in range(len(postag)):
        seq = []
        if (postag[i] in pos_tags):
            req = pos_tags.index(postag[i])
            for j in range(len(pos_tags)):
                if (j == req):
                    seq.append(1.0)
                else:
                    seq.append(0.0)
        remaining = 100 - len(pos_tags)
        for j in range(remaining):
            seq.append(0.0)
        one_hot_postag.append(seq)

    zero_list = []
    for i in range(100):
        zero_list.append(0.0)

    sent_index = []
    for i in range(len(corpus)):
        if (len(corpus[i]) == 1):
            sent_index.append(corpus[i - 1][2])

    temp = corpus
    corpus = []
    for i in range(len(temp)):
        if (len(temp[i]) == 1):
            continue
        else:
            corpus.append(temp[i])

    # uncomment the below lines to take only before 5 and next 6 postags as features
    # temp_corpus = corpus
    # temp_cue_ground_truths = cue_ground_truths
    # temp_one_hot_postag = one_hot_postag
    # features = []
    # cue_postag_features = []
    # for i in sent_index:
    # 	i = int(i)
    # 	target_sentence = temp_corpus[:i+1]
    # 	target_cues = temp_cue_ground_truths[:i+1]
    # 	target_pos = temp_one_hot_postag[:i+1]
    # 	temp_corpus = temp_corpus[i+1:]
    # 	temp_cue_ground_truths = temp_cue_ground_truths[i+1:]
    # 	temp_one_hot_postag = temp_one_hot_postag[i+1:]
    # 	for j in range(len(target_cues)):
    # 		if (target_cues[j] == 1):
    # 			missing = 6 - j
    # 			if(missing>0):
    # 				for k in range(missing):
    # 					features.append(zero_list)
    # 				n = 0
    # 				while n<=j:
    # 					features.append(target_pos[n])
    # 					n = n + 1
    # 			else:
    # 				n = j-6
    # 				while n <= j:
    # 					features.append(target_pos[n])
    # 					n = n +1
    # 			missing = 7 - len(target_sentence) + j
    # 			if missing>0:
    # 				n = j + 1
    # 				while n < len(target_sentence):
    # 					features.append(target_pos[n])
    # 					n = n+1
    # 				for n in range(missing):
    # 					features.append(zero_list)
    # 			else:
    # 				n = j+1
    # 				while n < (j+7):
    # 					features.append(target_pos[n])
    # 					n = n+1
    # 			cue_postag_features.append(features)
    # 			features = []

    # for i in range(len(cue_postag_features)):
    # 	if(len(cue_postag_features[i]) != 13):
    # 		print(len(cue_postag_features[i]), end= " ")
    # 		print(i)
    # for j in range(len(cue_postag_features[0])):
    # 	for k in range(len(cue_postag_features[0][j])):
    # 		if cue_postag_features[0][j][k] == 1:
    # 			print(pos_tags[k], end= " ")

    temp_corpus = corpus
    temp_cue_ground_truths = cue_ground_truths
    temp_one_hot_postag = one_hot_postag
    features = []
    feature1 = []
    cue_postag_features = []
    for i in sent_index:
        i = int(i)
        target_sentence = temp_corpus[:i + 1]
        target_cues = temp_cue_ground_truths[:i + 1]
        target_pos = temp_one_hot_postag[:i + 1]
        temp_corpus = temp_corpus[i + 1:]
        temp_cue_ground_truths = temp_cue_ground_truths[i + 1:]
        temp_one_hot_postag = temp_one_hot_postag[i + 1:]
        for j in range(len(target_cues)):
            if (target_cues[j] == 1):
                for k in range(len(target_pos)):
                    features.append(target_pos[j])
                    for l in range(100):
                        if l == j:
                            feature1.append(1.0)
                        else:
                            feature1.append(0.0)
                    features.append(feature1)
                    feature1 = []
                cue_postag_features.append(features)
                features = []

    cue_postag_features = keras.preprocessing.sequence.pad_sequences(
        cue_postag_features, maxlen=100)

    cue_postag_features = np.array(cue_postag_features)
    print(cue_postag_features.shape)

    temp_corpus = corpus
    temp_cue_ground_truths = cue_ground_truths
    temp_one_hot_postag = one_hot_postag
    features = []
    ground_scope = []
    for i in sent_index:
        i = int(i)
        target_sentence = temp_corpus[:i + 1]
        target_cues = temp_cue_ground_truths[:i + 1]
        target_pos = temp_one_hot_postag[:i + 1]
        temp_corpus = temp_corpus[i + 1:]
        temp_cue_ground_truths = temp_cue_ground_truths[i + 1:]
        temp_one_hot_postag = temp_one_hot_postag[i + 1:]
        for j in range(len(target_cues)):
            if (target_cues[j] == 1):
                cue_count = int((len(target_sentence[j]) - 7) / 3)
                paks = []
                for k in range(cue_count):
                    word = target_sentence[j][7 + (k * 3)]
                    paks.append(word)
                indi = 0
                for k in range(cue_count):
                    if (paks[k] != '_'):
                        indi = k
                for k in range(len(target_sentence)):
                    thing = target_sentence[k][7 + (indi * 3) + 1]
                    if (thing != '_'):
                        features.append(1.0)
                    else:
                        features.append(0.0)
                ground_scope.append(features)
                features = []

    ground_scope = keras.preprocessing.sequence.pad_sequences(ground_scope,
                                                              maxlen=100)
    ground_scope = np.array(ground_scope)
    print(ground_scope.shape)

    # X_train = cue_postag_features
    # y_train = ground_scope

    # nsamples, nx, ny = X_train.shape
    # X_train_2d_bef = X_train.reshape((nsamples,nx*ny))

    # y_train = y_train.reshape(nsamples*nx)

    # svm = SVC(kernel="linear", C=0.0025, random_state = 101)
    # svm.fit(X_train_2d_bef, y_train)
    # pickle.dump(svm, open("scope_detector.sav", 'wb'))

    X_train = cue_postag_features
    # y_train = ground_scope

    nsamples, col, vec = X_train.shape

    X_train_2d_bef = X_train.reshape((nsamples, col * vec))
    y_train_2d_bef = np.array(ground_scope, dtype=float)

    X_train_2d, X_validate_2d, y_train, y_validate = train_test_split(
        X_train_2d_bef, y_train_2d_bef, test_size=0.4, random_state=101)

    # classifier = LabelPowerset(GaussianNB())#0.49
    # classifier = LabelPowerset(RandomForestClassifier(n_estimators=25)).577
    classifier = LabelPowerset(RandomForestClassifier(n_estimators=50))  #.586
    # classifier = BinaryRelevance(MLPClassifier(hidden_layer_sizes = 200, verbose=True, max_iter = 15,learning_rate_init = 0.0035 ))
    # classifier = LabelPowerset(MLPClassifier()) # 0.58
    # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 200, verbose=True)) #0.589
    # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 200, verbose=True, max_iter = 400)) #.584
    # classifier = LabelPowerset(MLPClassifier(hidden_layer_sizes = 400, verbose=True, learning_rate_init = 0.0035))
    # classifier = LabelPowerset(GaussianNB())0.18
    # classifier = ClassifierChain(DecisionTreeClassifier()) 0.17
    # classifier = ClassifierChain(GaussianNB())0.02
    # classifier = ClassifierChain(GaussianNB())#0.2
    # classifier = BinaryRelevance(GaussianNB()).35 withoutcue
    # classifier = BinaryRelevance(DecisionTreeClassifier())0.15
    # classifier = BinaryRelevance(OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1))0.41
    # classifier = BinaryRelevance(OneClassSVM(nu=0.5, kernel="rbf", gamma=0.5))
    # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=25))0.29
    # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=100))0.24
    # classifier = BinaryRelevance(RandomForestClassifier(n_estimators=3)).2900
    # classifier = BinaryRelevance(MLPClassifier()).22
    classifier.fit(X_train_2d, y_train)
    y_predict = classifier.predict(X_validate_2d)
    f1_measure = f1_score(
        y_validate,
        y_predict,
        average="weighted",
    )
    print(f1_measure)
    pickle.dump(classifier, open("scope_detector.sav", 'wb'))
# In[35]:

classifier = LabelPowerset(GaussianNB())

# In[36]:

# train
classifier.fit(X_train, y_train)
#y_train.head()
#y_train.isnull().sum()
#cols.isnull().sum()

# In[38]:

# predict
predictions1 = classifier.predict(X_test)

#predictions = classifier.predict(X_test)

# In[49]:

from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_test, predictions1)

# In[50]:

print(classification_report(y_test, predictions1))

# # Accuracy is too less so, drop few columns at first place where there are more nas i.e., 20% nas

# In[7]:
ClassifierChainMultinomialNB_classifier.fit(X_train, Y_train)

# Predictions
predictions = ClassifierChainMultinomialNB_classifier.predict(X_test)

# Accuracy
print("Accuracy : {}".format(accuracy_score(Y_test,predict)*100))
# Create and save with pickle
save_mydocuments = open("pickled_algos/MultilabelClassifierchainWithMultinomialNB.pickle","wb")
pickle.dump(ClassifierChainMultinomialNB_classifier, save_mydocuments)
save_mydocuments.close()
print("Classifier chain with MultinomialNB classifier is done, time--- %s seconds ---" % (time.time() - start_time))

# 6. Label Powerset with MultinomialNB classifier (from scikit-multilearn)
# create and fit classifier
from skmultilearn.problem_transform import LabelPowerset
LabelPowersetMultinomialNB_classifier = LabelPowerset(MultinomialNB())
LabelPowersetMultinomialNB_classifier.fit(X_train, Y_train)

# Predictions
predictions = LabelPowersetMultinomialNB_classifier.predict(X_test)

# Accuracy
print("Accuracy : {}".format(accuracy_score(Y_test,predict)*100))
# Create and save with pickle
save_mydocuments = open("pickled_algos/MultilabelPowersetWithMultinomialNB.pickle","wb")
pickle.dump(LabelPowersetMultinomialNB_classifier, save_mydocuments)
save_mydocuments.close()
print("LabelPowersetMultinomialNB_classifier is done, time--- %s seconds ---" % (time.time() - start_time))

print("Done")
Example #18
0
Score_tree = dtree_model.score(features_test, labels_test)
#------------------------------------------------------------------------------
# Using naive Bayes

from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier_nb = LabelPowerset(GaussianNB())

# train
classifier_nb.fit(features_train, labels_train)

# predict
predictions_nb = classifier_nb.predict(features_test)
score_nb= accuracy_score(labels_test,predictions_nb)
#------------------------------------------------------------------------------
#using random forest classifier
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier(n_estimators =10, criterion = 'entropy', random_state = 0)
classifier1.fit(features_train,labels_train)

forest_pred = classifier1.predict(features_test)

Score_forest = classifier1.score(features_test, labels_test)
#------------------------------------------------------------------------------
# Using base Classifier with single-class SVM
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
class LMWrapper(Model):
    def __init__(self, C=1.0, use_idf=False, filename=None, **kwargs):
        self.lm = LabelPowerset(MultinomialNB())
        self.vect1 = TfidfVectorizer(norm=None,
                                     use_idf=use_idf,
                                     min_df=0.0,
                                     ngram_range=(1, 1))
        self.selector = sklearn.feature_selection.SelectKBest(k='all')
        self.output_dim = 0
        if filename is not None: self.load(filename)

    def build_representation(self, x, y=None, fit=False):
        auxX = [
            ' \n '.join([
                ' '.join(['w_' + str(token) for token in field if token != 0])
                for field in instance
            ]) for instance in x
        ]
        if fit: self.vect1.fit(auxX)
        auxX = self.vect1.transform(auxX)
        if fit: self.selector.fit(auxX, np.array([np.argmax(i) for i in y]))
        auxX = self.selector.transform(auxX)
        return auxX.todense()

    def fit(self, x, y, validation_data=None):
        auxY = y
        print('Build representation...')
        auxX = self.build_representation(x, auxY, fit=True)
        print('auxX shape:', auxX.shape)
        print('Fit model...')
        self.lm.fit(auxX, auxY)
        self.output_dim = auxY.shape[1]
        if validation_data is None: return None
        res = self.evaluate(validation_data[0], validation_data[1])
        print("Accuracy in validation data =", res)
        return None

    def predict(self, x):
        auxX = self.build_representation(x, fit=False)
        print('Predicting baseline...')
        auxY = self.lm.predict(auxX)
        #auxY = to_categorical(auxY)
        if auxY.shape[1] < self.output_dim:
            npad = ((0, 0), (0, self.output_dim - auxY.shape[1]))
            auxY = np.pad(auxY,
                          pad_width=npad,
                          mode='constant',
                          constant_values=0)
        return [auxY, [], []]

    def predict_prob(self, x):
        auxX = self.build_representation(x, fit=False)
        print('Predicting baseline...')
        auxY = self.lm.predict_proba(auxX)
        if auxY.shape[1] < self.output_dim:
            npad = ((0, 0), (0, self.output_dim - auxY.shape[1]))
            auxY = np.pad(auxY,
                          pad_width=npad,
                          mode='constant',
                          constant_values=0)
        return [auxY, [], []]

    def evaluate(self, x, y):
        auxX = self.build_representation(x, fit=False)
        auxY = y
        auxY = np.array([np.argmax(i) for i in auxY])
        return sklearn.metrics.accuracy_score(y_true=auxY,
                                              y_pred=self.lm.predict(auxX))

    def save(self, filename):
        f = open(filename, "wb")
        pickle.dump(self.lm, f, protocol=4)
        pickle.dump(self.vect1, f, protocol=4)
        pickle.dump(self.selector, f, protocol=4)
        pickle.dump(self.output_dim, f, protocol=4)
        f.close()

    def load(self, filename):
        f = open(filename, "rb")
        self.lm = pickle.load(f)
        self.vect1 = pickle.load(f)
        self.selector = pickle.load(f)
        self.output_dim = pickle.load(f)
        f.close()
Example #20
0
def labelpowerset(x_train, y_train, x_test, y_test):
    classifier = LabelPowerset(RandomForestClassifier(n_estimators=estimators))
    classifier.fit(x_train, y_train)
    predictions = classifier.predict(x_test)
    print("Accuracy = {}".format(accuracy_score(y_test, predictions)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#classifier = BinaryRelevance(GaussianNB())
#classifier = BinaryRelevance(tree.DecisionTreeClassifier())
#classifier = ClassifierChain(tree.DecisionTreeClassifier())
classifier = LabelPowerset(tree.DecisionTreeClassifier())
#classifier = MLkNN(k=5)
#classifier = BRkNNaClassifier(k=5)
#ptclassifier = LabelPowerset(tree.DecisionTreeClassifier())
#clusterer = IGraphLabelCooccurenceClusterer('fastgreedy', weighted=True, include_self_edges=True)
#classifier = LabelSpacePartitioningClassifier(ptclassifier, clusterer)

classifier.fit(X_train.as_matrix(), y_train.as_matrix())

predictions = classifier.predict(X_test.as_matrix())

loss = hamming_loss(y_test.as_matrix(), predictions)
print 'Hamming loss: ', loss

#acc = accuracy_score(y_test.as_matrix(), predictions)
#print 'accuracy: ', acc

#lrloss = label_ranking_loss(y_test, predictions.toarray())
#lrap = label_ranking_average_precision_score(y_test, predictions.toarray())
#print "LRLOSS: best value 0: ", lrloss
#print "LRAP: best value 1: ", lrap

#macro_score = f1_score(y_test, predictions.toarray(), average='macro')
micro_score = f1_score(y_test, predictions.toarray(), average='micro')
weighted_score = f1_score(y_test, predictions.toarray(), average='weighted')
Example #22
0
X_train = [X_train[index] for index in range(0,len(X_train)) if X_train[index] != ['"']]
X_train = [X_train[index] for index in range(0,len(X_train)) if X_train[index] != ['']]
test_data = [test_data[index] for index in range(0,len(test_data)) if test_data[index] != ['"']]
test_data = [test_data[index] for index in range(0,len(test_data)) if test_data[index] != ['']]

with open(train, 'r') as file:
    for line in file.readlines():
        index = line.index(',')'''

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(X, y)

# predict
predictions = classifier.predict(data)

accuracy_score(meta, predictions)
'''classifier = MLkNN(k=20)

# train
classifier.fit(data, meta)

# predict
predictions = classifier.predict(X)

accuracy_score(Y,predictions)'''
Example #23
0
hdrs = ['Release Year', 'Origin/Ethnicity', 'Director', 'Cast', 'Title']
for hd in hdrs:
    if hd == 'Title':
        cv = TfidfVectorizer(analyzer='word',
                             stop_words=stopwords)  # TF-IDF для названий
    else:
        cv = CountVectorizer(analyzer='word',
                             min_df=2)  # определить, относится или нет
        # конкретный режиссер (актер, год, страна) к конкретному фильму
    c = cv.fit_transform(x[hd].map(lambda pl: str(pl)))
    p = pandas.DataFrame(c.todense(),
                         index=x.index,
                         columns=cv.get_feature_names())
    plot = pandas.concat([plot, p], axis=1)
x_train, x_test, y_train, y_test = train_test_split(plot,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
classif = LabelPowerset(LogisticRegression())
classif.fit(x_train, y_train)
pr = classif.predict(x_test)
print("Accuracy = ", accuracy_score(
    y_test, pr))  # доля фильмов, у которых жанры предсказаны абсолюно точно
print("Jaccard similarity score =", jaccard_similarity_score(
    y_test, pr))  # мера схожести исходных и предсказанных наборов жанров
# precision - доля правильных присвоений данного жанра
# recall - способность находить фильмы данного жанра
# f1 - среднее гармоническое precision и recall
# support - количество фильмов каждого жанра в y_test
print(classification_report(y_test, pr, target_names=list(y.head())))
Example #24
0
	for line in f.readlines():
		tag_list.append(line[:-1].decode('utf-8'))


#classifier = MLkNN(k=100)
#classifier = MLARAM()
#classifier = LabelPowerset(classifier = SVC(), require_dense = [False, True])
#classifier = ClassifierChain(GaussianNB())
#classifier = ClassifierChain(SGDClassifier())
classifier = LabelPowerset(tree.DecisionTreeClassifier(),require_dense = [False, False])
#classifier = ClassifierChain(tree.DecisionTreeClassifier())
#classifier = BinaryRelevance(classifier = SVC(), require_dense = [False, True])

print "Start Training"
classifier.fit(X_train, Y_train)
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)

movie_tag_dict = dict()

for i in range(len(train_id)):
	movie_tag_dict[train_id[i]] = []
	for j in range(2015):
		if y_train_pred[i,j] == 1:
			movie_tag_dict[train_id[i]].append(tag_list[j])

for i in range(len(test_id)):
	movie_tag_dict[test_id[i]] = []
	for j in range(2015):
		if y_test_pred[i,j] == 1:
			movie_tag_dict[test_id[i]].append(tag_list[j]) 
      roc_auc_score(y_test,
                    classifier.predict_proba(x_test).toarray()))

# # Label Powerset
# * Label Powerset creates a unique class for every possible label combination that is present in the training set, this way it makes use of label correlation
# * Only problem with this method is as the no of classes increases its computational complexity also increases.

# In[67]:

log_classifier = LabelPowerset(LogisticRegression())

# In[68]:

log_classifier.fit(x_train, y_train)
print('Accuracy_score using LabelPowerset is ',
      round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1),
      '%')
print('-------------------------------------------------')
print('roc_auc_score using LabelPowerset is ',
      roc_auc_score(y_test,
                    log_classifier.predict_proba(x_test).toarray()))

# # ClassifierChain
# * This method uses a chain of binary classifiers
# * Each new Classifier uses the predictions of all previous classifiers
# * This was the correlation b/w labels is taken into account

# In[69]:

chain = ClassifierChain(LogisticRegression())
#Hamming Loss for Classifier Chaines
hamm_loss_cc = hamming_loss(y_test, predictions_cc)

print("Hamming Loss:", hamm_loss_cc)

print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes")

#initialize Label Powerset multi-label classifier
#with a gaussian naive bayes base classifier
classifier_lp = LabelPowerset(GaussianNB())

# train for Label Powerset
classifier_lp.fit(X_train, y_train)

# predict for Label Powerset
predictions_lp = classifier_lp.predict(X_test)

#Hamming Loss for Label PowerSet
hamm_loss_lp = hamming_loss(y_test, predictions_lp)

print("Hamming Loss:", hamm_loss_lp)

print("\n\n\nAll hamming loss:")
print("Binary Relevance:\n", hamm_loss_binary)
print("Classifier Chains:\n", hamm_loss_cc)
print("Label Powerset:\n", hamm_loss_lp)

objects = ('BinaryRelevance', 'ClassifierChain', 'LabelPowerset')
y_pos = np.arange(len(objects))
performance = [hamm_loss_binary, hamm_loss_cc, hamm_loss_lp]
Example #27
0
    i = 5
    while i > 0:
        i = i - 1
        t = int(np.mod(a, 2))
        a = int(np.floor(a / 2))
        res[i] = t
    return res


test = datasets.make_multilabel_classification()
data = pickle.load(open('datasets.pickle', 'rb'))
X = data[0]
Y = data[1]

logs = []
yt = []
for i in range(Y.shape[0]):
    yt.append(transfer(Y[i, :]))
log = LogisticRegression()
log.fit(X, yt)
p = log.predict(X)
res = []
for i in range(len(p)):
    rt = transfer1(p[i])
    res.append(rt)
print(accuracy_score(np.matrix(res), Y))

lb = LabelPowerset(LogisticRegression())
lb.fit(X, Y)
pred = lb.predict(X)
print(accuracy_score(pred, Y))
# The matrices are initially in lil_matrix format
# Converting them to compressed row matrix format

X_train = X_train.tocsr()
y_train = y_train.todense()
X_test = X_test.tocsr()
y_test = y_test.todense()

label_set = set([0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21])
label_list = [0, 3, 4, 7, 8, 9, 12, 15, 17, 19, 20, 21]

y_train = y_train[:, label_list]
y_test = y_test[:, label_list]

start_time = time.process_time()
# classifier = LabelPowerset(RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1))

# classifier = RandomForestClassifier(random_state=0, n_estimators=10)
# classifier = BinaryRelevance(classifier = LinearSVC(), require_dense = [False, True])
classifier = LabelPowerset(SGDClassifier(penalty='l2', alpha=0.01))
classifier.fit(X_train, y_train)
y_predicted = classifier.predict(X_test)
total_time = time.process_time() - start_time

print("Total time taken is : " + str(total_time))

print("Jaccard Similarity Score is : " +
      str(jaccard_similarity_score(y_test, y_predicted)))
print("Hamming Loss is : " + str(hamming_loss(y_test, y_predicted)))
# print("F1_Similarity score is : "+str(f1_score(y_test,y_predicted,average='macro')))
print(
    '\n Multi-Class OCSVM: Precision = %2.2f, Recall = %2.2f, FalseAlarm = %2.2f'
    % (precision, recall, falsealarm))

# step 2. perform multi-class learning approaches
# using binary relevance

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(sample_train, label_train[:, :5])

# predict
predictions = classifier.predict(sample_test)

# approach 1: one-vs-all (review the approach)
classif = OneVsRestClassifier(linear_model.LogisticRegression())
classif.fit(sample_train, label_train[:, :4])
# explain output label_score
label_score = classif.decision_function(sample_test)
# explain decision strategy here
idxsort = np.argsort(label_score)
label_pred = idxsort[:, -1] + 1
print('\n one-vs-all: %.2f' % accuracy_score(label_test[:, :4], label_pred))

## approach 2: one-vs-one (review the approach)
#classif = OneVsOneClassifier(linear_model.LogisticRegression())
#classif.fit(sample_train, label_train)
## explain similar output and decision strategy