class MyLabelPowerSetFeatureSelect():
   
    def fit(self, X, y):
        
        # I'm using a gaussian naive bayes base classifier
        self.LabelPowerSetObject = LabelPowerset(GaussianNB())
        
        # fitting the data
        self.LabelPowerSetObject.fit(X, y)
        
        # transformed y 
        y_transformed  = self.LabelPowerSetObject.transform(y)
        
        # instanciating with SelectKBest object
        self.X_new = SelectKBest(chi2, k=2)
        
        # the feature selecting
        self.X_transformed = self.X_new.fit_transform(X, y_transformed)
        
        # save indices of the saved attributes
        self.selected_attributes_indices = self.X_new.get_support(indices = True)
        
        #print(self.attributes_indices,'the indices of the selected atributes')
        
        return self
        
    
    def transform(self, X):    
        return X[:,self.selected_attributes_indices]
    
    def predict(self, X):
        return self.LabelPowerSetObject.predict(X)
    
    def predict_proba(self, X):
        return self.LabelPowerSetObject.predict_proba(X)
Example #2
0
class LP():
    '''
        Label Powerset Method
    '''

    h = None

    def __init__(self, h=LogisticRegression()):
        self.h = LabelPowerset(h)

    def fit(self, X, Y):
        '''
            Train the model on training data X,Y
        '''
        return self.h.fit(X, Y)

    def predict(self, X):
        '''
            Return predictions Y, given X
        '''
        return self.h.predict(X)

    def predict_proba(self, X):
        '''
            Return matrix P, where P[i,j] = P(Y[i,j] = 1 | X[i])
            (where i-th row/example, and j-th label)
        '''
        return self.h.predict_proba(X)
# * Only problem with this method is as the no of classes increases its computational complexity also increases.

# In[67]:

log_classifier = LabelPowerset(LogisticRegression())

# In[68]:

log_classifier.fit(x_train, y_train)
print('Accuracy_score using LabelPowerset is ',
      round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1),
      '%')
print('-------------------------------------------------')
print('roc_auc_score using LabelPowerset is ',
      roc_auc_score(y_test,
                    log_classifier.predict_proba(x_test).toarray()))

# # ClassifierChain
# * This method uses a chain of binary classifiers
# * Each new Classifier uses the predictions of all previous classifiers
# * This was the correlation b/w labels is taken into account

# In[69]:

chain = ClassifierChain(LogisticRegression())

# In[70]:

chain.fit(x_train, y_train)
print('Accuracy_score using ClassifierChain is ',
      round(accuracy_score(y_test, chain.predict(x_test)) * 100, 1), '%')
class LMWrapper(Model):
    def __init__(self, C=1.0, use_idf=False, filename=None, **kwargs):
        self.lm = LabelPowerset(MultinomialNB())
        self.vect1 = TfidfVectorizer(norm=None,
                                     use_idf=use_idf,
                                     min_df=0.0,
                                     ngram_range=(1, 1))
        self.selector = sklearn.feature_selection.SelectKBest(k='all')
        self.output_dim = 0
        if filename is not None: self.load(filename)

    def build_representation(self, x, y=None, fit=False):
        auxX = [
            ' \n '.join([
                ' '.join(['w_' + str(token) for token in field if token != 0])
                for field in instance
            ]) for instance in x
        ]
        if fit: self.vect1.fit(auxX)
        auxX = self.vect1.transform(auxX)
        if fit: self.selector.fit(auxX, np.array([np.argmax(i) for i in y]))
        auxX = self.selector.transform(auxX)
        return auxX.todense()

    def fit(self, x, y, validation_data=None):
        auxY = y
        print('Build representation...')
        auxX = self.build_representation(x, auxY, fit=True)
        print('auxX shape:', auxX.shape)
        print('Fit model...')
        self.lm.fit(auxX, auxY)
        self.output_dim = auxY.shape[1]
        if validation_data is None: return None
        res = self.evaluate(validation_data[0], validation_data[1])
        print("Accuracy in validation data =", res)
        return None

    def predict(self, x):
        auxX = self.build_representation(x, fit=False)
        print('Predicting baseline...')
        auxY = self.lm.predict(auxX)
        #auxY = to_categorical(auxY)
        if auxY.shape[1] < self.output_dim:
            npad = ((0, 0), (0, self.output_dim - auxY.shape[1]))
            auxY = np.pad(auxY,
                          pad_width=npad,
                          mode='constant',
                          constant_values=0)
        return [auxY, [], []]

    def predict_prob(self, x):
        auxX = self.build_representation(x, fit=False)
        print('Predicting baseline...')
        auxY = self.lm.predict_proba(auxX)
        if auxY.shape[1] < self.output_dim:
            npad = ((0, 0), (0, self.output_dim - auxY.shape[1]))
            auxY = np.pad(auxY,
                          pad_width=npad,
                          mode='constant',
                          constant_values=0)
        return [auxY, [], []]

    def evaluate(self, x, y):
        auxX = self.build_representation(x, fit=False)
        auxY = y
        auxY = np.array([np.argmax(i) for i in auxY])
        return sklearn.metrics.accuracy_score(y_true=auxY,
                                              y_pred=self.lm.predict(auxX))

    def save(self, filename):
        f = open(filename, "wb")
        pickle.dump(self.lm, f, protocol=4)
        pickle.dump(self.vect1, f, protocol=4)
        pickle.dump(self.selector, f, protocol=4)
        pickle.dump(self.output_dim, f, protocol=4)
        f.close()

    def load(self, filename):
        f = open(filename, "rb")
        self.lm = pickle.load(f)
        self.vect1 = pickle.load(f)
        self.selector = pickle.load(f)
        self.output_dim = pickle.load(f)
        f.close()