Ejemplo n.º 1
0
 def choice(self, X, pool):
     y_probas = self.model.predict_proba(X[pool])
     doc_id = pool[np.argsort(np.absolute(y_probas[:, 1] - 0.5))[0]]
     
     if self.Debug:
         print '\n'
         print '=' * 50
         # print 'Feature model thus far:'
         # print '*' * 50
         # print 'Negative features (class 0):'
         # print ', '.join(self.feature_names[self.model.class0_features])
         # print 'Positive features (class 1):'
         # print ', '.join(self.feature_names[self.model.class1_features])
         # print '=' * 50
         print_all_features(self.feature_names, self.feature_expert, self.top_n, doc_id, self.X_pool, self.y_pool, self.X_pool_docs)
         doc_prob = self.model.predict_proba(self.X_pool[doc_id])
         print 'feature model predict_probability class0 = %0.5f, class1 = %0.5f' % (doc_prob[0, 0], doc_prob[0, 1])
         
         feature = self.feature_expert.most_informative_feature(self.X_pool[doc_id], self.y_pool[doc_id])
         print 'feature to be added to the model = (%d, %s)' % (feature, self.feature_names[feature])
         print 'label to be added to the model = %d' % self.y_pool[doc_id]
         print
         
         print 'making sure that X_pool and X are indeed the same:'
         print 'label according to y: %d' % self.y[doc_id]
         x_feature = self.feature_expert.most_informative_feature(X[doc_id], self.y[doc_id])
         print 'feature according to X: (%d, %s)' % (x_feature, self.feature_names[x_feature])
         
         ch = raw_input('Press Enter to continue...  ')
         print 
         
         if ch == 'n':
             sys.exit(1)
     
     return doc_id
Ejemplo n.º 2
0
    def KLD(self, X, pool):
        '''
        Compute average KL Divergence between instance and feature model,
        the larger the value, the more KLD says that they disagree
        
        avg_KLD(IM, FM) = (KLD(IM, FM) + KLD(FM, IM)) / 2
        '''
        y_IM_probas = self.instance_model.predict_proba(X[pool])
        y_FM_probas = self.feature_model.predict_proba(X[pool])
        
        log_ratio = np.log(y_IM_probas) - np.log(y_FM_probas)
        KLD_IM_FM = np.sum(y_IM_probas *  log_ratio, 1)
        KLD_FM_IM = np.sum(y_FM_probas * -log_ratio, 1)
        KLD = (KLD_IM_FM + KLD_FM_IM) / 2
        
        num = np.argsort(KLD)[-1]
        doc_id = pool[num]
        
        if self.Debug:
            print '\n'
            print '=' * 50
            print 'Feature model thus far:'
            print '*' * 50
            print 'Negative features (class 0):'
            print ', '.join(self.feature_names[self.feature_model.class0_features])
            print 'Positive features (class 1):'
            print ', '.join(self.feature_names[self.feature_model.class1_features])
            print '=' * 50
            print_all_features(self.feature_names, self.feature_expert, self.top_n, doc_id, self.X_pool, self.y_pool, self.X_pool_docs)
            
            IM_prob = self.instance_model.predict_proba(self.X_pool[doc_id])
            print 'instance model predict_probability: class0 = %0.5f, class1 = %0.5f' % (IM_prob[0, 0], IM_prob[0, 1])
            
            FM_prob = self.feature_model.predict_proba(self.X_pool[doc_id])
            print 'feature model predict_probability:  class0 = %0.5f, class1 = %0.5f' % (FM_prob[0, 0], FM_prob[0, 1])

            print 'top 10 KLDs:'
            sorted_KLD = np.argsort(KLD)
            for i in range(1, self.top_n + 1):
                print 'Rank %d: doc#%d, KLD=%10.5f' % (i, pool[sorted_KLD[-i]], KLD[sorted_KLD[-i]])
            
            print 'this doc\'s KLD = ', KLD[num]
            
            feature = self.feature_expert.most_informative_feature(self.X_pool[doc_id], self.y_pool[doc_id])
            print 'feature to be added to the model = (%d, %s)' % (feature, self.feature_names[feature])
            print 'label to be added to the model = %d' % self.y_pool[doc_id]
            print
            
            print 'making sure that X_pool and X are indeed the same:'
            print 'label according to y: %d' % self.y[doc_id]
            x_feature = self.feature_expert.most_informative_feature(X[doc_id], self.y[doc_id])
            print 'feature according to X: (%d, %s)' % (x_feature, self.feature_names[x_feature])
            
            ch = raw_input('Press Enter to continue...  ')
            print 
            
            if ch == 'n':
                sys.exit(1)
        
        return doc_id
Ejemplo n.º 3
0
 def choice(self, X, pool):
     # if len(annotated_features) is even, choose a positive document
     # if len(annotated_features) is odd, choose a negative document
     
     label = len(self.annotated_features) % 2 # label for the document
     rank = len(self.annotated_features) / 2 # rank of the feature in the list
     feature = self.feature_expert.feature_rank[label][rank]
     
     # find all documents with next feature present
     X_csc = X.tocsc()
     docs_with_feature = X_csc.getcol(feature).indices
     
     # find the docs with no annotated features
     doc_with_no_annotated_features = np.nonzero(self.docs_feature_count == 0)[0]
     
     # Find documents without any annotated features but has the next feature
     potential_docs = set(docs_with_feature).intersection(set(doc_with_no_annotated_features))
     
     # Find indices of all labels that is the current label
     correct_label_indices = np.nonzero(self.y == label)[0]
     
     # Find the intersection between the result from above and the pool
     sampling_pool = list((set(pool).intersection(potential_docs)).intersection(correct_label_indices))
     
     if len(sampling_pool) == 0:
         doc_id = None
     else:
         doc_id = self.rgen.permutation(sampling_pool)[0]
     
     if self.Debug and doc_id != None:
         print 'size of overall pool: %d' % len(pool)
         print 'number of samples with feature present: %d' % len(docs_with_feature)
         print 'number of samples with no annotated_features: %d' % len(doc_with_no_annotated_features)
         print 'number of samples with label=%d: %d' % (label, correct_label_indices.shape[0])
         print 'size of the sampling pool: %d' % len(sampling_pool)
         
         print 'Annotated Features(%d): ' % len(self.annotated_features)
         print ', '.join([str((f, self.feature_names[f])) for f in self.annotated_features])
         print 'Cheating Approach: rank = %d, feature# = %d, feature name = %s' % (rank, feature, self.feature_names[feature])
         
         print_all_features(self.feature_names, self.feature_expert, self.top_n, doc_id, self.X_pool, self.y_pool, self.X_pool_docs)
         
         feature = self.feature_expert.most_informative_feature(self.X_pool[doc_id], self.y_pool[doc_id])
         print 'feature to be added to the model = (%d, %s)' % (feature, self.feature_names[feature])
         print 'label to be added to the model = %d' % self.y_pool[doc_id]
         print
         
         print 'making sure that X_pool and X are indeed the same:'
         print 'label according to y: %d' % self.y[doc_id]
         x_feature = self.feature_expert.most_informative_feature(X[doc_id], self.y[doc_id])
         print 'feature according to X: (%d, %s)' % (x_feature, self.feature_names[x_feature])
         
         ch = raw_input('Press Enter to continue...  ')
         print '-' * 50
         
         if ch == 'n':
             sys.exit(1)
     
     return doc_id
Ejemplo n.º 4
0
    def euclidean(self, X, pool):
        y_IM_probas = self.instance_model.predict_proba(X[pool])
        y_FM_probas = self.feature_model.predict_proba(X[pool])

        dist = np.sum(np.multiply(y_IM_probas - y_FM_probas,
                                  y_IM_probas - y_FM_probas),
                      axis=1)
        # select the document with the largest euclidean distance
        doc_id = np.array(pool)[np.argsort(dist)[-1]]

        if self.Debug:
            print '\n'
            print '=' * 50
            print 'Feature model thus far:'
            print '*' * 50
            print 'Negative features (class 0):'
            print ', '.join(
                self.feature_names[self.feature_model.class0_features])
            print 'Positive features (class 1):'
            print ', '.join(
                self.feature_names[self.feature_model.class1_features])
            print '=' * 50
            print_all_features(self.feature_names, self.feature_expert,
                               self.top_n, doc_id, self.X_pool, self.y_pool,
                               self.X_pool_docs)

            IM_prob = self.instance_model.predict_proba(self.X_pool[doc_id])
            print 'instance model predict_probability: class0 = %0.5f, class1 = %0.5f' % (
                IM_prob[0, 0], IM_prob[0, 1])

            FM_prob = self.feature_model.predict_proba(self.X_pool[doc_id])
            print 'feature model predict_probability:  class0 = %0.5f, class1 = %0.5f' % (
                FM_prob[0, 0], FM_prob[0, 1])

            sorted_dist = np.argsort(dist)
            print 'top 10 Euclidean Distances:'
            print dist[sorted_dist[-10:]]

            print sorted_dist[:10]
            for i in range(1, self.top_n + 1):
                print 'Rank %d: doc#%d, distance=%10.5f' % (
                    i, pool[sorted_dist[-i]], dist[sorted_dist[-i]])

            print 'this doc\'s distance = ', dist[sorted_dist[-1]]

            ch = raw_input('Press Enter to continue...  ')
            print

            if ch == 'n':
                sys.exit(1)

        return doc_id
Ejemplo n.º 5
0
    def choice(self, X, pool, certainClass):
        y_probas = self.model.predict_proba(X[pool])
        if certainClass == 0:
            doc_id = pool[np.argsort((y_probas[:, 0]))[::-1][0]]
        else:
            doc_id = pool[np.argsort((y_probas[:, 1]))[::-1][0]]

        if self.Debug:
            print '\n'
            print '=' * 50
            # print 'Feature model thus far:'
            # print '*' * 50
            # print 'Negative features (class 0):'
            # print ', '.join(self.feature_names[self.model.class0_features])
            # print 'Positive features (class 1):'
            # print ', '.join(self.feature_names[self.model.class1_features])
            # print '=' * 50
            print_all_features(self.feature_names, self.feature_expert,
                               self.top_n, doc_id, self.X_pool, self.y_pool,
                               self.X_pool_docs)
            doc_prob = self.model.predict_proba(self.X_pool[doc_id])
            print 'feature model predict_probability class0 = %0.5f, class1 = %0.5f' % (
                doc_prob[0, 0], doc_prob[0, 1])

            feature = self.feature_expert.most_informative_feature(
                self.X_pool[doc_id], self.y_pool[doc_id])
            print 'feature to be added to the model = (%d, %s)' % (
                feature, self.feature_names[feature])
            print 'label to be added to the model = %d' % self.y_pool[doc_id]
            print

            print 'making sure that X_pool and X are indeed the same:'
            print 'label according to y: %d' % self.y[doc_id]
            x_feature = self.feature_expert.most_informative_feature(
                X[doc_id], self.y[doc_id])
            print 'feature according to X: (%d, %s)' % (
                x_feature, self.feature_names[x_feature])

            ch = raw_input('Press Enter to continue...  ')
            print

            if ch == 'n':
                sys.exit(1)

        return doc_id
Ejemplo n.º 6
0
    def euclidean(self, X, pool):
        y_IM_probas = self.instance_model.predict_proba(X[pool])
        y_FM_probas = self.feature_model.predict_proba(X[pool])

        dist = np.sum(np.multiply(y_IM_probas - y_FM_probas, y_IM_probas - y_FM_probas), axis=1)
        # select the document with the largest euclidean distance
        doc_id = np.array(pool)[np.argsort(dist)[-1]]

        if self.Debug:
            print '\n'
            print '=' * 50
            print 'Feature model thus far:'
            print '*' * 50
            print 'Negative features (class 0):'
            print ', '.join(self.feature_names[self.feature_model.class0_features])
            print 'Positive features (class 1):'
            print ', '.join(self.feature_names[self.feature_model.class1_features])
            print '=' * 50
            print_all_features(self.feature_names, self.feature_expert, self.top_n, doc_id, self.X_pool, self.y_pool, self.X_pool_docs)
            
            IM_prob = self.instance_model.predict_proba(self.X_pool[doc_id])
            print 'instance model predict_probability: class0 = %0.5f, class1 = %0.5f' % (IM_prob[0, 0], IM_prob[0, 1])
            
            FM_prob = self.feature_model.predict_proba(self.X_pool[doc_id])
            print 'feature model predict_probability:  class0 = %0.5f, class1 = %0.5f' % (FM_prob[0, 0], FM_prob[0, 1])

            sorted_dist = np.argsort(dist)
            print 'top 10 Euclidean Distances:'
            print dist[sorted_dist[-10:]]

            print sorted_dist[:10]
            for i in range(1, self.top_n + 1):
                print 'Rank %d: doc#%d, distance=%10.5f' % (i, pool[sorted_dist[-i]], dist[sorted_dist[-i]])
            
            print 'this doc\'s distance = ', dist[sorted_dist[-1]]

            ch = raw_input('Press Enter to continue...  ')
            print 
            
            if ch == 'n':
                sys.exit(1)
            
        return doc_id
Ejemplo n.º 7
0
 print 'class 0 features (ranked):'
 print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class0_features_by_rank()])
 print '-' * 50
 
 print 'class 1 features (ranked):'
 print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class1_features_by_rank()])
 print '-' * 50
 
 doc_ids = np.random.permutation(np.arange(X_pool.shape[0]))
 top_n = 20
 
 print '\n'
 print '=' * 50
 
 for doc in doc_ids:
     print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool, X_pool_docs)
     
     print '=' * 50
     ch = raw_input('Display the next document? Press Enter to continue or type \'n\' to exit...  ')
     
     if ch == 'n':
         break
 
 feature_model = FeatureMNB(fe.class0_features_by_rank(), fe.class1_features_by_rank(), \
     num_feat=X_pool.shape[1], smoothing=1e-6, class_prior = [0.5, 0.5], r=100.)
 
 print 'Feature Model(MNB): accu = %f, auc = %f' % evaluate_model(feature_model, X_test, y_test)
 
 logit = linear_model.LogisticRegression(C=args.c, penalty='l1')
 logit.fit(X_pool, y_pool)
 
Ejemplo n.º 8
0
    print 'class 1 features (ranked):'
    print ', '.join([
        str((f, feature_names[f], fe.L1_weights[f]))
        for f in fe.class1_features_by_rank()
    ])
    print '-' * 50

    doc_ids = np.random.permutation(np.arange(X_pool.shape[0]))
    top_n = 20

    print '\n'
    print '=' * 50

    for doc in doc_ids:
        print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool,
                           X_pool_docs)

        print '=' * 50
        ch = raw_input(
            'Display the next document? Press Enter to continue or type \'n\' to exit...  '
        )

        if ch == 'n':
            break

    feature_model = FeatureMNB(fe.class0_features_by_rank(), fe.class1_features_by_rank(), \
        num_feat=X_pool.shape[1], smoothing=1e-6, class_prior = [0.5, 0.5], r=100.)

    print 'Feature Model(MNB): accu = %f, auc = %f' % evaluate_model(
        feature_model, X_test, y_test)
Ejemplo n.º 9
0
    def choice(self, X, pool):
        # if len(annotated_features) is even, choose a positive document
        # if len(annotated_features) is odd, choose a negative document

        label = len(self.annotated_features) % 2  # label for the document
        rank = len(
            self.annotated_features) / 2  # rank of the feature in the list
        feature = self.feature_expert.feature_rank[label][rank]

        # find all documents with next feature present
        X_csc = X.tocsc()
        docs_with_feature = X_csc.getcol(feature).indices

        # find the docs with no annotated features
        doc_with_no_annotated_features = np.nonzero(
            self.docs_feature_count == 0)[0]

        # Find documents without any annotated features but has the next feature
        potential_docs = set(docs_with_feature).intersection(
            set(doc_with_no_annotated_features))

        # Find indices of all labels that is the current label
        correct_label_indices = np.nonzero(self.y == label)[0]

        # Find the intersection between the result from above and the pool
        sampling_pool = list((set(pool).intersection(potential_docs)
                              ).intersection(correct_label_indices))

        if len(sampling_pool) == 0:
            doc_id = None
        else:
            doc_id = self.rgen.permutation(sampling_pool)[0]

        if self.Debug and doc_id != None:
            print 'size of overall pool: %d' % len(pool)
            print 'number of samples with feature present: %d' % len(
                docs_with_feature)
            print 'number of samples with no annotated_features: %d' % len(
                doc_with_no_annotated_features)
            print 'number of samples with label=%d: %d' % (
                label, correct_label_indices.shape[0])
            print 'size of the sampling pool: %d' % len(sampling_pool)

            print 'Annotated Features(%d): ' % len(self.annotated_features)
            print ', '.join([
                str((f, self.feature_names[f]))
                for f in self.annotated_features
            ])
            print 'Cheating Approach: rank = %d, feature# = %d, feature name = %s' % (
                rank, feature, self.feature_names[feature])

            print_all_features(self.feature_names, self.feature_expert,
                               self.top_n, doc_id, self.X_pool, self.y_pool,
                               self.X_pool_docs)

            feature = self.feature_expert.most_informative_feature(
                self.X_pool[doc_id], self.y_pool[doc_id])
            print 'feature to be added to the model = (%d, %s)' % (
                feature, self.feature_names[feature])
            print 'label to be added to the model = %d' % self.y_pool[doc_id]
            print

            print 'making sure that X_pool and X are indeed the same:'
            print 'label according to y: %d' % self.y[doc_id]
            x_feature = self.feature_expert.most_informative_feature(
                X[doc_id], self.y[doc_id])
            print 'feature according to X: (%d, %s)' % (
                x_feature, self.feature_names[x_feature])

            ch = raw_input('Press Enter to continue...  ')
            print '-' * 50

            if ch == 'n':
                sys.exit(1)

        return doc_id
Ejemplo n.º 10
0
    def KLD(self, X, pool):
        '''
        Compute average KL Divergence between instance and feature model,
        the larger the value, the more KLD says that they disagree
        
        avg_KLD(IM, FM) = (KLD(IM, FM) + KLD(FM, IM)) / 2
        '''
        y_IM_probas = self.instance_model.predict_proba(X[pool])
        y_FM_probas = self.feature_model.predict_proba(X[pool])

        log_ratio = np.log(y_IM_probas) - np.log(y_FM_probas)
        KLD_IM_FM = np.sum(y_IM_probas * log_ratio, 1)
        KLD_FM_IM = np.sum(y_FM_probas * -log_ratio, 1)
        KLD = (KLD_IM_FM + KLD_FM_IM) / 2

        num = np.argsort(KLD)[-1]
        doc_id = pool[num]

        if self.Debug:
            print '\n'
            print '=' * 50
            print 'Feature model thus far:'
            print '*' * 50
            print 'Negative features (class 0):'
            print ', '.join(
                self.feature_names[self.feature_model.class0_features])
            print 'Positive features (class 1):'
            print ', '.join(
                self.feature_names[self.feature_model.class1_features])
            print '=' * 50
            print_all_features(self.feature_names, self.feature_expert,
                               self.top_n, doc_id, self.X_pool, self.y_pool,
                               self.X_pool_docs)

            IM_prob = self.instance_model.predict_proba(self.X_pool[doc_id])
            print 'instance model predict_probability: class0 = %0.5f, class1 = %0.5f' % (
                IM_prob[0, 0], IM_prob[0, 1])

            FM_prob = self.feature_model.predict_proba(self.X_pool[doc_id])
            print 'feature model predict_probability:  class0 = %0.5f, class1 = %0.5f' % (
                FM_prob[0, 0], FM_prob[0, 1])

            print 'top 10 KLDs:'
            sorted_KLD = np.argsort(KLD)
            for i in range(1, self.top_n + 1):
                print 'Rank %d: doc#%d, KLD=%10.5f' % (i, pool[sorted_KLD[-i]],
                                                       KLD[sorted_KLD[-i]])

            print 'this doc\'s KLD = ', KLD[num]

            feature = self.feature_expert.most_informative_feature(
                self.X_pool[doc_id], self.y_pool[doc_id])
            print 'feature to be added to the model = (%d, %s)' % (
                feature, self.feature_names[feature])
            print 'label to be added to the model = %d' % self.y_pool[doc_id]
            print

            print 'making sure that X_pool and X are indeed the same:'
            print 'label according to y: %d' % self.y[doc_id]
            x_feature = self.feature_expert.most_informative_feature(
                X[doc_id], self.y[doc_id])
            print 'feature according to X: (%d, %s)' % (
                x_feature, self.feature_names[x_feature])

            ch = raw_input('Press Enter to continue...  ')
            print

            if ch == 'n':
                sys.exit(1)

        return doc_id