Example #1
0
    def test_iter(self):
        
        #Calculate original
        selector_original = SelectKBest(f_classif, k=1)
        selector_original.fit(self.X, self.y)        
        
        #Calculate custom
        X_y = izip(self.X, self.y)

        selector = iSelectKBest(if_classif, k=1)
        selector.fit(X_y, self.X.shape[1])
        
        #Asserts
        np.testing.assert_array_almost_equal([0.05882353, 0.03846154, 0.17241379], 
                                             selector.scores_, 8)
        np.testing.assert_array_almost_equal([0.83096915, 0.86263944, 0.71828192], 
                                             selector.pvalues_, 8)
        
        #Asserts
        np.testing.assert_array_equal(selector_original.scores_, 
                                      selector.scores_)
        np.testing.assert_array_equal(selector_original.pvalues_, 
                                      selector.pvalues_)
Example #2
0
    def __init__(self, corpus, document_titles, 
                 test_corpus, test_corpus_targets, num_test_corpus,
                 num_best_features = 1000,
                 num_features = None,
                 tmp_path = 'complete_similarity'):
        """
        The similarity between a document and each document of the corpus is
        the feature created.
        
        The corpus is filtered for significant features.
        
        Parameters
        ----------
        corpus : The corpus which contains all concepts. E.g. English Wikipedia
                 in TF-IDF space.
        test_corpus : The test corpus is used to select features. 
                      All documents in this corous should be classified.
        test_corpus_targets : The target classes of each document in the 
                              test corpus.
        num_test_corpus : Number of documents in the test corpus.
        document_titles : The names of each concept (doc) in corpus.
        num_features : The number of features of corpus
        num_best_features : Number of features which should be selected for cESA model.
                            If one wants to use all concepts as features the she has
                            to set num_best_features to the size of corpus.
        """
        
        if num_features is None:
            logger.info("scanning corpus to determine the number of features")
            num_features = 1 + utils.get_max_id(corpus)
            
        self.num_features = num_features
        
        #create similarity index of complete corpus
        complete_similarity_index = Similarity(output_prefix = tmp_path,
                                                    corpus = corpus,
                                                    num_features = self.num_features)
        
        #reduce concept count by feature selection
        self.selector = iSelectKBest(if_classif, k = num_best_features)
        
        #transform each document of test_corpus
        logger.info("Test corpus of %d documents..." % num_test_corpus)
            
        transformed_test_corpus = (complete_similarity_index[doc]
                                   for doc 
                                   in test_corpus)

        logger.info("Select best features...")
        X_y = izip(transformed_test_corpus, test_corpus_targets)
        self.selector.fit(X_y, len(document_titles))
        
        logger.info("Done selecting.")
        
        #reduce similarity index
        selected_documents = [doc 
                              for doc, mask 
                              in itertools.izip(corpus, self.selector.get_support())
                              if mask]
        self.similarity_index = MatrixSimilarity(corpus = selected_documents,
                                                 num_features = self.num_features)
        
        #reduce document titles
        self.document_titles = DocumentTitles()
        for doc_title, mask in itertools.izip(document_titles, self.selector.get_support()):
            if mask:
                self.document_titles.append(doc_title)
  
        #print doc titles
        for title in self.document_titles:
            logger.debug("%s" % title)