def test_iter(self): #Calculate original selector_original = SelectKBest(f_classif, k=1) selector_original.fit(self.X, self.y) #Calculate custom X_y = izip(self.X, self.y) selector = iSelectKBest(if_classif, k=1) selector.fit(X_y, self.X.shape[1]) #Asserts np.testing.assert_array_almost_equal([0.05882353, 0.03846154, 0.17241379], selector.scores_, 8) np.testing.assert_array_almost_equal([0.83096915, 0.86263944, 0.71828192], selector.pvalues_, 8) #Asserts np.testing.assert_array_equal(selector_original.scores_, selector.scores_) np.testing.assert_array_equal(selector_original.pvalues_, selector.pvalues_)
def __init__(self, corpus, document_titles, test_corpus, test_corpus_targets, num_test_corpus, num_best_features = 1000, num_features = None, tmp_path = 'complete_similarity'): """ The similarity between a document and each document of the corpus is the feature created. The corpus is filtered for significant features. Parameters ---------- corpus : The corpus which contains all concepts. E.g. English Wikipedia in TF-IDF space. test_corpus : The test corpus is used to select features. All documents in this corous should be classified. test_corpus_targets : The target classes of each document in the test corpus. num_test_corpus : Number of documents in the test corpus. document_titles : The names of each concept (doc) in corpus. num_features : The number of features of corpus num_best_features : Number of features which should be selected for cESA model. If one wants to use all concepts as features the she has to set num_best_features to the size of corpus. """ if num_features is None: logger.info("scanning corpus to determine the number of features") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features #create similarity index of complete corpus complete_similarity_index = Similarity(output_prefix = tmp_path, corpus = corpus, num_features = self.num_features) #reduce concept count by feature selection self.selector = iSelectKBest(if_classif, k = num_best_features) #transform each document of test_corpus logger.info("Test corpus of %d documents..." % num_test_corpus) transformed_test_corpus = (complete_similarity_index[doc] for doc in test_corpus) logger.info("Select best features...") X_y = izip(transformed_test_corpus, test_corpus_targets) self.selector.fit(X_y, len(document_titles)) logger.info("Done selecting.") #reduce similarity index selected_documents = [doc for doc, mask in itertools.izip(corpus, self.selector.get_support()) if mask] self.similarity_index = MatrixSimilarity(corpus = selected_documents, num_features = self.num_features) #reduce document titles self.document_titles = DocumentTitles() for doc_title, mask in itertools.izip(document_titles, self.selector.get_support()): if mask: self.document_titles.append(doc_title) #print doc titles for title in self.document_titles: logger.debug("%s" % title)