def test_20news(): try: data = datasets.fetch_20newsgroups(subset='all', download_if_missing=False, shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # Extract a reduced dataset data2cats = datasets.fetch_20newsgroups(subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset assert_equal(data2cats.target_names, data.target_names[-2:]) # Assert that we have only 0 and 1 as labels assert_equal(np.unique(data2cats.target).tolist(), [0, 1]) # Check that the first entry of the reduced dataset corresponds to # the first entry of the corresponding category in the full dataset entry1 = data2cats.data[0] category = data2cats.target_names[data2cats.target[0]] label = data.target_names.index(category) entry2 = data.data[np.where(data.target == label)[0][0]] assert_equal(entry1, entry2) # check that the filenames are available too assert_true(data.filenames[0].endswith( "20news_home/20news-bydate-test/talk.politics.mideast/76560"))
def test_20news(): try: data = datasets.fetch_20newsgroups(subset='all', download_if_missing=False, shuffle=False) except IOError: # Data not there return # Extract a reduced dataset data2cats = datasets.fetch_20newsgroups(subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset nose.tools.assert_equal(data2cats.target_names, data.target_names[-2:]) # Assert that we have only 0 and 1 as labels nose.tools.assert_equal(np.unique(data2cats.target).tolist(), [0, 1]) # Check that the first entry of the reduced dataset corresponds to # the first entry of the corresponding category in the full dataset entry1 = data2cats.data[0] category = data2cats.target_names[data2cats.target[0]] label = data.target_names.index(category) entry2 = data.data[np.where(data.target == label)[0][0]] nose.tools.assert_true(entry1 == entry2)
def _load_docs(self, training_set): if training_set == "newsgroup": self.info("extract the 20 newsgroup dataset") wide_dataset = fetch_20newsgroups() docs = [open(f).read() for f in wide_dataset.filenames] elif training_set == "docs": docs = [res['content'] for res in db.resources.find({'blacklisted': False, 'processed': True})] return docs
############################################################################### # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories if categories else "all" data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target
################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories if categories else "all" data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) print "%d documents (training set)" % len(data_train.filenames) print "%d documents (testing set)" % len(data_test.filenames) print "%d categories" % len(data_train.target_names) print # split a training set and a test set filenames_train, filenames_test = data_train.filenames, data_test.filenames y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time()
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') ################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories data = fetch_20newsgroups(subset='train', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will
format='%(asctime)s %(levelname)s %(message)s') ################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories data = fetch_20newsgroups(subset='train', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will