format='%(asctime)s %(levelname)s %(message)s') ################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories data = load_20newsgroups(subset='train', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will
################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories data_train = load_20newsgroups(subset='train', categories=categories, shuffle=True, rng=42) data_test = load_20newsgroups(subset='test', categories=categories, shuffle=True, rng=42) print "%d documents (training set)" % len(data_train.filenames) print "%d documents (testing set)" % len(data_test.filenames) print "%d categories" % len(data_train.target_names) print # split a training set and a test set filenames_train, filenames_test = data_train.filenames, data_test.filenames y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time()
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') ################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories data = load_20newsgroups(subset='train', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will
################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories data_train = load_20newsgroups(subset='train', categories=categories, shuffle=True, rng=42) data_test = load_20newsgroups(subset='test', categories=categories, shuffle=True, rng=42) print "%d documents (training set)" % len(data_train.filenames) print "%d documents (testing set)" % len(data_test.filenames) print "%d categories" % len(data_train.target_names) print # split a training set and a test set filenames_train, filenames_test = data_train.filenames, data_test.filenames y_train, y_test = data_train.target, data_test.target