print ################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories data = load_files('20news-18828', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will
from scikits.learn.feature_extraction.text import CountVectorizer from scikits.learn.feature_extraction.text import TfidfTransformer from scikits.learn.svm.sparse import LinearSVC from scikits.learn.pipeline import Pipeline from scikits.learn.grid_search import GridSearchCV from scikits.learn.datasets import load_files from scikits.learn import metrics # # The real code starts here # # the training data folder must be passed as first argument movie_reviews_data_folder = sys.argv[1] dataset = load_files(movie_reviews_data_folder, shuffle=True, random_state=42) # split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] split = (n_samples_total * 3) / 4 docs_train = dataset.data[:split] docs_test = dataset.data[split:] y_train = dataset.target[:split] y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer # TODO
"""Simple preprocessor that should be available by default""" def preprocess(self, unicode_content): return unicode_content.lower() def __repr__(self): return "LowerCasePreprocessor()" # # The real code starts here # # the training data folder must be passed as first argument languages_data_folder = sys.argv[1] dataset = load_files(languages_data_folder) # split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] docs_train = [open(f).read() for f in dataset.filenames[:n_samples_total/2]] docs_test = [open(f).read() for f in dataset.filenames[n_samples_total/2:]] y_train = dataset.target[:n_samples_total/2] y_test = dataset.target[n_samples_total/2:] # Build a an analyzer that split strings into sequence of 1 to 3 characters
import sys from scikits.learn.feature_extraction.text.sparse import CountVectorizer from scikits.learn.feature_extraction.text.sparse import TfidfTransformer from scikits.learn.svm.sparse import LinearSVC from scikits.learn.pipeline import Pipeline from scikits.learn.grid_search import GridSearchCV from scikits.learn.datasets import load_files from scikits.learn import metrics # # The real code starts here # # the training data folder must be passed as first argument movie_reviews_data_folder = sys.argv[1] dataset = load_files(movie_reviews_data_folder) # split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] split = (n_samples_total * 3) / 4 docs_train = [open(f).read() for f in dataset.filenames[:split]] docs_test = [open(f).read() for f in dataset.filenames[split:]] y_train = dataset.target[:split] y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)),
"""Simple preprocessor that should be available by default""" def preprocess(self, unicode_content): return unicode_content.lower() def __repr__(self): return "LowerCasePreprocessor()" # # The real code starts here # # the training data folder must be passed as first argument languages_data_folder = sys.argv[1] dataset = load_files(languages_data_folder, shuffle=True, random_state=42) # split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] split = n_samples_total / 2 docs_train = dataset.data[:split] docs_test = dataset.data[split:] y_train = dataset.target[:split] y_test = dataset.target[split:] # Build a an analyzer that split strings into sequence of 1 to 3 characters # after using the previous preprocessor analyzer = CharNGramAnalyzer(
from scikits.learn.feature_extraction.text.sparse import CountVectorizer from scikits.learn.feature_extraction.text.sparse import TfidfTransformer from scikits.learn.svm.sparse import LinearSVC from scikits.learn.pipeline import Pipeline from scikits.learn.grid_search import GridSearchCV from scikits.learn.datasets import load_files from scikits.learn import metrics # # The real code starts here # # the training data folder must be passed as first argument movie_reviews_data_folder = sys.argv[1] dataset = load_files(movie_reviews_data_folder) # split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] split = (n_samples_total * 3) / 4 docs_train = [open(f).read() for f in dataset.filenames[:split]] docs_test = [open(f).read() for f in dataset.filenames[split:]] y_train = dataset.target[:split] y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)),
################################################################################ # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] # Uncomment the following to do the analysis on all the categories #categories = None print "Loading 20 newsgroups dataset for categories:" print categories data = load_files('20news-18828', categories=categories, shuffle=True, rng=42) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print # split a training set and a test set filenames = data.filenames y = data.target n = filenames.shape[0] filenames_train, filenames_test = filenames[:-n/2], filenames[-n/2:] y_train, y_test = y[:-n/2], y[-n/2:] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer()