def test_load_files_wo_load_content(test_category_dir_1, test_category_dir_2, load_files_root): res = load_files(load_files_root, load_content=False) assert len(res.filenames) == 1 assert len(res.target_names) == 2 assert res.DESCR is None assert res.get('data') is None
def test_default_load_files(test_category_dir_1, test_category_dir_2, load_files_root): if IS_PYPY: pytest.xfail('[PyPy] fails due to string containing NUL characters') res = load_files(load_files_root) assert len(res.filenames) == 1 assert len(res.target_names) == 2 assert res.DESCR is None assert res.data == [b"Hello World!\n"]
def test_load_files_w_categories_desc_and_encoding(test_category_dir_1, test_category_dir_2, load_files_root): if IS_PYPY: pytest.xfail('[PyPy] fails due to string containing NUL characters') category = os.path.abspath(test_category_dir_1).split('/').pop() res = load_files(load_files_root, description="test", categories=category, encoding="utf-8") assert len(res.filenames) == 1 assert len(res.target_names) == 1 assert res.DESCR == "test" assert res.data == ["Hello World!\n"]
def test_default_empty_load_files(load_files_root): res = load_files(load_files_root) assert len(res.filenames) == 0 assert len(res.target_names) == 0 assert res.DESCR is None
""" # Author: Olivier Grisel <*****@*****.**> # License: Simplified BSD import sys from mrex.feature_extraction.text import TfidfVectorizer from mrex.linear_model import Perceptron from mrex.pipeline import Pipeline from mrex.datasets import load_files from mrex.model_selection import train_test_split from mrex import metrics # The training data folder must be passed as first argument languages_data_folder = sys.argv[1] dataset = load_files(languages_data_folder) # Split the dataset in training and test set: docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5) # TASK: Build a vectorizer that splits strings into sequence of 1 to 3 # characters instead of word tokens vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', use_idf=False) # TASK: Build a vectorizer / classifier pipeline using the previous analyzer # the pipeline instance should stored in a variable named clf clf = Pipeline([
from mrex.model_selection import GridSearchCV from mrex.datasets import load_files from mrex.model_selection import train_test_split from mrex import metrics if __name__ == "__main__": # NOTE: we put the following in a 'if __name__ == "__main__"' protected # block to be able to use a multi-core grid search that also works under # Windows, see: http://docs.python.org/library/multiprocessing.html#windows # The multiprocessing module is used as the backend of joblib.Parallel # that is used when n_jobs != 1 in GridSearchCV # the training data folder must be passed as first argument movie_reviews_data_folder = sys.argv[1] dataset = load_files(movie_reviews_data_folder, shuffle=False) print("n_samples: %d" % len(dataset.data)) # split the dataset in training and test set: docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.25, random_state=None) # TASK: Build a vectorizer / classifier pipeline that filters out tokens # that are too rare or too frequent # TASK: Build a grid search to find out whether unigrams or bigrams are # more useful. # Fit the pipeline on the training set using grid search for the parameters # TASK: print the cross-validated scores for the each parameters set # explored by the grid search