コード例 #1
0
def test_load_files_wo_load_content(test_category_dir_1, test_category_dir_2,
                                    load_files_root):
    res = load_files(load_files_root, load_content=False)
    assert len(res.filenames) == 1
    assert len(res.target_names) == 2
    assert res.DESCR is None
    assert res.get('data') is None
コード例 #2
0
def test_default_load_files(test_category_dir_1, test_category_dir_2,
                            load_files_root):
    if IS_PYPY:
        pytest.xfail('[PyPy] fails due to string containing NUL characters')
    res = load_files(load_files_root)
    assert len(res.filenames) == 1
    assert len(res.target_names) == 2
    assert res.DESCR is None
    assert res.data == [b"Hello World!\n"]
コード例 #3
0
def test_load_files_w_categories_desc_and_encoding(test_category_dir_1,
                                                   test_category_dir_2,
                                                   load_files_root):
    if IS_PYPY:
        pytest.xfail('[PyPy] fails due to string containing NUL characters')
    category = os.path.abspath(test_category_dir_1).split('/').pop()
    res = load_files(load_files_root,
                     description="test",
                     categories=category,
                     encoding="utf-8")
    assert len(res.filenames) == 1
    assert len(res.target_names) == 1
    assert res.DESCR == "test"
    assert res.data == ["Hello World!\n"]
コード例 #4
0
def test_default_empty_load_files(load_files_root):
    res = load_files(load_files_root)
    assert len(res.filenames) == 0
    assert len(res.target_names) == 0
    assert res.DESCR is None
コード例 #5
0
"""
# Author: Olivier Grisel <*****@*****.**>
# License: Simplified BSD

import sys

from mrex.feature_extraction.text import TfidfVectorizer
from mrex.linear_model import Perceptron
from mrex.pipeline import Pipeline
from mrex.datasets import load_files
from mrex.model_selection import train_test_split
from mrex import metrics

# The training data folder must be passed as first argument
languages_data_folder = sys.argv[1]
dataset = load_files(languages_data_folder)

# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data,
                                                          dataset.target,
                                                          test_size=0.5)

# TASK: Build a vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens
vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             analyzer='char',
                             use_idf=False)

# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf
clf = Pipeline([
コード例 #6
0
from mrex.model_selection import GridSearchCV
from mrex.datasets import load_files
from mrex.model_selection import train_test_split
from mrex import metrics


if __name__ == "__main__":
    # NOTE: we put the following in a 'if __name__ == "__main__"' protected
    # block to be able to use a multi-core grid search that also works under
    # Windows, see: http://docs.python.org/library/multiprocessing.html#windows
    # The multiprocessing module is used as the backend of joblib.Parallel
    # that is used when n_jobs != 1 in GridSearchCV

    # the training data folder must be passed as first argument
    movie_reviews_data_folder = sys.argv[1]
    dataset = load_files(movie_reviews_data_folder, shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    # split the dataset in training and test set:
    docs_train, docs_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=None)

    # TASK: Build a vectorizer / classifier pipeline that filters out tokens
    # that are too rare or too frequent

    # TASK: Build a grid search to find out whether unigrams or bigrams are
    # more useful.
    # Fit the pipeline on the training set using grid search for the parameters

    # TASK: print the cross-validated scores for the each parameters set
    # explored by the grid search