Esempi in Python per CharacterTokenizer

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: vtext.tokenize

Classe/tipologia: CharacterTokenizer

Esempi su hotexamples.com: 5

CharacterTokenizer in Python: 5 esempi trovati. Questi sono i migliori esempi reali in Python per vtext.tokenize.CharacterTokenizer, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

CharacterTokenizer(5)

tokenize(1)

Metodi utilizzati di frequente

CharacterTokenizer (5)

tokenize (1)

Esempio n. 1

Mostra file

File: test_tokenize.py Progetto: joshlk/vtext

def test_character_tokenizer():
    tokenizer = CharacterTokenizer()
    assert tokenizer.tokenize("fox can't") == [
        "fox ",
        "ox c",
        "x ca",
        " can",
        "can'",
        "an't",
    ]

Esempio n. 2

Mostra file

File: test_tokenize.py Progetto: joshlk/vtext

    assert tokenizer.tokenize("fox can't") == [
        "fox ",
        "ox c",
        "x ca",
        " can",
        "can'",
        "an't",
    ]


@hypothesis.given(st.text())
@pytest.mark.parametrize(
    "tokenizer",
    [
        RegexpTokenizer(),
        CharacterTokenizer(),
        UnicodeWordTokenizer(),
        VTextTokenizer("en"),
        VTextTokenizer("fr"),
    ],
    ids=_pytest_ids,
)
def test_tokenize_edge_cases(tokenizer, txt):
    tokenizer.tokenize(txt)


@pytest.mark.parametrize(
    "tokenizer, expected",
    [
        (RegexpTokenizer(), {
            "pattern": r"\b\w\w+\b"

Esempio n. 3

Mostra file

File: test_common.py Progetto: joshlk/vtext

def test_pickle_non_default_params():
    # check that pickling correctly stores estimator parameters
    est = CharacterTokenizer(window_size=10)
    est2 = pickle.loads(pickle.dumps(est))
    assert est2.get_params()["window_size"] == 10

Esempio n. 4

Mostra file

    db = [
        (r"Python re.findall(r'\b\w\w+\b', ...)", pyre_tokenizer),
        (
            r"RegexpTokenizer(r'\b\w\w+\b')",
            RegexpTokenizer(pattern=token_regexp).tokenize,
        ),
        (
            "UnicodeSegmentTokenizer(word_bounds=False)",
            UnicodeSegmentTokenizer(word_bounds=False).tokenize,
        ),
        (
            "UnicodeSegmentTokenizer(word_bounds=True)",
            UnicodeSegmentTokenizer(word_bounds=True).tokenize,
        ),
        ("VTextTokenizer('en')", VTextTokenizer("en").tokenize),
        ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize),
    ]

    if sacremoses is not None:
        db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize))
    if spacy is not None:
        from spacy.lang.en import English

        db.append(("Spacy en", English().tokenizer))

    if blingfire is not None:
        db.append(
            ("BlingFire en", lambda x: blingfire.text_to_words(x).split(" ")))

    for label, func in db:
        t0 = time()

Esempio n. 5

Mostra file

    assert tokenizer.tokenize("fox can't") == [
        "fox ",
        "ox c",
        "x ca",
        " can",
        "can'",
        "an't",
    ]


@hypothesis.given(st.text())
@pytest.mark.parametrize(
    "tokenizer",
    [
        RegexpTokenizer(),
        CharacterTokenizer(),
        UnicodeSegmentTokenizer(),
        VTextTokenizer("en"),
        VTextTokenizer("fr"),
    ],
    ids=_pytest_ids,
)
def test_tokenize_edge_cases(tokenizer, txt):
    tokenizer.tokenize(txt)


@pytest.mark.parametrize(
    "tokenizer, expected",
    [
        (RegexpTokenizer(), {"pattern": r"\b\w\w+\b"}),
        (CharacterTokenizer(), {"window_size": 4}),