Python TextPreparationPipelineFactory Exemples, mindmeld.text_preparation.text_preparation_pipeline.TextPreparationPipelineFactory Python Exemples

Exemple #1

0

Afficher le fichier

def test_query_cache_has_the_correct_format(kwik_e_mart_app_path):
    text_prep_pipeline = TextPreparationPipelineFactory.create_from_app_path(
        kwik_e_mart_app_path)
    cache = QueryCache(kwik_e_mart_app_path, text_prep_pipeline.get_hashid())
    key = QueryCache.get_key("store_info", "help", "User manual")
    row_id = cache.key_to_row_id(key)
    assert row_id is not None
    pq = cache.get(row_id)
    assert type(pq) == ProcessedQuery
    assert pq.domain == "store_info"
    assert pq.intent == "help"

Exemple #2

0

Afficher le fichier

def test_disk_query_cache(processed_queries):
    environ = {"MM_QUERY_CACHE_IN_MEMORY": "0"}

    with TemporaryDirectory() as tmpdir, patch.dict(os.environ, environ):
        text_prep_pipeline = TextPreparationPipelineFactory.create_from_app_path(
            tmpdir)
        cache = QueryCache(tmpdir, text_prep_pipeline.get_hashid())

        # Verify that there is no in-memory caching
        assert cache.memory_connection is None
        for q in processed_queries:
            key = QueryCache.get_key(q.domain, q.intent, q.query.text)
            cache.put(key, q)
            # Verify that queries are written to disk immediately
            assert get_query_from_disk(tmpdir, key) == (q.domain, q.intent,
                                                        q.query.text)

Exemple #3

0

Afficher le fichier

def test_tokenize_around_annoations():

    text_preparation_pipeline = (
        TextPreparationPipelineFactory.create_default_text_preparation_pipeline()
    )
    sentence = "HELLO {LUCIEN|PERSON_NAME}, HOW ARE YOU?"
    raw_tokens = text_preparation_pipeline.tokenize(sentence)

    expected_raw_tokens = [
        {"start": 0, "text": "HELLO"},
        {"start": 7, "text": "LUCIEN"},
        {"start": 26, "text": ","},
        {"start": 28, "text": "HOW"},
        {"start": 32, "text": "ARE"},
        {"start": 36, "text": "YOU?"},
    ]
    assert raw_tokens == expected_raw_tokens

Exemple #4

0

Afficher le fichier

def test_create_text_preparation_pipeline():

    text_preparation_pipeline = (
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            language=ENGLISH_LANGUAGE_CODE,
            preprocessors=[],
            regex_norm_rules=[{"pattern": ".*", "replacement": "cisco"}],
            normalizers=["Lowercase", "ASCIIFold"],
            tokenizer="WhiteSpaceTokenizer",
            stemmer=None,
        )
    )

    assert text_preparation_pipeline.language == ENGLISH_LANGUAGE_CODE
    assert isinstance(text_preparation_pipeline.preprocessors[0], NoOpPreprocessor)
    assert isinstance(text_preparation_pipeline.normalizers[0], RegexNormalizerRule)
    assert isinstance(text_preparation_pipeline.normalizers[1], Lowercase)
    assert isinstance(text_preparation_pipeline.normalizers[2], ASCIIFold)
    assert isinstance(text_preparation_pipeline.tokenizer, WhiteSpaceTokenizer)
    assert isinstance(text_preparation_pipeline.stemmer, EnglishNLTKStemmer)

Exemple #5

0

Afficher le fichier

def test_construct_pipeline_components_valid_input():
    text_preparation_pipeline = (
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            preprocessors=("NoOpPreprocessor", NoOpPreprocessor()),
            normalizers=(
                "RemoveBeginningSpace",
                NoOpNormalizer(),
                "ReplaceSpacesWithSpace",
                Lowercase(),
            ),
            tokenizer="SpacyTokenizer",
            stemmer=None,
        ))

    assert text_preparation_pipeline.language == ENGLISH_LANGUAGE_CODE
    for preprocessor in text_preparation_pipeline.preprocessors:
        assert isinstance(preprocessor, Preprocessor)
    for normalizer in text_preparation_pipeline.normalizers:
        assert isinstance(normalizer, Normalizer)
    assert isinstance(text_preparation_pipeline.tokenizer, SpacyTokenizer)
    assert isinstance(text_preparation_pipeline.stemmer, EnglishNLTKStemmer)

Exemple #6

0

Afficher le fichier

def test_memory_query_cache(processed_queries):
    environ = {
        "MM_QUERY_CACHE_IN_MEMORY": "1",
        "MM_QUERY_CACHE_WRITE_SIZE": "10"
    }

    with TemporaryDirectory() as tmpdir, patch.dict(os.environ, environ):
        text_prep_pipeline = TextPreparationPipelineFactory.create_from_app_path(
            tmpdir)
        cache = QueryCache(tmpdir, text_prep_pipeline.get_hashid())

        # Verify that there is in-memory caching
        assert cache.memory_connection is not None
        for q in processed_queries[:9]:
            key = QueryCache.get_key(q.domain, q.intent, q.query.text)
            cache.put(key, q)
            # Verify that the first 9 queries are not written to disk
            assert not get_query_from_disk(tmpdir, key)

        # Verify that the 10th query triggers a flush to disk
        q = processed_queries[9]
        key = QueryCache.get_key(q.domain, q.intent, q.query.text)
        cache.put(key, q)
        for q in processed_queries[:10]:
            key = QueryCache.get_key(q.domain, q.intent, q.query.text)
            assert get_query_from_disk(tmpdir, key) == (q.domain, q.intent,
                                                        q.query.text)

        # Verify that a GC triggers a flush to disk
        for q in processed_queries[10:15]:
            key = QueryCache.get_key(q.domain, q.intent, q.query.text)
            cache.put(key, q)
            assert not get_query_from_disk(tmpdir, key)
        cache = None
        for q in processed_queries[10:15]:
            key = QueryCache.get_key(q.domain, q.intent, q.query.text)
            assert get_query_from_disk(tmpdir, key)

Exemple #7

0

Afficher le fichier

def test_construct_pipeline_components_invalid_input():

    with pytest.raises(TypeError):
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            preprocessors=None,
            normalizers=("SpacyTokenizer"),
            tokenizer=None,
            stemmer=None,
        )

    with pytest.raises(TypeError):
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            preprocessors=None,
            normalizers=(NoOpTokenizer(), "NoOpTokenizer"),
            tokenizer=None,
            stemmer=None,
        )

    with pytest.raises(TypeError):
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            preprocessors=None,
            normalizers=None,
            tokenizer="NoOpNormalizer",
            stemmer=None,
        )

    with pytest.raises(TypeError):
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            preprocessors=None,
            normalizers=None,
            tokenizer=None,
            stemmer="NoOpPreprocessor",
        )

    with pytest.raises(TypeError):
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            preprocessors=("NoOpNormalizer"),
            normalizers=None,
            tokenizer=None,
            stemmer=None,
        )

    with pytest.raises(TypeError):
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            preprocessors=(NoOpNormalizer(), NoOpTokenizer()),
            normalizers=None,
            tokenizer=None,
            stemmer=None,
        )

Exemple #8

0

Afficher le fichier

def test_text_preparation_pipeline_hash():
    text_preparation_pipeline = (
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            language=ENGLISH_LANGUAGE_CODE,
            preprocessors=["NoOpPreprocessor"],
            regex_norm_rules=[{
                "pattern": ".*",
                "replacement": "cisco"
            }],
            normalizers=["Lowercase", "ASCIIFold"],
            tokenizer="WhiteSpaceTokenizer",
            stemmer=None,
        ))

    original_hash = text_preparation_pipeline.get_hashid()

    # Change order of normalizers
    text_preparation_pipeline = (
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            language=ENGLISH_LANGUAGE_CODE,
            preprocessors=["NoOpPreprocessor"],
            regex_norm_rules=[{
                "pattern": ".*",
                "replacement": "cisco"
            }],
            normalizers=["ASCIIFold", "Lowercase"],
            tokenizer="WhiteSpaceTokenizer",
            stemmer=None,
        ))
    order_changed_hash = text_preparation_pipeline.get_hashid()

    # Change RegexNormalizer pattern
    text_preparation_pipeline = (
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            language=ENGLISH_LANGUAGE_CODE,
            preprocessors=["NoOpPreprocessor"],
            regex_norm_rules=[{
                "pattern": ".*",
                "replacement": "cisc0"
            }],
            normalizers=["ASCIIFold", "Lowercase"],
            tokenizer="WhiteSpaceTokenizer",
            stemmer=None,
        ))

    regex_changed_hash = text_preparation_pipeline.get_hashid()

    # Change Tokenizer type
    text_preparation_pipeline = (
        TextPreparationPipelineFactory.create_text_preparation_pipeline(
            language=ENGLISH_LANGUAGE_CODE,
            preprocessors=["NoOpPreprocessor"],
            regex_norm_rules=[{
                "pattern": ".*",
                "replacement": "cisco"
            }],
            normalizers=["ASCIIFold", "Lowercase"],
            tokenizer="LetterTokenizer",
            stemmer=None,
        ))

    tokenizer_changed_hash = text_preparation_pipeline.get_hashid()

    assert original_hash != order_changed_hash
    assert original_hash != regex_changed_hash
    assert original_hash != tokenizer_changed_hash
    assert tokenizer_changed_hash != regex_changed_hash
    assert tokenizer_changed_hash != order_changed_hash
    assert regex_changed_hash != order_changed_hash