Exemple #1
0
def test_complex_tokens():
    text = '''
    40ft
    40'ft
    40"ft
    20ft
    20 feet
    20'feet
    '''

    annotator = Annotator(
        'Container',
        tokens={
            '"': '"',
            "'": "'",
            'feet_symbol': '/({"}|{\'})/',
            'feet_text': '/f(ee)?t/',
            'sep': '/ ?/',
            'feet': '/{sep}{feet_symbol}?{sep}{feet_text}/',
        },
        patterns=(
            '40{feet}',
            '20{feet}',
        ),
    )
    annotations = list(annotator.annotate(text))

    assert len(annotations) == 6
Exemple #2
0
def test_word_boundary():
    text = '1XHello and 2x World Nothing3x'

    annotator = Annotator(
        'Multiply',
        patterns=r'/(?P<amount>\d+)x/',
        transform=lambda amount: int(amount),
        settings={
            'do_word_boundary_end': False,
        },
    )

    annotations = list(annotator.annotate(text))
    assert len(annotations) == 2
Exemple #3
0
def test_regex_case_sensitivity():
    text = 'HELLO Bello cello'

    annotator = Annotator(
        'Thing',
        patterns='/[hbc]ello/',
    )
    annotations = list(annotator.annotate(text))

    assert annotations == [
        Annotation('HELLO', (0, 5), type='Thing'),
        Annotation('Bello', (6, 11), type='Thing'),
        Annotation('cello', (12, 17), type='Thing'),
    ]
Exemple #4
0
def test_representation():
    text = 'hello world'

    annotator = Annotator(
        'HelloWorld',
        patterns={
            'hello': 'Hello',
            'world': 'World',
        },
    )
    annotations = list(annotator.annotate(text))

    assert annotations == [
        Annotation('hello', (0, 5), type='HelloWorld', data='Hello'),
        Annotation('world', (6, 11), type='HelloWorld', data='World'),
    ]
Exemple #5
0
def test_tokens():
    text = 'hello, world!'

    annotator = Annotator(
        'Thing',
        tokens={
            'comma': ',',
            'dot': '.',
            'sep': '/({comma}|{dot})? /',
        },
        patterns='hello{sep}world!',
    )
    annotations = list(annotator.annotate(text))

    assert annotations == [
        Annotation('hello, world!', (0, 13), type='Thing'),
    ]
Exemple #6
0
def test_fuzziness():
    text = '''
    h3llo w0rld h311o w0r1d
    H3LLO W0RLD
    '''

    annotator = Annotator(
        'HelloWorld',
        patterns=('hello', 'world'),
        settings={
            'fuzzy_error_rate': 0.1,
            'fuzzy_min_errors_allowed': 1,
        },
    )

    annotations = list(annotator.annotate(text))
    assert len(annotations) == 4
Exemple #7
0
def test_text_case_sensitivity():
    text = 'hello Hello HELLO'

    annotator = Annotator(
        'Hello',
        patterns=('hello', ),
    )
    annotations = list(annotator.annotate(text))

    assert annotations == [
        Annotation('hello', (0, 5), type='Hello'),
        Annotation('Hello', (6, 11), type='Hello'),
        Annotation('HELLO', (12, 17), type='Hello'),
    ]

    annotator = Annotator(
        'AlternativeHello',
        patterns=('Hello', ),
        settings={'case_sensitive': True},
    )
    annotations = list(annotator.annotate(text))

    assert annotations == [
        Annotation('Hello', (6, 11), type='AlternativeHello'),
    ]
Exemple #8
0
def test_agent():
    text = '''
        hello
        world
    '''

    agent = Agent()
    agent.add_annotator(Annotator(
        'HelloWorld',
        patterns=('hello', 'world'),
    ))
    annotated_text = agent.annotate(text)

    assert len(annotated_text.lines) == 4
    assert annotated_text.lines[1].cells[0][0] == \
        Annotation('hello', (0, 5), type='HelloWorld')
    assert annotated_text.lines[2].cells[0][0] == \
        Annotation('world', (0, 5), type='HelloWorld')
Exemple #9
0
def test_increase_density():
    agent = Agent()
    agent.add_strategy(IncreaseDensityStrategy(10))
    agent.add_annotator(
        Annotator(
            'Greeting',
            patterns=('hello', 'hi', 'greetings'),
        ))

    text = '''
    hello
    hi
    hello

    hello
    greetings
    greetings
    '''

    annotated_text = agent.annotate(text)

    assert annotated_text.lines[0].text == ''
    assert annotated_text.lines[1].text == 'hello hi hello'
    assert annotated_text.lines[2].text == 'hello greetings greetings'