Ejemplo n.º 1
0
def test_encode_decode_unknown_huffman():

    unknown_token = "__unknown__"

    test = HuffmanEncoder({'form': {
        'fox': 1,
        'quick': 2,
        'brown': 3
    }},
                          SNGram,
                          unknown=unknown_token)

    pattern_list = [
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ]
    pattern = SNGram.from_element_list(pattern_list)

    expected_pattern_list = pattern_list
    expected_pattern_list[2] = PatternElement(unknown_token, 'form')
    expected_pattern = SNGram.from_element_list(expected_pattern_list)

    assert test.decode(test.encode(pattern)) == expected_pattern
Ejemplo n.º 2
0
def test_sngram_from_element_list(sngram, expected):

    assert sngram.get_pattern_list(['form'])[0] == SNGram.from_element_list(
        expected['repr'],
        left_bracket=sngram.LEFT_BRACKET,
        right_bracket=sngram.RIGHT_BRACKET,
        comma=sngram.COMMA)
Ejemplo n.º 3
0
def test_encode_decode(encoder):

    pattern = SNGram.from_element_list([
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ])

    assert encoder.decode(encoder.encode(pattern)) == pattern
Ejemplo n.º 4
0
def conversion_function(tree, tags):

    if tree.token['upos'] in tags:
        return SNGram.Tree(
            dict({
                'np_function': tree.token['deprel'],
                'id': tree.token['id']
            }), [])
    else:
        return None
Ejemplo n.º 5
0
def conversion_function(tree):

    if tree.token['upostag'] == "NOUN":
        return SNGram.Tree(
            dict({
                'function': tree.token['deprel'],
                'id': tree.token['id']
            }), [])
    else:
        return None
Ejemplo n.º 6
0
def test_encode_decode_different_levels(encoder_dict):

    test = BitEncoder(encoder_dict, SNGram)
    pattern = SNGram.from_element_list([
        PatternElement('Noun', 'pos'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ])

    assert test.decode(test.encode(pattern)) == pattern
Ejemplo n.º 7
0
def test_encode_decode_with_full_token(encoder):

    pattern = SNGram.from_element_list([{
        'form': 'fox'
    }, SNGram.LEFT_BRACKET,
                                        PatternElement('The',
                                                       'form'), SNGram.COMMA,
                                        PatternElement('quick', 'form'),
                                        SNGram.COMMA, {
                                            'form': 'brown'
                                        }, SNGram.RIGHT_BRACKET])

    assert encoder.decode(encoder.encode(pattern)) == pattern
Ejemplo n.º 8
0
def test_encode_unknown_not_set_huffman():

    test = HuffmanEncoder({'form': {'fox': 1, 'quick': 2, 'brown': 3}}, SNGram)

    pattern_list = [
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ]
    pattern = SNGram.from_element_list(pattern_list)

    with pytest.raises(EncodeError):
        test.encode(pattern)
Ejemplo n.º 9
0
def test_encode_unknown_not_set_bitencoder():

    test = BitEncoder({'form': set(['fox', 'quick', 'brown'])}, SNGram)

    pattern_list = [
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ]
    pattern = SNGram.from_element_list(pattern_list)

    with pytest.raises(EncodeError):
        test.encode(pattern)
Ejemplo n.º 10
0
def test_append(encoder):

    pattern_list = [
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ]
    expected_pattern = SNGram.from_element_list(pattern_list)

    pattern = b''
    for element in pattern_list:
        pattern = encoder.append(pattern, encoder.encode_item(element))

    assert encoder.decode(pattern) == expected_pattern
Ejemplo n.º 11
0
def test_encode_decode_item_with_full_token(encoder):

    element = {'form': 'fox'}
    expected_pattern = SNGram.from_element_list([element])

    assert encoder.decode(encoder.encode_item(element)) == expected_pattern
Ejemplo n.º 12
0
def test_encode_item(encoder):

    element = PatternElement('fox', 'form')
    expected_pattern = SNGram.from_element_list([element])

    assert encoder.decode(encoder.encode_item(element)) == expected_pattern
Ejemplo n.º 13
0
def case_jumps_phrases():

    tree = SNGram.Tree({
        'form': 'jumps',
        'id': 5
    }, [
        SNGram.Tree({
            'form': 'nsubj',
            'id': 4
        }, [],
                    SNGram.Tree({
                        'form': 'fox',
                        'id': 4
                    }, [
                        SNGram.Tree({
                            'form': 'The',
                            'id': 1
                        }, []),
                        SNGram.Tree({
                            'form': 'quick',
                            'id': 2
                        }, []),
                        SNGram.Tree({
                            'form': 'brown',
                            'id': 3
                        }, []),
                    ])),
        SNGram.Tree({
            'form': 'nmod',
            'id': 9
        }, [],
                    SNGram.Tree({
                        'form': 'dog',
                        'id': 9
                    }, [
                        SNGram.Tree({
                            'form': 'over',
                            'id': 6
                        }, []),
                        SNGram.Tree({
                            'form': 'the',
                            'id': 7
                        }, []),
                        SNGram.Tree({
                            'form': 'lazy',
                            'id': 8
                        }, []),
                    ])),
        SNGram.Tree({
            'form': '.',
            'id': 10
        }, [])
    ])

    return TokenSNGram(tree), {
        "length":
        4,
        "str":
        "jumps [nsubj, nmod, .]",
        "repr": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('nsubj', 'form', 4), SNGram.COMMA,
            PatternElement('nmod', 'form', 9), SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "repr_full": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form',
                           3), SNGram.RIGHT_BRACKET, SNGram.COMMA,
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET,
            SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }