Beispiel #1
0
def test_both_enonly_and_nosplit():
    with pytest.raises(ValueError):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 'U',
            PrepParam.COM: 'c',
            PrepParam.STR: '1',
            PrepParam.SPLIT: '0',
            PrepParam.TABS_NEWLINES: '0',
            PrepParam.CASE: 'l'
        })
        to_repr(prep_config, [], BpeData())
Beispiel #2
0
def test_to_repr_0():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '0',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'u'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        '1.1',
        "*",
        'übersetzen',
        '"', 'AWirklicä', '"',
        '/', '*', 'ц', 'blanco_english', '*', '/',
        '/', '/', "DIESELBE8", pl['olc_end']
    ]
    expected_metadata = PreprocessingMetadata({'"', "*", "/"},
                                              word_boundaries=list(range(16+1)),
                                              token_types=[Number, Operator, SplitContainer,
                                                           StringLiteral, StringLiteral, StringLiteral,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           OneLineComment, OneLineComment, OneLineComment, OneLineComment])

    assert expected == actual
    assert expected_metadata == actual_metadata
Beispiel #3
0
def test_to_repr_no_nosep():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        pl['non_eng'],
        '"', pl['non_eng'], '"',
        '/', '*', pl['non_eng'], pl['non_eng'], '*', '/',
        '/', '/', pl['non_eng'],
        pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/"},
                                              word_boundaries=[0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
                                              token_types=[Number, Operator, NonEng]
                                                          + [StringLiteral] * 3
                                                          + [MultilineComment] * 6
                                                          + [OneLineComment] * 4)

    assert expected == actual
    assert expected_metadata == actual_metadata
Beispiel #4
0
def test_to_repr_no_no_sep_with_bpe_no_merges():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'u'
    })

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={}, merges=MergeList()))

    expected = [
        '1',
        '.',
        '1',
        cwe,
        "*" + cwe,
        '÷', 'b', 'e', 'r', 's', 'e', 't', 'z', 'e', 'n', '</t>',
        '"', 'A', 'W', 'i', 'r', 'k', 'l', 'i', 'c', '\xf7', '\xa0', '"', cwe,
        '/' + cwe, '*' + cwe, '\xf7', cwe, 'b', 'l', 'a', 'n', 'c', 'o', '_', 'e', 'n', 'g', 'l', 'i', 's', 'h', cwe, '*' + cwe, '/' + cwe,
        '/' + cwe, '/' + cwe, 'D', 'I', 'E', 'S', 'E', 'L', 'B', 'E', '8', cwe,
        pl['olc_end'] + cwe
    ]

    assert expected == actual
Beispiel #5
0
def test_to_repr_no_str_no_com():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: '0',
        PrepParam.STR: '0',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        pl['non_eng'],
        pl["string_literal"],
        pl["comment"],
        pl["comment"],
        pl["comment"],
        pl["comment"]
    ]

    expected_metadata = PreprocessingMetadata({'*'}, word_boundaries=[0, 5, 6, 7, 8, 9, 10, 11, 12],
                                              token_types=[Number, Operator, NonEng, StringLiteral,
                                                           MultilineComment, MultilineComment, MultilineComment, OneLineComment])

    assert expected == actual
    assert expected_metadata == actual_metadata
Beispiel #6
0
def test_to_repr_with_non_eng():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        'übersetzen',
        '"', pl['word_start'], pl['capitals'], 'a', pl['capital'], 'wirklicä', pl['word_end'], '"',
        '/', '*', 'ц', pl['word_start'], 'blanco', '_', 'english', pl['word_end'], '*', '/',
        '/', '/', pl['word_start'], pl['capitals'], 'dieselbe', "8", pl['word_end'], pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/"}, word_boundaries=[0, 5, 6, 7, 8, 14, 15, 16, 17, 18,
                                                                                23, 24, 25, 26, 27, 32, 33],
                                              token_types=[Number, Operator, SplitContainer]
                                                          + [StringLiteral] * 3
                                                          + [MultilineComment] * 6
                                                          + [OneLineComment] * 4)

    assert expected == actual
    assert expected_metadata == actual_metadata
Beispiel #7
0
def test_to_repr_1_nosep():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '1',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        '1.1',
        "*",
        pl['non_eng'],
        '"',
        pl['non_eng'], '"',
        '/', '*', pl['non_eng'], pl['non_eng'], '*', '/',
        '/', '/', pl['non_eng'],
        pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"},
                                              word_boundaries=list(range(16+1)),
                                              token_types=[Number, Operator, NonEng,
                                                           StringLiteral, StringLiteral, StringLiteral,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           OneLineComment, OneLineComment, OneLineComment, OneLineComment])

    assert expected == actual
    assert expected_metadata == actual_metadata
Beispiel #8
0
def test_to_repr_0_max_str_length_7():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '7',
        PrepParam.SPLIT: '0',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'u'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        '1.1',
        "*",
        'übersetzen',
        '"', '"',
        '/', '*', 'ц', 'blanco_english', '*', '/',
        '/', '/', "DIESELBE8", pl['olc_end']
    ]
    expected_metadata = PreprocessingMetadata({'"', "*", "/"},
                                              word_boundaries=[0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                                              token_types=[Number, Operator, SplitContainer, StringLiteral,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           OneLineComment, OneLineComment, OneLineComment, OneLineComment])

    assert expected == actual
    assert expected_metadata == actual_metadata
Beispiel #9
0
def test_bpe_string_literal_performance():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    n= 10000
    tokens = [StringLiteral(['a' * n], n)]

    merge_list = MergeList()
    for i in range(1):
        merge_list.append(Merge(('a', 'a'), 10))
    start = time.perf_counter()
    to_repr(prep_config, tokens, BpeData(merges=merge_list, merges_cache={'Whi@@le@': ['Whi@@le@']}))
    assert (time.perf_counter() - start) < 1
Beispiel #10
0
def preprocess(text: str, config: PrepConfig, bpe_codes_id: Optional[str] = None, extension: Optional[str] = None,
               return_metadata: bool = False, force_reinit_bpe_data: bool = True, append_eof: bool = False) \
        -> Union[List[str], Tuple[List[str], PreprocessingMetadata]]:
    parsed = [parsed_token for parsed_token in convert_text(text, extension)]
    parsed = remove_trailing_newline(parsed)
    if append_eof:
        parsed.append(SpecialToken(placeholders['ect']))
    if config.is_bpe():
        assert bpe_codes_id
        custom_bpe_config = None if is_predefined_id(
            bpe_codes_id) else CustomBpeConfig.from_id(bpe_codes_id)
        init_bpe_data(config, custom_bpe_config, force_reinit_bpe_data)
    prep_tokens, metadata = to_repr(config, parsed)
    if return_metadata:
        return prep_tokens, metadata
    else:
        return prep_tokens
Beispiel #11
0
def test_1():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    tokens = [SplitContainer.from_single_token("Whi@le")]

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={'Whi@@le@': ['Whi@@le@']}))

    expected = ["Whi@le" + placeholders['compound_word_end']]

    expected_metadata = PreprocessingMetadata(word_boundaries=[0, 1], token_types=[SplitContainer])

    assert expected == actual
    assert expected_metadata == actual_metadata
Beispiel #12
0
def test_merges_no_cache():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    tokens = [SplitContainer.from_single_token("Whi@l@@e@")]

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges=MergeList().append(Merge(('W', 'h'), 10)),
                                                                    merges_cache={} ))

    expected = ["Wh", "i", '@', "l", '@', '@', "e", '@', pl["compound_word_end"]]

    expected_metadata = PreprocessingMetadata(word_boundaries=[0, 9], token_types=[SplitContainer])

    assert expected == actual
    assert expected_metadata == actual_metadata
Beispiel #13
0
def test_to_repr_with_enonlycontents1():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    tokens = [
        Number("1.1"),
        Operator("*"),
        NonEng(SplitContainer([Word.from_("dinero")])),
        StringLiteral([
            NonCodeChar('"'),
            NonEng(SplitContainer([Word.from_("ich")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("weiss")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("nicht")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("was")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("soll")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("es")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("bedeuten")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("dass")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("ich")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("so")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("traurig")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("bin")])),
            NonCodeChar('"'),
        ], 62),
        NewLine(),
        MultilineComment([NonCodeChar('/'), NonCodeChar('*')]),
        MultilineComment([
            NonEng(SplitContainer([Word.from_('ц')])),
            NonEng(
                SplitContainer([
                    Word.from_("blanco"),
                    Underscore(),
                    Word.from_("english")
                ])
            ),
        ]),
        MultilineComment([NonCodeChar('*'), NonCodeChar('/')]),
        NewLine(), Tab(),
        OneLineComment([NonCodeChar('/'), NonCodeChar('/'),
            NonEng(
                SplitContainer([
                    Word.from_("DIESELBE"),
                    Word.from_("8")
                ])
            )
        ])
    ]

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        pl['non_eng'],
        '"', pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"],
        pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], '"',
        '/', '*', pl['non_eng'], pl['non_eng'],
        '*', '/',
        '/', '/',  pl['non_eng'],
        pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"},
                                              word_boundaries=[0] + list(range(5, 32)),
                                              token_types=[Number, Operator, NonEng]
                                                          + [StringLiteral] * 14
                                                          + [MultilineComment] * 6
                                                          + [OneLineComment] * 4)

    assert expected == actual
    assert expected_metadata == actual_metadata