Python NonCodeChar 예제들, codeprep.tokens.word.NonCodeChar Python 예제들

예제 #1

0

파일 보기

def test_multi_line_comment():
    text = '''
/*multi-line MyComment_
*//
_operations
'''

    expected_result = [
        MultilineComment([
            NonCodeChar('/'),
            NonCodeChar('*'),
            SplitContainer.from_single_token('multi'),
            NonCodeChar('-'),
            SplitContainer.from_single_token('line'),
            SplitContainer(
                [Word.from_('My'),
                 Word.from_('Comment'),
                 Underscore()]),
            NewLine(),
            NonCodeChar('*'),
            NonCodeChar('/')
        ]),
        Operator('/'),
        NewLine(),
        SplitContainer([Underscore(), Word.from_('operations')]),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual

예제 #2

0

파일 보기

def test_one_line_comment():
    text = '''// this code won't compile but the preprocessing still has to be done corrrectly'''

    expected_result = [
        OneLineComment([
            NonCodeChar('/'),
            NonCodeChar('/'),
            SplitContainer.from_single_token('this'),
            SplitContainer.from_single_token('code'),
            SplitContainer.from_single_token('won'),
            NonCodeChar("'"),
            SplitContainer.from_single_token('t'),
            SplitContainer.from_single_token('compile'),
            SplitContainer.from_single_token('but'),
            SplitContainer.from_single_token('the'),
            SplitContainer.from_single_token('preprocessing'),
            SplitContainer.from_single_token('still'),
            SplitContainer.from_single_token('has'),
            SplitContainer.from_single_token('to'),
            SplitContainer.from_single_token('be'),
            SplitContainer.from_single_token('done'),
            SplitContainer.from_single_token('corrrectly'),
            NewLine()
        ])
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual

예제 #3

0

파일 보기

def to_parsed_token(token: str) -> ParsedToken:
    if token == '\n':
        return NewLine()
    elif token == '\t':
        return Tab()
    elif is_number(token):
        return Number(token)
    elif regex.fullmatch("\\w+", token):
        return split_identifier(token)
    else:
        return NonCodeChar(token)

예제 #4

0

파일 보기

def test_string_with_spaces():
    text = '''"hi   dear     world    !"'''
    expected = [
        StringLiteral([
            NonCodeChar('"'),
            SplitContainer.from_single_token('hi'),
            SpaceInString(3),
            SplitContainer.from_single_token('dear'),
            SpaceInString(5),
            SplitContainer.from_single_token('world'),
            SpaceInString(4),
            NonCodeChar('!'),
            NonCodeChar('"'),
        ], 26),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected == actual

예제 #5

0

파일 보기

def split_into_words(token: str) -> List[ParsedToken]:
    """
    >>> split_into_words("    var = 9.4\\t\\n")
    [<Tab>, SplitContainer[Word(('var', none))], NonCodeChar(=), <Number>(9), \
NonCodeChar(.), <Number>(4), <Tab>, <NewLine>]
    """
    res = []
    four_char_whitespace = " " * 4
    for m in regex.finditer(f"(\\w+|[^ ]|{four_char_whitespace})", token):
        if m[0] == four_char_whitespace:
            res.append(Tab())
        else:
            res.append(to_parsed_token(m[0]))
    return res

예제 #6

0

파일 보기

def split_string(token: str) -> List[ParsedToken]:
    """
    >>> split_string("    var = 9.4\\t\\n")
    [<SpaceInString> (n_chars=4), SplitContainer[Word(('var', none))], \
<SpaceInString> (n_chars=1), NonCodeChar(=), <SpaceInString> (n_chars=1), <Number>(9), \
NonCodeChar(.), <Number>(4), <Tab>, <NewLine>]
    """
    res = []
    arbitrary_whitespace = "( )+"
    for m in regex.finditer(f"(\\w+|[^ ]|{arbitrary_whitespace})", token):
        if regex.fullmatch(arbitrary_whitespace, m[0]):
            res.append(SpaceInString(n_chars=len(m[0])))
        else:
            res.append(to_parsed_token(m[0]))
    return res

예제 #7

0

파일 보기

def test_string_literal_double():
    text = '''a = "some_text".split()'''

    expected_result = [
        SplitContainer.from_single_token("a"),
        Operator('='),
        StringLiteral([NonCodeChar('"')], 1),
        StringLiteral([
            SplitContainer(
                [Word.from_("some"),
                 Underscore(),
                 Word.from_("text")])
        ], 9),
        StringLiteral([NonCodeChar('"')], 1),
        Operator('.'),
        SplitContainer.from_single_token("split"),
        OpeningBracket(),
        ClosingBracket(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'py')]

    assert expected_result == actual

예제 #8

0

파일 보기

def test_spaces_in_strings():
    text = '''BigAWESOMEString[] a2y = "a    bc".doSplit("\\"");'''
    expected_result = [
        SplitContainer(
            [Word.from_('Big'),
             Word.from_('AWESOME'),
             Word.from_('String')], ),
        Operator('['),
        Operator(']'),
        SplitContainer([Word.from_('a'),
                        Word.from_('2'),
                        Word.from_('y')]),
        Operator('='),
        StringLiteral([
            NonCodeChar('"'),
            SplitContainer.from_single_token('a'),
            SpaceInString(n_chars=4),
            SplitContainer.from_single_token('bc'),
            NonCodeChar('"')
        ], 9),
        Operator('.'),
        SplitContainer([Word.from_('do'),
                        Word.from_('Split')]),
        OpeningBracket(),
        StringLiteral([
            NonCodeChar('"'),
            NonCodeChar('\\'),
            NonCodeChar('"'),
            NonCodeChar('"')
        ], 4),
        ClosingBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual

예제 #9

0

파일 보기

파일: test_to_repr.py 프로젝트: mir-am/codeprep

def test_to_repr_with_enonlycontents1():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    tokens = [
        Number("1.1"),
        Operator("*"),
        NonEng(SplitContainer([Word.from_("dinero")])),
        StringLiteral([
            NonCodeChar('"'),
            NonEng(SplitContainer([Word.from_("ich")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("weiss")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("nicht")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("was")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("soll")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("es")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("bedeuten")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("dass")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("ich")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("so")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("traurig")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("bin")])),
            NonCodeChar('"'),
        ], 62),
        NewLine(),
        MultilineComment([NonCodeChar('/'), NonCodeChar('*')]),
        MultilineComment([
            NonEng(SplitContainer([Word.from_('ц')])),
            NonEng(
                SplitContainer([
                    Word.from_("blanco"),
                    Underscore(),
                    Word.from_("english")
                ])
            ),
        ]),
        MultilineComment([NonCodeChar('*'), NonCodeChar('/')]),
        NewLine(), Tab(),
        OneLineComment([NonCodeChar('/'), NonCodeChar('/'),
            NonEng(
                SplitContainer([
                    Word.from_("DIESELBE"),
                    Word.from_("8")
                ])
            )
        ])
    ]

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        pl['non_eng'],
        '"', pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"],
        pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], '"',
        '/', '*', pl['non_eng'], pl['non_eng'],
        '*', '/',
        '/', '/',  pl['non_eng'],
        pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"},
                                              word_boundaries=[0] + list(range(5, 32)),
                                              token_types=[Number, Operator, NonEng]
                                                          + [StringLiteral] * 14
                                                          + [MultilineComment] * 6
                                                          + [OneLineComment] * 4)

    assert expected == actual
    assert expected_metadata == actual_metadata

예제 #10

0

파일 보기

파일: test_to_repr.py 프로젝트: mir-am/codeprep

from codeprep.tokens.numeric import Number
from codeprep.preprocess.placeholders import placeholders
from codeprep.tokens.whitespace import Tab, NewLine, SpaceInString
from codeprep.tokens.word import Word, Underscore, NonCodeChar, Operator
from codeprep.prepconfig import PrepParam, PrepConfig
from codeprep.pipeline.to_repr import to_repr

pl = placeholders
cwe = placeholders['compound_word_end']

tokens = [
    Number('1.1'),
    Operator("*"),
    NonEng(SplitContainer([Word.from_("übersetzen")])),
    StringLiteral([
        NonCodeChar('"'),
        NonEng(
            SplitContainer([
                Word.from_("A"),
                Word.from_("Wirklicä")
            ])
        ),
        SpaceInString(1),
        NonCodeChar('"')
    ], 11),
    NewLine(),
    MultilineComment([NonCodeChar('/'), NonCodeChar('*')]),
    MultilineComment([
        NonEng(
            SplitContainer([Word.from_('ц')]),
        ),

예제 #11

0

파일 보기

def test_special_characters():
    text = '''
abc1
~-0xFFFFFL=
.0E+5
|=
?
==
!=
**
++
--
+=
-=
/=
*=
%=
$
<=
>=
@
    ^=
    &=
    #
                                                                                 >>
<<
&&
||
+*!/><\t\n
{}[],.-:();&|\\'~%^
'''

    expected_result = [
        SplitContainer([Word.from_('abc'), Word.from_('1')]),
        NewLine(),
        Operator('~'),
        Operator('-'),
        Number("0xFFFFFL"),
        Operator('='),
        NewLine(),
        Number(".0E+5"),
        NewLine(),
        Operator('|'),
        Operator('='),
        NewLine(),
        Operator('?'),
        NewLine(),
        Operator('='),
        Operator('='),
        NewLine(),
        Operator('!'),
        Operator('='),
        NewLine(),
        Operator('*'),
        Operator('*'),
        NewLine(),
        Operator('+'),
        Operator('+'),
        NewLine(),
        Operator('-'),
        Operator('-'),
        NewLine(),
        Operator('+'),
        Operator('='),
        NewLine(),
        Operator('-'),
        Operator('='),
        NewLine(),
        Operator('/'),
        Operator('='),
        NewLine(),
        Operator('*'),
        Operator('='),
        NewLine(),
        Operator('%'),
        Operator('='),
        NewLine(),
        NonCodeChar('$'),
        NewLine(),
        Operator('<'),
        Operator('='),
        NewLine(),
        Operator('>'),
        Operator('='),
        NewLine(),
        NonCodeChar('@'),
        NewLine(),
        Tab(),
        Operator('^'),
        Operator('='),
        NewLine(),
        Tab(),
        Operator('&'),
        Operator('='),
        NewLine(),
        Tab(),
        NonCodeChar('#'),
        NewLine(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Operator('>'),
        Operator('>'),
        NewLine(),
        Operator('<'),
        Operator('<'),
        NewLine(),
        Operator('&'),
        Operator('&'),
        NewLine(),
        Operator('|'),
        Operator('|'),
        NewLine(),
        Operator('+'),
        Operator('*'),
        Operator('!'),
        Operator('/'),
        Operator('>'),
        Operator('<'),
        Tab(),
        NewLine(),
        NewLine(),
        OpeningCurlyBracket(),
        ClosingCurlyBracket(),
        Operator('['),
        Operator(']'),
        Operator(','),
        Operator('.'),
        Operator('-'),
        Operator(':'),
        OpeningBracket(),
        ClosingBracket(),
        Semicolon(),
        Operator('&'),
        Operator('|'),
        NonCodeChar('\\'),
        NonCodeChar("'"),
        Operator('~'),
        Operator('%'),
        Operator('^'),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual