Ejemplo n.º 1
0
def test_one_line_comment():
    text = '''// this code won't compile but the preprocessing still has to be done corrrectly'''

    expected_result = [
        OneLineComment([
            NonCodeChar('/'),
            NonCodeChar('/'),
            SplitContainer.from_single_token('this'),
            SplitContainer.from_single_token('code'),
            SplitContainer.from_single_token('won'),
            NonCodeChar("'"),
            SplitContainer.from_single_token('t'),
            SplitContainer.from_single_token('compile'),
            SplitContainer.from_single_token('but'),
            SplitContainer.from_single_token('the'),
            SplitContainer.from_single_token('preprocessing'),
            SplitContainer.from_single_token('still'),
            SplitContainer.from_single_token('has'),
            SplitContainer.from_single_token('to'),
            SplitContainer.from_single_token('be'),
            SplitContainer.from_single_token('done'),
            SplitContainer.from_single_token('corrrectly'),
            NewLine()
        ])
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
Ejemplo n.º 2
0
def test_floats():
    text = '''float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8};'''
    expected_result = [
        KeyWord('float'),
        Operator('['),
        Operator(']'),
        SplitContainer.from_single_token('floats'),
        Operator('='),
        OpeningCurlyBracket(),
        Operator('-'),
        Number("0.43E4f"),
        Operator(','),
        Number(".58F"),
        Operator(','),
        Number("0.d"),
        Operator(','),
        Operator('-'),
        Number('9.63e+2D'),
        Operator(','),
        Number('0.E-8'),
        ClosingCurlyBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
Ejemplo n.º 3
0
def test_multi_line_comment():
    text = '''
/*multi-line MyComment_
*//
_operations
'''

    expected_result = [
        MultilineComment([
            NonCodeChar('/'),
            NonCodeChar('*'),
            SplitContainer.from_single_token('multi'),
            NonCodeChar('-'),
            SplitContainer.from_single_token('line'),
            SplitContainer(
                [Word.from_('My'),
                 Word.from_('Comment'),
                 Underscore()]),
            NewLine(),
            NonCodeChar('*'),
            NonCodeChar('/')
        ]),
        Operator('/'),
        NewLine(),
        SplitContainer([Underscore(), Word.from_('operations')]),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
Ejemplo n.º 4
0
def test_longs():
    text = '''long[] lovely_longs = {0x34a35EL,     0x88bc96fl           , -0x34L};'''
    expected_result = [
        KeyWord('long'),
        Operator('['),
        Operator(']'),
        SplitContainer(
            [Word.from_('lovely'),
             Underscore(),
             Word.from_('longs')]),
        Operator('='),
        OpeningCurlyBracket(),
        Number("0x34a35EL"),
        Operator(','),
        Tab(),
        Number("0x88bc96fl"),
        Tab(),
        Tab(),
        Operator(','),
        Operator('-'),
        Number("0x34L"),
        ClosingCurlyBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
Ejemplo n.º 5
0
def preprocess_and_write(params: Tuple[bytes, bytes]) -> None:
    src_file_path, dest_file_path = params

    dest_dirname = os.path.dirname(dest_file_path)
    if not os.path.exists(dest_dirname):
        os.makedirs(dest_dirname, exist_ok=True)

    if not REWRITE_PARSED_FILE and os.path.exists(dest_file_path):
        logger.warning(f"File {dest_file_path} already exists! Doing nothing.")
        return

    not_finished_dest_file_path = dest_file_path + NOT_FINISHED_EXTENSION.encode(
    )
    with gzip.GzipFile(not_finished_dest_file_path, 'wb') as f:
        try:
            lines_from_file, path = read_file_contents(src_file_path)
        except FileNotFoundError:
            logger.error(
                f"File was found when scanning the directory, but cannot be read: {src_file_path}. "
                f"Invalid symlink? Ignoring ...")
            return
        extension_bin = os.path.splitext(src_file_path)[1].decode()[1:]
        parsed = [
            p for p in convert_text("\n".join(lines_from_file), extension_bin)
        ]
        pickle.dump(parsed, f, pickle.HIGHEST_PROTOCOL)

    os.rename(not_finished_dest_file_path, dest_file_path)
Ejemplo n.º 6
0
def preprocess(text: str, config: PrepConfig, bpe_codes_id: Optional[str] = None, extension: Optional[str] = None,
               return_metadata: bool = False, force_reinit_bpe_data: bool = True, append_eof: bool = False) \
        -> Union[List[str], Tuple[List[str], PreprocessingMetadata]]:
    parsed = [parsed_token for parsed_token in convert_text(text, extension)]
    parsed = remove_trailing_newline(parsed)
    if append_eof:
        parsed.append(SpecialToken(placeholders['ect']))
    if config.is_bpe():
        assert bpe_codes_id
        custom_bpe_config = None if is_predefined_id(
            bpe_codes_id) else CustomBpeConfig.from_id(bpe_codes_id)
        init_bpe_data(config, custom_bpe_config, force_reinit_bpe_data)
    prep_tokens, metadata = to_repr(config, parsed)
    if return_metadata:
        return prep_tokens, metadata
    else:
        return prep_tokens
Ejemplo n.º 7
0
def test_string_with_spaces():
    text = '''"hi   dear     world    !"'''
    expected = [
        StringLiteral([
            NonCodeChar('"'),
            SplitContainer.from_single_token('hi'),
            SpaceInString(3),
            SplitContainer.from_single_token('dear'),
            SpaceInString(5),
            SplitContainer.from_single_token('world'),
            SpaceInString(4),
            NonCodeChar('!'),
            NonCodeChar('"'),
        ], 26),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected == actual
Ejemplo n.º 8
0
def test_ints():
    text = '''int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf};'''

    expected_result = [
        KeyWord('int'),
        Operator('['),
        Operator(']'),
        SplitContainer([
            Underscore(),
            Word.from_('my'),
            Underscore(),
            Word.from_('favo'),
            Word.from_('Rite'),
            Underscore(),
            Word.from_('ints'),
            Underscore()
        ]),
        Operator('='),
        OpeningCurlyBracket(),
        Number("0x12"),
        Operator(','),
        Number("0x1fE"),
        Operator(','),
        Number("441"),
        Operator(','),
        Operator('-'),
        Number("81"),
        Operator(','),
        Operator('-'),
        Number("0xfFf"),
        ClosingCurlyBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
Ejemplo n.º 9
0
def test_spaces_in_strings():
    text = '''BigAWESOMEString[] a2y = "a    bc".doSplit("\\"");'''
    expected_result = [
        SplitContainer(
            [Word.from_('Big'),
             Word.from_('AWESOME'),
             Word.from_('String')], ),
        Operator('['),
        Operator(']'),
        SplitContainer([Word.from_('a'),
                        Word.from_('2'),
                        Word.from_('y')]),
        Operator('='),
        StringLiteral([
            NonCodeChar('"'),
            SplitContainer.from_single_token('a'),
            SpaceInString(n_chars=4),
            SplitContainer.from_single_token('bc'),
            NonCodeChar('"')
        ], 9),
        Operator('.'),
        SplitContainer([Word.from_('do'),
                        Word.from_('Split')]),
        OpeningBracket(),
        StringLiteral([
            NonCodeChar('"'),
            NonCodeChar('\\'),
            NonCodeChar('"'),
            NonCodeChar('"')
        ], 4),
        ClosingBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
Ejemplo n.º 10
0
def test_capitals():
    text = '''
MyClass Class CONSTANT VAR_WITH_UNDERSCORES
'''

    expected_result = [
        SplitContainer([Word.from_("My"),
                        Word.from_("Class")]),
        SplitContainer.from_single_token("Class"),
        SplitContainer.from_single_token("CONSTANT"),
        SplitContainer([
            Word.from_("VAR"),
            Underscore(),
            Word.from_("WITH"),
            Underscore(),
            Word.from_("UNDERSCORES")
        ]),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
Ejemplo n.º 11
0
def test_string_literal_double():
    text = '''a = "some_text".split()'''

    expected_result = [
        SplitContainer.from_single_token("a"),
        Operator('='),
        StringLiteral([NonCodeChar('"')], 1),
        StringLiteral([
            SplitContainer(
                [Word.from_("some"),
                 Underscore(),
                 Word.from_("text")])
        ], 9),
        StringLiteral([NonCodeChar('"')], 1),
        Operator('.'),
        SplitContainer.from_single_token("split"),
        OpeningBracket(),
        ClosingBracket(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'py')]

    assert expected_result == actual
Ejemplo n.º 12
0
def test_special_characters():
    text = '''
abc1
~-0xFFFFFL=
.0E+5
|=
?
==
!=
**
++
--
+=
-=
/=
*=
%=
$
<=
>=
@
    ^=
    &=
    #
                                                                                 >>
<<
&&
||
+*!/><\t\n
{}[],.-:();&|\\'~%^
'''

    expected_result = [
        SplitContainer([Word.from_('abc'), Word.from_('1')]),
        NewLine(),
        Operator('~'),
        Operator('-'),
        Number("0xFFFFFL"),
        Operator('='),
        NewLine(),
        Number(".0E+5"),
        NewLine(),
        Operator('|'),
        Operator('='),
        NewLine(),
        Operator('?'),
        NewLine(),
        Operator('='),
        Operator('='),
        NewLine(),
        Operator('!'),
        Operator('='),
        NewLine(),
        Operator('*'),
        Operator('*'),
        NewLine(),
        Operator('+'),
        Operator('+'),
        NewLine(),
        Operator('-'),
        Operator('-'),
        NewLine(),
        Operator('+'),
        Operator('='),
        NewLine(),
        Operator('-'),
        Operator('='),
        NewLine(),
        Operator('/'),
        Operator('='),
        NewLine(),
        Operator('*'),
        Operator('='),
        NewLine(),
        Operator('%'),
        Operator('='),
        NewLine(),
        NonCodeChar('$'),
        NewLine(),
        Operator('<'),
        Operator('='),
        NewLine(),
        Operator('>'),
        Operator('='),
        NewLine(),
        NonCodeChar('@'),
        NewLine(),
        Tab(),
        Operator('^'),
        Operator('='),
        NewLine(),
        Tab(),
        Operator('&'),
        Operator('='),
        NewLine(),
        Tab(),
        NonCodeChar('#'),
        NewLine(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Operator('>'),
        Operator('>'),
        NewLine(),
        Operator('<'),
        Operator('<'),
        NewLine(),
        Operator('&'),
        Operator('&'),
        NewLine(),
        Operator('|'),
        Operator('|'),
        NewLine(),
        Operator('+'),
        Operator('*'),
        Operator('!'),
        Operator('/'),
        Operator('>'),
        Operator('<'),
        Tab(),
        NewLine(),
        NewLine(),
        OpeningCurlyBracket(),
        ClosingCurlyBracket(),
        Operator('['),
        Operator(']'),
        Operator(','),
        Operator('.'),
        Operator('-'),
        Operator(':'),
        OpeningBracket(),
        ClosingBracket(),
        Semicolon(),
        Operator('&'),
        Operator('|'),
        NonCodeChar('\\'),
        NonCodeChar("'"),
        Operator('~'),
        Operator('%'),
        Operator('^'),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual