def test_multi_line_comment(): text = ''' /*multi-line MyComment_ *// _operations ''' expected_result = [ MultilineComment([ NonCodeChar('/'), NonCodeChar('*'), SplitContainer.from_single_token('multi'), NonCodeChar('-'), SplitContainer.from_single_token('line'), SplitContainer( [Word.from_('My'), Word.from_('Comment'), Underscore()]), NewLine(), NonCodeChar('*'), NonCodeChar('/') ]), Operator('/'), NewLine(), SplitContainer([Underscore(), Word.from_('operations')]), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_longs(): text = '''long[] lovely_longs = {0x34a35EL, 0x88bc96fl , -0x34L};''' expected_result = [ KeyWord('long'), Operator('['), Operator(']'), SplitContainer( [Word.from_('lovely'), Underscore(), Word.from_('longs')]), Operator('='), OpeningCurlyBracket(), Number("0x34a35EL"), Operator(','), Tab(), Number("0x88bc96fl"), Tab(), Tab(), Operator(','), Operator('-'), Number("0x34L"), ClosingCurlyBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_split_string(): actual = split_string("123\nAb2cd34Ef000GG j_89_J") expected = [ Number('123'), NewLine(), SplitContainer([ Word.from_('Ab'), Word.from_('2'), Word.from_('cd'), Word.from_('34'), Word.from_('Ef'), Word.from_('000'), Word.from_('GG') ]), SpaceInString(5), SplitContainer([ Word.from_('j'), Underscore(), Word.from_('89'), Underscore(), Word.from_('J') ]) ] assert expected == actual
def split_identifier(token: str) -> SplitContainer: parts = [ m[0] for m in regex.finditer( '(_|[0-9]+|[[:upper:]]?[[:lower:]]+|[[:upper:]]+(?![[:lower:]])|[^ ])', token) ] processable_tokens = [ Word.from_(p) if p != '_' else Underscore() for p in parts ] split_container = SplitContainer(processable_tokens) return NonEng(split_container) if is_non_eng(token) else split_container
def test_ints(): text = '''int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf};''' expected_result = [ KeyWord('int'), Operator('['), Operator(']'), SplitContainer([ Underscore(), Word.from_('my'), Underscore(), Word.from_('favo'), Word.from_('Rite'), Underscore(), Word.from_('ints'), Underscore() ]), Operator('='), OpeningCurlyBracket(), Number("0x12"), Operator(','), Number("0x1fE"), Operator(','), Number("441"), Operator(','), Operator('-'), Number("81"), Operator(','), Operator('-'), Number("0xfFf"), ClosingCurlyBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_capitals(): text = ''' MyClass Class CONSTANT VAR_WITH_UNDERSCORES ''' expected_result = [ SplitContainer([Word.from_("My"), Word.from_("Class")]), SplitContainer.from_single_token("Class"), SplitContainer.from_single_token("CONSTANT"), SplitContainer([ Word.from_("VAR"), Underscore(), Word.from_("WITH"), Underscore(), Word.from_("UNDERSCORES") ]), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_string_literal_double(): text = '''a = "some_text".split()''' expected_result = [ SplitContainer.from_single_token("a"), Operator('='), StringLiteral([NonCodeChar('"')], 1), StringLiteral([ SplitContainer( [Word.from_("some"), Underscore(), Word.from_("text")]) ], 9), StringLiteral([NonCodeChar('"')], 1), Operator('.'), SplitContainer.from_single_token("split"), OpeningBracket(), ClosingBracket(), NewLine() ] actual = [t for t in convert_text(text, 'py')] assert expected_result == actual
def test_spaces_in_strings(): text = '''BigAWESOMEString[] a2y = "a bc".doSplit("\\"");''' expected_result = [ SplitContainer( [Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String')], ), Operator('['), Operator(']'), SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), Operator('='), StringLiteral([ NonCodeChar('"'), SplitContainer.from_single_token('a'), SpaceInString(n_chars=4), SplitContainer.from_single_token('bc'), NonCodeChar('"') ], 9), Operator('.'), SplitContainer([Word.from_('do'), Word.from_('Split')]), OpeningBracket(), StringLiteral([ NonCodeChar('"'), NonCodeChar('\\'), NonCodeChar('"'), NonCodeChar('"') ], 4), ClosingBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_to_repr_with_enonlycontents1(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '2', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) tokens = [ Number("1.1"), Operator("*"), NonEng(SplitContainer([Word.from_("dinero")])), StringLiteral([ NonCodeChar('"'), NonEng(SplitContainer([Word.from_("ich")])), SpaceInString(), NonEng(SplitContainer([Word.from_("weiss")])), SpaceInString(), NonEng(SplitContainer([Word.from_("nicht")])), SpaceInString(), NonEng(SplitContainer([Word.from_("was")])), SpaceInString(), NonEng(SplitContainer([Word.from_("soll")])), SpaceInString(), NonEng(SplitContainer([Word.from_("es")])), SpaceInString(), NonEng(SplitContainer([Word.from_("bedeuten")])), SpaceInString(), NonEng(SplitContainer([Word.from_("dass")])), SpaceInString(), NonEng(SplitContainer([Word.from_("ich")])), SpaceInString(), NonEng(SplitContainer([Word.from_("so")])), SpaceInString(), NonEng(SplitContainer([Word.from_("traurig")])), SpaceInString(), NonEng(SplitContainer([Word.from_("bin")])), NonCodeChar('"'), ], 62), NewLine(), MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ NonEng(SplitContainer([Word.from_('ц')])), NonEng( SplitContainer([ Word.from_("blanco"), Underscore(), Word.from_("english") ]) ), ]), MultilineComment([NonCodeChar('*'), NonCodeChar('/')]), NewLine(), Tab(), OneLineComment([NonCodeChar('/'), NonCodeChar('/'), NonEng( SplitContainer([ Word.from_("DIESELBE"), Word.from_("8") ]) ) ]) ] actual, actual_metadata = to_repr(prep_config, tokens) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], '"', '/', '*', pl['non_eng'], pl['non_eng'], '*', '/', '/', '/', pl['non_eng'], pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"}, word_boundaries=[0] + list(range(5, 32)), token_types=[Number, Operator, NonEng] + [StringLiteral] * 14 + [MultilineComment] * 6 + [OneLineComment] * 4) assert expected == actual assert expected_metadata == actual_metadata
from codeprep.preprocess.metadata import PreprocessingMetadata from codeprep.tokens.noneng import NonEng from codeprep.tokens.numeric import Number from codeprep.preprocess.placeholders import placeholders from codeprep.tokens.whitespace import Tab, NewLine, SpaceInString from codeprep.tokens.word import Word, Underscore, NonCodeChar, Operator from codeprep.prepconfig import PrepParam, PrepConfig from codeprep.pipeline.to_repr import to_repr pl = placeholders cwe = placeholders['compound_word_end'] tokens = [ Number('1.1'), Operator("*"), NonEng(SplitContainer([Word.from_("übersetzen")])), StringLiteral([ NonCodeChar('"'), NonEng( SplitContainer([ Word.from_("A"), Word.from_("Wirklicä") ]) ), SpaceInString(1), NonCodeChar('"') ], 11), NewLine(), MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ NonEng(
def test_special_characters(): text = ''' abc1 ~-0xFFFFFL= .0E+5 |= ? == != ** ++ -- += -= /= *= %= $ <= >= @ ^= &= # >> << && || +*!/><\t\n {}[],.-:();&|\\'~%^ ''' expected_result = [ SplitContainer([Word.from_('abc'), Word.from_('1')]), NewLine(), Operator('~'), Operator('-'), Number("0xFFFFFL"), Operator('='), NewLine(), Number(".0E+5"), NewLine(), Operator('|'), Operator('='), NewLine(), Operator('?'), NewLine(), Operator('='), Operator('='), NewLine(), Operator('!'), Operator('='), NewLine(), Operator('*'), Operator('*'), NewLine(), Operator('+'), Operator('+'), NewLine(), Operator('-'), Operator('-'), NewLine(), Operator('+'), Operator('='), NewLine(), Operator('-'), Operator('='), NewLine(), Operator('/'), Operator('='), NewLine(), Operator('*'), Operator('='), NewLine(), Operator('%'), Operator('='), NewLine(), NonCodeChar('$'), NewLine(), Operator('<'), Operator('='), NewLine(), Operator('>'), Operator('='), NewLine(), NonCodeChar('@'), NewLine(), Tab(), Operator('^'), Operator('='), NewLine(), Tab(), Operator('&'), Operator('='), NewLine(), Tab(), NonCodeChar('#'), NewLine(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Operator('>'), Operator('>'), NewLine(), Operator('<'), Operator('<'), NewLine(), Operator('&'), Operator('&'), NewLine(), Operator('|'), Operator('|'), NewLine(), Operator('+'), Operator('*'), Operator('!'), Operator('/'), Operator('>'), Operator('<'), Tab(), NewLine(), NewLine(), OpeningCurlyBracket(), ClosingCurlyBracket(), Operator('['), Operator(']'), Operator(','), Operator('.'), Operator('-'), Operator(':'), OpeningBracket(), ClosingBracket(), Semicolon(), Operator('&'), Operator('|'), NonCodeChar('\\'), NonCodeChar("'"), Operator('~'), Operator('%'), Operator('^'), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def from_single_token(cls, token: str): return cls([Word.from_(token)])