def test_multi_line_comment(): text = ''' /*multi-line MyComment_ *// _operations ''' expected_result = [ MultilineComment([ NonCodeChar('/'), NonCodeChar('*'), SplitContainer.from_single_token('multi'), NonCodeChar('-'), SplitContainer.from_single_token('line'), SplitContainer( [Word.from_('My'), Word.from_('Comment'), Underscore()]), NewLine(), NonCodeChar('*'), NonCodeChar('/') ]), Operator('/'), NewLine(), SplitContainer([Underscore(), Word.from_('operations')]), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_one_line_comment(): text = '''// this code won't compile but the preprocessing still has to be done corrrectly''' expected_result = [ OneLineComment([ NonCodeChar('/'), NonCodeChar('/'), SplitContainer.from_single_token('this'), SplitContainer.from_single_token('code'), SplitContainer.from_single_token('won'), NonCodeChar("'"), SplitContainer.from_single_token('t'), SplitContainer.from_single_token('compile'), SplitContainer.from_single_token('but'), SplitContainer.from_single_token('the'), SplitContainer.from_single_token('preprocessing'), SplitContainer.from_single_token('still'), SplitContainer.from_single_token('has'), SplitContainer.from_single_token('to'), SplitContainer.from_single_token('be'), SplitContainer.from_single_token('done'), SplitContainer.from_single_token('corrrectly'), NewLine() ]) ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def to_parsed_token(token: str) -> ParsedToken: if token == '\n': return NewLine() elif token == '\t': return Tab() elif is_number(token): return Number(token) elif regex.fullmatch("\\w+", token): return split_identifier(token) else: return NonCodeChar(token)
def test_string_with_spaces(): text = '''"hi dear world !"''' expected = [ StringLiteral([ NonCodeChar('"'), SplitContainer.from_single_token('hi'), SpaceInString(3), SplitContainer.from_single_token('dear'), SpaceInString(5), SplitContainer.from_single_token('world'), SpaceInString(4), NonCodeChar('!'), NonCodeChar('"'), ], 26), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected == actual
def split_into_words(token: str) -> List[ParsedToken]: """ >>> split_into_words(" var = 9.4\\t\\n") [<Tab>, SplitContainer[Word(('var', none))], NonCodeChar(=), <Number>(9), \ NonCodeChar(.), <Number>(4), <Tab>, <NewLine>] """ res = [] four_char_whitespace = " " * 4 for m in regex.finditer(f"(\\w+|[^ ]|{four_char_whitespace})", token): if m[0] == four_char_whitespace: res.append(Tab()) else: res.append(to_parsed_token(m[0])) return res
def split_string(token: str) -> List[ParsedToken]: """ >>> split_string(" var = 9.4\\t\\n") [<SpaceInString> (n_chars=4), SplitContainer[Word(('var', none))], \ <SpaceInString> (n_chars=1), NonCodeChar(=), <SpaceInString> (n_chars=1), <Number>(9), \ NonCodeChar(.), <Number>(4), <Tab>, <NewLine>] """ res = [] arbitrary_whitespace = "( )+" for m in regex.finditer(f"(\\w+|[^ ]|{arbitrary_whitespace})", token): if regex.fullmatch(arbitrary_whitespace, m[0]): res.append(SpaceInString(n_chars=len(m[0]))) else: res.append(to_parsed_token(m[0])) return res
def test_string_literal_double(): text = '''a = "some_text".split()''' expected_result = [ SplitContainer.from_single_token("a"), Operator('='), StringLiteral([NonCodeChar('"')], 1), StringLiteral([ SplitContainer( [Word.from_("some"), Underscore(), Word.from_("text")]) ], 9), StringLiteral([NonCodeChar('"')], 1), Operator('.'), SplitContainer.from_single_token("split"), OpeningBracket(), ClosingBracket(), NewLine() ] actual = [t for t in convert_text(text, 'py')] assert expected_result == actual
def test_spaces_in_strings(): text = '''BigAWESOMEString[] a2y = "a bc".doSplit("\\"");''' expected_result = [ SplitContainer( [Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String')], ), Operator('['), Operator(']'), SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), Operator('='), StringLiteral([ NonCodeChar('"'), SplitContainer.from_single_token('a'), SpaceInString(n_chars=4), SplitContainer.from_single_token('bc'), NonCodeChar('"') ], 9), Operator('.'), SplitContainer([Word.from_('do'), Word.from_('Split')]), OpeningBracket(), StringLiteral([ NonCodeChar('"'), NonCodeChar('\\'), NonCodeChar('"'), NonCodeChar('"') ], 4), ClosingBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_to_repr_with_enonlycontents1(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '2', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) tokens = [ Number("1.1"), Operator("*"), NonEng(SplitContainer([Word.from_("dinero")])), StringLiteral([ NonCodeChar('"'), NonEng(SplitContainer([Word.from_("ich")])), SpaceInString(), NonEng(SplitContainer([Word.from_("weiss")])), SpaceInString(), NonEng(SplitContainer([Word.from_("nicht")])), SpaceInString(), NonEng(SplitContainer([Word.from_("was")])), SpaceInString(), NonEng(SplitContainer([Word.from_("soll")])), SpaceInString(), NonEng(SplitContainer([Word.from_("es")])), SpaceInString(), NonEng(SplitContainer([Word.from_("bedeuten")])), SpaceInString(), NonEng(SplitContainer([Word.from_("dass")])), SpaceInString(), NonEng(SplitContainer([Word.from_("ich")])), SpaceInString(), NonEng(SplitContainer([Word.from_("so")])), SpaceInString(), NonEng(SplitContainer([Word.from_("traurig")])), SpaceInString(), NonEng(SplitContainer([Word.from_("bin")])), NonCodeChar('"'), ], 62), NewLine(), MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ NonEng(SplitContainer([Word.from_('ц')])), NonEng( SplitContainer([ Word.from_("blanco"), Underscore(), Word.from_("english") ]) ), ]), MultilineComment([NonCodeChar('*'), NonCodeChar('/')]), NewLine(), Tab(), OneLineComment([NonCodeChar('/'), NonCodeChar('/'), NonEng( SplitContainer([ Word.from_("DIESELBE"), Word.from_("8") ]) ) ]) ] actual, actual_metadata = to_repr(prep_config, tokens) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], '"', '/', '*', pl['non_eng'], pl['non_eng'], '*', '/', '/', '/', pl['non_eng'], pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"}, word_boundaries=[0] + list(range(5, 32)), token_types=[Number, Operator, NonEng] + [StringLiteral] * 14 + [MultilineComment] * 6 + [OneLineComment] * 4) assert expected == actual assert expected_metadata == actual_metadata
from codeprep.tokens.numeric import Number from codeprep.preprocess.placeholders import placeholders from codeprep.tokens.whitespace import Tab, NewLine, SpaceInString from codeprep.tokens.word import Word, Underscore, NonCodeChar, Operator from codeprep.prepconfig import PrepParam, PrepConfig from codeprep.pipeline.to_repr import to_repr pl = placeholders cwe = placeholders['compound_word_end'] tokens = [ Number('1.1'), Operator("*"), NonEng(SplitContainer([Word.from_("übersetzen")])), StringLiteral([ NonCodeChar('"'), NonEng( SplitContainer([ Word.from_("A"), Word.from_("Wirklicä") ]) ), SpaceInString(1), NonCodeChar('"') ], 11), NewLine(), MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ NonEng( SplitContainer([Word.from_('ц')]), ),
def test_special_characters(): text = ''' abc1 ~-0xFFFFFL= .0E+5 |= ? == != ** ++ -- += -= /= *= %= $ <= >= @ ^= &= # >> << && || +*!/><\t\n {}[],.-:();&|\\'~%^ ''' expected_result = [ SplitContainer([Word.from_('abc'), Word.from_('1')]), NewLine(), Operator('~'), Operator('-'), Number("0xFFFFFL"), Operator('='), NewLine(), Number(".0E+5"), NewLine(), Operator('|'), Operator('='), NewLine(), Operator('?'), NewLine(), Operator('='), Operator('='), NewLine(), Operator('!'), Operator('='), NewLine(), Operator('*'), Operator('*'), NewLine(), Operator('+'), Operator('+'), NewLine(), Operator('-'), Operator('-'), NewLine(), Operator('+'), Operator('='), NewLine(), Operator('-'), Operator('='), NewLine(), Operator('/'), Operator('='), NewLine(), Operator('*'), Operator('='), NewLine(), Operator('%'), Operator('='), NewLine(), NonCodeChar('$'), NewLine(), Operator('<'), Operator('='), NewLine(), Operator('>'), Operator('='), NewLine(), NonCodeChar('@'), NewLine(), Tab(), Operator('^'), Operator('='), NewLine(), Tab(), Operator('&'), Operator('='), NewLine(), Tab(), NonCodeChar('#'), NewLine(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Operator('>'), Operator('>'), NewLine(), Operator('<'), Operator('<'), NewLine(), Operator('&'), Operator('&'), NewLine(), Operator('|'), Operator('|'), NewLine(), Operator('+'), Operator('*'), Operator('!'), Operator('/'), Operator('>'), Operator('<'), Tab(), NewLine(), NewLine(), OpeningCurlyBracket(), ClosingCurlyBracket(), Operator('['), Operator(']'), Operator(','), Operator('.'), Operator('-'), Operator(':'), OpeningBracket(), ClosingBracket(), Semicolon(), Operator('&'), Operator('|'), NonCodeChar('\\'), NonCodeChar("'"), Operator('~'), Operator('%'), Operator('^'), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual