def test_split_string(): actual = split_string("123\nAb2cd34Ef000GG j_89_J") expected = [ Number('123'), NewLine(), SplitContainer([ Word.from_('Ab'), Word.from_('2'), Word.from_('cd'), Word.from_('34'), Word.from_('Ef'), Word.from_('000'), Word.from_('GG') ]), SpaceInString(5), SplitContainer([ Word.from_('j'), Underscore(), Word.from_('89'), Underscore(), Word.from_('J') ]) ] assert expected == actual
def test_multi_line_comment(): text = ''' /*multi-line MyComment_ *// _operations ''' expected_result = [ MultilineComment([ NonCodeChar('/'), NonCodeChar('*'), SplitContainer.from_single_token('multi'), NonCodeChar('-'), SplitContainer.from_single_token('line'), SplitContainer( [Word.from_('My'), Word.from_('Comment'), Underscore()]), NewLine(), NonCodeChar('*'), NonCodeChar('/') ]), Operator('/'), NewLine(), SplitContainer([Underscore(), Word.from_('operations')]), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_floats(): text = '''float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8};''' expected_result = [ KeyWord('float'), Operator('['), Operator(']'), SplitContainer.from_single_token('floats'), Operator('='), OpeningCurlyBracket(), Operator('-'), Number("0.43E4f"), Operator(','), Number(".58F"), Operator(','), Number("0.d"), Operator(','), Operator('-'), Number('9.63e+2D'), Operator(','), Number('0.E-8'), ClosingCurlyBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_longs(): text = '''long[] lovely_longs = {0x34a35EL, 0x88bc96fl , -0x34L};''' expected_result = [ KeyWord('long'), Operator('['), Operator(']'), SplitContainer( [Word.from_('lovely'), Underscore(), Word.from_('longs')]), Operator('='), OpeningCurlyBracket(), Number("0x34a35EL"), Operator(','), Tab(), Number("0x88bc96fl"), Tab(), Tab(), Operator(','), Operator('-'), Number("0x34L"), ClosingCurlyBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def preprocessed_repr( self, repr_config: ReprConfig ) -> Tuple[List[str], PreprocessingMetadata]: if repr_config.bpe_data: token = replace_non_ascii_seqs(str(self.processable_token), placeholders['non_ascii_seq']) return torepr(SplitContainer.from_single_token(token), repr_config) else: return self.wrap_in_metadata_for_full_word( [placeholders['non_eng']])
def test_string_with_spaces(): text = '''"hi dear world !"''' expected = [ StringLiteral([ NonCodeChar('"'), SplitContainer.from_single_token('hi'), SpaceInString(3), SplitContainer.from_single_token('dear'), SpaceInString(5), SplitContainer.from_single_token('world'), SpaceInString(4), NonCodeChar('!'), NonCodeChar('"'), ], 26), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected == actual
def split_identifier(token: str) -> SplitContainer: parts = [ m[0] for m in regex.finditer( '(_|[0-9]+|[[:upper:]]?[[:lower:]]+|[[:upper:]]+(?![[:lower:]])|[^ ])', token) ] processable_tokens = [ Word.from_(p) if p != '_' else Underscore() for p in parts ] split_container = SplitContainer(processable_tokens) return NonEng(split_container) if is_non_eng(token) else split_container
def test_spaces_in_strings(): text = '''BigAWESOMEString[] a2y = "a bc".doSplit("\\"");''' expected_result = [ SplitContainer( [Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String')], ), Operator('['), Operator(']'), SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), Operator('='), StringLiteral([ NonCodeChar('"'), SplitContainer.from_single_token('a'), SpaceInString(n_chars=4), SplitContainer.from_single_token('bc'), NonCodeChar('"') ], 9), Operator('.'), SplitContainer([Word.from_('do'), Word.from_('Split')]), OpeningBracket(), StringLiteral([ NonCodeChar('"'), NonCodeChar('\\'), NonCodeChar('"'), NonCodeChar('"') ], 4), ClosingBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_capitals(): text = ''' MyClass Class CONSTANT VAR_WITH_UNDERSCORES ''' expected_result = [ SplitContainer([Word.from_("My"), Word.from_("Class")]), SplitContainer.from_single_token("Class"), SplitContainer.from_single_token("CONSTANT"), SplitContainer([ Word.from_("VAR"), Underscore(), Word.from_("WITH"), Underscore(), Word.from_("UNDERSCORES") ]), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_string_literal_double(): text = '''a = "some_text".split()''' expected_result = [ SplitContainer.from_single_token("a"), Operator('='), StringLiteral([NonCodeChar('"')], 1), StringLiteral([ SplitContainer( [Word.from_("some"), Underscore(), Word.from_("text")]) ], 9), StringLiteral([NonCodeChar('"')], 1), Operator('.'), SplitContainer.from_single_token("split"), OpeningBracket(), ClosingBracket(), NewLine() ] actual = [t for t in convert_text(text, 'py')] assert expected_result == actual
def test_1(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '4', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) tokens = [SplitContainer.from_single_token("Whi@le")] actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={'Whi@@le@': ['Whi@@le@']})) expected = ["Whi@le" + placeholders['compound_word_end']] expected_metadata = PreprocessingMetadata(word_boundaries=[0, 1], token_types=[SplitContainer]) assert expected == actual assert expected_metadata == actual_metadata
def test_merges_no_cache(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '4', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) tokens = [SplitContainer.from_single_token("Whi@l@@e@")] actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges=MergeList().append(Merge(('W', 'h'), 10)), merges_cache={} )) expected = ["Wh", "i", '@', "l", '@', '@', "e", '@', pl["compound_word_end"]] expected_metadata = PreprocessingMetadata(word_boundaries=[0, 9], token_types=[SplitContainer]) assert expected == actual assert expected_metadata == actual_metadata
def test_ints(): text = '''int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf};''' expected_result = [ KeyWord('int'), Operator('['), Operator(']'), SplitContainer([ Underscore(), Word.from_('my'), Underscore(), Word.from_('favo'), Word.from_('Rite'), Underscore(), Word.from_('ints'), Underscore() ]), Operator('='), OpeningCurlyBracket(), Number("0x12"), Operator(','), Number("0x1fE"), Operator(','), Number("441"), Operator(','), Operator('-'), Number("81"), Operator(','), Operator('-'), Number("0xfFf"), ClosingCurlyBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_to_repr_with_enonlycontents1(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '2', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) tokens = [ Number("1.1"), Operator("*"), NonEng(SplitContainer([Word.from_("dinero")])), StringLiteral([ NonCodeChar('"'), NonEng(SplitContainer([Word.from_("ich")])), SpaceInString(), NonEng(SplitContainer([Word.from_("weiss")])), SpaceInString(), NonEng(SplitContainer([Word.from_("nicht")])), SpaceInString(), NonEng(SplitContainer([Word.from_("was")])), SpaceInString(), NonEng(SplitContainer([Word.from_("soll")])), SpaceInString(), NonEng(SplitContainer([Word.from_("es")])), SpaceInString(), NonEng(SplitContainer([Word.from_("bedeuten")])), SpaceInString(), NonEng(SplitContainer([Word.from_("dass")])), SpaceInString(), NonEng(SplitContainer([Word.from_("ich")])), SpaceInString(), NonEng(SplitContainer([Word.from_("so")])), SpaceInString(), NonEng(SplitContainer([Word.from_("traurig")])), SpaceInString(), NonEng(SplitContainer([Word.from_("bin")])), NonCodeChar('"'), ], 62), NewLine(), MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ NonEng(SplitContainer([Word.from_('ц')])), NonEng( SplitContainer([ Word.from_("blanco"), Underscore(), Word.from_("english") ]) ), ]), MultilineComment([NonCodeChar('*'), NonCodeChar('/')]), NewLine(), Tab(), OneLineComment([NonCodeChar('/'), NonCodeChar('/'), NonEng( SplitContainer([ Word.from_("DIESELBE"), Word.from_("8") ]) ) ]) ] actual, actual_metadata = to_repr(prep_config, tokens) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], '"', '/', '*', pl['non_eng'], pl['non_eng'], '*', '/', '/', '/', pl['non_eng'], pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"}, word_boundaries=[0] + list(range(5, 32)), token_types=[Number, Operator, NonEng] + [StringLiteral] * 14 + [MultilineComment] * 6 + [OneLineComment] * 4) assert expected == actual assert expected_metadata == actual_metadata
from codeprep.preprocess.metadata import PreprocessingMetadata from codeprep.tokens.noneng import NonEng from codeprep.tokens.numeric import Number from codeprep.preprocess.placeholders import placeholders from codeprep.tokens.whitespace import Tab, NewLine, SpaceInString from codeprep.tokens.word import Word, Underscore, NonCodeChar, Operator from codeprep.prepconfig import PrepParam, PrepConfig from codeprep.pipeline.to_repr import to_repr pl = placeholders cwe = placeholders['compound_word_end'] tokens = [ Number('1.1'), Operator("*"), NonEng(SplitContainer([Word.from_("übersetzen")])), StringLiteral([ NonCodeChar('"'), NonEng( SplitContainer([ Word.from_("A"), Word.from_("Wirklicä") ]) ), SpaceInString(1), NonCodeChar('"') ], 11), NewLine(), MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ NonEng(
def test_one_line_comment(): text = '''// this code won't compile but the preprocessing still has to be done corrrectly''' expected_result = [ OneLineComment([ NonCodeChar('/'), NonCodeChar('/'), SplitContainer.from_single_token('this'), SplitContainer.from_single_token('code'), SplitContainer.from_single_token('won'), NonCodeChar("'"), SplitContainer.from_single_token('t'), SplitContainer.from_single_token('compile'), SplitContainer.from_single_token('but'), SplitContainer.from_single_token('the'), SplitContainer.from_single_token('preprocessing'), SplitContainer.from_single_token('still'), SplitContainer.from_single_token('has'), SplitContainer.from_single_token('to'), SplitContainer.from_single_token('be'), SplitContainer.from_single_token('done'), SplitContainer.from_single_token('corrrectly'), NewLine() ]) ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_special_characters(): text = ''' abc1 ~-0xFFFFFL= .0E+5 |= ? == != ** ++ -- += -= /= *= %= $ <= >= @ ^= &= # >> << && || +*!/><\t\n {}[],.-:();&|\\'~%^ ''' expected_result = [ SplitContainer([Word.from_('abc'), Word.from_('1')]), NewLine(), Operator('~'), Operator('-'), Number("0xFFFFFL"), Operator('='), NewLine(), Number(".0E+5"), NewLine(), Operator('|'), Operator('='), NewLine(), Operator('?'), NewLine(), Operator('='), Operator('='), NewLine(), Operator('!'), Operator('='), NewLine(), Operator('*'), Operator('*'), NewLine(), Operator('+'), Operator('+'), NewLine(), Operator('-'), Operator('-'), NewLine(), Operator('+'), Operator('='), NewLine(), Operator('-'), Operator('='), NewLine(), Operator('/'), Operator('='), NewLine(), Operator('*'), Operator('='), NewLine(), Operator('%'), Operator('='), NewLine(), NonCodeChar('$'), NewLine(), Operator('<'), Operator('='), NewLine(), Operator('>'), Operator('='), NewLine(), NonCodeChar('@'), NewLine(), Tab(), Operator('^'), Operator('='), NewLine(), Tab(), Operator('&'), Operator('='), NewLine(), Tab(), NonCodeChar('#'), NewLine(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Operator('>'), Operator('>'), NewLine(), Operator('<'), Operator('<'), NewLine(), Operator('&'), Operator('&'), NewLine(), Operator('|'), Operator('|'), NewLine(), Operator('+'), Operator('*'), Operator('!'), Operator('/'), Operator('>'), Operator('<'), Tab(), NewLine(), NewLine(), OpeningCurlyBracket(), ClosingCurlyBracket(), Operator('['), Operator(']'), Operator(','), Operator('.'), Operator('-'), Operator(':'), OpeningBracket(), ClosingBracket(), Semicolon(), Operator('&'), Operator('|'), NonCodeChar('\\'), NonCodeChar("'"), Operator('~'), Operator('%'), Operator('^'), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual