def test_multi_line_comment(): text = ''' /*multi-line MyComment_ *// _operations ''' expected_result = [ MultilineComment([ NonCodeChar('/'), NonCodeChar('*'), SplitContainer.from_single_token('multi'), NonCodeChar('-'), SplitContainer.from_single_token('line'), SplitContainer( [Word.from_('My'), Word.from_('Comment'), Underscore()]), NewLine(), NonCodeChar('*'), NonCodeChar('/') ]), Operator('/'), NewLine(), SplitContainer([Underscore(), Word.from_('operations')]), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_floats(): text = '''float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8};''' expected_result = [ KeyWord('float'), Operator('['), Operator(']'), SplitContainer.from_single_token('floats'), Operator('='), OpeningCurlyBracket(), Operator('-'), Number("0.43E4f"), Operator(','), Number(".58F"), Operator(','), Number("0.d"), Operator(','), Operator('-'), Number('9.63e+2D'), Operator(','), Number('0.E-8'), ClosingCurlyBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def preprocessed_repr( self, repr_config: ReprConfig ) -> Tuple[List[str], PreprocessingMetadata]: if repr_config.bpe_data: token = replace_non_ascii_seqs(str(self.processable_token), placeholders['non_ascii_seq']) return torepr(SplitContainer.from_single_token(token), repr_config) else: return self.wrap_in_metadata_for_full_word( [placeholders['non_eng']])
def test_string_with_spaces(): text = '''"hi dear world !"''' expected = [ StringLiteral([ NonCodeChar('"'), SplitContainer.from_single_token('hi'), SpaceInString(3), SplitContainer.from_single_token('dear'), SpaceInString(5), SplitContainer.from_single_token('world'), SpaceInString(4), NonCodeChar('!'), NonCodeChar('"'), ], 26), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected == actual
def test_spaces_in_strings(): text = '''BigAWESOMEString[] a2y = "a bc".doSplit("\\"");''' expected_result = [ SplitContainer( [Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String')], ), Operator('['), Operator(']'), SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), Operator('='), StringLiteral([ NonCodeChar('"'), SplitContainer.from_single_token('a'), SpaceInString(n_chars=4), SplitContainer.from_single_token('bc'), NonCodeChar('"') ], 9), Operator('.'), SplitContainer([Word.from_('do'), Word.from_('Split')]), OpeningBracket(), StringLiteral([ NonCodeChar('"'), NonCodeChar('\\'), NonCodeChar('"'), NonCodeChar('"') ], 4), ClosingBracket(), Semicolon(), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_capitals(): text = ''' MyClass Class CONSTANT VAR_WITH_UNDERSCORES ''' expected_result = [ SplitContainer([Word.from_("My"), Word.from_("Class")]), SplitContainer.from_single_token("Class"), SplitContainer.from_single_token("CONSTANT"), SplitContainer([ Word.from_("VAR"), Underscore(), Word.from_("WITH"), Underscore(), Word.from_("UNDERSCORES") ]), NewLine() ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual
def test_string_literal_double(): text = '''a = "some_text".split()''' expected_result = [ SplitContainer.from_single_token("a"), Operator('='), StringLiteral([NonCodeChar('"')], 1), StringLiteral([ SplitContainer( [Word.from_("some"), Underscore(), Word.from_("text")]) ], 9), StringLiteral([NonCodeChar('"')], 1), Operator('.'), SplitContainer.from_single_token("split"), OpeningBracket(), ClosingBracket(), NewLine() ] actual = [t for t in convert_text(text, 'py')] assert expected_result == actual
def test_1(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '4', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) tokens = [SplitContainer.from_single_token("Whi@le")] actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={'Whi@@le@': ['Whi@@le@']})) expected = ["Whi@le" + placeholders['compound_word_end']] expected_metadata = PreprocessingMetadata(word_boundaries=[0, 1], token_types=[SplitContainer]) assert expected == actual assert expected_metadata == actual_metadata
def test_merges_no_cache(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '4', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) tokens = [SplitContainer.from_single_token("Whi@l@@e@")] actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges=MergeList().append(Merge(('W', 'h'), 10)), merges_cache={} )) expected = ["Wh", "i", '@', "l", '@', '@', "e", '@', pl["compound_word_end"]] expected_metadata = PreprocessingMetadata(word_boundaries=[0, 9], token_types=[SplitContainer]) assert expected == actual assert expected_metadata == actual_metadata
def test_one_line_comment(): text = '''// this code won't compile but the preprocessing still has to be done corrrectly''' expected_result = [ OneLineComment([ NonCodeChar('/'), NonCodeChar('/'), SplitContainer.from_single_token('this'), SplitContainer.from_single_token('code'), SplitContainer.from_single_token('won'), NonCodeChar("'"), SplitContainer.from_single_token('t'), SplitContainer.from_single_token('compile'), SplitContainer.from_single_token('but'), SplitContainer.from_single_token('the'), SplitContainer.from_single_token('preprocessing'), SplitContainer.from_single_token('still'), SplitContainer.from_single_token('has'), SplitContainer.from_single_token('to'), SplitContainer.from_single_token('be'), SplitContainer.from_single_token('done'), SplitContainer.from_single_token('corrrectly'), NewLine() ]) ] actual = [t for t in convert_text(text, 'java')] assert expected_result == actual