def test_7(self): text = ''' /*multi-line MyComment_ *// _operations ''' expected_result = [ NewLine(), MultilineComment([ SplitContainer.from_single_token('multi'), '-', SplitContainer.from_single_token('line'), SplitContainer( [Word.from_('My'), Word.from_('Comment'), Underscore()]), NewLine() ]), '/', NewLine(), SplitContainer([Underscore(), Word.from_('operations')]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_mark_all_eng(self): ''' All words are english. Nothing changed ''' tokens = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Word.from_("my"), Word.from_("class") ]) ]), NewLine(), OneLineComment([ MultilineCommentEnd(), SplitContainer.from_single_token("lifeisgood") ]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] actual = mark(tokens, {}) self.assertEqual(actual, tokens)
def test_log_no_mark_logs(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 1, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 0, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig() tokens = [ LogStatement( SplitContainer.from_single_token('LOGGER'), SplitContainer.from_single_token('Info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capitals'], 'logger', '.', pl['capital'], 'info', '(', '"', pl['capital'], 'hi', '"', ')', ';' ] self.assertEqual(expected, actual)
def test_class_class(self): input = [ SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), SplitContainer.from_single_token('class') ] actual = loggable.mark(input, None)
def test_no_dot(self): input = [NewLine(), SplitContainer.from_single_token('log'), SplitContainer.from_single_token('infooooo'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_2(self): text = ''' int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('int'), '[', ']', SplitContainer([ Underscore(), Word.from_('my'), Underscore(), NonEng(Word.from_('favo')), Word.from_('Rite'), Underscore(), Word.from_('ints'), Underscore() ]), '=', '{', Number([HexStart(), '1', '2']), ',', Number([HexStart(), '1', 'f', 'E']), ',', Number(['4', '4', '1']), ',', Number(['-', '8', '1']), ',', Number(['-', HexStart(), 'f', 'F', 'f']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_merges_no_cache(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges={('w', 'h'): 0}, merges_cache={}) tokens = [SplitContainer.from_single_token("While")] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], pl['capital'], "wh", "i", "l", "e", pl["word_end"] ] self.assertEqual(expected, actual)
def test_content_length_over_limit(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '1', '*', '3', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')' ')', ')', ')', ')', ')', ')', ')', ')' ')', ')', ')', ')', ')', ')', ')', ')' ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_capitals(self): text = ''' MyClass Class CONSTANT VAR_WITH_UNDERSCORES ''' expected_result = [ NewLine(), SplitContainer([Word.from_("My"), Word.from_("Class")]), SplitContainer.from_single_token("Class"), SplitContainer.from_single_token("CONSTANT"), SplitContainer([ Word.from_("VAR"), Underscore(), Word.from_("WITH"), Underscore(), Word.from_("UNDERSCORES") ]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_no_logs(self): input = [NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer([Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L ()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_3(self): text = ''' float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('float'), '[', ']', SplitContainer.from_single_token('floats'), '=', '{', Number(['-', '0', DecimalPoint(), '4', '3', E(), '4', F()]), ',', Number([DecimalPoint(), '5', '8', F()]), ',', Number(['0', DecimalPoint(), D()]), ',', Number(['-', '9', DecimalPoint(), '6', '3', E(), '+', '2', D()]), ',', Number(['0', DecimalPoint(), E(), '-', '8']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_mark_with_noneng(self): tokens = [ StringLiteral( [SplitContainer([Word.from_("A"), Word.from_("Wirklich")])]), MultilineComment([ SplitContainer.from_single_token('ц'), SplitContainer([ Word.from_("blanco"), Underscore(), Word.from_("english") ]) ]), OneLineComment( [SplitContainer([Word.from_("DIESELBE"), Word.from_("8")])]) ] actual = mark(tokens, {}) expected = [ StringLiteral([ SplitContainer( [Word.from_("A"), NonEng(Word.from_("Wirklich"))]) ]), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ # we have to call constructor manually here, # case split container cannot set wordStart prefix # when the first subword is wrapped in NonEng NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), OneLineComment([ SplitContainer([ # we have to call constructor manually here, # case split container cannot set wordStart prefix # when the first subword is wrapped in NonEng NonEng(Word.from_("DIESELBE")), Word.from_("8") ]) ]) ] self.assertEqual(expected, actual)
def test_4(self): text = ''' BigAWESOMEString[] a2y = "abc".doSplit("\\""); ''' expected_result = [ NewLine(), SplitContainer([ Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String') ]), '[', ']', SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), '=', StringLiteral([SplitContainer.from_single_token('abc')]), '.', SplitContainer([Word.from_('do'), Word.from_('Split')]), '(', StringLiteral([Backslash(), Quote()]), ')', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_simple_log(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_1(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges_cache={'while': ['while']}) tokens = [SplitContainer.from_single_token("While")] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capital'], "while", ] self.assertEqual(expected, actual)
def test_1(self): text = ''' long[] lovely_longs = {0x34a35EL, 0x88bc96fl , -0x34L}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer( [Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()]), ',', Tab(), Number([HexStart(), '8', '8', 'b', 'c', '9', '6', 'f', L()]), Tab(), Tab(), ',', Number(['-', HexStart(), '3', '4', L()]), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_tabs_and_newlines_before_semicolon(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('d'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', NewLine(), NewLine(), Tab(), Tab(), ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('d'), DEBUG, [StringLiteral([SplitContainer.from_single_token("Hi")])], [NewLine(), NewLine(), Tab(), Tab()]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_class_closing_bracket(self): input = [SplitContainer.from_single_token('class'), '}'] actual = loggable.mark(input, None)
def test_process_comments_and_str_literals(self): ''' Positive scenario <start>"//test_MyClass" //*/ "/*!" /* /* <end> ''' tokens = [ Quote(), OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ]), Quote(), NewLine(), OneLineCommentStart(), MultilineCommentEnd(), NewLine(), Quote(), MultilineCommentStart(), SplitContainer.from_single_token("!"), Quote(), NewLine(), MultilineCommentStart(), NewLine(), MultilineCommentEnd(), NewLine(), ] actual = process_comments_and_str_literals(tokens, {}) expected = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ], ) ]), NewLine(), OneLineComment([MultilineCommentEnd()]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] self.assertEqual(expected, actual)
def test_2_logs(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('t'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', NewLine(), SplitContainer.from_single_token('Logger'), '.', SplitContainer.from_single_token('SEVERE'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', ] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('t'), TRACE, [StringLiteral([SplitContainer.from_single_token("Hi")])]), NewLine(), LogStatement(SplitContainer.from_single_token('Logger'), SplitContainer.from_single_token('SEVERE'), FATAL, [StringLiteral([SplitContainer.from_single_token("Hi")])])] self.assertEqual(expected, actual)
def test_5(self): text = ''' // this code won't compile but the preprocessing still has to be done corrrectly ''' expected_result = [ NewLine(), OneLineComment([ SplitContainer.from_single_token('this'), SplitContainer.from_single_token('code'), SplitContainer.from_single_token('won'), "'", SplitContainer.from_single_token('t'), SplitContainer.from_single_token('compile'), SplitContainer.from_single_token('but'), SplitContainer.from_single_token('the'), SplitContainer.from_single_token('preprocessing'), SplitContainer.from_single_token('still'), SplitContainer.from_single_token('has'), SplitContainer.from_single_token('to'), SplitContainer.from_single_token('be'), SplitContainer.from_single_token('done'), SplitContainer.from_single_token('corrrectly') ]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_nested_data_class(self): input = [ '{', '}', MultilineComment([SplitContainer.from_single_token("class")]), SplitContainer.from_single_token('import'), SplitContainer.from_single_token("a"), NewLine(), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), '{', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print1'), '(', ')', '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}', SplitContainer.from_single_token('static'), SplitContainer.from_single_token('private'), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('B'), SplitContainer.from_single_token('extends'), SplitContainer.from_single_token('D'), '{', SplitContainer.from_single_token('private'), SplitContainer.from_single_token('String'), SplitContainer.from_single_token('b'), ';', SplitContainer.from_single_token('B'), '(', ')', '{', '}', SplitContainer.from_single_token('static'), '{', SplitContainer.from_single_token('c'), '=', StringLiteral([SplitContainer.from_single_token('class')]), '.', SplitContainer.from_single_token('class'), '}', '}', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print'), '(', ')', '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}', SplitContainer.from_single_token('int'), SplitContainer.from_single_token('a'), ';', '}', ] actual = loggable.mark(input, None) expected = [ '{', '}', MultilineComment([SplitContainer.from_single_token("class")]), SplitContainer.from_single_token('import'), SplitContainer.from_single_token("a"), NewLine(), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), '{', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print1'), '(', ')', LoggableBlock([ '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}' ]), SplitContainer.from_single_token('static'), SplitContainer.from_single_token('private'), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('B'), SplitContainer.from_single_token('extends'), SplitContainer.from_single_token('D'), '{', SplitContainer.from_single_token('private'), SplitContainer.from_single_token('String'), SplitContainer.from_single_token('b'), ';', SplitContainer.from_single_token('B'), '(', ')', LoggableBlock(['{', '}']), SplitContainer.from_single_token('static'), LoggableBlock([ '{', SplitContainer.from_single_token('c'), '=', StringLiteral([SplitContainer.from_single_token('class')]), '.', SplitContainer.from_single_token('class'), '}' ]), '}', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print'), '(', ')', LoggableBlock([ '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}' ]), SplitContainer.from_single_token('int'), SplitContainer.from_single_token('a'), ';', '}' ] self.assertEqual(expected, actual)
from logrec.dataprep.model.noneng import NonEng from logrec.dataprep.preprocessors.preprocessor_list import pp_params from logrec.dataprep.preprocessors import apply_preprocessors from logrec.dataprep.preprocessors.general import from_string from logrec.dataprep.model.containers import SplitContainer, StringLiteral from logrec.dataprep.model.logging import LogStatement, INFO from logrec.dataprep.model.numeric import Number, DecimalPoint, E from logrec.dataprep.model.placeholders import placeholders from logrec.dataprep.model.word import Word, Underscore from logrec.dataprep.prepconfig import PrepConfig from logrec.dataprep.split.ngram import NgramSplitConfig, NgramSplittingType from logrec.dataprep.to_repr import to_repr test_cases = { "create": ( [SplitContainer.from_single_token("create")], ["create"], ), "Vector": ( [SplitContainer.from_single_token("Vector")], [placeholders["capital"], "vector"], ), "players": ( [SplitContainer.from_single_token("players")], [placeholders["word_start"], 'play', 'er', 's', placeholders["word_end"]] ), "0.345e+4": ( [Number(["0", DecimalPoint(), "3", "4", "5", E(), "+", "4"])], [placeholders["word_start"], "0.", "3", "4", "5", "e+", "4", placeholders["word_end"]] ), "bestPlayers": (