def test_mark_all_eng(self): ''' All words are english. Nothing changed ''' tokens = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Word.from_("my"), Word.from_("class") ]) ]), NewLine(), OneLineComment([ MultilineCommentEnd(), SplitContainer.from_single_token("lifeisgood") ]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] actual = mark(tokens, {}) self.assertEqual(actual, tokens)
def test_to_repr_with_enonlycontents(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 2, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={}) tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral([ NonEng(Word.from_("ich")), NonEng(Word.from_("weiss")), NonEng(Word.from_("nicht")), NonEng(Word.from_("was")), NonEng(Word.from_("soll")), NonEng(Word.from_("es")), NonEng(Word.from_("bedeuten")), NonEng(Word.from_("dass")), NonEng(Word.from_("ich")), NonEng(Word.from_("so")), NonEng(Word.from_("traurig")), NonEng(Word.from_("bin")), ]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment([ SplitContainer( [NonEng(Word.from_("DIESELBE")), Word.from_("8")]) ]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng_content"], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_log_no_mark_logs(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 1, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 0, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig() tokens = [ LogStatement( SplitContainer.from_single_token('LOGGER'), SplitContainer.from_single_token('Info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capitals'], 'logger', '.', pl['capital'], 'info', '(', '"', pl['capital'], 'hi', '"', ')', ';' ] self.assertEqual(expected, actual)
def test_7(self): text = ''' /*multi-line MyComment_ *// _operations ''' expected_result = [ NewLine(), MultilineComment([ SplitContainer.from_single_token('multi'), '-', SplitContainer.from_single_token('line'), SplitContainer( [Word.from_('My'), Word.from_('Comment'), Underscore()]), NewLine() ]), '/', NewLine(), SplitContainer([Underscore(), Word.from_('operations')]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_2(self): text = ''' int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('int'), '[', ']', SplitContainer([ Underscore(), Word.from_('my'), Underscore(), NonEng(Word.from_('favo')), Word.from_('Rite'), Underscore(), Word.from_('ints'), Underscore() ]), '=', '{', Number([HexStart(), '1', '2']), ',', Number([HexStart(), '1', 'f', 'E']), ',', Number(['4', '4', '1']), ',', Number(['-', '8', '1']), ',', Number(['-', HexStart(), 'f', 'F', 'f']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_class_class(self): input = [ SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), SplitContainer.from_single_token('class') ] actual = loggable.mark(input, None)
def test_no_dot(self): input = [NewLine(), SplitContainer.from_single_token('log'), SplitContainer.from_single_token('infooooo'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_no_logs(self): input = [NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer([Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L ()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_merges_no_cache(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges={('w', 'h'): 0}, merges_cache={}) tokens = [SplitContainer.from_single_token("While")] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], pl['capital'], "wh", "i", "l", "e", pl["word_end"] ] self.assertEqual(expected, actual)
def is_class_like_declaration(token, new_tokens): if not token in [ SplitContainer([Word.from_('class')]), SplitContainer([Word.from_('enum')]), SplitContainer([Word.from_('interface')]) ]: return False if new_tokens: if new_tokens[-1] == '.': return False if isinstance( new_tokens[-1], LoggableBlock) and new_tokens[-1].get_subtokens()[-1] == '.': return False return True
def test_content_length_over_limit(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '1', '*', '3', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')' ')', ')', ')', ')', ')', ')', ')', ')' ')', ')', ')', ')', ')', ')', ')', ')' ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_mark_with_noneng(self): tokens = [ StringLiteral( [SplitContainer([Word.from_("A"), Word.from_("Wirklich")])]), MultilineComment([ SplitContainer.from_single_token('ц'), SplitContainer([ Word.from_("blanco"), Underscore(), Word.from_("english") ]) ]), OneLineComment( [SplitContainer([Word.from_("DIESELBE"), Word.from_("8")])]) ] actual = mark(tokens, {}) expected = [ StringLiteral([ SplitContainer( [Word.from_("A"), NonEng(Word.from_("Wirklich"))]) ]), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ # we have to call constructor manually here, # case split container cannot set wordStart prefix # when the first subword is wrapped in NonEng NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), OneLineComment([ SplitContainer([ # we have to call constructor manually here, # case split container cannot set wordStart prefix # when the first subword is wrapped in NonEng NonEng(Word.from_("DIESELBE")), Word.from_("8") ]) ]) ] self.assertEqual(expected, actual)
def test_capitals(self): text = ''' MyClass Class CONSTANT VAR_WITH_UNDERSCORES ''' expected_result = [ NewLine(), SplitContainer([Word.from_("My"), Word.from_("Class")]), SplitContainer.from_single_token("Class"), SplitContainer.from_single_token("CONSTANT"), SplitContainer([ Word.from_("VAR"), Underscore(), Word.from_("WITH"), Underscore(), Word.from_("UNDERSCORES") ]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_4(self): text = ''' BigAWESOMEString[] a2y = "abc".doSplit("\\""); ''' expected_result = [ NewLine(), SplitContainer([ Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String') ]), '[', ']', SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), '=', StringLiteral([SplitContainer.from_single_token('abc')]), '.', SplitContainer([Word.from_('do'), Word.from_('Split')]), '(', StringLiteral([Backslash(), Quote()]), ')', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_3(self): text = ''' float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('float'), '[', ']', SplitContainer.from_single_token('floats'), '=', '{', Number(['-', '0', DecimalPoint(), '4', '3', E(), '4', F()]), ',', Number([DecimalPoint(), '5', '8', F()]), ',', Number(['0', DecimalPoint(), D()]), ',', Number(['-', '9', DecimalPoint(), '6', '3', E(), '+', '2', D()]), ',', Number(['0', DecimalPoint(), E(), '-', '8']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_with_numbers_split(self): token = [StringLiteral([":", ParseableToken("_test_my123GmyClass_")])] actual = simple_split(token, {}) expected = [StringLiteral([":", SplitContainer([ Underscore(), Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("123"), Word.from_("Gmy"), Word.from_("Class"), Underscore() ])])] self.assertEqual(actual, expected)
def test_1(self): text = ''' long[] lovely_longs = {0x34a35EL, 0x88bc96fl , -0x34L}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer( [Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()]), ',', Tab(), Number([HexStart(), '8', '8', 'b', 'c', '9', '6', 'f', L()]), Tab(), Tab(), ',', Number(['-', HexStart(), '3', '4', L()]), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def simple_split_token(token): if isinstance(token, ParseableToken): parts = [ m[0] for m in regex.finditer( '(_|[0-9]+|[[:upper:]]?[[:lower:]]+|[[:upper:]]+(?![[:lower:]]))', str(token)) ] # if len("".join(parts)) == processable_tokens = [ Word.from_(p) if p != '_' else Underscore() for p in parts ] return SplitContainer(processable_tokens) elif isinstance(token, ProcessableTokenContainer): return type(token)([ simple_split_token(subtoken) for subtoken in token.get_subtokens() ]) else: return token
def test_simple_log(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_1(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges_cache={'while': ['while']}) tokens = [SplitContainer.from_single_token("While")] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capital'], "while", ] self.assertEqual(expected, actual)
def test_tabs_and_newlines_before_semicolon(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('d'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', NewLine(), NewLine(), Tab(), Tab(), ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('d'), DEBUG, [StringLiteral([SplitContainer.from_single_token("Hi")])], [NewLine(), NewLine(), Tab(), Tab()]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
# TODO write explanations with normal strings from logrec.dataprep.model.containers import SplitContainer, OneLineComment, MultilineComment, StringLiteral from logrec.dataprep.model.logging import INFO, LogStatement, LoggableBlock from logrec.dataprep.model.noneng import NonEng from logrec.dataprep.model.numeric import DecimalPoint, Number from logrec.dataprep.model.placeholders import placeholders from logrec.dataprep.model.word import Word, Underscore from logrec.dataprep.prepconfig import PrepParam, PrepConfig from logrec.dataprep.split.ngram import NgramSplittingType, NgramSplitConfig from logrec.dataprep.to_repr import to_repr pl = placeholders tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral( [SplitContainer([Word.from_("A"), NonEng(Word.from_("Wirklich"))])]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment(
def test_nested_data_class(self): input = [ '{', '}', MultilineComment([SplitContainer.from_single_token("class")]), SplitContainer.from_single_token('import'), SplitContainer.from_single_token("a"), NewLine(), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), '{', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print1'), '(', ')', '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}', SplitContainer.from_single_token('static'), SplitContainer.from_single_token('private'), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('B'), SplitContainer.from_single_token('extends'), SplitContainer.from_single_token('D'), '{', SplitContainer.from_single_token('private'), SplitContainer.from_single_token('String'), SplitContainer.from_single_token('b'), ';', SplitContainer.from_single_token('B'), '(', ')', '{', '}', SplitContainer.from_single_token('static'), '{', SplitContainer.from_single_token('c'), '=', StringLiteral([SplitContainer.from_single_token('class')]), '.', SplitContainer.from_single_token('class'), '}', '}', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print'), '(', ')', '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}', SplitContainer.from_single_token('int'), SplitContainer.from_single_token('a'), ';', '}', ] actual = loggable.mark(input, None) expected = [ '{', '}', MultilineComment([SplitContainer.from_single_token("class")]), SplitContainer.from_single_token('import'), SplitContainer.from_single_token("a"), NewLine(), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), '{', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print1'), '(', ')', LoggableBlock([ '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}' ]), SplitContainer.from_single_token('static'), SplitContainer.from_single_token('private'), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('B'), SplitContainer.from_single_token('extends'), SplitContainer.from_single_token('D'), '{', SplitContainer.from_single_token('private'), SplitContainer.from_single_token('String'), SplitContainer.from_single_token('b'), ';', SplitContainer.from_single_token('B'), '(', ')', LoggableBlock(['{', '}']), SplitContainer.from_single_token('static'), LoggableBlock([ '{', SplitContainer.from_single_token('c'), '=', StringLiteral([SplitContainer.from_single_token('class')]), '.', SplitContainer.from_single_token('class'), '}' ]), '}', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print'), '(', ')', LoggableBlock([ '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}' ]), SplitContainer.from_single_token('int'), SplitContainer.from_single_token('a'), ';', '}' ] self.assertEqual(expected, actual)
def test_process_comments_and_str_literals(self): ''' Positive scenario <start>"//test_MyClass" //*/ "/*!" /* /* <end> ''' tokens = [ Quote(), OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ]), Quote(), NewLine(), OneLineCommentStart(), MultilineCommentEnd(), NewLine(), Quote(), MultilineCommentStart(), SplitContainer.from_single_token("!"), Quote(), NewLine(), MultilineCommentStart(), NewLine(), MultilineCommentEnd(), NewLine(), ] actual = process_comments_and_str_literals(tokens, {}) expected = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ], ) ]), NewLine(), OneLineComment([MultilineCommentEnd()]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] self.assertEqual(expected, actual)
def test_class_closing_bracket(self): input = [SplitContainer.from_single_token('class'), '}'] actual = loggable.mark(input, None)
def test_5(self): text = ''' // this code won't compile but the preprocessing still has to be done corrrectly ''' expected_result = [ NewLine(), OneLineComment([ SplitContainer.from_single_token('this'), SplitContainer.from_single_token('code'), SplitContainer.from_single_token('won'), "'", SplitContainer.from_single_token('t'), SplitContainer.from_single_token('compile'), SplitContainer.from_single_token('but'), SplitContainer.from_single_token('the'), SplitContainer.from_single_token('preprocessing'), SplitContainer.from_single_token('still'), SplitContainer.from_single_token('has'), SplitContainer.from_single_token('to'), SplitContainer.from_single_token('be'), SplitContainer.from_single_token('done'), SplitContainer.from_single_token('corrrectly') ]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_6(self): text = ''' 9a abc1 ~-0xFFFFFL= .0E+5 |= ? == != ** ++ -- += -= /= *= %= $ <= >= @ ^= &= # >> << && || +*!/><\t\n {}[],.-:();&|\\'~%^ ''' expected_result = [ NewLine(), SplitContainer([Word.from_('9'), Word.from_('a')]), SplitContainer([Word.from_('abc'), Word.from_('1')]), NewLine(), '~', Number(['-', HexStart(), 'F', 'F', 'F', 'F', 'F', L()]), '=', NewLine(), Number([DecimalPoint(), '0', E(), '+', '5']), NewLine(), '|=', NewLine(), '?', NewLine(), '==', NewLine(), '!=', NewLine(), '**', NewLine(), '++', NewLine(), '--', NewLine(), '+=', NewLine(), '-=', NewLine(), '/=', NewLine(), '*=', NewLine(), '%=', NewLine(), '$', NewLine(), '<=', NewLine(), '>=', NewLine(), '@', NewLine(), Tab(), '^=', NewLine(), Tab(), '&=', NewLine(), Tab(), '#', NewLine(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), '>>', NewLine(), '<<', NewLine(), '&&', NewLine(), '||', NewLine(), '+', '*', '!', '/', '>', '<', Tab(), NewLine(), NewLine(), '{', '}', '[', ']', ',', '.', '-', ':', '(', ')', ';', '&', '|', Backslash(), "'", '~', '%', '^', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
from logrec.dataprep.model.noneng import NonEng from logrec.dataprep.preprocessors.preprocessor_list import pp_params from logrec.dataprep.preprocessors import apply_preprocessors from logrec.dataprep.preprocessors.general import from_string from logrec.dataprep.model.containers import SplitContainer, StringLiteral from logrec.dataprep.model.logging import LogStatement, INFO from logrec.dataprep.model.numeric import Number, DecimalPoint, E from logrec.dataprep.model.placeholders import placeholders from logrec.dataprep.model.word import Word, Underscore from logrec.dataprep.prepconfig import PrepConfig from logrec.dataprep.split.ngram import NgramSplitConfig, NgramSplittingType from logrec.dataprep.to_repr import to_repr test_cases = { "create": ( [SplitContainer.from_single_token("create")], ["create"], ), "Vector": ( [SplitContainer.from_single_token("Vector")], [placeholders["capital"], "vector"], ), "players": ( [SplitContainer.from_single_token("players")], [placeholders["word_start"], 'play', 'er', 's', placeholders["word_end"]] ), "0.345e+4": ( [Number(["0", DecimalPoint(), "3", "4", "5", E(), "+", "4"])], [placeholders["word_start"], "0.", "3", "4", "5", "e+", "4", placeholders["word_end"]] ), "bestPlayers": (
def test_2_logs(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('t'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', NewLine(), SplitContainer.from_single_token('Logger'), '.', SplitContainer.from_single_token('SEVERE'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', ] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('t'), TRACE, [StringLiteral([SplitContainer.from_single_token("Hi")])]), NewLine(), LogStatement(SplitContainer.from_single_token('Logger'), SplitContainer.from_single_token('SEVERE'), FATAL, [StringLiteral([SplitContainer.from_single_token("Hi")])])] self.assertEqual(expected, actual)