def test_2(self): text = ''' int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('int'), '[', ']', SplitContainer([ Underscore(), Word.from_('my'), Underscore(), NonEng(Word.from_('favo')), Word.from_('Rite'), Underscore(), Word.from_('ints'), Underscore() ]), '=', '{', Number([HexStart(), '1', '2']), ',', Number([HexStart(), '1', 'f', 'E']), ',', Number(['4', '4', '1']), ',', Number(['-', '8', '1']), ',', Number(['-', HexStart(), 'f', 'F', 'f']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_to_repr_with_enonlycontents(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 2, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={}) tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral([ NonEng(Word.from_("ich")), NonEng(Word.from_("weiss")), NonEng(Word.from_("nicht")), NonEng(Word.from_("was")), NonEng(Word.from_("soll")), NonEng(Word.from_("es")), NonEng(Word.from_("bedeuten")), NonEng(Word.from_("dass")), NonEng(Word.from_("ich")), NonEng(Word.from_("so")), NonEng(Word.from_("traurig")), NonEng(Word.from_("bin")), ]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment([ SplitContainer( [NonEng(Word.from_("DIESELBE")), Word.from_("8")]) ]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng_content"], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def process_number_literal(possible_number): if is_number(possible_number) and possible_number not in tabs: parts_of_number = [] if possible_number.startswith('-'): parts_of_number.append('-') possible_number = possible_number[1:] if possible_number.startswith("0x"): parts_of_number.append(HexStart()) possible_number = possible_number[2:] hex = True else: hex = False for ch in possible_number: if ch == '.': parts_of_number.append(DecimalPoint()) elif ch == 'l' or ch == 'L': parts_of_number.append(L()) elif (ch == 'f' or ch == 'F') and not hex: parts_of_number.append(F()) elif (ch == 'd' or ch == 'D') and not hex: parts_of_number.append(D()) elif (ch == 'e' or ch == 'E') and not hex: parts_of_number.append(E()) else: parts_of_number.append(ch) return Number(parts_of_number) else: return ParseableToken(possible_number)
def test_simple_log(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_tabs_and_newlines_before_semicolon(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('d'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', NewLine(), NewLine(), Tab(), Tab(), ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('d'), DEBUG, [StringLiteral([SplitContainer.from_single_token("Hi")])], [NewLine(), NewLine(), Tab(), Tab()]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_no_dot(self): input = [NewLine(), SplitContainer.from_single_token('log'), SplitContainer.from_single_token('infooooo'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_no_logs(self): input = [NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer([Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L ()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_3(self): text = ''' float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('float'), '[', ']', SplitContainer.from_single_token('floats'), '=', '{', Number(['-', '0', DecimalPoint(), '4', '3', E(), '4', F()]), ',', Number([DecimalPoint(), '5', '8', F()]), ',', Number(['0', DecimalPoint(), D()]), ',', Number(['-', '9', DecimalPoint(), '6', '3', E(), '+', '2', D()]), ',', Number(['0', DecimalPoint(), E(), '-', '8']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_1(self): text = ''' long[] lovely_longs = {0x34a35EL, 0x88bc96fl , -0x34L}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer( [Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()]), ',', Tab(), Number([HexStart(), '8', '8', 'b', 'c', '9', '6', 'f', L()]), Tab(), Tab(), ',', Number(['-', HexStart(), '3', '4', L()]), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_content_length_over_limit(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '1', '*', '3', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')' ')', ')', ')', ')', ')', ')', ')', ')' ')', ')', ')', ')', ')', ')', ')', ')' ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_6(self): text = ''' 9a abc1 ~-0xFFFFFL= .0E+5 |= ? == != ** ++ -- += -= /= *= %= $ <= >= @ ^= &= # >> << && || +*!/><\t\n {}[],.-:();&|\\'~%^ ''' expected_result = [ NewLine(), SplitContainer([Word.from_('9'), Word.from_('a')]), SplitContainer([Word.from_('abc'), Word.from_('1')]), NewLine(), '~', Number(['-', HexStart(), 'F', 'F', 'F', 'F', 'F', L()]), '=', NewLine(), Number([DecimalPoint(), '0', E(), '+', '5']), NewLine(), '|=', NewLine(), '?', NewLine(), '==', NewLine(), '!=', NewLine(), '**', NewLine(), '++', NewLine(), '--', NewLine(), '+=', NewLine(), '-=', NewLine(), '/=', NewLine(), '*=', NewLine(), '%=', NewLine(), '$', NewLine(), '<=', NewLine(), '>=', NewLine(), '@', NewLine(), Tab(), '^=', NewLine(), Tab(), '&=', NewLine(), Tab(), '#', NewLine(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), '>>', NewLine(), '<<', NewLine(), '&&', NewLine(), '||', NewLine(), '+', '*', '!', '/', '>', '<', Tab(), NewLine(), NewLine(), '{', '}', '[', ']', ',', '.', '-', ':', '(', ')', ';', '&', '|', Backslash(), "'", '~', '%', '^', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
from logrec.dataprep.model.chars import NewLine, Tab # TODO write explanations with normal strings from logrec.dataprep.model.containers import SplitContainer, OneLineComment, MultilineComment, StringLiteral from logrec.dataprep.model.logging import INFO, LogStatement, LoggableBlock from logrec.dataprep.model.noneng import NonEng from logrec.dataprep.model.numeric import DecimalPoint, Number from logrec.dataprep.model.placeholders import placeholders from logrec.dataprep.model.word import Word, Underscore from logrec.dataprep.prepconfig import PrepParam, PrepConfig from logrec.dataprep.split.ngram import NgramSplittingType, NgramSplitConfig from logrec.dataprep.to_repr import to_repr pl = placeholders tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral( [SplitContainer([Word.from_("A"), NonEng(Word.from_("Wirklich"))])]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(),
test_cases = { "create": ( [SplitContainer.from_single_token("create")], ["create"], ), "Vector": ( [SplitContainer.from_single_token("Vector")], [placeholders["capital"], "vector"], ), "players": ( [SplitContainer.from_single_token("players")], [placeholders["word_start"], 'play', 'er', 's', placeholders["word_end"]] ), "0.345e+4": ( [Number(["0", DecimalPoint(), "3", "4", "5", E(), "+", "4"])], [placeholders["word_start"], "0.", "3", "4", "5", "e+", "4", placeholders["word_end"]] ), "bestPlayers": ( [SplitContainer([Word.from_("best"), Word.from_("Players")])], [placeholders["word_start"], "best", placeholders["capital"], 'play', "er", "s", placeholders["word_end"]] ), "test_BestPlayers": ( [SplitContainer([Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players")])], [placeholders["word_start"], "test", '_', placeholders["capital"], "best", placeholders["capital"], 'play', "er", "s", placeholders["word_end"]] ), "test_BestPlayers_modified": ( [SplitContainer( [Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players"), Underscore(), Word.from_("modified")]