def _get_parse_from_strings(feature_table, constraint_set_string, words): lexicon = Lexicon(words, feature_table) constraint_set = ConstraintSet.load_from_printed_string_representation(constraint_set_string, feature_table) grammar = Grammar(feature_table, constraint_set, lexicon) return grammar.get_all_outputs_grammar()
class TestingParserSuite(unittest.TestCase): """ this test case is designed test the parser and related function: get_range and generate see: https://taucompling.atlassian.net/wiki/display/OTML/Testing+parser+suite """ def setUp(self): self.feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_son_feature_table.json")) self.constraint_set = ConstraintSet.load(get_constraint_set_fixture("no_bb_Max_Dep_constraint_set.json"), self.feature_table) self.corpus = Corpus.load(get_corpus_fixture("testing_parser_suite_corpus.txt")) self.lexicon = Lexicon(self.corpus.get_words(),self.feature_table) self.grammar = Grammar(self.feature_table, self.constraint_set, self.lexicon) self.bb = Word("bb", self.feature_table) self.bab = Word("bab", self.feature_table) self.abba = Word("abba", self.feature_table) self.ababa = Word("ababa", self.feature_table) def test_generate(self): self.assertEqual(self.grammar.generate(self.bb), {"bab"}) self.assertEqual(self.grammar.generate(self.bab), {"bab"}) self.assertEqual(self.grammar.generate(self.abba), {"ababa"}) self.assertEqual(self.grammar.generate(self.ababa), {"ababa"}) def test_parser(self): traversable_hypothesis = TraversableGrammarHypothesis(self.grammar, ["bb"]) self.assertEqual(traversable_hypothesis.parse_data(), {'bb': set()}) traversable_hypothesis = TraversableGrammarHypothesis(self.grammar, ["ababa"]) self.assertEqual(traversable_hypothesis.parse_data(), {"ababa": set([(self.abba, 1), (self.ababa, 1)])})
def factorize(G: Grammar): def lcp(sym): prefix_len, prefix = 0, None d = set() t = Trie([x for x in (G.terminals + G.nonTerminals)]) for production in sym.productions: _, body = production p = t.prefix_query(body) if p: if len(p) > prefix_len: prefix = p prefix_len = len(p) t.insert(body) if prefix: for production in sym.productions: if all(prefix[i] == production.Right[i] for i in range(prefix_len)): d.add(production) print(d) return prefix, d stack = [x for x in G.nonTerminals] while stack: sym = stack.pop() prefix, productions = lcp(sym) change = not prefix is None if change: new_t = G.NonTerminal(f'{sym.Name}^') sym %= prefix + new_t for p in productions: remainder = p.Right[len(prefix)::] remainder = Sentence(*remainder) if remainder else G.Epsilon new_t %= remainder G.Productions.remove(p)
def remove_left_recursion(G: Grammar): def find_recursive_production(t: NonTerminal): recursives, non_recursives = list(), list() for production in t.productions: head, body = production if len(body) > 1 and head == body[0]: recursives.append(production) else: non_recursives.append(production) return recursives, non_recursives def remove_redundant_productions(G: Grammar): for production in G.Productions: while G.Productions.count(production) > 1: G.Productions.remove(production) for nt in G.nonTerminals: recursives, non_recursives = find_recursive_production(nt) if recursives: new_symbol = G.NonTerminal(f'{nt}^') for production in non_recursives: G.Productions.remove(production) body = production.Right nt %= body + new_symbol for production in recursives: alpha = Sentence(*production.Right[1::]) G.Productions.remove(production) new_symbol %= alpha + new_symbol | G.Epsilon remove_redundant_productions(G)
def clear_modules_caching(self): if True: Grammar.clear_caching() ConstraintSet.clear_caching() Constraint.clear_caching() Word.clear_caching() diagnostics_flag = False if diagnostics_flag: def object_size_in_mb(object_): from pympler.asizeof import asizeof return int((asizeof(object_) / (1024 ** 2))) import grammar.grammar import grammar.constraint_set import grammar.constraint import grammar.lexicon outputs_by_constraint_set_and_word_size = object_size_in_mb( grammar.grammar.outputs_by_constraint_set_and_word) grammar_transducers_size = object_size_in_mb(grammar.grammar.grammar_transducers) constraint_set_transducers_size = object_size_in_mb(grammar.constraint_set.constraint_set_transducers) constraint_transducers_size = object_size_in_mb(grammar.constraint.constraint_transducers) word_transducers_size = object_size_in_mb(grammar.lexicon.word_transducers) logger.info( "asizeof outputs_by_constraint_set_and_word: {} MB".format(outputs_by_constraint_set_and_word_size)) logger.info("length outputs_by_constraint_set_and_word: {}".format( len(grammar.grammar.outputs_by_constraint_set_and_word))) logger.info("asizeof grammar_transducers: {} MB".format(grammar_transducers_size)) logger.info("length grammar_transducers: {}".format(len(grammar.grammar.grammar_transducers))) logger.info("asizeof constraint_set_transducers: {} MB".format(constraint_set_transducers_size)) logger.info("asizeof constraint_transducers: {} MB".format(constraint_transducers_size)) logger.info("asizeof word_transducers: {} MB".format(word_transducers_size)) logger.info("length word_transducers: {}".format(len(grammar.lexicon.word_transducers))) sum_asizeof = outputs_by_constraint_set_and_word_size + grammar_transducers_size + \ constraint_set_transducers_size + constraint_transducers_size + \ word_transducers_size logger.info("sum asizeof: {} MB".format(sum_asizeof)) logger.info("Memory usage: {} MB".format(self._get_memory_usage()))
class TestObjectCaching(unittest.TestCase): def setUp(self): self.feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_son_feature_table.json")) self.constraint_set_filename = get_constraint_set_fixture("no_bb_Max_Dep_constraint_set.json") self.corpus = Corpus.load(get_corpus_fixture("small_ab_corpus.txt")) self.word = Word("abababa",self.feature_table) self.constraint = PhonotacticConstraint([{'son': '+'}, {'son': '+'}], self.feature_table) self.constraint_set = ConstraintSet.load(self.constraint_set_filename, self.feature_table) self.lexicon = Lexicon(self.corpus.get_words(), self.feature_table) self.grammar = Grammar(self.feature_table, self.constraint_set, self.lexicon) def test_constraint_transducer_caching(self): max_constraint = MaxConstraint([{'son': '+'}], self.feature_table) #deepcopy(max_constraint) orig_transducer = max_constraint.get_transducer() max_constraint.augment_feature_bundle() new_transducer = max_constraint.get_transducer() self.assertEqual(id(orig_transducer), id(new_transducer)) #def test_word_caching(self): # get_transducer(self.word) # get_transducer(self.word) # get_transducer(self.word) # #def test_constraint_caching(self): # get_transducer(self.constraint) # get_transducer(self.constraint) # get_transducer(self.constraint) # # #def test_constraint_set_caching(self): # get_transducer(self.constraint_set) # get_transducer(self.constraint_set) # get_transducer(self.constraint_set) # #def test_grammar_caching(self): # get_transducer(self.grammar) # get_transducer(self.grammar) # get_transducer(self.grammar) def test_generate_caching(self): word = Word("bbb", self.feature_table) word_outputs = self.grammar.generate(word) from grammar.grammar import outputs_by_constraint_set_and_word constraint_set_and_word_key = str(self.grammar.constraint_set) + str(word) self.assertEqual(set(outputs_by_constraint_set_and_word[constraint_set_and_word_key]), set(word_outputs)) def test_parse_data_caching(self): pass def test_start_from_middle(self): print() print(pickle.dumps(configurations))
def train(args): train_set = Dataset.from_bin_file(args.train_file) if args.dev_file: dev_set = Dataset.from_bin_file(args.dev_file) else: dev_set = Dataset(examples=[]) vocab = pickle.load(open(args.vocab, 'rb')) grammar = Grammar.from_text(open(args.asdl_file).read()) # transition_system = Registrable.by_name(args.transition_system)(grammar) transition_system = TurkTransitionSystem(grammar) parser = ASNParser(args, transition_system, vocab) nn_utils.glorot_init(parser.parameters()) optimizer = optim.Adam(parser.parameters(), lr=args.lr) best_acc = 0.0 log_every = args.log_every train_begin = time.time() for epoch in range(1, args.max_epoch + 1): train_iter = 0 loss_val = 0. epoch_loss = 0. parser.train() epoch_begin = time.time() for batch_example in train_set.batch_iter(batch_size=args.batch_size, shuffle=False): optimizer.zero_grad() loss = parser.score(batch_example) loss_val += torch.sum(loss).data.item() epoch_loss += torch.sum(loss).data.item() loss = torch.mean(loss) loss.backward() torch.nn.utils.clip_grad_norm_(parser.parameters(), args.clip_grad) optimizer.step() train_iter += 1 if train_iter % log_every == 0: print("[epoch {}, step {}] loss: {:.3f}".format(epoch, train_iter, loss_val / (log_every * args.batch_size ))) loss_val = 0. # print(epoch, 'Train loss', '{:.3f}'.format(epoch_loss / len(train_set)), 'time elapsed %d' % (time.time() - epoch_begin)) print('[epoch {}] train loss {:.3f}, epoch time {:.0f}, total time {:.0f}'.format(epoch, epoch_loss / len(train_set), time.time() - epoch_begin, time.time() - train_begin) ) if epoch > args.run_val_after: eval_begin = time.time() parser.eval() with torch.no_grad(): parse_results = [parser.naive_parse(ex) for ex in dev_set] match_results = [transition_system.compare_ast(e.tgt_ast, r) for e, r in zip(dev_set, parse_results)] match_acc = sum(match_results) * 1. / len(match_results) # print('Eval Acc', match_acc) print('[epoch {}] eval acc {:.3f}, eval time {:.0f}'.format(epoch, match_acc, time.time() - eval_begin)) if match_acc >= best_acc: best_acc = match_acc parser.save(args.save_to)
def setUp(self): self.feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_son_feature_table.json")) self.constraint_set_filename = get_constraint_set_fixture("no_bb_Max_Dep_constraint_set.json") self.corpus = Corpus.load(get_corpus_fixture("small_ab_corpus.txt")) self.word = Word("abababa",self.feature_table) self.constraint = PhonotacticConstraint([{'son': '+'}, {'son': '+'}], self.feature_table) self.constraint_set = ConstraintSet.load(self.constraint_set_filename, self.feature_table) self.lexicon = Lexicon(self.corpus.get_words(), self.feature_table) self.grammar = Grammar(self.feature_table, self.constraint_set, self.lexicon)
def save_words(gram: Grammar, output_path: str): """ Writes all words of a language, described by the grammar `gram`. :param gram: the grammar of a language :param output_path: path to the output file """ words = gram.generate_words() with open(output_path, 'w') as output: print('Words in language of {grammar_name}'.format( grammar_name=repr(gram.get_name())), file=output) print('It has {words_count} words:\n'.format(words_count=len(words)), file=output) for word in sorted(words): print(word if word != '' else '#eps', file=output)
def __init__(self, segmentLength=20, paaSize=5, alphabetSize=3, upperBound=100, lowerBound=-100): self.segmentLength = segmentLength self.paaSize = paaSize self.alphabetSize = alphabetSize self.upperBound = upperBound self.lowerBound = lowerBound self.sax = SAX(wordSize=paaSize, alphabetSize=alphabetSize, lowerBound=lowerBound, upperBound=upperBound, epsilon=1e-6) self.grammar = Grammar() self.segmentIndexes = [] self.rule_set = [] self.tsCount = 0
def setUp(self): self.feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_son_feature_table.json")) self.constraint_set = ConstraintSet.load(get_constraint_set_fixture("no_bb_Max_Dep_constraint_set.json"), self.feature_table) self.corpus = Corpus.load(get_corpus_fixture("testing_parser_suite_corpus.txt")) self.lexicon = Lexicon(self.corpus.get_words(),self.feature_table) self.grammar = Grammar(self.feature_table, self.constraint_set, self.lexicon) self.bb = Word("bb", self.feature_table) self.bab = Word("bab", self.feature_table) self.abba = Word("abba", self.feature_table) self.ababa = Word("ababa", self.feature_table)
def regex_to_grammar(regex: Regex): automaton = regex.automaton G = Grammar() start = G.NonTerminal(f'A{automaton.start}', True) n = {} n[automaton.start] = start for i in range(automaton.states): if i != automaton.start: n[i] = G.NonTerminal(f'A{i}') t = {} for sym in automaton.vocabulary: t[sym] = G.Terminal(sym) for src, d in automaton.transitions.items(): for sym, dest in d.items(): nt = n[src] nt1 = n[dest[0]] s = t[sym] nt %= s + nt1 if nt1 != nt else s for f in automaton.finals: nt = n[f] nt %= G.Epsilon return G
def __init__(self, regex, ignore_white_space=True): self.regex = regex self.G = Grammar() E = self.G.NonTerminal('E', True) T, F, A, X, Y, Z = self.G.NonTerminals('T F A X Y Z') pipe, star, opar, cpar, symbol, epsilon, plus, minus, obrack, cbrack, question = self.G.Terminals( '| * ( ) symbol ε + - [ ] ?') E %= T + X, lambda h, s: s[2], None, lambda h, s: s[1] X %= pipe + E, lambda h, s: UnionNode(h[0], s[2]) X %= self.G.Epsilon, lambda h, s: h[0] T %= F + Y, lambda h, s: s[2], None, lambda h, s: s[1] Y %= T, lambda h, s: ConcatNode(h[0], s[1]) Y %= self.G.Epsilon, lambda h, s: h[0] F %= A + Z, lambda h, s: s[2], None, lambda h, s: s[1] Z %= star, lambda h, s: ClosureNode(h[0]) Z %= plus, lambda h, s: PositiveClosureNode(h[0]) Z %= question, lambda h, s: QuestionNode(h[0]) Z %= self.G.Epsilon, lambda h, s: h[0] A %= symbol, lambda h, s: SymbolNode(s[1]) A %= epsilon, lambda h, s: EpsilonNode(s[1]) A %= opar + E + cpar, lambda h, s: s[2] self.automaton = self._build_automaton(regex, ignore_white_space)
def make_dataset(): grammar = Grammar.from_text(open('data/turk/turk_asdl.txt').read()) transition_system = TurkTransitionSystem(grammar) train_set = load_dataset("train", transition_system) dev_set = load_dataset("val", transition_system) test_set = load_dataset("test", transition_system) # get vocab from actions vocab = build_dataset_vocab(train_set, transition_system, src_cutoff=2) # cache decision using vocab can be done in train pickle.dump(train_set, open('data/turk/train.bin', 'wb')) pickle.dump(dev_set, open('data/turk/dev.bin', 'wb')) pickle.dump(test_set, open('data/turk/test.bin', 'wb')) pickle.dump(vocab, open('data/turk/vocab.bin', 'wb'))
def __init__(self, filename): ''' Initialise the grammar to be used to parse each line in the log file Setup multiprocessing shared object and other generic structs ''' #Function dictionary, something to mimic function pointers self.parse_line = { 'octeon':self.parse_line_octeon, 'others':self.parse_line_other } self.log = Logger("parser","info") #???? Not the nest implementation yet #Initialise variables self.__filepath = filename #Setup the grammar to use based on the filetype self.__grammar_type = self.get_grammar_type(filename) self.__current_grammar = Grammar(self.__grammar_type).grammar self.parse_file()
def from_xml(xml_path) -> Grammar: """ Parses xml file describing the language grammar. Creates a Grammar object and returns it. :param xml_path: source xml-file :return: Grammar object """ terminals = {''} non_terminals = set() starting_non_terminal = None transitions = {} restrictions = {} restrictions_names = {'max-word-length'} def _parse_terminals(terminals_xml_element): for terminal in terminals_xml_element: terminals.add(terminal.attrib['value']) def _parse_non_terminals(non_terminals_xml_element): nonlocal starting_non_terminal for non_terminal in non_terminals_xml_element: if 'starting' in non_terminal.attrib and ( non_terminal.attrib['starting'] == 'true'): starting_non_terminal = non_terminal.attrib['value'] non_terminals.add(non_terminal.attrib['value']) def _parse_transitions(transitions_xml_element): for transition in transitions_xml_element: from_sym = transition.attrib['from'] to_seq = transition.attrib['to'] if from_sym not in transitions: transitions[from_sym] = [] transitions[from_sym].append(to_seq) def _parse_restrictions(restrictions_xml_element): for restriction in restrictions_xml_element: if restriction.tag in restrictions_names: restrictions[restriction.tag] = restriction.attrib['value'] tree = ElementTree.parse(xml_path) root = tree.getroot() actions = { 'terminals': _parse_terminals, 'non-terminals': _parse_non_terminals, 'transitions': _parse_transitions, 'restrictions': _parse_restrictions, } for child in root: parse_action = actions.get(child.tag, None) if parse_action is not None: parse_action(child) if 'name' in root.attrib and root.attrib['name'] != '': grammar_name = root.attrib['name'] else: grammar_name = 'unknown' return Grammar(terminals=terminals, non_terminals=non_terminals, starting_non_terminal=starting_non_terminal, transitions=transitions, restrictions=restrictions, name=grammar_name)
class Regex(object): def __init__(self, regex, ignore_white_space=True): self.regex = regex self.G = Grammar() E = self.G.NonTerminal('E', True) T, F, A, X, Y, Z = self.G.NonTerminals('T F A X Y Z') pipe, star, opar, cpar, symbol, epsilon, plus, minus, obrack, cbrack, question = self.G.Terminals( '| * ( ) symbol ε + - [ ] ?') E %= T + X, lambda h, s: s[2], None, lambda h, s: s[1] X %= pipe + E, lambda h, s: UnionNode(h[0], s[2]) X %= self.G.Epsilon, lambda h, s: h[0] T %= F + Y, lambda h, s: s[2], None, lambda h, s: s[1] Y %= T, lambda h, s: ConcatNode(h[0], s[1]) Y %= self.G.Epsilon, lambda h, s: h[0] F %= A + Z, lambda h, s: s[2], None, lambda h, s: s[1] Z %= star, lambda h, s: ClosureNode(h[0]) Z %= plus, lambda h, s: PositiveClosureNode(h[0]) Z %= question, lambda h, s: QuestionNode(h[0]) Z %= self.G.Epsilon, lambda h, s: h[0] A %= symbol, lambda h, s: SymbolNode(s[1]) A %= epsilon, lambda h, s: EpsilonNode(s[1]) A %= opar + E + cpar, lambda h, s: s[2] self.automaton = self._build_automaton(regex, ignore_white_space) def _build_automaton(self, regex, ignore_white_space): def regex_tokenizer(regex, ignore_white_space): d = {term.Name: term for term in self.G.terminals} tokens = [] symbol_term = [ term for term in self.G.terminals if term.Name == 'symbol' ][0] fixed_tokens = { tok.Name: Token(tok.Name, tok) for tok in [ d['|'], d['*'], d['+'], d['?'], d['('], d[')'], d['['], d[']'], d['-'], d['ε'] ] } for i, c in enumerate(regex): if c == '@' or (ignore_white_space and c.isspace()): continue try: token = fixed_tokens[c] if regex[i - 1] == '@': raise KeyError except KeyError: token = Token(c, symbol_term) tokens.append(token) tokens.append(Token('$', self.G.EOF)) return tokens toks = regex_tokenizer(regex, ignore_white_space) parser = build_ll1_parser(self.G) left_parse = parser(toks) tree = evaluate_parse(left_parse, toks) automatom = tree.evaluate() automaton = nfa_to_deterministic(automatom) return automaton def __call__(self, w: str): return self.automaton.recognize(w)
class SymbolicClustering(object): def __init__(self, segmentLength=20, paaSize=5, alphabetSize=3, upperBound=100, lowerBound=-100): self.segmentLength = segmentLength self.paaSize = paaSize self.alphabetSize = alphabetSize self.upperBound = upperBound self.lowerBound = lowerBound self.sax = SAX(wordSize=paaSize, alphabetSize=alphabetSize, lowerBound=lowerBound, upperBound=upperBound, epsilon=1e-6) self.grammar = Grammar() self.segmentIndexes = [] self.rule_set = [] self.tsCount = 0 def printSegments(self): print("\nCurrent Segments:") for segmentIndex in self.segmentIndexes: segmentIndex.printContent() def discretize(self, s): """ @description : discretize the single seires using modified PAA method. --------- @param : s -- timeseries in array format ------- @Returns : a list of segments which are discretized from the input. ------- """ n = len(s) segments = [] if n % self.segmentLength != 0: raise SegmentsCanNotBeEquallyDivided() nSegment = int(n / self.segmentLength) for i in range(0, nSegment): start = i * self.segmentLength end = (i + 1) * self.segmentLength if self.tsCount == 0: self.segmentIndexes.append(SegmentIndex((start, end))) (letters, indices) = self.sax.to_letter_rep_ori(s[start:end]) segment = Segment(s[start:end], letters, indices, self.segmentIndexes[i]) self.segmentIndexes[i].addSegment(segment) segments.append(segment) self.tsCount += 1 return segments def grammar_induction(self, segments): """ @description : get grammar from segments. --------- @param : segments -- a list of segments ------- @Returns : ------- """ self.grammar.train_string(segments) self.rule_set = self.grammar.get_rule_set() def get_frequency_matrix(self): """ @description : for each segment, generate their frequencies of which are covered by the same grammar rule. --------- @param : ------- @Returns : a two-dimensional matrix that represents the frequency of each segment covered by certain grammar. ------- """ frequencyMatrix = [] for segmentIndex in self.segmentIndexes: rDict = {} for j in range(0, self.tsCount): segment = segmentIndex.getSegment(j) rule = segment.getRule() if rDict.get(rule) == None: rDict[rule] = 1 else: rDict[rule] = rDict[rule] + 1 rowFrequency = [] for j in range(0, self.tsCount): rule = segmentIndex.getSegment(j).getRule() if rule == self.grammar.root_production: rowFrequency.append(1) else: rowFrequency.append( rDict[segmentIndex.getSegment(j).getRule()]) frequencyMatrix.append(rowFrequency) return frequencyMatrix def cut_window(self, frequencyMatrix): """ @description : generate windows with the frequencyMatrix. The change points of the frequency are the cut lines. --------- @param : frequencyMatrix -- a two-dimensional matrix ------- @Returns : a list of windows ------- """ start = 0 windows = [] for now in range(1, len(frequencyMatrix)): if frequencyMatrix[now] != frequencyMatrix[start]: windows.append(Window(start, now, self.segmentLength)) start = now windows.append(Window(start, len(frequencyMatrix), self.segmentLength)) return windows def generateInitialClusters(self, startIndex, windows): """ @description : generate initial clusters in each window. The clusters are not overlapped by each other but the sum of them covers all the segments. --------- @param : startIndex -- the start number of p time series. windows -- the cut window used to generate clusters ------- @Returns : new windows that each of which contains the generated clusters ------- """ for window in windows: window.initSubsequences(startIndex, self) window.initClusters(self) window.clustersCombination() window.clustersBreakingTie() window.clustersProcessMiss() window.computeAllDistancesAndCentroids() return windows
def build_cool_grammar(): G = Grammar() program = G.NonTerminal('<program>', True) class_list, class_def, empty_feature_list, feature_list, meod_def = G.NonTerminals( '<class_list> <class_def> <empty_feature_list> <feature_list> <meod_def>' ) attr_def, param_list, param, statement_list = G.NonTerminals( '<attr_def> <param_list> <param> <statement_list>') statement, var_dec, func_call, args_list = G.NonTerminals( '<statement> <var_dec> <func_call> <args_list>') exp, typex, term, factor = G.NonTerminals('<exp> <type> <term> <factor>') arith, atom = G.NonTerminals('<arith> <atom>') args_list_empty, param_list_empty = G.NonTerminals( '<args_list_empty> <param_list_empty>') class_keyword, def_keyword, in_keyword = G.Terminals('class def in') coma, period, dot_comma, opar, cpar, obrack, cbrack, plus, minus, star, div, dd = G.Terminals( ', . ; ( ) { } + - * / :') idx, let, intx, string, num, equal, true, false, boolean, objectx = G.Terminals( 'id let int string num = true false bool object') string_const, void, auto = G.Terminals('string_const void AUTO_TYPE') if_, then, else_, assign, new = G.Terminals('if then else assign new') gt, lt, ge, le, eq, not_ = G.Terminals('> < >= <= == !') while_, do = G.Terminals('while do') program %= class_list, lambda s: ProgramNode(s[1]) class_list %= class_def, lambda s: [s[1]] class_list %= class_def + class_list, lambda s: [s[1]] + s[2] class_def %= class_keyword + idx + obrack + feature_list + cbrack, lambda s: ClassDef( s[2], s[4]) class_def %= class_keyword + idx + dd + typex + obrack + feature_list + cbrack, lambda s: ClassDef( s[2], s[6], s[4]) feature_list %= meod_def, lambda s: [s[1]] feature_list %= attr_def, lambda s: [s[1]] feature_list %= meod_def + feature_list, lambda s: [s[1]] + s[2] feature_list %= attr_def + feature_list, lambda s: [s[1]] + s[2] meod_def %= def_keyword + idx + opar + param_list_empty + cpar + dd + typex + obrack + statement_list + cbrack, lambda s: MethodDef( s[2], s[4], s[7], s[9]) attr_def %= idx + dd + typex + dot_comma, lambda s: AttributeDef( s[1], s[3]) attr_def %= idx + dd + typex + equal + exp + dot_comma, lambda s: AttributeDef( s[1], s[3], s[5]) param_list_empty %= param_list, lambda s: s[1] param_list_empty %= G.Epsilon, lambda s: [] param_list %= param, lambda s: [s[1]] param_list %= param + coma + param_list, lambda s: [s[1]] + s[3] param %= idx + dd + typex, lambda s: Param(s[1], s[3]) statement_list %= exp + dot_comma, lambda s: [s[1]] statement_list %= exp + dot_comma + statement_list, lambda s: [s[1]] + s[3] # var_dec %= let + idx + dd + typex + equal + exp, lambda s: VariableDeclaration(s[2],s[4],s[6]) var_dec %= let + idx + dd + typex + assign + exp + in_keyword + obrack + statement_list + cbrack, lambda s: VariableDeclaration( s[2], s[4], s[6], s[9]) exp %= arith, lambda s: s[1] arith %= arith + plus + term, lambda s: PlusNode(s[1], s[3]) arith %= arith + minus + term, lambda s: DifNode(s[1], s[3]) arith %= term, lambda s: s[1] term %= term + star + factor, lambda s: MulNode(s[1], s[3]) term %= term + div + factor, lambda s: DivNode(s[1], s[3]) term %= factor, lambda s: s[1] factor %= opar + arith + cpar, lambda s: s[2] factor %= num, lambda s: IntegerConstant(s[1]) factor %= idx, lambda s: VariableCall(s[1]) factor %= idx + period + idx + opar + args_list_empty + cpar, lambda s: FunCall( s[1], s[3], s[5]) factor %= idx + opar + args_list_empty + cpar, lambda s: FunCall( 'self', s[1], s[3]) exp %= var_dec, lambda s: s[1] exp %= true, lambda s: TrueConstant() exp %= false, lambda s: FalseConstant() exp %= string_const, lambda s: StringConstant(s[1]) exp %= if_ + opar + exp + cpar + then + obrack + exp + cbrack + else_ + obrack + exp + cbrack, lambda s: IfThenElseNode( s[3], s[7], s[11]) exp %= idx + assign + exp, lambda s: AssignNode(s[1], s[3]) exp %= atom, lambda s: s[1] exp %= new + idx + opar + args_list_empty + cpar, lambda s: InstantiateClassNode( s[2], s[4]) atom %= factor + gt + factor, lambda s: GreaterThanNode(s[1], s[3]) atom %= factor + lt + factor, lambda s: LowerThanNode(s[1], s[3]) atom %= factor + eq + factor, lambda s: EqualToNode(s[1], s[3]) atom %= factor + ge + factor, lambda s: GreaterEqualNode(s[1], s[3]) atom %= factor + le + factor, lambda s: LowerEqual(s[1], s[3]) atom %= not_ + factor, lambda s: NotNode(s[2]) exp %= while_ + opar + exp + cpar + do + obrack + statement_list + cbrack, lambda s: WhileBlockNode( s[3], s[7]) typex %= intx, lambda s: 'int' typex %= boolean, lambda s: 'bool' typex %= string, lambda s: 'string' typex %= objectx, lambda s: 'object' typex %= idx, lambda s: s[1] typex %= auto, lambda s: 'AUTO_TYPE' typex %= void, lambda s: 'void' args_list_empty %= args_list, lambda s: s[1] args_list_empty %= G.Epsilon, lambda s: [] args_list %= exp, lambda s: [s[1]] args_list %= exp + coma + args_list, lambda s: [s[1]] + s[3] table = [ (class_keyword, 'class'), (def_keyword, 'def'), (in_keyword, 'in'), (intx, 'int'), (boolean, 'bool'), (objectx, 'object'), (string, 'string'), (true, 'true'), (false, 'false'), (auto, 'AUTO_TYPE'), (if_, 'if'), (then, 'then'), (else_, 'else'), (new, 'new'), (while_, 'while'), (do, 'do'), (coma, ','), (period, '.'), (dd, ':'), (dot_comma, ';'), (assign, '<@-'), (lt, '@<'), (gt, '@>'), (ge, '>='), (le, '<='), (eq, '=='), (not_, '@!'), (equal, '='), (opar, '@('), (cpar, '@)'), (obrack, '@{'), (cbrack, '@}'), (plus, '@+'), (minus, '@-'), (div, '/'), (star, '@*'), (let, 'let'), (idx, '(A|a|B|b|C|c|D|d|E|e|F|f|G|g|H|h|I|i|J|j|K|k|L|l|M|m|N|n|O|o|P|p|Q|q|R|r|S|s|T|t|u|U|V|v|W|w|X|x|Y|y|Z|z)+' ), (num, '0|(1|2|3|4|5|6|7|8|9)(1|2|3|4|5|6|7|8|9|0)*'), (string_const, "@'(A|a|B|b|C|c|D|d|E|e|F|f|G|g|H|h|I|i|J|j|K|k|L|l|M|m|N|n|O|o|P|p|Q|q|R|r|S|s|T|t|u|U|V|v|W|w|X|x|Y|y|Z|z)+@'" ) ] lexer = Lexer(table, G.EOF, ignore_white_space=True) return G, lexer
class Parser(object): def __init__(self, filename): ''' Initialise the grammar to be used to parse each line in the log file Setup multiprocessing shared object and other generic structs ''' #Function dictionary, something to mimic function pointers self.parse_line = { 'octeon':self.parse_line_octeon, 'others':self.parse_line_other } self.log = Logger("parser","info") #???? Not the nest implementation yet #Initialise variables self.__filepath = filename #Setup the grammar to use based on the filetype self.__grammar_type = self.get_grammar_type(filename) self.__current_grammar = Grammar(self.__grammar_type).grammar self.parse_file() def parse_file(self): ''' Read a file and distribute chunks to parsing routine. Collect chunks and send for generating the final parsed list. ''' self.__manager = mp.Manager() self.__out_list = self.__manager.list() self.__dict_list,self.__lines, self.__pcap_text = [],[],[] self.__last_ppm_timestamp = [[(-1,0)]*MAX_CORES]*MAX_OCTS with open(self.__filepath,'r') as f: self.__lines = f.readlines() self.__num_lines = len(self.__lines) self.__chunk_size = self.__num_lines/mp.cpu_count() self.__chunk_size = self.__num_lines \ if self.__chunk_size < self.__num_lines \ else self.__chunk_size #Distribute work to processes in chunks processes = [mp.Process(target=self.parse_line[self.__grammar_type], args=(index,))\ for index in range(0,len(self.__lines),self.__chunk_size)] [process.start() for process in processes] [process.join() for process in processes] #Cleanup parsed object list self.generate_parsed_list() def parse_line_octeon(self, index): ''' Parse a chunk of lines from original file to create a list of defaultdict objects, which is appended to a multiprocessing Queue as a tuple (starting index,list(defaultdict)) ''' temp_list = [] for offset,line in enumerate(self.__lines[index:]): lineno = index + offset self.log.logger.debug("[File:%s]Parsing:[line#%d]%s" \ % (self.__filepath,lineno,line.strip())) try: result = self.__current_grammar.parseString(line) except ParseException: self.log.logger.error("[File:%s]Unrecognized format:[line#%d]%s" \ % (self.__filepath,lineno,line.strip())) return if not result.core: self.log.logger.error("[File:%s]Expected [core#] field:[line#%d]%s" \ % (self.__filepath,lineno,line.strip())) return if not (result.linux or result.vxworks): self.log.logger.error("[File:%s]Not a linux or vxworks format:[line#%d]%s" \ % (self.__filepath,lineno,line.strip())) return dic = defaultdict(int) dic["systime"] = datetime.strptime(" ".join(result.systime),"%b %d %H:%M:%S.%f") if result.timestamp or result.ppm: if result.linux: dic['oct'], dic['core'], dic['ppm'], dic['timestamp'] = \ result.octeon, result.core, result.ppm, result.timestamp else: dic['oct'] = 0 dic['core'], dic['ppm'], dic['timestamp'] = \ result.core, result.ppm, result.timestamp else: if result.linux: dic['oct'], dic['core'] = result.octeon,result.core else: dic['oct'] = 0 dic['core'] = result.core dic['ppm'] = -1 dic['timestamp'] = -1 dic['p_pcap'] = "".join(result.pcap) dic['p_line'] = lineno temp_list.append(dict(dic)) self.__out_list.append((index,temp_list)) def parse_line_other(self, index): ''' Parse a chunk of lines from original file to create a list of defaultdict objects, which is appended to a multiprocessing Queue as a tuple (starting index,list(defaultdict)) ''' temp_list = [] for offset,line in enumerate(self.__lines[index:]): lineno = index + offset self.log.logger.debug("[File:%s]Parsing:[line#%d]%s" \ % (self.__filepath,lineno,line.strip())) try: result = self.__current_grammar.parseString(line) except ParseException: self.log.logger.error("[File:%s]Unrecognized format:[line#%d]%s" \ % (self.__filepath,lineno,line.strip())) return dic = defaultdict(int) dic['systime'] = datetime.strptime(" ".join(result.systime),"%b %d %H:%M:%S.%f") dic['p_line'] = lineno temp_list.append(dict(dic)) self.__out_list.append((index,temp_list)) def generate_parsed_list(self): ''' Sort based on index from return tuple list and flatten the list of dicts Update the timestamp and ppm field of the dicts by using last known ts and ppm for given oct/core values. ''' self.__dict_list = sorted(self.__out_list,key=lambda x: x[0]) self.__dict_list = [item for tup in self.__dict_list for item in tup[1]] for item in iter(self.__dict_list): item['systime'] = item['systime'].replace(year=datetime.now().year) if self.__grammar_type == "octeon": if item['timestamp'] > 0 or item['ppm'] > 0: self.__last_ppm_timestamp[item['oct']][item['core']] = (item['ppm'],item['timestamp']) else: item['ppm'],item['timestamp'] = self.__last_ppm_timestamp[item['oct']][item['core']] self.__pcap_text.append(item['p_pcap']) def get_grammar_type(self,filename): if set("log.octData").issubset(set(self.__filepath)): return "octeon" else: return "others" def get_pcap_text(self): return self.__pcap_text def get_filename(self): '''Return the file from which the object was created''' return self.__filepath def get_lines(self): '''Return a list of lines from the original file''' return self.__lines def get_len(self): '''Return length of list of lines from the original file''' return self.__num_lines def get_dict_list(self): ''' Return a list of defaultdict objects created on parsing lines from the original file ''' return self.__dict_list def get_sortable_attributes(self): '''Return a list of current keys in the dict list''' attributes = [key for key in self.__dict_list[0].keys() if not str(key).startswith('p_')] return attributes def print_sorted_list(self,attr): for item in sorted(self.__dict_list, key=operator.itemgetter(attr)): print self.__lines[item["p_line"]].strip()
"VP += clitic:Pron[Pronoun_Form=weak!]", # "VP[head=verb] -> verb:Vb", "VP += subj:NP[nr=@ pers=@ cas=Nom pos=0? has_det=T]", "VP += dobj:NP[cas=Acc pos=1?]", "VP += iobj:NP[cas==Dat!]", "VP += pcomp*:PP", "VP += adv*:Adv", # "NP[nr~pl] -> n1:NP[cas=@ pos=0!], cc:C[pos=1!] , n2:NP[cas=@ pos=2!]", # "VP -> v1:VP[pos=0!], cc:C[pos=1!] , v2:VP[pos=2!]", "SP[head=vp] -> part:Part[Type=Subj! pos=0!] , vp:VP[mod=Subj]", "VP += dobj:SP" ] grammar = Grammar([rule_from_string(s) for s in rules_str]) infilename = 'easy.txt.conllu' #'C:\\Users\\ffxvtj\\OneDrive\\Lingv\\Corpus\\ro_rrt-ud-train.conllu' print("Loading file " + infilename) conll: Conll = pyconll.load_from_file(infilename) print("Done loading file") class ProcessConll: def __init__(self, conll, grammar): self.conll = conll self.i = 0 self.parses = [] self.grammar = grammar def next(self):
def test_grammar_get_encoding_length(self): lexicon = Lexicon(['abb', 'bba'], self.feature_table) grammar = Grammar(self.feature_table, self.constraint_set, lexicon) self.assertEqual(grammar.get_encoding_length(), 154)