def _search(self, string, tokstream): self.begin = 0 self.end = 0 n = len(tokstream) tracer = TokenTracer(self.langlet, self.symbol) initial = tracer.selectables() i = 0 while i<n: tok = tokstream[i] if tok[0] in initial: selection = [] K = None for j,T in enumerate(tokstream[i:]): if not self.accept_token(T): continue try: selection = tracer.select(T[0]) except NonSelectableError: if K is not None: stream = tokstream[i:i+K+1] if self.condition(stream): m = CMatchObject(string) first = stream[0] last = stream[-1] m.begin = get_index(string, first[2])+first[-1][0]+1 m.end = get_index(string, last[2])+last[-1][1]+1 m.matched = string[m.begin: m.end] m.tokstream = TokenStream(stream) m.tokpos = i return m break if FIN in selection: K = j tracer = TokenTracer(self.langlet, self.symbol) i+=1
def _check_gene(self, gene): tr = TokenTracer(self.langlet) try: res, idx = tr.check(gene) except (KeyError, TypeError): print gene raise return res
def _lookahead(self, tokstream, S, sym): tokstream = tokstream.clone() tracer_data = {} tts = [] for s in S: if is_symbol(s): tt = TokenTracer(self.langlet, s) else: # jump to state with nid = s within the current NFA states = self._get_states(s, sym) if not states: return -1 tt = TokenTracer(self.langlet, sym, jump_to_state=states, without_expansion=False) tts.append(tt) tracer_data[tt] = (s, -1) n = len(tokstream) p = tokstream.position m = -1 while p < n: nid = tokstream[p][0] if nid == INTRON_NID: p += 1 continue removable = [] for tt in tts: selection = tt.selectables() if FIN in selection: s, _ = tracer_data[tt] tracer_data[tt] = (s, p) m = p if nid not in selection: removable.append(tt) else: tt.select(nid) for tt in removable: tts.remove(tt) if len(tts) == 1: s, q = tracer_data[tts[0]] if q >= 0: return (tracer_data[tts[0]][0], p, None) elif len(tts) == 0: if p > m: ttcancel = removable[-1] s, _ = tracer_data[ttcancel] self._last_scan_point = (s, p, tokstream[p], ttcancel.selectables()) if m >= 0: for tt, (s, i) in tracer_data.items(): if i == m: return (s, p, None) selectable = set() for tt in removable: selectable.update(tt.selectables()) return (-1, p, selectable) p += 1 return (-1, p, set())
def gen_token_string(self, nid): tracer = TokenTracer(self.langlet, nid, "lex") selection = list(tracer.selectables()) S = [] while True: n = len(selection) if selection == [FIN]: return ''.join(S) if len(S) > 20: return self.gen_token_string(nid) while True: m = random.randrange(0, n) t = selection[m] if t is FIN: continue try: chars = list(self.lexer_terminal[t]) except KeyError: return '' if not chars: other_chars = reduce(lambda S, T: S.union(T), [ self.lexer_terminal.get(r, set()) for r in selection if r != t ], set()) while True: c = random_printable() if c == '\\': continue if c not in other_chars: break S.append(c) else: c = chars[random.randrange(0, len(chars))] S.append(c) selection = list(tracer.select(t)) break if len(S) >= self.stdlen: if FIN in selection: if S[0] in ('"', "'"): if len(S) >= 4: return ''.join(S) if S[0] in string.digits: if len(S) >= 4: return ''.join(S) if random.randrange(0, 2) == 0: return ''.join(S)
def delete(self, g): visited = set() while True: n = len(g) - 1 if len(visited)>=n: return g gene = g[:] k = random.randrange(0, n) visited.add(k) T = gene[k+1][0] del gene[k+1] n-=1 R = self.get_right_par(T+SYMBOL_OFFSET) # TODO: consider 'extended braces' loc = [] if R: R-=SYMBOL_OFFSET for i, tok in enumerate(gene[k+1:]): if tok[0] == R: loc.append(i+k+1) else: L = self.get_left_par(T+SYMBOL_OFFSET) if L: L-=SYMBOL_OFFSET for i, tok in enumerate(gene[:k]): if tok[0] == L: loc.append(i) if loc: while loc: m = loc[random.randrange(0, len(loc))] backup = gene[m] del gene[m] tr = TokenTracer(self.langlet) res, idx = tr.check(gene) if res == True: return gene else: loc.remove(m) gene.insert(m, backup) continue else: if self._check_gene(gene): return gene else: continue
def gen_token_string(self, nid): tracer = TokenTracer(self.langlet, nid, "lex") selection = list(tracer.selectables()) S = [] while True: n = len(selection) if selection == [FIN]: return ''.join(S) if len(S)>20: return self.gen_token_string(nid) while True: m = random.randrange(0, n) t = selection[m] if t is FIN: continue try: chars = list(self.lexer_terminal[t]) except KeyError: return '' if not chars: other_chars = reduce(lambda S, T: S.union(T), [self.lexer_terminal.get(r, set()) for r in selection if r!=t], set()) while True: c = random_printable() if c == '\\': continue if c not in other_chars: break S.append(c) else: c = chars[random.randrange(0, len(chars))] S.append(c) selection = list(tracer.select(t)) break if len(S) >= self.stdlen: if FIN in selection: if S[0] in ('"', "'"): if len(S)>=4: return ''.join(S) if S[0] in string.digits: if len(S)>=4: return ''.join(S) if random.randrange(0,2) == 0: return ''.join(S)
def insert(self, g): trials = set() while True: gene = g[:] n = len(gene) - 1 k, T, tracer = self._seek_random_item(gene, trials) if T is None: continue value = self.gen_token_string(T+SYMBOL_OFFSET) gene.insert(k+1, [T, value]) n+=1 R = self.get_right_par(T+SYMBOL_OFFSET) # TODO: consider 'extended braces' if R: R-=SYMBOL_OFFSET i = 1 loc = [] while k+i<n: try: selection = tracer.select(gene[k+i][0]) except NonSelectableError: break if R in selection: loc.append(k+i) i+=1 if loc: value = self.gen_token_string(R+SYMBOL_OFFSET) while loc: m = loc[random.randrange(0, len(loc))] gene.insert(m+1, [R, value]) tr = TokenTracer(self.langlet) res, idx = tr.check(gene) if res == True: return gene else: loc.remove(m) continue else: continue else: if self._check_gene(gene): return gene else: continue
def _repair(self, tokenstream): ''' Maybe the tokenstream needs some repair? Insert constant token, when needed. ''' n = len(tokenstream) tt = TokenTracer(self.langlet, start=self.start_symbol) selectables = tt.selectables() repaired = [] i = 0 while i < n: tok = tokenstream[i] # print tok if tok[0] in selectables: repaired.append(tok) selectables = tt.select(tok[0]) else: # find a const token T which can be inserted between token[i] and token[i+1] S = [] for s in selectables: if s in self.constants: _tt = tt.clone() _selectables = _tt.select(s) if i == n - 1: if FIN in _selectables: repaired.append([s, self.constants[s]]) break else: T = tokenstream[i + 1] if T[0] in _selectables: S.append(s) selectables = _selectables else: if S == []: if tok[1].strip() == "": # forgotten a linebreak? i += 1 continue # TODO: replace this by an expressive error message self.compute_syntax_error(tokenstream, i) elif len(S) == 1: selectables = _selectables repaired.append([s, self.constants[s]]) i += 1 return repaired
def _repair(self, tokenstream): ''' Maybe the tokenstream needs some repair? Insert constant token, when needed. ''' n = len(tokenstream) tt = TokenTracer(self.langlet, start = self.start_symbol) selectables = tt.selectables() repaired = [] i = 0 while i<n: tok = tokenstream[i] # print tok if tok[0] in selectables: repaired.append(tok) selectables = tt.select(tok[0]) else: # find a const token T which can be inserted between token[i] and token[i+1] S = [] for s in selectables: if s in self.constants: _tt = tt.clone() _selectables = _tt.select(s) if i == n-1: if FIN in _selectables: repaired.append([s, self.constants[s]]) break else: T = tokenstream[i+1] if T[0] in _selectables: S.append(s) selectables = _selectables else: if S == []: if tok[1].strip() == "": # forgotten a linebreak? i+=1 continue # TODO: replace this by an expressive error message self.compute_syntax_error(tokenstream, i) elif len(S) == 1: selectables = _selectables repaired.append([s, self.constants[s]]) i+=1 return repaired
def _seek_random_item(self, gene, trials): n = len(gene) - 1 if len(trials) == n: trials.clear() while True: k = random.randrange(-1, n) if k not in trials: break trials.add(k) tracer = TokenTracer(self.langlet) selection = [] for i, tok in enumerate(gene): if i<=k: tracer.select(tok[0]) else: break selection = list(tracer.selectables()) m = random.randrange(0, len(selection)) T = selection[m] return k, T, tracer
def run(self, start=None, maxlen=3, exclude = ()): ttracer = TokenTracer(self.langlet, start = start) L = [] def create_trace(ttracer, selection, L, n): R = [] if n == 0: if None in selection: return [L] else: return [] for s in selection: if s is None: R.append(L) elif s not in exclude: subtracer = ttracer.clone() subselect = subtracer.select(s) R+=create_trace(subtracer, subselect, L+[s], n-1) return R R = [] L = [] traces = create_trace(ttracer, ttracer.selectables(), L, maxlen) return traces
def _lookahead(self, tokstream, S, sym): tokstream = tokstream.clone() tracer_data = {} tts = [] for s in S: if is_symbol(s): tt = TokenTracer(self.langlet, s) else: # jump to state with nid = s within the current NFA states = self._get_states(s, sym) if not states: return -1 tt = TokenTracer(self.langlet, sym, jump_to_state = states, without_expansion = False) tts.append(tt) tracer_data[tt] = (s, -1) n = len(tokstream) p = tokstream.position m = -1 while p<n: nid = tokstream[p][0] if nid == INTRON_NID: p+=1 continue removable = [] for tt in tts: selection = tt.selectables() if FIN in selection: s, _ = tracer_data[tt] tracer_data[tt] = (s, p) m = p if nid not in selection: removable.append(tt) else: tt.select(nid) for tt in removable: tts.remove(tt) if len(tts) == 1: s, q = tracer_data[tts[0]] if q>=0: return (tracer_data[tts[0]][0], p, None) elif len(tts) == 0: if p > m: ttcancel = removable[-1] s, _ = tracer_data[ttcancel] self._last_scan_point = (s, p, tokstream[p], ttcancel.selectables()) if m >= 0: for tt, (s, i) in tracer_data.items(): if i == m: return (s, p, None) selectable = set() for tt in removable: selectable.update(tt.selectables()) return (-1, p, selectable) p+=1 return (-1, p, set())
def subst(self, g): trials = set() n = len(g) - 1 while True: gene = g[:] k = random.randrange(-1, n) tracer = TokenTracer(self.langlet) for i, tok in enumerate(gene): if i<=k: tracer.select(tok[0]) else: break selection = list(tracer.selectables()) while k+1<n: if len(selection) == 1: k+=1 selection = list(tracer.select(gene[k][0])) continue while selection: m = random.randrange(0, len(selection)) T = selection[m] selection.remove(T) if T is None: continue value = self.gen_token_string(T+SYMBOL_OFFSET) backup = gene[k+1] if backup[1] == value: continue gene[k+1] = [T, value] tr = TokenTracer(self.langlet) try: res, idx = tr.check(gene) except (KeyError, TypeError): print gene raise if res == True: return gene else: gene[k+1] = backup k+=1
def _search(self, string, tokstream): self.begin = 0 self.end = 0 n = len(tokstream) tracer = TokenTracer(self.langlet, self.symbol) initial = tracer.selectables() i = 0 while i < n: tok = tokstream[i] if tok[0] in initial: selection = [] K = None for j, T in enumerate(tokstream[i:]): if not self.accept_token(T): continue try: selection = tracer.select(T[0]) except NonSelectableError: if K is not None: stream = tokstream[i:i + K + 1] if self.condition(stream): m = CMatchObject(string) first = stream[0] last = stream[-1] m.begin = get_index( string, first[2]) + first[-1][0] + 1 m.end = get_index(string, last[2]) + last[-1][1] + 1 m.matched = string[m.begin:m.end] m.tokstream = TokenStream(stream) m.tokpos = i return m break if FIN in selection: K = j tracer = TokenTracer(self.langlet, self.symbol) i += 1
nfa_E = [ "E: a '+' E | a '*' E | 'a'", (E, 0, E), { (E, 0, E): [(a, 1, E), (a, 2, E), (a, 3, E)], (a, 1, E): [(PLUS, 4, E)], (a, 2, E): [(MUL, 5, E)], (PLUS, 4, E): [(E, 6, E)], (MUL, 5, E): [(E, 7, E)], (a, 3, E): [(FIN, FEX, E)], (E, 6, E): [(FIN, FEX, E)], (E, 7, E): [(FIN, FEX, E)] } ] nfa_E2 = [ "E: a E", (E, 0, E), { (E, 0, E): [(a, 1, E)], (a, 1, E): [(E, 2, E)], (E, 2, E): [(FIN, FEX, E)] } ] langlet.parse_nfa.nfas[E] = nfa_E2 tt = TokenTracer(langlet) tt.selectables() tt.select(1) tt.select(1) tt.select(1) tt.select(1) tt.select(1)
langlet.parse_nfa.start_symbols = (E,) nfa_E = ["E: a '+' E | a '*' E | 'a'", (E,0,E), {(E,0,E):[(a,1,E),(a,2,E),(a,3,E)], (a,1,E):[(PLUS,4,E)], (a,2,E):[(MUL,5,E)], (PLUS,4,E):[(E,6,E)], (MUL,5,E):[(E,7,E)], (a,3,E):[(FIN, FEX, E)], (E,6,E):[(FIN, FEX,E)], (E,7,E):[(FIN, FEX,E)]}] nfa_E2 = ["E: a E", (E,0,E), {(E,0,E):[(a,1,E)], (a,1,E):[(E,2,E)], (E,2,E):[(FIN, FEX,E)]}] langlet.parse_nfa.nfas[E] = nfa_E2 tt = TokenTracer(langlet) tt.selectables() tt.select(1) tt.select(1) tt.select(1) tt.select(1) tt.select(1)
def random_rule(stoplen): # some rules to constrain 'interesting' cases # # 1. Avoid double parens (( ... )) or double square braces [[ ... ]] # 2. Avoid use of STRING # 3. Avoid sequences of NAME longer than 2 i.e. NAME NAME NAME trace = [] ttracer = TokenTracer(ls_grammar, start = ls_grammar.symbol.rhs) STRING = ls_grammar.token.STRING NAME = ls_grammar.token.NAME LPAR = ls_grammar.token.LPAR RPAR = ls_grammar.token.RPAR LSQB = ls_grammar.token.LSQB RSQB = ls_grammar.token.RSQB selection = list(ttracer.selectables()) while True: # print len(trace), selection if len(trace)>stoplen: if None in selection: return trace elif RSQB in selection: trace.append(RSQB) selection = ttracer.select(RSQB) continue elif RPAR in selection: trace.append(RPAR) selection = ttracer.select(RPAR) continue while selection: k = random.randrange(len(selection)) item = selection[k] selection.remove(item) if item is None: continue if item == STRING: continue elif item in (NAME, LPAR, LSQB): if len(trace)>=2: if trace[-1] == trace[-2] == item: continue if item in (LPAR, LSQB): if trace[-2] in (LPAR, LSQB) and trace[-1] in (LPAR, LSQB): continue elif item in (RSQB, RPAR): if trace and trace[-1] == item: if item == RSQB: LEFT = LSQB else: LEFT = LPAR RIGHT = item m = len(trace)-2 double = False level = -2 while m: if trace[m] == RIGHT: level-=1 elif trace[m] == LEFT: level+=1 if level == 0: if trace[m+1] == LEFT: double = True break m-=1 if double: continue trace.append(item) selection = list(ttracer.select(item)) break
S.append(c) else: c = chars[random.randrange(0, len(chars))] S.append(c) selection = list(tracer.select(t)) break if len(S) >= self.stdlen: if FIN in selection: if S[0] in ('"', "'"): if len(S) >= 4: return ''.join(S) if S[0] in string.digits: if len(S) >= 4: return ''.join(S) if random.randrange(0, 2) == 0: return ''.join(S) if __name__ == '__main__': import langscape from langscape.trail.tokentracer import TokenTracer python = langscape.load_langlet("python") tracer = TokenTracer(python, python.lex_symbol.Single3, "lex") tokgen = TokenGenerator(python) for i in range(100): s = tokgen.gen_token_string(python.lex_symbol.NAME) print s #print "NUM", "%-8s %s"%(s, eval(s))