def _endswith_tagtable_rest_of_line(text): return ( # Is the current line the end of record marker? (None, TT.Word, text, +8, +1), # Read whatever else is on that line (could be nothing) (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), # Get the end of line ("end", TT.Is, '\n', +1, -2), # matches '\n' (None, TT.Is, '\r', +4, +1), ("end", TT.Is, '\n', +1, -4), (None, TT.Skip, -1, +1, +1), ("end", TT.Skip, +1, -6, -6), # Check if EOF (only tests when the end of record line has no \n) # Only time this should fail is with a bug in TT. ("end", TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk), # Not the end of record marker, so read to the end of line (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), # Check if EOF (None, TT.EOF, TT.Here, +1, TT.MatchOk), # Not EOF, so scarf any newlines and try again (None, TT.AllInSet, TT.set('\r\n'), TT.MatchFail, -10), )
def match(self, url, datetime=None): """ Return 1/0 depending on whether the Cookie matches the given url or not. datetime is used to check for expiration in case the Cookie is a temporary one. It defaults to the current date/time. """ url = URL.URL(url) if self.expires is not None: if datetime is None: datetime = DateTime.now() if self.expires < datetime: if _debug: print 'expired' return 0 if TextTools.prefix(url.path, (self.path, )) is None: if _debug: print 'path does not match' return 0 if TextTools.suffix(url.host, (self.domain, )) is None: if _debug: print 'domain does not match' return 0 return 1
def _startswith_tagtable_rest_of_line(text): return ( # Ensure the text starts with the given word ("begin", TT.Word, text, TT.MatchFail, +1), # Read to the end of line (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), # Read the end of line (None, TT.Is, '\n', +1, +4), # matches '\n' or (None, TT.Is, '\r', +2, +1), # '\r' followed by (None, TT.Is, '\n', +2, +2), # optional '\n' # Check if EOF (allow EOF if no EOL found) (None, TT.EOF, TT.Here, +1, TT.MatchOk), # Not EOF, so look for the next line starting with text ("begin", TT.Word, text, +1, -5), # Not what I am looking for, so read to the end of line (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), # Read the end of line then test the next line (None, TT.Is, '\n', +1, -2), # '\n' (None, TT.Is, '\r', +2, +1), # '\r' followed by (None, TT.Is, '\n', -4, -4), # optional '\n' # Allow termination at EOF (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk), )
def match(self, url, datetime=None): """ Return 1/0 depending on whether the Cookie matches the given url or not. datetime is used to check for expiration in case the Cookie is a temporary one. It defaults to the current date/time. """ url = URL.URL(url) if self.expires is not None: if datetime is None: datetime = DateTime.now() if self.expires < datetime: if _debug: print 'expired' return 0 if TextTools.prefix(url.path, (self.path,)) is None: if _debug: print 'path does not match' return 0 if TextTools.suffix(url.host, (self.domain,)) is None: if _debug: print 'domain does not match' return 0 return 1
def check_assert(text, x, end, tag_words): result, taglist, pos = TT.tag(text, tag_words, x, end) if result: # This succeeded, move forward 1, to be removed later return x+1 # failed return x
def check_assert_not(text, x, end, tagtable): result, taglist, pos = TT.tag(text, tagtable, x, end) if result: # This failed return x # On success, move forward 1, to be removed later return x + 1
def check_assert(text, x, end, tag_words): result, taglist, pos = TT.tag(text, tag_words, x, end) if result: # This succeeded, move forward 1, to be removed later return x + 1 # failed return x
def _parse_elements(s, tagtable, cont_handler, debug_level, attrlookup): """parse the string with the tagtable and send the ContentHandler events Specifically, it sends the startElement, endElement and characters events but not startDocument and endDocument. """ if debug_level: import Generate Generate._position = 0 result, taglist, pos = TextTools.tag(s, tagtable, 0, len(s)) # Special case test for the base ContentHandler since I know that # object does nothing and I want to test the method call overhead. if isinstance(cont_handler, Dispatch.Dispatcher): _do_dispatch_callback(s, 0, pos, taglist, cont_handler._start_table.get, cont_handler, cont_handler._save_stack, cont_handler._end_table.get, attrlookup) elif cont_handler.__class__ != handler.ContentHandler: # Send any tags to the client (there can be some even if there _do_callback(s, 0, pos, taglist, cont_handler, attrlookup) if not result: if debug_level: return ParserPositionException(Generate._position) else: return ParserPositionException(pos) elif pos != len(s): return pos else: return None
def search_bench(word, text): iterations = Tools.trange(COUNT) print ('Searching for all occurences of %r using ...' % word) t0 = time.time() so = TextTools.TextSearch(word) for i in iterations: l = so.findall(text) t1 = time.time() count = len(l) print (' - mx.TextSearch.TextSearch().findall(): %5.3f ms (%i)' % ((t1 - t0) / COUNT * 1000.0, count)) t0 = time.time() so = re.compile(word) for i in iterations: l = so.findall(text) t1 = time.time() count = len(l) print (' - re.compile().findall(): %5.3f ms (%i)' % ((t1 - t0) / COUNT * 1000.0, count)) t0 = time.time() for i in iterations: count = text.count(word) t1 = time.time() print (' - text.count(): %5.3f ms (%i)' % ((t1 - t0) / COUNT * 1000.0, count))
class _modinit: # Reserved URL chars as defined by RFC2396 unsafe_charset = TextTools.set(\ 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' '0123456789' '-_.!~*\'()',0) # Modified version of the above set which includes even fewer # characters (esp. dots and quotes are not included) rpc_unsafe_charset = TextTools.set(\ 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' '0123456789' '-_()',0)
def _startswith_tagtable_newline(text): return ( # Ensure the text starts with the given word ... ("begin", TT.Word, text, TT.MatchFail, +1), # ... followed by the end of line (None, TT.Is, '\n', +1, +4), # matches '\n' or (None, TT.Is, '\r', +2, +1), # '\r' followed by (None, TT.Is, '\n', +2, +2), # optional '\n' # Check if EOF instead of a newline (allow EOF if found) # Otherwise, this means the line starts with the text but # doesn't have a successive newline. # XXX BUG! When looking for "A\n" should not fail on "AA\n"! (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk), # Look for the next line starting with text ("begin", TT.Word, text, +1, -4), # Not what I am looking for, so read to the end of line (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), # Read the end of line then test the next line (None, TT.Is, '\n', +1, -2), # '\n' (None, TT.Is, '\r', +2, +1), # '\r' followed by (None, TT.Is, '\n', -4, -4), # optional '\n' # Allow termination at EOF (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk), )
def _find_begin_positions(text, tagtable): success, tags, pos = TT.tag(text, tagtable) # print "XXX", success, tags, pos, len(text) if not success: raise ReaderError("invalid format starting with %s" % repr(text[:50])) if pos != len(text): raise ReaderError, \ "could not parse to end of text (ended at %d of %d)" % \ (pos, len(text)) return [tag[1] for tag in tags]
def _endswith_tagtable_newline(text): return ( # Is the current line the end of record marker? (None, TT.Word, text, +6, +1), # Make sure it ends the line ("end", TT.Is, '\n', +1, -1), # matches '\n' (None, TT.Is, '\r', +4, +1), ("end", TT.Is, '\n', +1, -3), (None, TT.Skip, -1, +1, +1), ("end", TT.Skip, +1, -5, -5), # Not the end of record marker, so read to the end of line (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), # Check if EOF (None, TT.EOF, TT.Here, +1, TT.MatchOk), # Not EOF, so scarf any newlines (None, TT.AllInSet, TT.set('\r\n'), TT.MatchFail, -8), )
def call(self, text, x, end): # Called by 'TextTools.Call' to detect a match. # I do the full match here and store the results for later use. # If successful, I return x+1, else return x+0 (the +1/-1 trick) min_count, max_count = self._get_ranges() assert min_count == max_count, \ "cannot have different sizes: %s %s" % (min_count, max_count) tagtable = self.tagtable * min_count result, taglist, pos = TT.tag(text, tagtable, x, end) if result == 1: # Store the taglist for later use self.taglist = taglist return pos + 1 # +1 because {0} is allowed; Skip -1 later else: self.taglist = None return x
def generate_dot(expression, genstate): return [(None, TT.IsInSet, TT.invset('\n')), ]
def normalize(s): return string.join(TT.splitlines(s), "\n")
def generate_dot(expression, genstate): return [ (None, TT.IsInSet, TT.invset('\n')), ]
fake = self.text + "\n" reader = StartsWith(self.infile, self.text, self.sizehint, fake + self.lookahead) rec = reader.next() rec = rec[len(fake):] # remove the fake data self.infile, self.lookahead = reader.remainder() self.found = 1 return rec def remainder(self): return self.infile, self.lookahead # Tag the last byte of every newline _tag_lines_tagtable = ( # Skip non-newline characters (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), # Check if newline ("newline", TT.Is, '\n', +1, -1), # can be '\n' (None, TT.Is, '\r', +3, +1), # or start a '\r' followed by .. ("newline", TT.Is, '\n', +1, -3), # .. an optional '\n' ("newline", TT.Skip, 0, -4, -4), # get here with just an '\r' (None, TT.EOF, TT.Here, -5, TT.MatchOk), # stop at end of text ) class CountLines(RecordReader): """Read a specified (fixed) number of lines""" def __init__(self, infile, count, sizehint = SIZEHINT, lookahead = ""): assert count > 0, "CountLines reader must read at least one line" assert lookahead > 0, "Must read at least a character at a time"