Python TextTools Examples, mx.TextTools Python Examples

Example #1

0

Show file

File: RecordReader.py Project: manucorreia/biopython

def _endswith_tagtable_rest_of_line(text):
    return (
        # Is the current line the end of record marker?
        (None, TT.Word, text, +8, +1),

        # Read whatever else is on that line (could be nothing)
        (None, TT.AllInSet, TT.invset('\r\n'), +1, +1),
 
        # Get the end of line
        ("end", TT.Is, '\n', +1, -2),  # matches '\n'
        (None, TT.Is, '\r', +4, +1),
        ("end", TT.Is, '\n', +1, -4),
        (None, TT.Skip, -1, +1, +1),
        ("end", TT.Skip, +1, -6, -6),
 
        # Check if EOF (only tests when the end of record line has no \n)
        # Only time this should fail is with a bug in TT.
        ("end", TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk),
        
        # Not the end of record marker, so read to the end of line
        (None, TT.AllInSet, TT.invset('\r\n'), +1, +1),
 
        # Check if EOF
        (None, TT.EOF, TT.Here, +1, TT.MatchOk),
 
        # Not EOF, so scarf any newlines and try again
        (None, TT.AllInSet, TT.set('\r\n'), TT.MatchFail, -10),
        )

Example #2

0

Show file

    def match(self, url, datetime=None):
        """ Return 1/0 depending on whether the Cookie matches
            the given url or not.

            datetime is used to check for expiration in case the
            Cookie is a temporary one. It defaults to the current
            date/time.

        """
        url = URL.URL(url)
        if self.expires is not None:
            if datetime is None:
                datetime = DateTime.now()
            if self.expires < datetime:
                if _debug:
                    print 'expired'
                return 0
        if TextTools.prefix(url.path, (self.path, )) is None:
            if _debug:
                print 'path does not match'
            return 0
        if TextTools.suffix(url.host, (self.domain, )) is None:
            if _debug:
                print 'domain does not match'
            return 0
        return 1

Example #3

0

Show file

File: RecordReader.py Project: manucorreia/biopython

def _startswith_tagtable_rest_of_line(text):
    return (
        # Ensure the text starts with the given word
        ("begin", TT.Word, text, TT.MatchFail, +1),

        # Read to the end of line
        (None, TT.AllInSet, TT.invset('\r\n'), +1, +1),

        # Read the end of line
        (None, TT.Is, '\n', +1, +4),  # matches '\n' or
        (None, TT.Is, '\r', +2, +1),  # '\r' followed by
        (None, TT.Is, '\n', +2, +2),  # optional '\n'

        # Check if EOF (allow EOF if no EOL found)
        (None, TT.EOF, TT.Here, +1, TT.MatchOk),

        # Not EOF, so look for the next line starting with text
        ("begin", TT.Word, text, +1, -5),

        # Not what I am looking for, so read to the end of line
        (None, TT.AllInSet, TT.invset('\r\n'), +1, +1),

        # Read the end of line then test the next line
        (None, TT.Is, '\n', +1, -2),  # '\n'
        (None, TT.Is, '\r', +2, +1),  # '\r' followed by
        (None, TT.Is, '\n', -4, -4),  # optional '\n'
        # Allow termination at EOF
        (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk),    
        )

Example #4

0

Show file

File: Cookie.py Project: fxia22/ASM_xf

    def match(self, url, datetime=None):

        """ Return 1/0 depending on whether the Cookie matches
            the given url or not.

            datetime is used to check for expiration in case the
            Cookie is a temporary one. It defaults to the current
            date/time.

        """
        url = URL.URL(url)
        if self.expires is not None:
            if datetime is None:
                datetime = DateTime.now()
            if self.expires < datetime:
                if _debug:
                    print 'expired'
                return 0
        if TextTools.prefix(url.path, (self.path,)) is None:
            if _debug:
                print 'path does not match'
            return 0
        if TextTools.suffix(url.host, (self.domain,)) is None:
            if _debug:
                print 'domain does not match'
            return 0
        return 1

Example #5

0

Show file

File: Generate.py Project: andyoberlin/biopython

def check_assert(text, x, end, tag_words):
    result, taglist, pos = TT.tag(text, tag_words, x, end)
    if result:
        # This succeeded, move forward 1, to be removed later
        return x+1
    # failed
    return x

Example #6

0

Show file

File: Generate.py Project: andyoberlin/biopython

def check_assert_not(text, x, end, tagtable):
    result, taglist, pos = TT.tag(text, tagtable, x, end)
    if result:
        # This failed
        return x
    # On success, move forward 1, to be removed later
    return x + 1

Example #7

0

Show file

File: Generate.py Project: manucorreia/biopython

def check_assert(text, x, end, tag_words):
    result, taglist, pos = TT.tag(text, tag_words, x, end)
    if result:
        # This succeeded, move forward 1, to be removed later
        return x + 1
    # failed
    return x

Example #8

0

Show file

File: Parser.py Project: manucorreia/biopython

def _parse_elements(s, tagtable, cont_handler, debug_level, attrlookup):
    """parse the string with the tagtable and send the ContentHandler events

    Specifically, it sends the startElement, endElement and characters
    events but not startDocument and endDocument.
    """
    if debug_level:
        import Generate
        Generate._position = 0

    result, taglist, pos = TextTools.tag(s, tagtable, 0, len(s))

    # Special case test for the base ContentHandler since I know that
    # object does nothing and I want to test the method call overhead.
    if isinstance(cont_handler, Dispatch.Dispatcher):
        _do_dispatch_callback(s, 0, pos, taglist,
                              cont_handler._start_table.get, cont_handler,
                              cont_handler._save_stack,
                              cont_handler._end_table.get, attrlookup)
    elif cont_handler.__class__ != handler.ContentHandler:
        # Send any tags to the client (there can be some even if there
        _do_callback(s, 0, pos, taglist, cont_handler, attrlookup)

    if not result:
        if debug_level:
            return ParserPositionException(Generate._position)
        else:
            return ParserPositionException(pos)
    elif pos != len(s):
        return pos
    else:
        return None

Example #9

0

Show file

File: Parser.py Project: andyoberlin/biopython

def _parse_elements(s, tagtable, cont_handler, debug_level, attrlookup):
    """parse the string with the tagtable and send the ContentHandler events

    Specifically, it sends the startElement, endElement and characters
    events but not startDocument and endDocument.
    """
    if debug_level:
        import Generate
        Generate._position = 0

    result, taglist, pos = TextTools.tag(s, tagtable, 0, len(s))

    # Special case test for the base ContentHandler since I know that
    # object does nothing and I want to test the method call overhead.
    if isinstance(cont_handler, Dispatch.Dispatcher):
        _do_dispatch_callback(s, 0, pos, taglist,
                              cont_handler._start_table.get,
                              cont_handler, cont_handler._save_stack,
                              cont_handler._end_table.get,
                              attrlookup)
    elif cont_handler.__class__ != handler.ContentHandler:
        # Send any tags to the client (there can be some even if there
        _do_callback(s, 0, pos, taglist, cont_handler, attrlookup)

    if not result:
        if debug_level:
            return ParserPositionException(Generate._position)
        else:
            return ParserPositionException(pos)
    elif pos != len(s):
        return pos
    else:
        return None

Example #10

0

Show file

def search_bench(word, text):

    iterations = Tools.trange(COUNT)
    print ('Searching for all occurences of %r using ...' % word)

    t0 = time.time()
    so = TextTools.TextSearch(word)
    for i in iterations:
        l = so.findall(text)
    t1 = time.time()
    count = len(l)

    print (' - mx.TextSearch.TextSearch().findall(): %5.3f ms (%i)' %
           ((t1 - t0) / COUNT * 1000.0, count))

    t0 = time.time()
    so = re.compile(word)
    for i in iterations:
        l = so.findall(text)
    t1 = time.time()
    count = len(l)
    
    print (' - re.compile().findall(): %5.3f ms (%i)' %
           ((t1 - t0) / COUNT * 1000.0, count))

    t0 = time.time()
    for i in iterations:
        count = text.count(word)
    t1 = time.time()
    
    print (' - text.count(): %5.3f ms (%i)' %
           ((t1 - t0) / COUNT * 1000.0, count))

Example #11

0

Show file

File: URL.py Project: gannim/egenix-mx-base-python-2x3

class _modinit:

    # Reserved URL chars as defined by RFC2396
    unsafe_charset = TextTools.set(\
        'abcdefghijklmnopqrstuvwxyz'
        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        '0123456789'
        '-_.!~*\'()',0)

    # Modified version of the above set which includes even fewer
    # characters (esp. dots and quotes are not included)
    rpc_unsafe_charset = TextTools.set(\
        'abcdefghijklmnopqrstuvwxyz'
        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        '0123456789'
        '-_()',0)

Example #12

0

Show file

File: RecordReader.py Project: manucorreia/biopython

def _startswith_tagtable_newline(text):
    return (
        # Ensure the text starts with the given word ...
        ("begin", TT.Word, text, TT.MatchFail, +1),

        # ... followed by the end of line
        (None, TT.Is, '\n', +1, +4),  # matches '\n' or
        (None, TT.Is, '\r', +2, +1),  # '\r' followed by
        (None, TT.Is, '\n', +2, +2),  # optional '\n'

        # Check if EOF instead of a newline (allow EOF if found)
        # Otherwise, this means the line starts with the text but
        # doesn't have a successive newline.
        # XXX BUG! When looking for "A\n" should not fail on "AA\n"!
        (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk),

        # Look for the next line starting with text
        ("begin", TT.Word, text, +1, -4),

        # Not what I am looking for, so read to the end of line
        (None, TT.AllInSet, TT.invset('\r\n'), +1, +1),

        # Read the end of line then test the next line
        (None, TT.Is, '\n', +1, -2),  # '\n'
        (None, TT.Is, '\r', +2, +1),  # '\r' followed by
        (None, TT.Is, '\n', -4, -4),  # optional '\n'
        # Allow termination at EOF
        (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk),    
        )

Example #13

0

Show file

File: Generate.py Project: manucorreia/biopython

def check_assert_not(text, x, end, tagtable):
    result, taglist, pos = TT.tag(text, tagtable, x, end)
    if result:
        # This failed
        return x
    # On success, move forward 1, to be removed later
    return x + 1

Example #14

0

Show file

File: RecordReader.py Project: manucorreia/biopython

def _find_begin_positions(text, tagtable):
    success, tags, pos = TT.tag(text, tagtable)
    # print "XXX", success, tags, pos, len(text)
    if not success:
        raise ReaderError("invalid format starting with %s" % repr(text[:50]))
    if pos != len(text):
        raise ReaderError, \
            "could not parse to end of text (ended at %d of %d)" % \
            (pos, len(text))
    return [tag[1] for tag in tags]

Example #15

0

Show file

File: RecordReader.py Project: manucorreia/biopython

def _endswith_tagtable_newline(text):
    return (
        # Is the current line the end of record marker?
        (None, TT.Word, text, +6, +1),
 
        # Make sure it ends the line
        ("end", TT.Is, '\n', +1, -1),  # matches '\n'
        (None, TT.Is, '\r', +4, +1),
        ("end", TT.Is, '\n', +1, -3),
        (None, TT.Skip, -1, +1, +1),
        ("end", TT.Skip, +1, -5, -5),
 
        # Not the end of record marker, so read to the end of line
        (None, TT.AllInSet, TT.invset('\r\n'), +1, +1),
 
        # Check if EOF
        (None, TT.EOF, TT.Here, +1, TT.MatchOk),
 
        # Not EOF, so scarf any newlines
        (None, TT.AllInSet, TT.set('\r\n'), TT.MatchFail, -8),
        )

Example #16

0

Show file

File: Generate.py Project: andyoberlin/biopython

 def call(self, text, x, end):
     # Called by 'TextTools.Call' to detect a match.
     # I do the full match here and store the results for later use.
     # If successful, I return x+1, else return x+0 (the +1/-1 trick)
     min_count, max_count = self._get_ranges()
     assert min_count == max_count, \
            "cannot have different sizes: %s %s" % (min_count, max_count)
     
     tagtable = self.tagtable * min_count
     result, taglist, pos = TT.tag(text, tagtable, x, end)
     if result == 1:
         # Store the taglist for later use
         self.taglist = taglist
         return pos + 1  # +1 because {0} is allowed; Skip -1 later
     else:
         self.taglist = None
         return x

Example #17

0

Show file

File: Generate.py Project: manucorreia/biopython

    def call(self, text, x, end):
        # Called by 'TextTools.Call' to detect a match.
        # I do the full match here and store the results for later use.
        # If successful, I return x+1, else return x+0 (the +1/-1 trick)
        min_count, max_count = self._get_ranges()
        assert min_count == max_count, \
               "cannot have different sizes: %s %s" % (min_count, max_count)

        tagtable = self.tagtable * min_count
        result, taglist, pos = TT.tag(text, tagtable, x, end)
        if result == 1:
            # Store the taglist for later use
            self.taglist = taglist
            return pos + 1  # +1 because {0} is allowed; Skip -1 later
        else:
            self.taglist = None
            return x

Example #18

0

Show file

File: Generate.py Project: andyoberlin/biopython

def generate_dot(expression, genstate):
    return [(None, TT.IsInSet, TT.invset('\n')), ]

Example #19

0

Show file

File: test_RecordReader2.py Project: andyoberlin/biopython

def normalize(s):
    return string.join(TT.splitlines(s), "\n")

Example #20

0

Show file

File: Generate.py Project: manucorreia/biopython

def generate_dot(expression, genstate):
    return [
        (None, TT.IsInSet, TT.invset('\n')),
    ]

Example #21

0

Show file

File: RecordReader.py Project: manucorreia/biopython

        fake = self.text + "\n"
        reader = StartsWith(self.infile, self.text, self.sizehint,
                            fake + self.lookahead)
        rec = reader.next()
        rec = rec[len(fake):]  # remove the fake data
        self.infile, self.lookahead = reader.remainder()
        self.found = 1
        return rec

    def remainder(self):
        return self.infile, self.lookahead

# Tag the last byte of every newline
_tag_lines_tagtable = (
    # Skip non-newline characters
    (None, TT.AllInSet, TT.invset('\r\n'), +1, +1),

    # Check if newline
    ("newline", TT.Is, '\n', +1, -1),  # can be '\n'
    (None, TT.Is, '\r', +3, +1),       # or start a '\r' followed by ..
    ("newline", TT.Is, '\n', +1, -3),  #  .. an optional '\n'
    ("newline", TT.Skip, 0, -4, -4),   # get here with just an '\r'
    (None, TT.EOF, TT.Here, -5, TT.MatchOk),  # stop at end of text
    )


class CountLines(RecordReader):
    """Read a specified (fixed) number of lines"""
    def __init__(self, infile, count, sizehint = SIZEHINT, lookahead = ""):
        assert count > 0, "CountLines reader must read at least one line"
        assert lookahead > 0, "Must read at least a character at a time"

Example #22

0

Show file

File: test_RecordReader2.py Project: manucorreia/biopython

def normalize(s):
    return string.join(TT.splitlines(s), "\n")