Python generate_tokens Examples, minds.util.html_pull_parser.generate_tokens Python Examples

Example #1

0

Show file

File: test_html_pull_parser.py Project: BackupTheBerlios/mindretrieve-svn

    def test_declaration_incomplete(self):

        # verify that the lenient declaration can handle incompete tags

        doc = " <html>A<!-- bad comment -->B</html>"

        # Note unrelated problem: without the initial space above, there
        # is problem in parsing the incomplete <html>. Investigate?!

        for i in range(1, len(doc)-1):

            chunks = [doc[:i], doc[i:]]
            #print chunks
            fp = ChunkedStringIO(chunks)
            tokens = hpp.generate_tokens(fp)

            self._test_generator1(
                tokens,
                [
                (TAG,    u'html', []),
                (DATA,   u'A'       ),
                (DATA,   u'B'       ),
                (ENDTAG, u'html'),
                ])

Example #2

0

Show file

File: distillML.py Project: BackupTheBerlios/mindretrieve-svn

def process(fp, out, meta):
    """ Return has_html, has_frameset """

    has_html        = False
    has_frameset    = False
    has_common_tag  = False

    first_td    = False     # state for iterating td inside tr

    iterator = html_pull_parser.generate_tokens(fp)

    # General HTML format
    # <html>
    #   <head>
    #   <body>
    #
    # However all elements are optional.
    # It is better to use a flat, stateless loop to process elements

    for token in iterator:

        if token[0] == DATA:
            out.out(token[1])

        elif token[0] == TAG:

            tag = token[1]
            id = starttag_dict.get(tag,-1)

            if id > 0:
                has_common_tag = True

            if id == sOUTP:
                out.outTag('p')

            elif id == sOUTTAG:
                out.outTag(tag)

            elif id == sTR:
                first_td = True

            elif id == sTDTH:
                if first_td:
                    first_td = False
                else:
                    out.out('   ')

            elif id == sINPUT:

                attrs = token[2]
                itype = _getvalue(attrs, 'type')

                if itype == 'checkbox':
                    if _hasattr(attrs,'checked'):
                        out.out('[*] ')
                    else:
                        out.out('[ ] ')

                elif itype == 'radio':
                    if _hasattr(attrs,'checked'):
                        out.out('(*) ')
                    else:
                        out.out('( ) ')

                elif itype == 'image':
                    alt = _getvalue(attrs, 'alt') or _getvalue(attrs, 'value')
                    out.outAlt(saxutils.unescape(alt))

                elif itype == 'password':
                    out.outAlt('***')

                elif itype == 'hidden':
                    pass

                else:
                    value = _getvalue(attrs, 'value')
                    out.outAlt(saxutils.unescape(value))

            elif id == sIMG:
                attrs = token[2]
                alt = _getvalue(attrs, 'alt')
                if alt:
                    out.outAlt(saxutils.unescape(alt))

            elif id == sHTML:
                has_html = True
                out.notifyHtml()

            elif id == sBODY:
                out.outHeader(meta)

            elif id == sFRAMESET:
                has_frameset = True

            elif id == sTITLE:
                title = ''
                for token in iterator:
                    if token[0] == DATA:
                        title += token[1]
                    elif token in [
                        (ENDTAG, 'title'),  # only </title> is valid
                        (ENDTAG, 'head'),   # in case no </title>
                        (TAG, 'body'),      # in case no </title>
                        ]:
                        break
                meta['title'] = _collapse(title)

            elif id == sMETA:
                attrs = token[2]
                name = _getvalue(attrs,'name').lower()
                content = _getvalue(attrs,'content')
                if name == 'description':
                    meta['description'] = saxutils.unescape(_collapse(content))
                elif name == 'keywords':
                    meta['keywords'] = saxutils.unescape(_collapse(content))

            elif id == sSCRIPT:
                for token in iterator:
                    if token == (ENDTAG, 'script'):
                        break

            elif id == sSTYLE:
                for token in iterator:
                    if token == (ENDTAG, 'style'):
                        break

            elif id == sSELECT:
                for token in iterator:
                    if token == (ENDTAG, 'select'):
                        break


        elif token[0] == ENDTAG:

            tag = token[1]
            id = endtag_dict.get(tag,-1)

            if id == eCLOSE_TAG:
                out.outTag('/'+tag)

            elif id == eBREAK_LINE:
                out.outTag('br')

    out.close(meta)

    return has_html, has_frameset, has_common_tag

Example #3

0

Show file

File: snapshot.py Project: BackupTheBerlios/mindretrieve-svn

def scan_html(fp, baseuri, append):

    token_stream = hpp.generate_tokens(fp, comment=True)
    for token in token_stream:
        if token[0] != hpp.TAG:
            continue
        tag = token[1]

        # ----------------------------------------------------------------------
        # handle <style> block
        if tag == 'style':
            # HACK: <style> should only be valid inside <head>
            styles = []
            for token in token_stream:
                if token[0] == hpp.DATA:
                    styles.append(token[1])
                elif token[0] == hpp.COMMENT:
                    styles.append(token[1])     # CSS enclosed by HTML comment!
                elif token[0] == hpp.TAG:
                    if token[1] == 'style':
                        # <style> follows by <style>??? OK, treat it as <style>.
                        continue
                    else:
                        # No </style>??? Any other open tags would close <style>
                        break
                elif token[0] == hpp.ENDTAG:
                    break
            else:
                # the stream is exhausted? Make sure next step knows
                # there is no unprocessed token.
                token = None

            # process the style content
            _scan_html_style(tag, ''.join(styles), baseuri, append)

            # look at last unprecessed token
            if not token:
                # TODO: test
                break
            elif token[0] == hpp.ENDTAG:
                # hopefully this ends with a valid </style>
                # TODO: test
                continue
            else:
                # we got a (non-style) begin TAG??? OK, we'll process this tag.
                tag = token[1]
                # TODO: test

        # ----------------------------------------------------------------------
        # read TAG and its attributes
        isLinkTag = (tag == 'link')
        uri_attr = LINKABLE_TAGS.get(tag,'')

        # run through attribute list to find relevant info
        uri = style = rel = ctype = ''
        for n, v in token[2]:               # TODO: need to XML decode?
            if n == uri_attr:
                uri = v
            elif n == 'style':
                style = v
            elif isLinkTag:
                if n == 'rel':
                    rel = v
                elif n == 'type':
                    ctype = v

        # ----------------------------------------------------------------------
        # handle style attribute
        if style:
            _scan_html_style(tag, style, baseuri, append)

        # ----------------------------------------------------------------------
        # handle uri attributes (href, src, etc)
        if not uri:
            continue
        #print >>sys.stderr, tag, uri, ctype

        # note: the ctype is advisory, content-type from http may not be consistent
        # everything not HTML or CSS is APPLICATION
        if tag in ['frame','iframe']:
            ctype = TEXT_HTML
        elif isLinkTag:
            if rel.lower() == 'stylesheet': # TODO: rel="alternate stylesheet"?
                ctype = TEXT_CSS
            elif ctype != TEXT_CSS:
                # only want CSS from <link>
                continue
        else:
            ctype = APPLICATION

        append(baseuri, uri, ctype, tag)

Example #4

0

Show file

File: test_html_pull_parser.py Project: BackupTheBerlios/mindretrieve-svn

 def _test_generator(self, doc, expect, **args):
     fp = StringIO.StringIO(doc)
     tokens = hpp.generate_tokens(fp, **args)
     self._test_generator1(tokens, expect)

Example #5

0

Show file

File: test_html_pull_parser.py Project: BackupTheBerlios/mindretrieve-svn

 def test_0(self):
     tlist = list(hpp.generate_tokens(StringIO.StringIO("&#XE5;")))
     self.assertEqual(tlist, [(DATA, u'\u00e5')])