Example #1
0
def main():
    # parse HTML
    infile = sys.argv[1]
    inf = file(infile)
    p = HTMLParser(entities)
    for n, line in enumerate(inf):
        try:
            p.feed(line)
        except HTMLParseError as err:
            sys.stderr.write("%s:%d:%d: Parse error: %s\n" % (infile, err.lineno, err.offset, err.msg))
            sys.exit(1)
        except Exception as err:
            sys.stderr.write("%s:%d:0: Error (%s): %s\n" % (infile, n + 1, repr(err), line))
            sys.exit(1)
    p.close()
    inf.close()

    # generate groff
    sf = StringIO()
    f = Formatter(infile, sf)
    f.pp(fix(p.data))
    s = sf.getvalue()
    sf.close()

    # strip excess whitespace
    blank_re = re.compile("[ \t\n]*\n([ \t]*\n)*")
    s = blank_re.sub("\n", s)
    s = s.lstrip()

    # write groff
    outf = file(sys.argv[2], "w")
    outf.write(s)
    outf.close()
Example #2
0
def main():
    # parse HTML
    infile = sys.argv[1]
    inf = file(infile)
    p = HTMLParser(entities)
    for n, line in enumerate(inf):
	try:
	    p.feed(line)
	except HTMLParseError, err:
	    sys.stderr.write('%s:%d:%d: Parse error: %s\n' % (infile, err.lineno, err.offset, err.msg))
	    sys.exit(1)
	except Exception, err:
	    sys.stderr.write('%s:%d:0: Error (%s): %s\n' % (infile, n + 1, repr(err), line))
	    sys.exit(1)
Example #3
0
def insert_read_only_node(c, p, name):
    if name == "":
        name = g.app.gui.runOpenFileDialog(
            c,
            title="Open",
            filetypes=[("All files", "*")],
        )
        c.setHeadString(p, "@read-only %s" % name)
        c.redraw()
    parse = urlparse.urlparse(name)
    try:
        if parse[0] == 'ftp':
            file = FTPurl(name)  # FTP URL
        elif parse[0] == 'http':
            file = urllib.urlopen(name)  # HTTP URL
        else:
            file = open(name, "r")  # local file
        g.es("..." + name)
        new = file.read()
        file.close()
    except IOError:  # as msg:
        # g.es("error reading %s: %s" % (name, msg))
        # g.es("...not found: " + name)
        c.setBodyString(p, "")  # Clear the body text.
        return True  # Mark the node as changed.
    else:
        ext = os.path.splitext(parse[2])[1]
        if ext.lower() in ['.htm', '.html']:
            #@+<< convert HTML to text >>
            #@+node:edream.110203113231.895: *3* << convert HTML to text >>
            fh = StringIO()
            fmt = AbstractFormatter(DumbWriter(fh))
            # the parser stores parsed data into fh (file-like handle)
            parser = HTMLParser(fmt)

            # send the HTML text to the parser
            parser.feed(new)
            parser.close()

            # now replace the old string with the parsed text
            new = fh.getvalue()
            fh.close()

            # finally, get the list of hyperlinks and append to the end of the text
            hyperlinks = parser.anchorlist
            numlinks = len(hyperlinks)
            if numlinks > 0:
                hyperlist = ['\n\n--Hyperlink list follows--']
                for i in range(numlinks):
                    hyperlist.append("\n[%d]: %s" %
                                     (i + 1, hyperlinks[i]))  # 3/26/03: was i.
                new = new + ''.join(hyperlist)
            #@-<< convert HTML to text >>
        previous = p.b
        c.setBodyString(p, new)
        changed = (g.toUnicode(new) != g.toUnicode(previous))
        if changed and previous != "":
            g.es("changed: %s" % name)  # A real change.
        return changed
Example #4
0
def main():
    # parse HTML
    infile = sys.argv[1]
    inf = file(infile)
    p = HTMLParser(entities)
    for n, line in enumerate(inf):
        try:
            p.feed(line)
        except HTMLParseError as err:
            sys.stderr.write('%s:%d:%d: Parse error: %s\n' %
                             (infile, err.lineno, err.offset, err.msg))
            sys.exit(1)
        except Exception as err:
            sys.stderr.write('%s:%d:0: Error (%s): %s\n' %
                             (infile, n + 1, repr(err), line))
            sys.exit(1)
    p.close()
    inf.close()

    # generate groff
    sf = StringIO()
    f = Formatter(infile, sf)
    f.pp(fix(p.data))
    s = sf.getvalue()
    sf.close()

    # strip excess whitespace
    blank_re = re.compile("[ \t\n]*\n([ \t]*\n)*")
    s = blank_re.sub('\n', s)
    s = s.lstrip()

    # write groff
    outf = file(sys.argv[2], 'w')
    outf.write(s)
    outf.close()
Example #5
0
def replaceHTMLCodes(txt):
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    return txt