Beispiel #1
0
def email_strip_html(html_content):
    """Strip html tags from html_content, trying to respect formatting."""
    html_content = RE_SPACES.sub(' ', html_content)
    html_content = RE_NEWLINES.sub('\n', html_content)
    html_content = RE_HTML_TAGS.sub('', html_content)
    html_content = html_content.split('\n')
    out = StringIO()
    out_format = AbstractFormatter(DumbWriter(out))
    for row in html_content:
        out_format.add_flowing_data(row)
        out_format.end_paragraph(1)
    return out.getvalue()
Beispiel #2
0
 def parseAndGetLinks(self):
     """StringIO是从内存中读取数据 DumbWriter将事件流转换为存文本文档  AbstractFormatter 类进行格式化
     """
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
Beispiel #3
0
def index(path, indexpage, output):
    parser = IdxHlpHtmlParser(AbstractFormatter(AlmostNullWriter()),
                              path, output)
    f = open(path + '/' + indexpage)
    parser.feed(f.read())
    parser.close()
    f.close()
Beispiel #4
0
def content(path, contentpage, output):
    parser = TocHlpHtmlParser(AbstractFormatter(AlmostNullWriter()),
                              path, output)
    f = open(path + '/' + contentpage)
    parser.feed(f.read())
    parser.close()
    f.close()
Beispiel #5
0
 def __init__(self, writer, settings, context):
     if not self._inited:
         for k, v in self.fontdingbats.items():
             self.dingbats[(k, 'grey')] = v
             self.dingbats[(k, 'color')] = v
         import Greek
         for k, v in Greek.entitydefs.items():
             tup = (v, 'Symbol')
             self.dingbats[(k, 'grey')] = tup
             self.dingbats[(k, 'color')] = tup
         PrintingHTMLParser._inited = 1
     HTMLParser.__init__(self, AbstractFormatter(writer))
     if settings.strict_parsing:
         self.sgml_parser.restrict(0)
     self._baseurl = context.get_baseurl()
     self.context = context
     self.settings = settings
     if settings.imageflag:
         self._image_loader = utils.image_loader
     self._image_cache = {}
     self._anchors = {None: None}
     self._anchor_sequence = []
     self._anchor_xforms = []
     if not settings.footnoteflag:
         self.add_anchor_transform(disallow_anchor_footnotes)
     else:
         self.add_anchor_transform(
             disallow_self_reference(context.get_url()))
     self.__fontsize = [3]
 def parseAndGetLinks(self):
     # 创建一个基本的HTML解释器,可能需要单独一篇文章来说这句
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     # 解析html文件,获取所有的连接(带有href的)
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
Beispiel #7
0
def rewrite_htmlinclude(match, include_dir, quietly):
    file_path = os.path.join(include_dir, match.group(1))    
    trailing_char = match.group(2)

    if not valid_file(file_path):
        if not quietly:
            print("Warning: unable to expand @htmlinclude '" + match.group(1) + "'")
        return ''

    # First, try to see if there's a .txt version.  If so, use that.

    txt_file = re.sub(r'html', 'txt', file_path, re.IGNORECASE)
    if valid_file(txt_file):
        contents = read_file_contents(txt_file)
        return rewrite_included_contents(contents) + trailing_char
    else:                               # No txt file; proceed with .html file.
        file = open(file_path, 'r')

        writer = RewritePydocStringWriter()
        parser = RewritePydocHTMLParser(AbstractFormatter(writer))
        parser.feed(file.read())
        parser.close()
        file.close()

        return rewrite_included_contents(writer.get_text()) + trailing_char
Beispiel #8
0
 def parseAndGetLinks(self):  # pars HTML, save links
     self.parser = HTMLParser(AbstractFormatter( \
      DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     print self.parser
     return self.parser.anchorlist
Beispiel #9
0
def insert_read_only_node(c, p, name):
    if name == "":
        name = g.app.gui.runOpenFileDialog(
            c,
            title="Open",
            filetypes=[("All files", "*")],
        )
        c.setHeadString(p, "@read-only %s" % name)
        c.redraw()
    parse = urlparse.urlparse(name)
    try:
        if parse[0] == 'ftp':
            file = FTPurl(name)  # FTP URL
        elif parse[0] == 'http':
            file = urllib.urlopen(name)  # HTTP URL
        else:
            file = open(name, "r")  # local file
        g.es("..." + name)
        new = file.read()
        file.close()
    except IOError:  # as msg:
        # g.es("error reading %s: %s" % (name, msg))
        # g.es("...not found: " + name)
        c.setBodyString(p, "")  # Clear the body text.
        return True  # Mark the node as changed.
    else:
        ext = os.path.splitext(parse[2])[1]
        if ext.lower() in ['.htm', '.html']:
            #@+<< convert HTML to text >>
            #@+node:edream.110203113231.895: *3* << convert HTML to text >>
            fh = StringIO()
            fmt = AbstractFormatter(DumbWriter(fh))
            # the parser stores parsed data into fh (file-like handle)
            parser = HTMLParser(fmt)

            # send the HTML text to the parser
            parser.feed(new)
            parser.close()

            # now replace the old string with the parsed text
            new = fh.getvalue()
            fh.close()

            # finally, get the list of hyperlinks and append to the end of the text
            hyperlinks = parser.anchorlist
            numlinks = len(hyperlinks)
            if numlinks > 0:
                hyperlist = ['\n\n--Hyperlink list follows--']
                for i in range(numlinks):
                    hyperlist.append("\n[%d]: %s" %
                                     (i + 1, hyperlinks[i]))  # 3/26/03: was i.
                new = new + ''.join(hyperlist)
            #@-<< convert HTML to text >>
        previous = p.b
        c.setBodyString(p, new)
        changed = (g.toUnicode(new) != g.toUnicode(previous))
        if changed and previous != "":
            g.es("changed: %s" % name)  # A real change.
        return changed
Beispiel #10
0
 def parseAndGetLinks(self):
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     try:
         self.parser.feed(open(self.file).read())
         self.parser.close()
     except IOError:
         pass
     return self.parser.anchorlist
Beispiel #11
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(AbstractFormatter(DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Beispiel #12
0
def html2text(html):
    f = StringIO()
    parser = HTMLParser(AbstractFormatter(DumbWriter(f)))
    try:
        parser.feed(html)
    except HTMLParseError:
        return ''
    else:
        parser.close()
        return f.getvalue()
Beispiel #13
0
def get_text_from_html( html_input ):
  "Strip tags and non-ascii characters from HTML input."
  my_stringio = StringIO.StringIO() # make an instance of this file-like string thing
  p = HTMLParser(AbstractFormatter(DumbWriter(my_stringio)))
  try: p.feed(html_input); p.close() #calling close is not usually needed, but let's play it safe
  except HTMLParseError: print '***HTML malformed***' #the html is badly malformed (or you found a bug)
  #return my_stringio.getvalue().replace('\xa0','')
  s = re.sub( r'[^\x00-\x7f]', r' ', my_stringio.getvalue() )
  s = s.replace('\r\n',' ').replace('\n',' ')
  s = re.sub( ' +', ' ', s )
  return s
Beispiel #14
0
    def parseAndGetLinks(self):
        '''解析html页面,获取页面中的链接,并保存链接'''

        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        #使用HTMLParser的方法进行处理 , StringIO是从内存中读取数据,DumbWriter将事件流转换为存文本文档。
        self.parser.feed(open(self.file).read())
        #将self.file文件打开,并一次性读入上面定的文件中

        self.parser.close()
        print 'self.parser.anchorlist --> ', self.parser.anchorlist
        return self.parser.anchorlist  #anchorlist 记录href 地址
Beispiel #15
0
 def parseAndGetLinks(self, html_string):
     try:
         self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
         self.parser.feed(html_string)
         self.parser.close()
         links = []
         for eachLink in self.parser.anchorlist:
             if eachLink[:4] != "http" and find(eachLink, "://") == -1:
                 eachLink = urljoin(self.base_url, eachLink)
             links.append(eachLink)
         return links
     except IOError:
         return []
Beispiel #16
0
 def parse_html(self, html):
     from StringIO import StringIO
     from formatter import (AbstractFormatter, DumbWriter)
     from htmllib import HTMLParser
     _html = re.sub(self.notrans_tag, r" \1 ", html)
     buf = StringIO()
     p = HTMLParser(AbstractFormatter(DumbWriter(buf)))
     p.feed(_html)
     _sub = re.sub(self.whitespaces, " ", buf.getvalue())
     # FIXME: how can zerowidth be removed more simply?
     _sub = re.sub(self.zerowidth, "", _sub)
     _sub = re.sub(self.colon, r"\1", _sub)
     return _sub
Beispiel #17
0
def test():
    import sys
    file = 'test.html'
    if sys.argv[1:]: file = sys.argv[1]
    fp = open(file, 'r')
    data = fp.read()
    fp.close()
    from formatter import NullWriter, AbstractFormatter
    w = NullWriter()
    f = AbstractFormatter(w)
    p = HTMLParser(f)
    p.feed(data)
    p.close()
Beispiel #18
0
        class _Out(Coloring):
            def __init__(self, gc):
                Coloring.__init__(self, gc, "help")
                self.heading = self.printer("heading", attr="bold")

                self.wrap = AbstractFormatter(DumbWriter())

            def _PrintSection(self, heading, bodyAttr):
                try:
                    body = getattr(cmd, bodyAttr)
                except AttributeError:
                    return
                if body == "" or body is None:
                    return

                self.nl()

                self.heading("%s", heading)
                self.nl()

                self.heading("%s", "".ljust(len(heading), "-"))
                self.nl()

                me = "repo %s" % cmd.NAME
                body = body.strip()
                body = body.replace("%prog", me)

                asciidoc_hdr = re.compile(r"^\n?([^\n]{1,})\n([=~-]{2,})$")
                for para in body.split("\n\n"):
                    if para.startswith(" "):
                        self.write("%s", para)
                        self.nl()
                        self.nl()
                        continue

                    m = asciidoc_hdr.match(para)
                    if m:
                        title = m.group(1)
                        section_type = m.group(2)
                        if section_type[0] in ("=", "-"):
                            p = self.heading
                        else:

                            def _p(fmt, *args):
                                self.write("  ")
                                self.heading(fmt, *args)

                            p = _p

                        p("%s", title)
                        self.nl()
                        p("%s", "".ljust(len(title), section_type[0]))
                        self.nl()
                        continue

                    self.wrap.add_flowing_data(para)
                    self.wrap.end_paragraph(1)
                self.wrap.end_paragraph(0)
Beispiel #19
0
        class _Out(Coloring):
            def __init__(self, gc):
                Coloring.__init__(self, gc, 'help')
                self.heading = self.printer('heading', attr='bold')

                self.wrap = AbstractFormatter(DumbWriter())

            def _PrintSection(self, heading, bodyAttr):
                try:
                    body = getattr(cmd, bodyAttr)
                except AttributeError:
                    return
                if body == '' or body is None:
                    return

                self.nl()

                self.heading('%s', heading)
                self.nl()

                self.heading('%s', ''.ljust(len(heading), '-'))
                self.nl()

                me = 'repo %s' % cmd.NAME
                body = body.strip()
                body = body.replace('%prog', me)

                asciidoc_hdr = re.compile(r'^\n?([^\n]{1,})\n([=~-]{2,})$')
                for para in body.split("\n\n"):
                    if para.startswith(' '):
                        self.write('%s', para)
                        self.nl()
                        self.nl()
                        continue

                    m = asciidoc_hdr.match(para)
                    if m:
                        title = m.group(1)
                        section_type = m.group(2)
                        if section_type[0] in ('=', '-'):
                            p = self.heading
                        else:

                            def _p(fmt, *args):
                                self.write('  ')
                                self.heading(fmt, *args)

                            p = _p

                        p('%s', title)
                        self.nl()
                        p('%s', ''.ljust(len(title), section_type[0]))
                        self.nl()
                        continue

                    self.wrap.add_flowing_data(para)
                    self.wrap.end_paragraph(1)
                self.wrap.end_paragraph(0)
Beispiel #20
0
        class _Out(Coloring):
            def __init__(self, gc):
                Coloring.__init__(self, gc, 'help')
                self.heading = self.printer('heading', attr='bold')

                self.wrap = AbstractFormatter(DumbWriter())

            def _PrintSection(self, heading, bodyAttr):
                try:
                    body = getattr(cmd, bodyAttr)
                except AttributeError:
                    return
                if body == '' or body is None:
                    return

                self.nl()

                self.heading('%s', heading)
                self.nl()

                self.heading('%s', ''.ljust(len(heading), '-'))
                self.nl()

                me = 'andromeda %s' % cmd.NAME
                body = body.strip()
                body = body.replace('%prog', me)

                asciidoc_hdr = re.compile(r'^\n?([^\n]{1,})\n([=~-]{2,})$')
                for para in body.split("\n\n"):
                    if para.startswith(' '):
                        self.write('%s', para)
                        self.nl()
                        self.nl()
                        continue

                    m = asciidoc_hdr.match(para)
                    if m:
                        title = m.group(1)
                        section_type = m.group(2)
                        if section_type[0] in ('=', '-'):
                            p = self.heading
                        else:
                            def _p(fmt, *args):
                                self.write('  ')
                                self.heading(fmt, *args)

                            p = _p

                        p('%s', title)
                        self.nl()
                        p('%s', ''.ljust(len(title), section_type[0]))
                        self.nl()
                        continue

                    self.wrap.add_flowing_data(para)
                    self.wrap.end_paragraph(1)
                self.wrap.end_paragraph(0)
Beispiel #21
0
def get_plain_from_html(html):
    """extract plain text from html

    >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>"
    >>> get_plain_from_html(test_html)
    '\\nHey\\n\\nThis is some text'

    """
    from htmllib import HTMLParser  # import here to avoid high startup cost

    textout = StringIO()
    formtext = AbstractFormatter(DumbWriter(textout))
    parser = HTMLParser(formtext)
    parser.feed(html)
    parser.close()
    return textout.getvalue()
Beispiel #22
0
    def extract_from_html(self,
                          html,
                          lower_threshold=None,
                          upper_threshold=None):
        # Create an instance of ParsingTracker and pass TextWriter() to collect approrpiate output
        self.writer = TextWriter()
        formatter = AbstractFormatter(self.writer)
        self.parser = ParsingTracker(self.writer, formatter)

        if lower_threshold:
            self.writer.lower_threshold = lower_threshold
        if upper_threshold:
            self.writer.upper_threshold = upper_threshold

        self.parser.feed(html)
        self.parser.close()
        return self.writer.output()
Beispiel #23
0
def email_strip_html(html_content):
    """Strip html tags from html_content, trying to respect formatting."""
    html_content = RE_SPACES.sub(' ', html_content)
    html_content = RE_NEWLINES.sub('\n', html_content)
    html_content = RE_HTML_TAGS.sub('', html_content)
    html_content = html_content.split('\n')
    out = StringIO()
    out_format = AbstractFormatter(DumbWriter(out))
    for row in html_content:
        out_format.add_flowing_data(row)
        out_format.end_paragraph(1)
    return out.getvalue()
Beispiel #24
0
def collectURLSFromPage(page):
    """
    This returns a list of URLS that come from a certain page.
    Useful for spiders. It takes just a string as an argument.
    """

    resultList = []
    if page == "":
        #nothing to parse, so nothing to return
        return resultList

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #This needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
Beispiel #25
0
    def parseAndGetLinks(self):  # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        r = self.download()
        if r:
            print '________'
            try:
                try:
                    s = r.read(50000)
                except socket.error as e:
                    print "***************************socket error***************************", e
                    return []
                self.parser.feed(s)
                print '------------------'

                r.close()
                print '***************************'
            except HTMLParseError:
                print 'get links error\n'
                return []

        self.parser.close()
        return self.parser.anchorlist
Beispiel #26
0
def collectURLSFromPage(page):

    resultList = []

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #the whole "AbstractFormater()" line is a bunch of crap I copied
    #That needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
Beispiel #27
0
        class _Out(Coloring):
            def __init__(self, gc):
                Coloring.__init__(self, gc, 'help')
                self.heading = self.printer('heading', attr='bold')

                self.wrap = AbstractFormatter(DumbWriter())

            def _PrintSection(self, heading, bodyAttr):
                try:
                    body = getattr(cmd, bodyAttr)
                except AttributeError:
                    return
                if body == '' or body is None:
                    return

                self.nl()

                self.heading('%s%s', header_prefix, heading)
                self.nl()
                self.nl()

                me = 'repo %s' % cmd.NAME
                body = body.strip()
                body = body.replace('%prog', me)

                asciidoc_hdr = re.compile(r'^\n?#+ (.+)$')
                for para in body.split("\n\n"):
                    if para.startswith(' '):
                        self.write('%s', para)
                        self.nl()
                        self.nl()
                        continue

                    m = asciidoc_hdr.match(para)
                    if m:
                        self.heading('%s%s', header_prefix, m.group(1))
                        self.nl()
                        self.nl()
                        continue

                    self.wrap.add_flowing_data(para)
                    self.wrap.end_paragraph(1)
                self.wrap.end_paragraph(0)
Beispiel #28
0
    class _Out(Coloring):
      def __init__(self, gc):
        Coloring.__init__(self, gc, 'help')
        self.heading = self.printer('heading', attr='bold')

        self.wrap = AbstractFormatter(DumbWriter())

      def _PrintSection(self, heading, bodyAttr):
        try:
          body = getattr(cmd, bodyAttr)
        except AttributeError:
          return
        if body == '' or body is None:
          return

        self.nl()

        self.heading('%s', heading)
        self.nl()
        self.nl()

        me = 'repo %s' % cmd.NAME
        body = body.strip()
        body = body.replace('%prog', me)

        asciidoc_hdr = re.compile(r'^\n?#+ (.+)$')
        for para in body.split("\n\n"):
          if para.startswith(' '):
            self.write('%s', para)
            self.nl()
            self.nl()
            continue

          m = asciidoc_hdr.match(para)
          if m:
            self.heading(m.group(1))
            self.nl()
            self.nl()
            continue

          self.wrap.add_flowing_data(para)
          self.wrap.end_paragraph(1)
        self.wrap.end_paragraph(0)
Beispiel #29
0
 def new_formatter(self):
     formatter = AbstractFormatter(self._viewer)
     # set parskip to prevent blank line at top of cell if the content
     # starts with a <P> or header element.
     formatter.parskip = 1
     return formatter
Beispiel #30
0
 def __init__(self):
     HTMLParser.__init__(self, AbstractFormatter(NullWriter()))
     self.result = []
     self.requires_no_close = ['img', 'br']
Beispiel #31
0
 def parseAndGetLinks(self):  #¿?¿?HTML¿?¿?¿?¿?¿?
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
Beispiel #32
0
            def __init__(self, gc):
                Coloring.__init__(self, gc, "help")
                self.heading = self.printer("heading", attr="bold")

                self.wrap = AbstractFormatter(DumbWriter())
	def parseAndGetLinks(self):
		self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
		self.parser.feed(urlopen(self.url).read())
		self.parser.close()
		return self.parser.anchorlist
Beispiel #34
0
      def __init__(self, gc):
        Coloring.__init__(self, gc, 'help')
        self.heading = self.printer('heading', attr='bold')

        self.wrap = AbstractFormatter(DumbWriter())
Beispiel #35
0
 def new_formatter(self):
     formatter = AbstractFormatter(self._viewer)
     # set parskip to prevent blank line at top of cell if the content
     # starts with a <P> or header element.
     formatter.parskip = 1
     return formatter
Beispiel #36
0
 def __init__(self):
     AbstractFormatter.__init__(self, NullWriter())
     self.m_raw = []
     self.page_width = 60
     self.cursor = 0
Beispiel #37
0
            def __init__(self, gc):
                Coloring.__init__(self, gc, 'help')
                self.heading = self.printer('heading', attr='bold')

                self.wrap = AbstractFormatter(DumbWriter())
Beispiel #38
0
	def __init__(self):
		AbstractFormatter.__init__(self, NullWriter())
		self.m_raw = [ ]
		self.page_width = 60
		self.cursor = 0