コード例 #1
0
    def set_and_parse_xhtml(self, xhtml=None):
        """
        Takes a xhtml text string and parses it using the tkHTMLWriter/Parser system.
        """
        # Ensure the input is ok:
        if xhtml is None:
            xhtml = self.Experiment.getWikiXhtml()
        if xhtml is None:
            xhtml = ""
        else:
            xhtml = xhtml.replace(' ', ' ')

        # prepare the text widget:
        self.text.config(state="normal")
        self.text.delete("1.0", "end")
        self.text.update_idletasks()
        if not xhtml:
            logger.debug("No xhtml, aborting...")
            self.text.config(state="disabled")
            return xhtml
        # Write the xhtml to the text widget:
        writer = tkHTMLWriter(self.text)
        fmt = formatter.AbstractFormatter(writer)
        parser = tkHTMLParser(fmt)
        parser.feed(xhtml)
        parser.close()
        # Finally, disable the text widget again
        self.text.config(state="disabled")
        logger.debug("(%s) text area updated with parsed/formatted html from string of length %s", self.__class__.__name__, len(xhtml) if xhtml else xhtml)
コード例 #2
0
 def OpenURL(self, url):
     from htmllib import HTMLParser
     import formatter
     self.url = url
     m = re.match('http://([^/]+)(/\S*)\s*', url)
     if m:
         host = m.groups()[0]
         path = m.groups()[1]
     else:
         m = re.match('http://(\S+)\s*', url)
         if not m:
             # Invalid URL
             self.logprint("Invalid or unsupported URL: %s" % (url))
             return
         host = m.groups()[0]
         path = ''
     f = self.RetrieveAsFile(host, path)
     if not f:
         self.logprint("Could not open %s" % (url))
         return
     self.logprint("Receiving data...")
     data = f.read()
     tmp = open('hangman_dict.txt', 'w')
     fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp))
     p = HTMLParser(fmt)
     self.logprint("Parsing data...")
     p.feed(data)
     p.close()
     tmp.close()
コード例 #3
0
 def __init__(self, viewer, reload=0):
     global _inited
     self.viewer = viewer
     self.reload = reload
     self.context = self.viewer.context
     self.app = self.context.app
     self.load_dingbat = self.app.load_dingbat
     self.loaded = []
     self.current_map = None
     self.target = None
     self.formatter_stack = []
     fmt = formatter.AbstractFormatter(self.viewer)
     HTMLParser.__init__(self, fmt)
     self.push_formatter(fmt)
     if not _inited:
         _inited = 1
         init_module(self.app.prefs)
     self._ids = {}
     # Hackery so reload status can be reset when all applets are loaded
     import AppletLoader
     self.reload1 = self.reload and AppletLoader.set_reload(self.context)
     if self.reload1:
         self.reload1.attach(self)
     if self.app.prefs.GetBoolean('parsing-html', 'strict'):
         self.sgml_parser.restrict(0)
     # Information from <META ... CONTENT="..."> is collected here.
     # Entries are KEY --> [(NAME, HTTP-EQUIV, CONTENT), ...], where
     # KEY is (NAME or HTTP-EQUIV).
     self._metadata = {}
コード例 #4
0
    def __init__(self,
                 parent,
                 startUrlCallback=None,
                 endUrlCallback=None,
                 enterLinkCallback=None,
                 leaveLinkCallback=None,
                 *args,
                 **kw):

        self.startUrlCallback = startUrlCallback
        self.endUrlCallback = endUrlCallback
        self.enterLinkCallback = enterLinkCallback
        self.leaveLinkCallback = leaveLinkCallback

        self.initTags()

        self.protocol = ''
        self.location = ''
        self.path = ''
        self.dir = ''

        formatter.NullWriter.__init__(self)
        self.formatter = formatter.AbstractFormatter(self)
        htmllib.HTMLParser.__init__(self, self.formatter)
        ScrolledText.__init__(self, parent, *args, **kw)
        self.text_area.config(font=(font_family, default_size))

        self.bind('<Enter>', lambda event: self.focus())
        self.bind('<KeyPress-Prior>', lambda event: self.pageChange(-1))
        self.bind('<KeyPress-Next>', lambda event: self.pageChange(1))
コード例 #5
0
def test(args = None):
    import sys, formatter
    if not args:
        args = sys.argv[1:]
    silent = args and args[0] == '-s'
    if silent:
        del args[0]
    if args:
        file = args[0]
    else:
        file = 'test.html'
    if file == '-':
        f = sys.stdin
    else:
        try:
            f = open(file, 'r')
        except IOError as msg:
            print file, ':', msg
            sys.exit(1)

    data = f.read()
    if f is not sys.stdin:
        f.close()
    if silent:
        f = formatter.NullFormatter()
    else:
        f = formatter.AbstractFormatter(formatter.DumbWriter())
    p = HTMLParser(f)
    p.feed(data)
    p.close()
コード例 #6
0
def test(args=None):
    import sys
    import formatter

    if not args:
        args = sys.argv[1:]

    silent = args and args[0] == '-s'
    if silent:
        del args[0]

    if args:
        fn = args[0]
    else:
        fn = 'test.html'

    if fn == '-':
        data = sys.stdin.read()
    else:
        try:
            with open(fn, 'rt') as fh:
                data = fh.read()
        except IOError as msg:
            print(fn, ":", msg)
            sys.exit(1)

    if silent:
        f = formatter.NullFormatter()
    else:
        f = formatter.AbstractFormatter(formatter.DumbWriter())

    p = HTMLParser(f)
    p.feed(data)
    p.close()
コード例 #7
0
def extractFromHTML(html, blur=5):
    """
    Extracts text from HTML content.
    """
    html = unicode(html)
    assert isinstance(html, unicode)

    # Create memory file.
    file = StringIO()

    # Convert html to text.
    f = formatter.AbstractFormatter(formatter.DumbWriter(file))
    p = TextExtractor()
    p.pathBlur = blur
    p.feed(html)
    p.close()
    text = p.get_plaintext()

    # Remove stand-alone punctuation.
    text = re.sub("\s[\(\),;\.\?\!](?=\s)", " ", text).strip()

    # Compress whitespace.
    text = re.sub("[\n\s]+", " ", text).strip()

    # Remove consequetive dashes.
    text = re.sub("\-{2,}", "", text).strip()

    # Remove consequetive periods.
    text = re.sub("\.{2,}", "", text).strip()

    return text
コード例 #8
0
def create_plaintext_message(message):
    """ Create clean plain text version of email message

        Parse the html and remove style and javacript tags and then
        create a plain-text-message by parsing the html
        and attaching links as endnotes
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['style']
    doc = message.decode('utf-8', 'ignore')
    to_clean = lxml.html.fromstring(doc)
    cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
    plain_text_maxcols = 72
    textout = cStringIO.StringIO()
    formtext = formatter.AbstractFormatter(
        formatter.DumbWriter(textout, plain_text_maxcols))
    parser = HTMLParser(formtext)
    parser.feed(cleaned_msg)
    parser.close()
    # append the anchorlist at the bottom of a message
    # to keep the message readable.
    counter = 0
    anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
    for item in parser.anchorlist:
        counter += 1
        if item.startswith('https://'):
            new_item = item.replace('https://', 'http://')
        else:
            new_item = item
        anchorlist += "[%d] %s\n" % (counter, new_item)
    text = textout.getvalue() + anchorlist
    del textout, formtext, parser, anchorlist
    return text
コード例 #9
0
	def format(self, text):
		writer = SimpleWriter()
		format = formatter.AbstractFormatter(writer)
		parser = htmllib.HTMLParser(format)
		parser.feed(text)
		parser.close()
		return writer
コード例 #10
0
def create_html_mail(subject,
                     html,
                     text=None,
                     from_addr=None,
                     to_addr=None,
                     headers=None,
                     encoding='UTF-8'):
    """Create a mime-message that will render HTML in popular
    MUAs, text in better ones.
    """
    # Use DumbWriters word wrapping to ensure that no text line
    # is longer than plain_text_maxcols characters.
    plain_text_maxcols = 72

    html = html.encode(encoding)
    if text is None:
        # Produce an approximate textual rendering of the HTML string,
        # unless you have been given a better version as an argument
        textout = StringIO.StringIO()
        formtext = formatter.AbstractFormatter(
            formatter.DumbWriter(textout, plain_text_maxcols))
        parser = htmllib.HTMLParser(formtext)
        parser.feed(html)
        parser.close()

        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            anchorlist += "[%d] %s\n" % (counter, item)

        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
    else:
        text = text.encode(encoding)

    # if we would like to include images in future, there should
    # probably be 'related' instead of 'mixed'
    msg = MIMEMultipart('mixed')
    # maybe later :)  msg['From'] = Header("%s <%s>" %
    #   (send_from_name, send_from), encoding)
    msg['Subject'] = Header(subject, encoding)
    msg['From'] = from_addr
    msg['To'] = to_addr
    msg['Date'] = formatdate(localtime=True)
    msg["Message-ID"] = email.Utils.make_msgid()
    if headers:
        for key, value in headers.items():
            msg[key] = value
    msg.preamble = 'This is a multi-part message in MIME format.'

    alternatives = MIMEMultipart('alternative')
    msg.attach(alternatives)
    alternatives.attach(MIMEText(text, 'plain', _charset=encoding))
    alternatives.attach(MIMEText(html, 'html', _charset=encoding))

    return msg
コード例 #11
0
ファイル: spider.py プロジェクト: pombredanne/paella-svn
 def parse(self):
     # We're using the parser just to get the HREFs
     # We should also use it to e.g. respect <META NOFOLLOW>
     w = formatter.DumbWriter(StringIO())
     f = formatter.AbstractFormatter(w)
     self.parser = htmllib.HTMLParser(f)
     self.parser.feed(self.body)
     self.parser.close()
コード例 #12
0
 def renderOn(self, aPiddleCanvas):
     '''draw the text with aPiddleCanvas
         jjk  02/01/00'''
     writer = _HtmlPiddleWriter(self, aPiddleCanvas)
     fmt = formatter.AbstractFormatter(writer)
     parser = _HtmlParser(fmt)
     parser.feed(self.html)
     parser.close()
コード例 #13
0
ファイル: autoxml.py プロジェクト: pars-linux/pisi-devel
 def print_text(self, file=sys.stdout):
     w = Writer(file)  # plain text
     f = formatter.AbstractFormatter(w)
     errs = []
     self.format(f, errs)
     if errs:
         for x in errs:
             ctx.ui.warning(x)
コード例 #14
0
def html2txt(htmlblock):
    import htmllib, formatter, StringIO
    s = StringIO.StringIO('')
    w = formatter.DumbWriter(s)
    f = formatter.AbstractFormatter(w)
    p = htmllib.HTMLParser(f)
    p.feed(htmlblock)
    return s.getvalue().strip()
コード例 #15
0
ファイル: spider.py プロジェクト: CaMeLCa5e/Pyspider
def find_links(html):
    """return list of links in HTML"""
    writer = formatter.DumbWriter(StringIO())
    f = formatter.AbstractFormatter(writer)
    parser = htmllib.HTMLParser(f)
    parser.feed(html)
    parser.close()
    return parser.anchorlist
コード例 #16
0
def content(path, archivo, output):
    f = formatter.AbstractFormatter(AlmostNullWriter())
    parser = TocHlpHtmlParser(f)
    parser.path = path
    parser.ft = output
    fil = path + '/' + archivo
    parser.feed(open(fil).read())
    parser.close()
コード例 #17
0
def getLinkByHTML2(html):
    format = formatter.AbstractFormatter(formatter.NullWriter())
    ptext = htmllib.HTMLParser(format)
    ptext.feed(html)
    for link in ptext.anchorlist:
       print(link)
       return link
    return ""
コード例 #18
0
 def get_data(self):
     '''
     Download data from Weather Underground website for a given stationid
         , a startyar, and an endyear. The html file is parsed and written
         as csv to a separate txt file for each day.
         [singleprocessing code, deprecated]
     '''
     logger.info('Download data for stationid: ' + self.stationid +
                 ' [start]')
     for td in utils.progressbar(range(0, (self.enddate - self.startdate)
                                       .days + 1), "Downloading: ", 60):
         # increase the date by 1 day for the next download
         current_date = self.startdate + timedelta(days=td)
         # set download url
         url = 'http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=' + \
             self.stationid + '&day=' + str(current_date.day) + '&year=' + \
             str(current_date.year) + '&month=' + \
             str(current_date.month) + '&format=1'
         # define outputfile
         outputfile = self.stationid + '_' + str(current_date.year) \
             + str(current_date.month).zfill(2) + \
             str(current_date.day).zfill(2) + '.txt'
         # check if we want to keep previous downloaded files
         if self.keep:
             if os.path.exists(os.path.join(self.outputdir, outputfile)):
                 # check if filesize is not null
                 if os.path.getsize(os.path.join(self.outputdir,
                                                 outputfile)) > 0:
                     # file exists and is not null, continue next iteration
                     continue
                 else:
                     # file exists but is null, so remove and redownload
                     os.remove(os.path.join(self.outputdir, outputfile))
         elif os.path.exists(os.path.join(self.outputdir, outputfile)):
             os.remove(os.path.join(self.outputdir, outputfile))
         # open outputfile
         with open(os.path.join(self.outputdir, outputfile),
                   'wb') as outfile:
             # open and read the url
             handler = urllib2.urlopen(url)
             content = handler.read()
             # convert spaces to non-breaking spaces
             content = content.replace(' ', '&nbsp;')
             # Removing all the HTML tags from the file
             outstream = cStringIO.StringIO()
             parser = htmllib.HTMLParser(
                 formatter.AbstractFormatter(
                     formatter.DumbWriter(outstream)))
             parser.feed(content)
             # convert spaces back to regular whitespace (' ')
             content = outstream.getvalue().replace('\xa0', ' ')
             # write output
             outfile.write(content)
             # close handler and outstream
             outstream.close()
             handler.close()
         logger.info('Download data for stationid: ' + self.stationid +
                     ' [completed]')
コード例 #19
0
ファイル: spider.py プロジェクト: somapullela/python
def find_links(html):
    """Return a list of links in html."""
    # We're using the parser just to get the HREFs
    writer = formatter.DumbWriter(StringIO())
    f = formatter.AbstractFormatter(writer)
    parser = htmllib.HTMLParser(f)
    parser.feed(html)
    parser.close()
    return parser.anchorlist
コード例 #20
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
コード例 #21
0
ファイル: getLinks.py プロジェクト: zeus911/mypython
def getLinks():
    website = urllib2.urlopen("http://www.profmcmmillan.com")
    data = website.read()
    website.close()
    Format = formatter.AbstractFormatter(formatter.NullWriter())
    ptext = htmllib.HTMLParser(Format)
    ptext.feed(data)
    for link in ptext.anchorlist:
        print link
コード例 #22
0
ファイル: h2parser.py プロジェクト: eox03y/OldWorks
    def __init__(self, prn=0):
        if prn:
            format = formatter.AbstractFormatter(formatter.DumbWriter())
        else:
            format = formatter.NullFormatter()

        htmllib.HTMLParser.__init__(self, format)
        self.depth = 0
        self.stack = []
コード例 #23
0
 def textFromHtml(self, html):
     textout = cStringIO.StringIO()
     formtext = formatter.AbstractFormatter(formatter.DumbWriter(textout))
     parser = htmllib.HTMLParser(formtext)
     parser.feed(html)
     parser.close()
     text = textout.getvalue()
     del textout, formtext, parser
     return text
コード例 #24
0
def get_urls(url):
    data = urllib.urlopen(url).read()
    parser = HTMLParser(
        formatter.AbstractFormatter(formatter.DumbWriter(
            cStringIO.StringIO())))
    parser.feed(data)
    parser.close()
    url_list = parser.anchorlist
    return url_list
コード例 #25
0
 def parse_links(self):
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     paeser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     paeser.feed(data)
     paeser.close()
     return paeser.anchorlist
コード例 #26
0
 def decodeHTMLCharacterEntities(self, code):
     #print code
     outputstring = StringIO.StringIO()
     w = formatter.DumbWriter(outputstring, maxcol=9999999)  # plain text
     f = formatter.AbstractFormatter(w)
     p = htmllib.HTMLParser(f)
     p.feed(code)
     p.close()
     #print outputstring.getvalue()
     return (outputstring.getvalue())
コード例 #27
0
def urlparser2():
    import urllib.request, urllib.parse, formatter
    from html.parser import HTMLParser
    response = urllib.request.urlopen(url)
    data = response.read()
    response.close()
    format = formatter.AbstractFormatter(formatter.NullFormatter())
    ptext = HTMLParser(format)
    ptext.feed(data)
    for link in ptext.anchorlist:
        print(link)
コード例 #28
0
ファイル: crawl.py プロジェクト: yfang1644/web_training
 def parse_link(seld):
     'Parse out the link'
     f = open('seld.file', 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
コード例 #29
0
 def parse_links(self):  #解析刚下载下来的页面链接
     f = open(self.file, 'r')  #读取下载的页面
     data = f.read()
     f.close()
     parser = htmllib.HTMLParser(
         formatter.AbstractFormatter(  #AbstractFormatter用来解析数据
             formatter.DumbWriter(cStringIO.StringIO()))
     )  #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件)
     parser.feed(data)
     parser.close()
     return parser.anchorlist  #返回解析后的列表
コード例 #30
0
def urlparser():
    import urllib.request, formatter, sys, html
    from html.parser import HTMLParser
    with urllib.request.urlopen(url) as response:
        data = str(response.read())
        response.close()
        format = formatter.AbstractFormatter(formatter.DumbWriter(
            sys.stdout))
        ptext = HTMLParser(format)
        ptext.feed(data)
        ptext.close()