Python HTMLParserの例、htmllib.HTMLParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: conv.py プロジェクト: bumatic/WikiTalkParser

def unescapeHTML(s):
	if '&' in s:
		p = htmllib.HTMLParser(None)
		p.save_bgn()
		p.feed(s)
		return p.save_end()
	return s

コード例 #2

0

ファイルを表示

	def format(self, text):
		writer = SimpleWriter()
		format = formatter.AbstractFormatter(writer)
		parser = htmllib.HTMLParser(format)
		parser.feed(text)
		parser.close()
		return writer

コード例 #3

0

ファイルを表示

ファイル: buzz.py プロジェクト: runbrahms/buzz2weibo

    def unescape(self, s):
        """解码html转义"""

        p = htmllib.HTMLParser(None)
        p.save_bgn()
        p.feed(s)
        return p.save_end()

コード例 #4

0

ファイルを表示

def unescape(s):
    if not s:
        return s
    p = htmllib.HTMLParser(None)
    p.save_bgn()
    p.feed(s)
    return p.save_end()

コード例 #5

0

ファイルを表示

ファイル: snippet.py プロジェクト: someburner/GistsHub

def prep_text(intext):
    intext = intext.replace("\\r", ' ')  #get rid of the line returns
    #convert the entities back into characters
    p = htmllib.HTMLParser(None)
    p.save_bgn()
    p.feed(intext)
    return p.save_end()

コード例 #6

0

ファイルを表示

ファイル: TorrentStraw.py プロジェクト: zone0000/TorrentStraw

 def __get_unescape_u(text_u):
     """get unesace unicode html"""
     html_parser = htmllib.HTMLParser(None)
     html_parser.save_bgn()
     html_parser.feed(text_u)
     unescaped_u = html_parser.save_end()
     return unescaped_u

コード例 #7

0

ファイルを表示

def unescape_entities(html_string):
    """Removes HTML or XML character references and entities 
    from a html string (see http://wiki.python.org/moin/EscapingHtml)"""
    parser = htmllib.HTMLParser(None)
    parser.save_bgn()
    parser.feed(html_string.encode("iso8859-1"))
    return parser.save_end()

コード例 #8

0

ファイルを表示

def create_html_mail(subject,
                     html,
                     text=None,
                     from_addr=None,
                     to_addr=None,
                     headers=None,
                     encoding='UTF-8'):
    """Create a mime-message that will render HTML in popular
    MUAs, text in better ones.
    """
    # Use DumbWriters word wrapping to ensure that no text line
    # is longer than plain_text_maxcols characters.
    plain_text_maxcols = 72

    html = html.encode(encoding)
    if text is None:
        # Produce an approximate textual rendering of the HTML string,
        # unless you have been given a better version as an argument
        textout = StringIO.StringIO()
        formtext = formatter.AbstractFormatter(
            formatter.DumbWriter(textout, plain_text_maxcols))
        parser = htmllib.HTMLParser(formtext)
        parser.feed(html)
        parser.close()

        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            anchorlist += "[%d] %s\n" % (counter, item)

        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
    else:
        text = text.encode(encoding)

    # if we would like to include images in future, there should
    # probably be 'related' instead of 'mixed'
    msg = MIMEMultipart('mixed')
    # maybe later :)  msg['From'] = Header("%s <%s>" %
    #   (send_from_name, send_from), encoding)
    msg['Subject'] = Header(subject, encoding)
    msg['From'] = from_addr
    msg['To'] = to_addr
    msg['Date'] = formatdate(localtime=True)
    msg["Message-ID"] = email.Utils.make_msgid()
    if headers:
        for key, value in headers.items():
            msg[key] = value
    msg.preamble = 'This is a multi-part message in MIME format.'

    alternatives = MIMEMultipart('alternative')
    msg.attach(alternatives)
    alternatives.attach(MIMEText(text, 'plain', _charset=encoding))
    alternatives.attach(MIMEText(html, 'html', _charset=encoding))

    return msg

コード例 #9

0

ファイルを表示

ファイル: spider.py プロジェクト: pombredanne/paella-svn

 def parse(self):
     # We're using the parser just to get the HREFs
     # We should also use it to e.g. respect <META NOFOLLOW>
     w = formatter.DumbWriter(StringIO())
     f = formatter.AbstractFormatter(w)
     self.parser = htmllib.HTMLParser(f)
     self.parser.feed(self.body)
     self.parser.close()

コード例 #10

0

ファイルを表示

ファイル: combineWikiFiles.py プロジェクト: gnusi/lucene_util

def unescapeHTML(s):
    if reHTMLEscape.search(s) is not None:
        p = htmllib.HTMLParser(None)
        p.save_bgn()
        p.feed(s)
        return p.save_end()
    else:
        return s

コード例 #11

0

ファイルを表示

def unescape(s):
    """
    remplace les sequences d'echappement par leurs caracteres equivalent
    """
    p = htmllib.HTMLParser(None)
    p.save_bgn()
    p.feed(s)
    return p.save_end()

コード例 #12

0

ファイルを表示

ファイル: spider.py プロジェクト: CaMeLCa5e/Pyspider

def find_links(html):
    """return list of links in HTML"""
    writer = formatter.DumbWriter(StringIO())
    f = formatter.AbstractFormatter(writer)
    parser = htmllib.HTMLParser(f)
    parser.feed(html)
    parser.close()
    return parser.anchorlist

コード例 #13

0

ファイルを表示

 def extract(self,htmldata, docno,url):
     try:
         parser = htmllib.HTMLParser(formatter.NullFormatter())
         parser.feed(htmldata)
         return parser.anchorlist
     except Exception,ex:
         pass
         return []

コード例 #14

0

ファイルを表示

ファイル: demo_download_from_html.py プロジェクト: bitores/python-life

def getLinkByHTML2(html):
    format = formatter.AbstractFormatter(formatter.NullWriter())
    ptext = htmllib.HTMLParser(format)
    ptext.feed(html)
    for link in ptext.anchorlist:
       print(link)
       return link
    return ""

コード例 #15

0

ファイルを表示

ファイル: normalize.py プロジェクト: vijayendra-g/productner

def unescape(s):
    p = htmllib.HTMLParser(None)
    p.save_bgn()
    try:
        p.feed(s)
    except:
        return s
    return p.save_end()

コード例 #16

0

ファイルを表示

def html2txt(htmlblock):
    import htmllib, formatter, StringIO
    s = StringIO.StringIO('')
    w = formatter.DumbWriter(s)
    f = formatter.AbstractFormatter(w)
    p = htmllib.HTMLParser(f)
    p.feed(htmlblock)
    return s.getvalue().strip()

コード例 #17

0

ファイルを表示

ファイル: html.py プロジェクト: thi517/unisubs

def unescape(s):
    p = htmllib.HTMLParser(formatter.NullFormatter() )
    # we need to preserve line breaks, nofill makes sure we don't
    # loose them
    p.nofill = True
    p.save_bgn()
    p.feed(s)
    return p.save_end().strip()

コード例 #18

0

ファイルを表示

ファイル: CleanTitle.py プロジェクト: mrrqzhang/FromYahoo

def unescape(s):
    p = htmllib.HTMLParser(None)
    p.save_bgn()
    try:
        p.feed(s)
    except:
        raise ValueError(s)
    return p.save_end()

コード例 #19

0

ファイルを表示

 def get_data(self):
     '''
     Download data from Weather Underground website for a given stationid
         , a startyar, and an endyear. The html file is parsed and written
         as csv to a separate txt file for each day.
         [singleprocessing code, deprecated]
     '''
     logger.info('Download data for stationid: ' + self.stationid +
                 ' [start]')
     for td in utils.progressbar(range(0, (self.enddate - self.startdate)
                                       .days + 1), "Downloading: ", 60):
         # increase the date by 1 day for the next download
         current_date = self.startdate + timedelta(days=td)
         # set download url
         url = 'http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=' + \
             self.stationid + '&day=' + str(current_date.day) + '&year=' + \
             str(current_date.year) + '&month=' + \
             str(current_date.month) + '&format=1'
         # define outputfile
         outputfile = self.stationid + '_' + str(current_date.year) \
             + str(current_date.month).zfill(2) + \
             str(current_date.day).zfill(2) + '.txt'
         # check if we want to keep previous downloaded files
         if self.keep:
             if os.path.exists(os.path.join(self.outputdir, outputfile)):
                 # check if filesize is not null
                 if os.path.getsize(os.path.join(self.outputdir,
                                                 outputfile)) > 0:
                     # file exists and is not null, continue next iteration
                     continue
                 else:
                     # file exists but is null, so remove and redownload
                     os.remove(os.path.join(self.outputdir, outputfile))
         elif os.path.exists(os.path.join(self.outputdir, outputfile)):
             os.remove(os.path.join(self.outputdir, outputfile))
         # open outputfile
         with open(os.path.join(self.outputdir, outputfile),
                   'wb') as outfile:
             # open and read the url
             handler = urllib2.urlopen(url)
             content = handler.read()
             # convert spaces to non-breaking spaces
             content = content.replace(' ', '&nbsp;')
             # Removing all the HTML tags from the file
             outstream = cStringIO.StringIO()
             parser = htmllib.HTMLParser(
                 formatter.AbstractFormatter(
                     formatter.DumbWriter(outstream)))
             parser.feed(content)
             # convert spaces back to regular whitespace (' ')
             content = outstream.getvalue().replace('\xa0', ' ')
             # write output
             outfile.write(content)
             # close handler and outstream
             outstream.close()
             handler.close()
         logger.info('Download data for stationid: ' + self.stationid +
                     ' [completed]')

コード例 #20

0

ファイルを表示

ファイル: getLinks.py プロジェクト: zeus911/mypython

def getLinks():
    website = urllib2.urlopen("http://www.profmcmmillan.com")
    data = website.read()
    website.close()
    Format = formatter.AbstractFormatter(formatter.NullWriter())
    ptext = htmllib.HTMLParser(Format)
    ptext.feed(data)
    for link in ptext.anchorlist:
        print link

コード例 #21

0

ファイルを表示

ファイル: mailers.py プロジェクト: cueas-rickmoore/newa-sensor-monitor

 def textFromHtml(self, html):
     textout = cStringIO.StringIO()
     formtext = formatter.AbstractFormatter(formatter.DumbWriter(textout))
     parser = htmllib.HTMLParser(formtext)
     parser.feed(html)
     parser.close()
     text = textout.getvalue()
     del textout, formtext, parser
     return text

コード例 #22

0

ファイルを表示

ファイル: spider.py プロジェクト: somapullela/python

def find_links(html):
    """Return a list of links in html."""
    # We're using the parser just to get the HREFs
    writer = formatter.DumbWriter(StringIO())
    f = formatter.AbstractFormatter(writer)
    parser = htmllib.HTMLParser(f)
    parser.feed(html)
    parser.close()
    return parser.anchorlist

コード例 #23

0

ファイルを表示

ファイル: WebPACScraper.py プロジェクト: alivesay/squire

    def unescape(self, s):
        """Quick and dirty unescaping."""

        s = s.replace("&nbsp;","")
        p = htmllib.HTMLParser(None)
        p.save_bgn()
        p.feed(s)

        return p.save_end()

コード例 #24

0

ファイルを表示

ファイル: IMDBTriviaPlugin.py プロジェクト: hheimbuerger/rtbot

 def decodeHTMLCharacterEntities(self, code):
     #print code
     outputstring = StringIO.StringIO()
     w = formatter.DumbWriter(outputstring, maxcol=9999999)  # plain text
     f = formatter.AbstractFormatter(w)
     p = htmllib.HTMLParser(f)
     p.feed(code)
     p.close()
     #print outputstring.getvalue()
     return (outputstring.getvalue())

コード例 #25

0

ファイルを表示

def unescape(s):
    """http://wiki.python.org/moin/EscapingHtml
     Example:
      >>> unescape("Norwegian&lt;&gt;Polish Dictionar TR&#39;App")
      >>> Norwegian<>Polish Dictionar TR'App
  """
    _htmlParser = htmllib.HTMLParser(None)
    _htmlParser.save_bgn()
    _htmlParser.feed(s)
    return _htmlParser.save_end()

コード例 #26

0

ファイルを表示

 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = htmllib.HTMLParser(
         formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist

コード例 #27

0

ファイルを表示

 def parse_links(self):  #解析刚下载下来的页面链接
     f = open(self.file, 'r')  #读取下载的页面
     data = f.read()
     f.close()
     parser = htmllib.HTMLParser(
         formatter.AbstractFormatter(  #AbstractFormatter用来解析数据
             formatter.DumbWriter(cStringIO.StringIO()))
     )  #DumbWriter用来输出内容，cStringIO保障不输出到标准输出（最好输出到文件）
     parser.feed(data)
     parser.close()
     return parser.anchorlist  #返回解析后的列表

コード例 #28

0

ファイルを表示

 def get_extra_information(self):
     text = ""
     if self.diff:
         self.diff = self.escape(self.diff)
         outstream = cStringIO.StringIO()
         p = htmllib.HTMLParser(
             formatter.AbstractFormatter(formatter.DumbWriter(outstream)))
         p.feed(self.diff)
         self.diff = outstream.getvalue()
         outstream.close()
         text = self.diff.replace("&", "&amp;")
     return text

コード例 #29

0

ファイルを表示

 def unescape(self, s):
     p = htmllib.HTMLParser(None)
     p.save_bgn()
     p.feed(str(s))
     s = p.save_end()
     #~ s = s.replace('&lt;','<')
     #~ s = s.replace('&gt;','>')
     s = s.replace(' rel="nofollow"', '')
     #~ s = s.replace('&quote;','"')
     s = s.replace('&', '%26')
     s = s.replace('[1]', '')
     return s

コード例 #30

0

ファイルを表示

    def pretty_print(self):
        """Print a pretty (formatted) version of the HTML content.

        If the content is not text/html then it is just printed.
        """
        if not self.headers['content-type'].lower().startswith('text/html'):
            print self.contents
        else:
            parser = htmllib.HTMLParser(
                formatter.AbstractFormatter(formatter.DumbWriter()))
            parser.feed(self.contents)
            parser.close()