Ejemplo n.º 1
0
def create_plaintext_message(message):
    """ Create clean plain text version of email message

        Parse the html and remove style and javacript tags and then
        create a plain-text-message by parsing the html
        and attaching links as endnotes
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['style']
    doc = message.decode('utf-8', 'ignore')
    to_clean = lxml.html.fromstring(doc)
    cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
    plain_text_maxcols = 72
    textout = cStringIO.StringIO()
    formtext = formatter.AbstractFormatter(
        formatter.DumbWriter(textout, plain_text_maxcols))
    parser = HTMLParser(formtext)
    parser.feed(cleaned_msg)
    parser.close()
    # append the anchorlist at the bottom of a message
    # to keep the message readable.
    counter = 0
    anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
    for item in parser.anchorlist:
        counter += 1
        if item.startswith('https://'):
            new_item = item.replace('https://', 'http://')
        else:
            new_item = item
        anchorlist += "[%d] %s\n" % (counter, new_item)
    text = textout.getvalue() + anchorlist
    del textout, formtext, parser, anchorlist
    return text
Ejemplo n.º 2
0
def test(args=None):
    import sys
    import formatter

    if not args:
        args = sys.argv[1:]

    silent = args and args[0] == '-s'
    if silent:
        del args[0]

    if args:
        fn = args[0]
    else:
        fn = 'test.html'

    if fn == '-':
        data = sys.stdin.read()
    else:
        try:
            with open(fn, 'rt') as fh:
                data = fh.read()
        except IOError as msg:
            print(fn, ":", msg)
            sys.exit(1)

    if silent:
        f = formatter.NullFormatter()
    else:
        f = formatter.AbstractFormatter(formatter.DumbWriter())

    p = HTMLParser(f)
    p.feed(data)
    p.close()
Ejemplo n.º 3
0
 def OpenURL(self, url):
     from htmllib import HTMLParser
     import formatter
     self.url = url
     m = re.match('http://([^/]+)(/\S*)\s*', url)
     if m:
         host = m.groups()[0]
         path = m.groups()[1]
     else:
         m = re.match('http://(\S+)\s*', url)
         if not m:
             # Invalid URL
             self.logprint("Invalid or unsupported URL: %s" % (url))
             return
         host = m.groups()[0]
         path = ''
     f = self.RetrieveAsFile(host, path)
     if not f:
         self.logprint("Could not open %s" % (url))
         return
     self.logprint("Receiving data...")
     data = f.read()
     tmp = open('hangman_dict.txt', 'w')
     fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp))
     p = HTMLParser(fmt)
     self.logprint("Parsing data...")
     p.feed(data)
     p.close()
     tmp.close()
Ejemplo n.º 4
0
def test(args = None):
    import sys, formatter
    if not args:
        args = sys.argv[1:]
    silent = args and args[0] == '-s'
    if silent:
        del args[0]
    if args:
        file = args[0]
    else:
        file = 'test.html'
    if file == '-':
        f = sys.stdin
    else:
        try:
            f = open(file, 'r')
        except IOError as msg:
            print file, ':', msg
            sys.exit(1)

    data = f.read()
    if f is not sys.stdin:
        f.close()
    if silent:
        f = formatter.NullFormatter()
    else:
        f = formatter.AbstractFormatter(formatter.DumbWriter())
    p = HTMLParser(f)
    p.feed(data)
    p.close()
Ejemplo n.º 5
0
def extractFromHTML(html, blur=5):
    """
    Extracts text from HTML content.
    """
    html = unicode(html)
    assert isinstance(html, unicode)

    # Create memory file.
    file = StringIO()

    # Convert html to text.
    f = formatter.AbstractFormatter(formatter.DumbWriter(file))
    p = TextExtractor()
    p.pathBlur = blur
    p.feed(html)
    p.close()
    text = p.get_plaintext()

    # Remove stand-alone punctuation.
    text = re.sub("\s[\(\),;\.\?\!](?=\s)", " ", text).strip()

    # Compress whitespace.
    text = re.sub("[\n\s]+", " ", text).strip()

    # Remove consequetive dashes.
    text = re.sub("\-{2,}", "", text).strip()

    # Remove consequetive periods.
    text = re.sub("\.{2,}", "", text).strip()

    return text
Ejemplo n.º 6
0
def create_html_mail(subject,
                     html,
                     text=None,
                     from_addr=None,
                     to_addr=None,
                     headers=None,
                     encoding='UTF-8'):
    """Create a mime-message that will render HTML in popular
    MUAs, text in better ones.
    """
    # Use DumbWriters word wrapping to ensure that no text line
    # is longer than plain_text_maxcols characters.
    plain_text_maxcols = 72

    html = html.encode(encoding)
    if text is None:
        # Produce an approximate textual rendering of the HTML string,
        # unless you have been given a better version as an argument
        textout = StringIO.StringIO()
        formtext = formatter.AbstractFormatter(
            formatter.DumbWriter(textout, plain_text_maxcols))
        parser = htmllib.HTMLParser(formtext)
        parser.feed(html)
        parser.close()

        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            anchorlist += "[%d] %s\n" % (counter, item)

        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
    else:
        text = text.encode(encoding)

    # if we would like to include images in future, there should
    # probably be 'related' instead of 'mixed'
    msg = MIMEMultipart('mixed')
    # maybe later :)  msg['From'] = Header("%s <%s>" %
    #   (send_from_name, send_from), encoding)
    msg['Subject'] = Header(subject, encoding)
    msg['From'] = from_addr
    msg['To'] = to_addr
    msg['Date'] = formatdate(localtime=True)
    msg["Message-ID"] = email.Utils.make_msgid()
    if headers:
        for key, value in headers.items():
            msg[key] = value
    msg.preamble = 'This is a multi-part message in MIME format.'

    alternatives = MIMEMultipart('alternative')
    msg.attach(alternatives)
    alternatives.attach(MIMEText(text, 'plain', _charset=encoding))
    alternatives.attach(MIMEText(html, 'html', _charset=encoding))

    return msg
Ejemplo n.º 7
0
 def parse(self):
     # We're using the parser just to get the HREFs
     # We should also use it to e.g. respect <META NOFOLLOW>
     w = formatter.DumbWriter(StringIO())
     f = formatter.AbstractFormatter(w)
     self.parser = htmllib.HTMLParser(f)
     self.parser.feed(self.body)
     self.parser.close()
Ejemplo n.º 8
0
def html2txt(htmlblock):
    import htmllib, formatter, StringIO
    s = StringIO.StringIO('')
    w = formatter.DumbWriter(s)
    f = formatter.AbstractFormatter(w)
    p = htmllib.HTMLParser(f)
    p.feed(htmlblock)
    return s.getvalue().strip()
Ejemplo n.º 9
0
def find_links(html):
    """return list of links in HTML"""
    writer = formatter.DumbWriter(StringIO())
    f = formatter.AbstractFormatter(writer)
    parser = htmllib.HTMLParser(f)
    parser.feed(html)
    parser.close()
    return parser.anchorlist
Ejemplo n.º 10
0
    def end_h4(self):

        if self.h4_title:
            f = StringIO.StringIO("")
            self.help_strings[self.h4_title] = f
            self.formatter.writer = formatter.DumbWriter(f, 800)

        self.h4 = False
Ejemplo n.º 11
0
 def get_data(self):
     '''
     Download data from Weather Underground website for a given stationid
         , a startyar, and an endyear. The html file is parsed and written
         as csv to a separate txt file for each day.
         [singleprocessing code, deprecated]
     '''
     logger.info('Download data for stationid: ' + self.stationid +
                 ' [start]')
     for td in utils.progressbar(range(0, (self.enddate - self.startdate)
                                       .days + 1), "Downloading: ", 60):
         # increase the date by 1 day for the next download
         current_date = self.startdate + timedelta(days=td)
         # set download url
         url = 'http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=' + \
             self.stationid + '&day=' + str(current_date.day) + '&year=' + \
             str(current_date.year) + '&month=' + \
             str(current_date.month) + '&format=1'
         # define outputfile
         outputfile = self.stationid + '_' + str(current_date.year) \
             + str(current_date.month).zfill(2) + \
             str(current_date.day).zfill(2) + '.txt'
         # check if we want to keep previous downloaded files
         if self.keep:
             if os.path.exists(os.path.join(self.outputdir, outputfile)):
                 # check if filesize is not null
                 if os.path.getsize(os.path.join(self.outputdir,
                                                 outputfile)) > 0:
                     # file exists and is not null, continue next iteration
                     continue
                 else:
                     # file exists but is null, so remove and redownload
                     os.remove(os.path.join(self.outputdir, outputfile))
         elif os.path.exists(os.path.join(self.outputdir, outputfile)):
             os.remove(os.path.join(self.outputdir, outputfile))
         # open outputfile
         with open(os.path.join(self.outputdir, outputfile),
                   'wb') as outfile:
             # open and read the url
             handler = urllib2.urlopen(url)
             content = handler.read()
             # convert spaces to non-breaking spaces
             content = content.replace(' ', '&nbsp;')
             # Removing all the HTML tags from the file
             outstream = cStringIO.StringIO()
             parser = htmllib.HTMLParser(
                 formatter.AbstractFormatter(
                     formatter.DumbWriter(outstream)))
             parser.feed(content)
             # convert spaces back to regular whitespace (' ')
             content = outstream.getvalue().replace('\xa0', ' ')
             # write output
             outfile.write(content)
             # close handler and outstream
             outstream.close()
             handler.close()
         logger.info('Download data for stationid: ' + self.stationid +
                     ' [completed]')
Ejemplo n.º 12
0
def find_links(html):
    """Return a list of links in html."""
    # We're using the parser just to get the HREFs
    writer = formatter.DumbWriter(StringIO())
    f = formatter.AbstractFormatter(writer)
    parser = htmllib.HTMLParser(f)
    parser.feed(html)
    parser.close()
    return parser.anchorlist
Ejemplo n.º 13
0
def get_urls(url):
    data = urllib.urlopen(url).read()
    parser = HTMLParser(
        formatter.AbstractFormatter(formatter.DumbWriter(
            cStringIO.StringIO())))
    parser.feed(data)
    parser.close()
    url_list = parser.anchorlist
    return url_list
Ejemplo n.º 14
0
 def textFromHtml(self, html):
     textout = cStringIO.StringIO()
     formtext = formatter.AbstractFormatter(formatter.DumbWriter(textout))
     parser = htmllib.HTMLParser(formtext)
     parser.feed(html)
     parser.close()
     text = textout.getvalue()
     del textout, formtext, parser
     return text
Ejemplo n.º 15
0
    def __init__(self, prn=0):
        if prn:
            format = formatter.AbstractFormatter(formatter.DumbWriter())
        else:
            format = formatter.NullFormatter()

        htmllib.HTMLParser.__init__(self, format)
        self.depth = 0
        self.stack = []
Ejemplo n.º 16
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Ejemplo n.º 17
0
    def start_a(self, attrs):
        dattrs = dict(attrs)
        if "name" in dattrs:

            h4_title = dattrs["name"]

            f = StringIO.StringIO("")
            self.help_strings[h4_title] = f
            self.formatter.writer = formatter.DumbWriter(f, 800)
Ejemplo n.º 18
0
 def parse_links(self):
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     paeser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     paeser.feed(data)
     paeser.close()
     return paeser.anchorlist
Ejemplo n.º 19
0
 def decodeHTMLCharacterEntities(self, code):
     #print code
     outputstring = StringIO.StringIO()
     w = formatter.DumbWriter(outputstring, maxcol=9999999)  # plain text
     f = formatter.AbstractFormatter(w)
     p = htmllib.HTMLParser(f)
     p.feed(code)
     p.close()
     #print outputstring.getvalue()
     return (outputstring.getvalue())
Ejemplo n.º 20
0
 def parse_link(seld):
     'Parse out the link'
     f = open('seld.file', 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Ejemplo n.º 21
0
 def parse_links(self):  #解析刚下载下来的页面链接
     f = open(self.file, 'r')  #读取下载的页面
     data = f.read()
     f.close()
     parser = htmllib.HTMLParser(
         formatter.AbstractFormatter(  #AbstractFormatter用来解析数据
             formatter.DumbWriter(cStringIO.StringIO()))
     )  #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件)
     parser.feed(data)
     parser.close()
     return parser.anchorlist  #返回解析后的列表
def urlparser():
    import urllib.request, formatter, sys, html
    from html.parser import HTMLParser
    with urllib.request.urlopen(url) as response:
        data = str(response.read())
        response.close()
        format = formatter.AbstractFormatter(formatter.DumbWriter(
            sys.stdout))
        ptext = HTMLParser(format)
        ptext.feed(data)
        ptext.close()
Ejemplo n.º 23
0
 def parse_links(self):  #解析刚下载下来的页面链接
     f = codecs.open(self.file, 'rb', 'utf-8')  #读取下载的页面
     data = f.read()  #.decode("utf-8")
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(  #AbstractFormatter用来解析数据
             formatter.DumbWriter(io.StringIO()))
     )  #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件)
     parser.feed(data)
     parser.close()
     return parser.anchorlist  #返回解析后的列表
Ejemplo n.º 24
0
 def parse_links(self):
     """fetch all links from page
     """
     f = open(self.save_file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Ejemplo n.º 25
0
 def get_content(self, document):
     """
     Get the content or the text from the document. 
     Document can be Html document or text file
     """
     outstream = cStringIO.StringIO()
     parser = htmllib.HTMLParser(
         formatter.AbstractFormatter(formatter.DumbWriter(outstream)))
     parser.feed(document)
     content = outstream.getvalue()
     outstream.close()
     return content
Ejemplo n.º 26
0
def parseDOM(gb):
    w = formatter.DumbWriter() # plain text
    f = formatter.AbstractFormatter(w)

    for ff in os.listdir(os.path.join(hPATH)):
        
        command='python parser.py {} > {}'\
        .format(shellquote(os.path.join(hPATH,ff)),shellquote(os.path.join(HOME,'html',gb+'_parsed','parsed_'+os.path.splitext(os.path.basename(os.path.join(hPATH,ff)))[0]+".txt")))
        
        print command
        
        result=os.system(command)
Ejemplo n.º 27
0
    def pretty_print(self):
        """Print a pretty (formatted) version of the HTML content.

        If the content is not text/html then it is just printed.
        """
        if not self.headers['content-type'].lower().startswith('text/html'):
            print self.contents
        else:
            parser = htmllib.HTMLParser(
                formatter.AbstractFormatter(formatter.DumbWriter()))
            parser.feed(self.contents)
            parser.close()
Ejemplo n.º 28
0
 def get_extra_information(self):
     text = ""
     if self.diff:
         self.diff = self.escape(self.diff)
         outstream = cStringIO.StringIO()
         p = htmllib.HTMLParser(
             formatter.AbstractFormatter(formatter.DumbWriter(outstream)))
         p.feed(self.diff)
         self.diff = outstream.getvalue()
         outstream.close()
         text = self.diff.replace("&", "&amp;")
     return text
Ejemplo n.º 29
0
def apply_htmlparser(html, maxcol=MAXCOL, codec='utf8'):
    """This function extracts from the HTML string by passing it through a
        htmllib.HTMLParser instance (slightly modified for Unicode support).

        Adapted from http://www.bazza.com/~eaganj/weblog/2006/04/04/printing-html-as-text-in-python-with-unicode/

        @type  html: unicode
        @param html: The HTML to extract text from (eg. u"<html><body><h1>Hello</h1>...")
        @type  maxcol: int
        @param maxcol: The maxcol value to passed to formatter.DumbWriter()
        @type  codec: str (passed to codecs.lookup())
        @param codec: The codec to use to parse the HTML.

        @rtype : str
        @return: The text parsed from the HTML."""

    class UnicodeHTMLParser(htmllib.HTMLParser):
        """HTMLParser that can handle unicode charrefs"""

        entitydefs = dict([ (k, unichr(v)) for k, v in htmlentitydefs.name2codepoint.items() ])

        def handle_charref(self, name):
            """Override builtin version to return unicode instead of binary strings for 8-bit chars."""
            try:
                n = int(name)
            except ValueError:
                self.unknown_charref(name)
                return
            if not 0 <= n <= 255:
                self.unknown_charref(name)
                return
            if 0 <= n <= 127:
                self.handle_data(chr(n))
            else:
                self.handle_data(unichr(n))

    sio = StringIO()
    encoder, decoder, reader, writer = codecs.lookup(codec)
    codecio = codecs.StreamReaderWriter(sio, reader, writer, 'replace')
    writer = formatter.DumbWriter(codecio, maxcol)
    prettifier = formatter.AbstractFormatter(writer)

    parser = UnicodeHTMLParser(prettifier)
    parser.feed(html)
    parser.close()

    codecio.seek(0)
    result = codecio.read()
    sio.close()
    codecio.close()

    return result
Ejemplo n.º 30
0
def chapter1_25():
    # 生成写入、格式化、解析对象,根据需要将他们连接起来
    myWriter = formatter.DumbWriter()
    if sys.stdout.isatty():
        myFormatter = TtyFormatter(myWriter)
    else:
        myFormatter = formatter.AbstractFormatter(myWriter)
    myParser = htmllib.HTMLParser(myFormatter)
    # 将标准输入和终端操作提供给解析器
    myParser.feed(sys.stdin.read())
    myParser.close()

    pass