Python soupify Examples, util.soupify Python Examples

Example #1

0

Show file

File: dicts.py Project: AlexUlrich/digsby

def _GetAspellIndex(root=None):
    RAD = RemoteAspellDictionary
    response, content = urlcacheopen(httpjoin(root or RAD.ROOT,'0index.html'), decode=False)

    if not response.ok:
        print 'Unhandled HTTP response code: %r' % response
        return ()

    soup = soupify(content)
    results = {}
    for row in soup.find('a', attrs=dict(name='0.50')).findAllNext('tr'):
        contents = row.findAll('td')
        if len(contents) == 4:
            id, name_english, name_native, dictionary_path = contents
            id = id.find(href=True)

            if id is None:
                continue

            id = id['href']
            if id not in results:
                dictionary_path = dictionary_path.find(href=True)
                if dictionary_path is None:
                    continue
                dictionary_path = dictionary_path['href']

                name_english = name_english.renderContents(None).decode('xml')
                name_native = name_native.renderContents(None).decode('xml').strip() or None
                results[id] = RAD(id, name_english, name_native, id, dictionary_path)

    return results.values()

Example #2

0

Show file

def _GetAspellIndex(root=None):
    RAD = RemoteAspellDictionary
    response, content = urlcacheopen(httpjoin(root or RAD.ROOT, '0index.html'),
                                     decode=False)

    if not response.ok:
        print 'Unhandled HTTP response code: %r' % response
        return ()

    soup = soupify(content)
    results = {}
    for row in soup.find('a', attrs=dict(name='0.50')).findAllNext('tr'):
        contents = row.findAll('td')
        if len(contents) == 4:
            id, name_english, name_native, dictionary_path = contents
            id = id.find(href=True)

            if id is None:
                continue

            id = id['href']
            if id not in results:
                dictionary_path = dictionary_path.find(href=True)
                if dictionary_path is None:
                    continue
                dictionary_path = dictionary_path['href']

                name_english = name_english.renderContents(None).decode('xml')
                name_native = name_native.renderContents(None).decode(
                    'xml').strip() or None
                results[id] = RAD(id, name_english, name_native, id,
                                  dictionary_path)

    return results.values()

Example #3

0

Show file

File: logger.py Project: AlexUlrich/digsby

def parse_html_slow(html):
    'Uses Beautiful Soup to parse messages out of a log file.'

    html = html.decode('utf-8', 'ignore')

    soup     = soupify(html, markupMassage = ((br_re,lambda m: '<br />'),))
    messages = []
    strptime = datetime.strptime

    for div in soup.findAll(message_divs):
        try:
            buddyname = div.findAll('span', class_buddy)[0].renderContents(None)
            timestamp = parse_timestamp(div['timestamp'])
            message   = div.findAll('span', class_msgcontent)[0].renderContents(None)
            type      = div['class'].replace('message', '').strip()
            auto      = boolify(div.get('auto', 'false'))
        except Exception:
            print_exc()
        else:
            messages.append(Message(buddy     = S(name = buddyname),
                                    timestamp = timestamp,
                                    message   = message,
                                    type      = type,
                                    auto      = auto))

    log_info('parse_html_slow with %d bytes returning %d messages', len(html), len(messages))
    return messages

Example #4

0

Show file

def parse_html_slow(html):
    'Uses Beautiful Soup to parse messages out of a log file.'

    html = html.decode('utf-8', 'ignore')

    soup = soupify(html, markupMassage=((br_re, lambda m: '<br />'), ))
    messages = []
    strptime = datetime.strptime

    for div in soup.findAll(message_divs):
        try:
            buddyname = div.findAll('span',
                                    class_buddy)[0].renderContents(None)
            timestamp = parse_timestamp(div['timestamp'])
            message = div.findAll('span',
                                  class_msgcontent)[0].renderContents(None)
            type = div['class'].replace('message', '').strip()
            auto = boolify(div.get('auto', 'false'))
        except Exception:
            print_exc()
        else:
            messages.append(
                Message(buddy=S(name=buddyname),
                        timestamp=timestamp,
                        message=message,
                        type=type,
                        auto=auto))

    log_info('parse_html_slow with %d bytes returning %d messages', len(html),
             len(messages))
    return messages

Example #5

0

Show file

def anchor2tuple(s):
    '''
    Our profile box takes tuples for links.

    Returns (u"http://www.google.com", u"http://www.google.com") for
    <a href="http://www.google.com">http://www.google.com</a>.
    '''

    if not s: return None
    a = soupify(s).a
    return (a['href'], a.renderContents()) if a else None

Example #6

0

Show file

def strip(html, formatting=True, colors=True, plaintext_transforms=None):
    '''
    Strips formatting and/or colors from a string.

    Returns (stripped_string, {stripped_values}])
    '''

    #LOG('before strip: %r', html)

    if plaintext_transforms is None:
        plaintext_transforms = {}

    # A dictionary of lists of things this function has stripped out.
    removed = defaultdict(list)

    try:
        soup = soupify(html, convertEntities='html')
        #LOG('strip: %r', html)

        if formatting:
            strip_formatting(soup, removed)
            # LOG('after stripping formatting: %r', soup.renderContents(None))

        remove_attrs(soup, ['color', 'bgcolor'], removed, doremove=colors)

        if colors:
            remove_styles(soup, ['background', 'color'], removed)
            remove_attrs(soup, ['back'], removed)
        else:
            convert_back(soup, removed)

        #LOG('after colors: %r', soup.renderContents(None))

        remove_tags(soup, 'html')
        remove_tags(soup, 'body')

        #LOG('after removing color: %r', soup.renderContents(None))

        apply_plaintext_transforms(soup, plaintext_transforms)

        final = soup.renderContents(None)
        #LOG('after transformations: %r', final)

        return final, removed

    except Exception:
        # If any exceptions occur, just return the original string.
        print_exc()

        return html, removed

Example #7

0

Show file

File: stripformatting.py Project: AlexUlrich/digsby

def strip(html, formatting = True, colors = True, plaintext_transforms = None):
    '''
    Strips formatting and/or colors from a string.

    Returns (stripped_string, {stripped_values}])
    '''

    #LOG('before strip: %r', html)

    if plaintext_transforms is None:
        plaintext_transforms = {}

    # A dictionary of lists of things this function has stripped out.
    removed = defaultdict(list)

    try:
        soup = soupify(html, convertEntities = 'html')
        #LOG('strip: %r', html)

        if formatting:
            strip_formatting(soup, removed)
            # LOG('after stripping formatting: %r', soup.renderContents(None))

        remove_attrs(soup,  ['color', 'bgcolor'], removed, doremove = colors)

        if colors:
            remove_styles(soup, ['background', 'color'], removed)
            remove_attrs(soup, ['back'], removed)
        else:
            convert_back(soup, removed)

        #LOG('after colors: %r', soup.renderContents(None))

        remove_tags(soup, 'html')
        remove_tags(soup, 'body')

        #LOG('after removing color: %r', soup.renderContents(None))

        apply_plaintext_transforms(soup, plaintext_transforms)

        final = soup.renderContents(None)
        #LOG('after transformations: %r', final)

        return final, removed

    except Exception:
        # If any exceptions occur, just return the original string.
        print_exc()

        return html, removed

Example #8

0

Show file

File: stripformatting.py Project: AlexUlrich/digsby

def convert_back(soup, removed):
    'AIM <font back="#ff0000">'

    # Convert all <font back="#ff0000">
    for tag in soup.findAll(name = 'font', back = True):
        removed['back'].append(tag['back'])

        styles = ['background-color: %s' % tag['back']]

        if 'color' in dict(tag.attrs):
            removed['color'].append(tag['color'])
            styles.append('color: %s' % tag['color'])

        tag.replaceWith(soupify(('<span style="%s">' % '; '.join(styles)) + tag.renderContents(None) + '</span>'))

Example #9

0

Show file

def convert_back(soup, removed):
    'AIM <font back="#ff0000">'

    # Convert all <font back="#ff0000">
    for tag in soup.findAll(name='font', back=True):
        removed['back'].append(tag['back'])

        styles = ['background-color: %s' % tag['back']]

        if 'color' in dict(tag.attrs):
            removed['color'].append(tag['color'])
            styles.append('color: %s' % tag['color'])

        tag.replaceWith(
            soupify(('<span style="%s">' % '; '.join(styles)) +
                    tag.renderContents(None) + '</span>'))

Example #10

0

Show file

def remove_tags(soup, *tags):
    for tag in soup.findAll(name=tags):
        # TODO: don't use soupify to reparse HTML a lot
        tag.replaceWith(soupify(tag.renderContents(None)))

Example #11

0

Show file

File: stripformatting.py Project: AlexUlrich/digsby

def remove_tags(soup, *tags):
    for tag in soup.findAll(name = tags):
        # TODO: don't use soupify to reparse HTML a lot
        tag.replaceWith(soupify(tag.renderContents(None)))

Example #12

0

Show file

def EditSource(self):
    'Brings up a simple editor with the HTML source of this window, available for editing.'

    import wx
    from util import soupify
    from wx.stc import StyledTextCtrl, STC_LEX_HTML

    font = wx.Font(10, wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL,
                   wx.FONTWEIGHT_NORMAL, False, "Consolas")

    f = wx.Frame(wx.GetTopLevelParent(self), -1, 'View Source', name = 'View Source', size = (640, 480))
    s = wx.BoxSizer(wx.VERTICAL)
    t = StyledTextCtrl(f, -1, wx.DefaultPosition, wx.DefaultSize, wx.NO_BORDER)
    #t.SetLexer(STC_LEX_HTML)

    orightml = self.HTML

    # TODO: BeautifulSoup is more destructive than is useful here.
    html = soupify(orightml).prettify()
    t.SetText(html)
    #t.SetFont(font)
    wx.CallAfter(t.SetSelection, 0, 0)

    buttons = wx.Panel(f)
    save = wx.Button(buttons, -1, '&Save')
    save.Bind(wx.EVT_BUTTON, lambda e: self.SetHTML(t.GetText()))

    save_as_file = wx.Button(buttons, -1, 'Save &As File...')

    def onsaveasfile(e):
        diag = wx.FileDialog(self, "Save HTML", "contents.html", style=wx.SAVE)

        if diag.ShowModal() == wx.ID_OK:
            with open(diag.GetPath(), 'wb') as f:
                f.write(orightml.encode('utf-8'))

    save_as_file.Bind(wx.EVT_BUTTON, onsaveasfile)

    copybutton = wx.Button(buttons, -1, _('&Copy'))

    def openinbrowser(e):
        from subprocess import Popen
        import os.path, tempfile

        fdesc, fname  = tempfile.mkstemp()
        with os.fdopen(fdesc, 'w') as f:
            f.write(t.GetText().encode('utf-8'))

        if "wxMSW" in wx.PlatformInfo:
            from common import pref
            from path import path

            browser_exe = pref('debug.message_area.debug_browser',
                               r'c:\Program Files\Safari\Safari.exe', type=basestring)
            browser_exe = path(browser_exe).expand()

            if browser_exe.isfile():
                Popen([browser_exe, fname])
            else:
                wx.MessageBox('Error launching browser:\n\n'
                              '"%s"\n\n'
                              'Please set the "debug.message_area.debug_browser" pref to\n'
                              'the path to your web browser.' % browser_exe,
                              'Open in Browser')
        else:
            import webbrowser
            webbrowser.open_new("file://" + fname)


    openbutton = wx.Button(buttons, -1, _('&Open in Browser'))
    openbutton.Bind(wx.EVT_BUTTON, openinbrowser)
    openbutton.SetToolTipString(_('Launches browser in pref "debug.message_area.debug_browser"'))

    def docopy(e):
        clip = wx.TheClipboard
        if clip.Open():
            clip.SetData( wx.TextDataObject(t.Value) )
            clip.Close()

    copybutton.Bind(wx.EVT_BUTTON, docopy)

    buttons.Sizer = wx.BoxSizer(wx.HORIZONTAL)
    buttons.Sizer.AddMany([save, copybutton, openbutton, save_as_file])

    s.Add(t, 1, wx.EXPAND)
    s.Add(buttons, 0, wx.EXPAND)
    f.SetSizer(s)

    # remember position and cascade when necessary
    from gui.toolbox import persist_window_pos
    persist_window_pos(f)
    f.EnsureNotStacked()

    f.Show()