def _GetAspellIndex(root=None): RAD = RemoteAspellDictionary response, content = urlcacheopen(httpjoin(root or RAD.ROOT,'0index.html'), decode=False) if not response.ok: print 'Unhandled HTTP response code: %r' % response return () soup = soupify(content) results = {} for row in soup.find('a', attrs=dict(name='0.50')).findAllNext('tr'): contents = row.findAll('td') if len(contents) == 4: id, name_english, name_native, dictionary_path = contents id = id.find(href=True) if id is None: continue id = id['href'] if id not in results: dictionary_path = dictionary_path.find(href=True) if dictionary_path is None: continue dictionary_path = dictionary_path['href'] name_english = name_english.renderContents(None).decode('xml') name_native = name_native.renderContents(None).decode('xml').strip() or None results[id] = RAD(id, name_english, name_native, id, dictionary_path) return results.values()
def _GetAspellIndex(root=None): RAD = RemoteAspellDictionary response, content = urlcacheopen(httpjoin(root or RAD.ROOT, '0index.html'), decode=False) if not response.ok: print 'Unhandled HTTP response code: %r' % response return () soup = soupify(content) results = {} for row in soup.find('a', attrs=dict(name='0.50')).findAllNext('tr'): contents = row.findAll('td') if len(contents) == 4: id, name_english, name_native, dictionary_path = contents id = id.find(href=True) if id is None: continue id = id['href'] if id not in results: dictionary_path = dictionary_path.find(href=True) if dictionary_path is None: continue dictionary_path = dictionary_path['href'] name_english = name_english.renderContents(None).decode('xml') name_native = name_native.renderContents(None).decode( 'xml').strip() or None results[id] = RAD(id, name_english, name_native, id, dictionary_path) return results.values()
def parse_html_slow(html): 'Uses Beautiful Soup to parse messages out of a log file.' html = html.decode('utf-8', 'ignore') soup = soupify(html, markupMassage = ((br_re,lambda m: '<br />'),)) messages = [] strptime = datetime.strptime for div in soup.findAll(message_divs): try: buddyname = div.findAll('span', class_buddy)[0].renderContents(None) timestamp = parse_timestamp(div['timestamp']) message = div.findAll('span', class_msgcontent)[0].renderContents(None) type = div['class'].replace('message', '').strip() auto = boolify(div.get('auto', 'false')) except Exception: print_exc() else: messages.append(Message(buddy = S(name = buddyname), timestamp = timestamp, message = message, type = type, auto = auto)) log_info('parse_html_slow with %d bytes returning %d messages', len(html), len(messages)) return messages
def parse_html_slow(html): 'Uses Beautiful Soup to parse messages out of a log file.' html = html.decode('utf-8', 'ignore') soup = soupify(html, markupMassage=((br_re, lambda m: '<br />'), )) messages = [] strptime = datetime.strptime for div in soup.findAll(message_divs): try: buddyname = div.findAll('span', class_buddy)[0].renderContents(None) timestamp = parse_timestamp(div['timestamp']) message = div.findAll('span', class_msgcontent)[0].renderContents(None) type = div['class'].replace('message', '').strip() auto = boolify(div.get('auto', 'false')) except Exception: print_exc() else: messages.append( Message(buddy=S(name=buddyname), timestamp=timestamp, message=message, type=type, auto=auto)) log_info('parse_html_slow with %d bytes returning %d messages', len(html), len(messages)) return messages
def anchor2tuple(s): ''' Our profile box takes tuples for links. Returns (u"http://www.google.com", u"http://www.google.com") for <a href="http://www.google.com">http://www.google.com</a>. ''' if not s: return None a = soupify(s).a return (a['href'], a.renderContents()) if a else None
def strip(html, formatting=True, colors=True, plaintext_transforms=None): ''' Strips formatting and/or colors from a string. Returns (stripped_string, {stripped_values}]) ''' #LOG('before strip: %r', html) if plaintext_transforms is None: plaintext_transforms = {} # A dictionary of lists of things this function has stripped out. removed = defaultdict(list) try: soup = soupify(html, convertEntities='html') #LOG('strip: %r', html) if formatting: strip_formatting(soup, removed) # LOG('after stripping formatting: %r', soup.renderContents(None)) remove_attrs(soup, ['color', 'bgcolor'], removed, doremove=colors) if colors: remove_styles(soup, ['background', 'color'], removed) remove_attrs(soup, ['back'], removed) else: convert_back(soup, removed) #LOG('after colors: %r', soup.renderContents(None)) remove_tags(soup, 'html') remove_tags(soup, 'body') #LOG('after removing color: %r', soup.renderContents(None)) apply_plaintext_transforms(soup, plaintext_transforms) final = soup.renderContents(None) #LOG('after transformations: %r', final) return final, removed except Exception: # If any exceptions occur, just return the original string. print_exc() return html, removed
def strip(html, formatting = True, colors = True, plaintext_transforms = None): ''' Strips formatting and/or colors from a string. Returns (stripped_string, {stripped_values}]) ''' #LOG('before strip: %r', html) if plaintext_transforms is None: plaintext_transforms = {} # A dictionary of lists of things this function has stripped out. removed = defaultdict(list) try: soup = soupify(html, convertEntities = 'html') #LOG('strip: %r', html) if formatting: strip_formatting(soup, removed) # LOG('after stripping formatting: %r', soup.renderContents(None)) remove_attrs(soup, ['color', 'bgcolor'], removed, doremove = colors) if colors: remove_styles(soup, ['background', 'color'], removed) remove_attrs(soup, ['back'], removed) else: convert_back(soup, removed) #LOG('after colors: %r', soup.renderContents(None)) remove_tags(soup, 'html') remove_tags(soup, 'body') #LOG('after removing color: %r', soup.renderContents(None)) apply_plaintext_transforms(soup, plaintext_transforms) final = soup.renderContents(None) #LOG('after transformations: %r', final) return final, removed except Exception: # If any exceptions occur, just return the original string. print_exc() return html, removed
def convert_back(soup, removed): 'AIM <font back="#ff0000">' # Convert all <font back="#ff0000"> for tag in soup.findAll(name = 'font', back = True): removed['back'].append(tag['back']) styles = ['background-color: %s' % tag['back']] if 'color' in dict(tag.attrs): removed['color'].append(tag['color']) styles.append('color: %s' % tag['color']) tag.replaceWith(soupify(('<span style="%s">' % '; '.join(styles)) + tag.renderContents(None) + '</span>'))
def convert_back(soup, removed): 'AIM <font back="#ff0000">' # Convert all <font back="#ff0000"> for tag in soup.findAll(name='font', back=True): removed['back'].append(tag['back']) styles = ['background-color: %s' % tag['back']] if 'color' in dict(tag.attrs): removed['color'].append(tag['color']) styles.append('color: %s' % tag['color']) tag.replaceWith( soupify(('<span style="%s">' % '; '.join(styles)) + tag.renderContents(None) + '</span>'))
def remove_tags(soup, *tags): for tag in soup.findAll(name=tags): # TODO: don't use soupify to reparse HTML a lot tag.replaceWith(soupify(tag.renderContents(None)))
def remove_tags(soup, *tags): for tag in soup.findAll(name = tags): # TODO: don't use soupify to reparse HTML a lot tag.replaceWith(soupify(tag.renderContents(None)))
def EditSource(self): 'Brings up a simple editor with the HTML source of this window, available for editing.' import wx from util import soupify from wx.stc import StyledTextCtrl, STC_LEX_HTML font = wx.Font(10, wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL, False, "Consolas") f = wx.Frame(wx.GetTopLevelParent(self), -1, 'View Source', name = 'View Source', size = (640, 480)) s = wx.BoxSizer(wx.VERTICAL) t = StyledTextCtrl(f, -1, wx.DefaultPosition, wx.DefaultSize, wx.NO_BORDER) #t.SetLexer(STC_LEX_HTML) orightml = self.HTML # TODO: BeautifulSoup is more destructive than is useful here. html = soupify(orightml).prettify() t.SetText(html) #t.SetFont(font) wx.CallAfter(t.SetSelection, 0, 0) buttons = wx.Panel(f) save = wx.Button(buttons, -1, '&Save') save.Bind(wx.EVT_BUTTON, lambda e: self.SetHTML(t.GetText())) save_as_file = wx.Button(buttons, -1, 'Save &As File...') def onsaveasfile(e): diag = wx.FileDialog(self, "Save HTML", "contents.html", style=wx.SAVE) if diag.ShowModal() == wx.ID_OK: with open(diag.GetPath(), 'wb') as f: f.write(orightml.encode('utf-8')) save_as_file.Bind(wx.EVT_BUTTON, onsaveasfile) copybutton = wx.Button(buttons, -1, _('&Copy')) def openinbrowser(e): from subprocess import Popen import os.path, tempfile fdesc, fname = tempfile.mkstemp() with os.fdopen(fdesc, 'w') as f: f.write(t.GetText().encode('utf-8')) if "wxMSW" in wx.PlatformInfo: from common import pref from path import path browser_exe = pref('debug.message_area.debug_browser', r'c:\Program Files\Safari\Safari.exe', type=basestring) browser_exe = path(browser_exe).expand() if browser_exe.isfile(): Popen([browser_exe, fname]) else: wx.MessageBox('Error launching browser:\n\n' '"%s"\n\n' 'Please set the "debug.message_area.debug_browser" pref to\n' 'the path to your web browser.' % browser_exe, 'Open in Browser') else: import webbrowser webbrowser.open_new("file://" + fname) openbutton = wx.Button(buttons, -1, _('&Open in Browser')) openbutton.Bind(wx.EVT_BUTTON, openinbrowser) openbutton.SetToolTipString(_('Launches browser in pref "debug.message_area.debug_browser"')) def docopy(e): clip = wx.TheClipboard if clip.Open(): clip.SetData( wx.TextDataObject(t.Value) ) clip.Close() copybutton.Bind(wx.EVT_BUTTON, docopy) buttons.Sizer = wx.BoxSizer(wx.HORIZONTAL) buttons.Sizer.AddMany([save, copybutton, openbutton, save_as_file]) s.Add(t, 1, wx.EXPAND) s.Add(buttons, 0, wx.EXPAND) f.SetSizer(s) # remember position and cascade when necessary from gui.toolbox import persist_window_pos persist_window_pos(f) f.EnsureNotStacked() f.Show()