def sanitizeHtml(value, base_url=None): value = value.replace('<div>','').replace('</div>','').replace('<p>','').replace('</p>','').replace('<span>','').replace('</span>','') rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:')) rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:')) re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE) validTags = 'br i em strong ul ol li u b a h1 h2 h3 blockquote'.split() validAttrs = 'href'.split() urlAttrs = 'href src'.split() # Attributes which should have a URL soup = BeautifulSoup(value) for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): # Get rid of comments comment.extract() for tag in soup.findAll(True): if tag.name not in validTags: tag.extract() attrs = tag.attrs tag.attrs = [] for attr, val in attrs: if attr in validAttrs: val = re_scripts.sub('', val) # Remove scripts (vbs & js) if attr in urlAttrs: val = urljoin(base_url, val) # Calculate the absolute url tag.attrs.append((attr, val)) return soup.renderContents().decode('utf8')
def preview_html(self): html = self.text_to_html(self.body.get()).encode('utf-8') name = "html_" + time.strftime("%Y%m%d_%H%M%S", time.localtime()) + ".html" name = os.path.join(DEFDIR, "cache", name) soup = BeautifulSoup(html) imgs = soup.findAll('img') for img in imgs: if os.path.isfile(img["src"]): img["src"] = "file://localhost/" + img["src"] html = soup.prettify().replace("\n","") try: fp = open(name,"wt") fp.write("<html>\n") fp.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n') fp.write("<body>\n") fp.write(html) fp.write("</body></html>") fp.close() except: note(LABELS.loc.pt_err_cant_gen_prvw,"error") return viewer = Content_handler(self.refresh) try: viewer.open(name) except: note(LABELS.loc.pt_err_cant_prvw,"error")
def remove_image(self,del_img): soup = BeautifulSoup(self.contents.encode('utf-8')) imgs = soup.findAll('img') for img in imgs: if img["src"] == del_img: img.extract() self.contents = utf8_to_unicode(soup.prettify().replace("\n",""))
def _parse_tabela(self, html): soup = BeautifulSoup(html) linhas = soup.findAll( 'tr', attrs={ 'onclick': re.compile(r"javascript:detalharCep\('\d+','\d+'\);") }) return [self._parse_linha_tabela(linha) for linha in linhas]
def find_images(self): soup = BeautifulSoup(self.contents.encode('utf-8')) imgs = soup.findAll('img') self.images = [] for img in imgs: try: self.images.append(img['src']) except: pass
def get_polls_all(fd, limit=None): """ Scrape an file like object and return a list in which each element represents information about a poll """ soup = BeautifulSoup(fd) tables = soup.findAll("table", { "class" : "wikitable sortable" }) if len(tables) > 1: # TODO This can actually be handled checking for info inside each table raise Exception("Too many tables found") all_trs = tables[0].findAll("tr") tr_lines = all_trs[1:] # Find out parties names # All names are on the first line of the table # Search for font tags span_tags = all_trs[0].findAllNext("span") parties_names = [f.string for f in span_tags] all_polls = [] # TODO Further asserts/verifies are needed to make sure we can use this table for poll in tr_lines: if limit and len(all_polls) >= limit: _log.debug("Stopped parsing. Already parsed until limit = %s" % limit) break cells = poll.findAll("td") if len(cells) != 9: _log.info("Stopping parsing. Line does not have 9 columns. We need 8 columns to parse stats.") break cells_t = [_clean_data(c.string) if c.string is not None else None for c in cells ] a_tag = cells[1].find('a') href = dict(a_tag.attrs)["href"] institute = a_tag.string current_poll_data = { "date" : _clean_data(cells_t[0]), # We actually handle this OK, but clients will probably have problems "source" : { "href" : href, "name" : institute, }, "parties" : {} } current_poll_data["parties"].update((party, cells_t[n]) for party, n in izip(parties_names, range(2,8))) all_polls.append(current_poll_data) _log.info("Parsed polls for %s" % cells_t[0]) return all_polls
def new_post(self, title, contents, categories, tags, publish, offline_idx=-1): """ Upload a new post, where: - title: post title, in unicode - contents: post contents, in unicode - categories: array of category names in unicode - tags: array of tag names in unicode - publish: draft post (False, not published) or final post (True, published) - offline_idx: post index, if it is offline, Return the new post ID (success) or -1 (error) """ app.title = LABELS.loc.wp_info_upld_post_cont soup = BeautifulSoup(unicode_to_utf8(contents)) for img in soup.findAll('img'): if os.path.isfile( img['src'] ): # just upload local files url = self.upload_images( img['src'] ) if url is not None: img['src'] = url contents = soup.prettify().replace("\n"," ") app.title = LABELS.loc.wp_info_upld_post_cont post = wp.WordPressPost() post.description = contents + unicode_to_utf8(LABELS.loc.promo_phrase) post.title = unicode_to_utf8( title ) post.categories = [ self.categoryName2Id(c)[0] for c in categories ] post.keywords = ",".join([ unicode_to_utf8(t) for t in tags ]) post.allowComments = True try: npost = self.blog.newPost(post, publish) except: note(LABELS.loc.wp_err_cant_pub_post,"error") npost = -1 if npost >= 0: app.title = LABELS.loc.wp_info_updt_post_list try: p = self.blog.getLastPostTitle( ) # indicate that the corresponding offline post now has a remote copy if offline_idx >= 0: self.posts[offline_idx] = p else: self.posts.insert( 0, p ) except: note(LABELS.loc.wp_err_cant_updt_post_list,"error") self.save() return npost
def _parse_detalhe(self, html): soup = BeautifulSoup(html.decode('ISO-8859-1')) value_cells = soup.findAll('td', attrs={'class': 'value'}) values = [cell.firstText(text=True) for cell in value_cells] localidade, uf = values[2].split('/') values_dict = { 'Logradouro': values[0], 'Bairro': values[1], 'Localidade': localidade, 'UF': uf, 'CEP': values[3] } return values_dict
def edit_post(self, title, contents, categories, tags, post_idx, publish): """ Update a post. Return True or False, indicating if the updating operation was sucessfuly completed or not """ # when local post is edited it does not have a postid, in such case we need to # create a new post instead updating an existing one if self.post_is_only_local(post_idx): np_id = self.new_post(title, contents, categories, tags, publish, post_idx) return (np_id >= 0) app.title = LABELS.loc.wp_info_upld_post_cont soup = BeautifulSoup( unicode_to_utf8(contents) ) for img in soup.findAll('img'): if os.path.isfile( img['src'] ): # just upload local files url = self.upload_images( img['src'] ) if url is not None: img['src'] = url contents = soup.prettify().replace("\n"," ") app.title = LABELS.loc.wp_info_upld_post_cont post = wp.WordPressPost() post.id = self.posts[post_idx]['postid'] post.title = unicode_to_utf8( title ) post.description = contents post.categories = [ self.categoryName2Id(c)[0] for c in categories ] post.keywords = ",".join([ unicode_to_utf8(t) for t in tags ]) post.allowComments = True post.permaLink = self.posts[post_idx]['permaLink'] post.textMore = self.posts[post_idx]['mt_text_more'] post.excerpt = self.posts[post_idx]['mt_excerpt'] try: npost = self.blog.editPost(post.id, post, publish) except: note(LABELS.loc.wp_err_cant_updt_the_post,"error") return False else: app.title = LABELS.loc.wp_info_updt_post_list try: upd_post = self.blog.getPost(post.id) except: note(LABELS.loc.wp_err_cant_updt_post_list,"error") else: self.posts[post_idx] = upd_post self.save() return True
def upgrade(self): self.lock_ui(LABELS.loc.wm_info_check_updt) url = "http://code.google.com/p/wordmobi/wiki/LatestVersion" local_file = "web_" + time.strftime("%Y%m%d_%H%M%S", time.localtime()) + ".html" local_file = os.path.join(DEFDIR, "cache", local_file) try: urlprx = UrllibProxy(BLOG.get_proxy()) urlprx.urlretrieve(url, local_file) except: note(LABELS.loc.wm_err_upd_page % url,"error") ok = False else: ok = True if ok: html = open(local_file).read() soup = BeautifulSoup( html ) addrs = soup.findAll('a') version = "" file_url = "" file_url_py19 = "" for addr in addrs: if addr.contents[0] == "latest_wordmobi_version": version = addr["href"] elif addr.contents[0] == "wordmobi_sis_url": file_url = addr["href"] elif addr.contents[0] == "wordmobi_sis_url_py19": file_url_py19 = addr["href"] if version and file_url and file_url_py19: version = version[version.rfind("/")+1:] num_rem_ver = self.ver2num(version) num_loc_ver = self.ver2num(VERSION) if (num_loc_ver >= num_rem_ver) and (VERSION.find('RC') == -1): # RC versions gives to the user the upgrading decision note(LABELS.loc.wm_info_ver_is_updt, "info") else: yn = popup_menu( [LABELS.loc.gm_yes,LABELS.loc.gm_no], LABELS.loc.wm_pmenu_download % (version) ) if yn is not None: if yn == 0: if float(e32.pys60_version[:3]) >= 1.9: furl = file_url_py19 else: furl = file_url sis_name = furl[furl.rfind("/")+1:] local_file = os.path.join(DEFDIR, "updates", sis_name) self.set_title( LABELS.loc.wm_info_downloading ) try: urlprx = UrllibProxy(BLOG.get_proxy()) urlprx.urlretrieve(furl, local_file) except: note(LABELS.loc.wm_err_downld_fail % sis_name, "error") else: msg = LABELS.loc.wm_info_downld_ok % (sis_name,DEFDIR) note( msg , "info") else: note(LABELS.loc.wm_err_upd_info,"error") self.set_title( u"Wordmobi" ) self.unlock_ui() self.refresh()
#!/usr/bin/env python # -*- coding: utf-8 -*- from beautifulsoup import BeautifulSoup from urllib2 import urlopen url = "https://scrapebook22.appspot.com/" response = urlopen(url).read() soup = BeautifulSoup(response) print soup.html.head.title.string for link in soup.findAll("a"): if link.string == "See full profile": person_url = "https://scrapebook22.appspot.com" + link["href"] person_html = urlopen(person_url).read() person_soup = BeautifulSoup(person_html) email = person_soup.find("span", attrs={"class": "email"}).string name = person_soup.find("div", attrs={"class": "col-md-8"}).h1.string city = person_soup.find("span", attrs={"data-city": True}).string print name + "," + email + "," + city csv_file = open("list.csv", "w") csv_file.write(name + "," + email + "," + city + "\n") csv_file.close()
autologin = '******' login = '******' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) login_data = urllib.urlencode({'user_email' : username, 'user_password' : password, 'autologin' : autologin, 'submit' : login }) log = opener.open('http://www.autosport.com/subs/login.php', login_data) gallery = opener.open(url) gal_id = url.split('/')[-1] html_gal = open('debug.html', 'wb') html_gal.write(gallery.read()) html_gal.close() soup = BeautifulSoup(open('debug.html', 'r').read()) links = soup.findAll('div', 'boxes') descriptions = soup.findAll('img', 'black') images = [] titles = [] dir_name = soup.find('option', value=str(gal_id)) gal_dir = string.replace(dir_name.contents[0], ' ', '_') + '_' + gal_id for link in links: hrefs = link.findAll('a') images.append(hrefs[-1]['href'].split('/dir')[1]) for description in descriptions: title = description['onmouseover'] titles.append(title.split("return overlib('")[1].split("');")[0])
def get_polls_all(fd, limit=None): """ Scrape an file like object and return a list in which each element represents information about a poll """ soup = BeautifulSoup(fd) tables = soup.findAll("table", {"class": "wikitable sortable"}) if len( tables ) > 1: # TODO This can actually be handled checking for info inside each table raise Exception("Too many tables found") all_trs = tables[0].findAll("tr") tr_lines = all_trs[1:] # Find out parties names # All names are on the first line of the table # Search for font tags span_tags = all_trs[0].findAllNext("span") parties_names = [f.string for f in span_tags] all_polls = [] # TODO Further asserts/verifies are needed to make sure we can use this table for poll in tr_lines: if limit and len(all_polls) >= limit: _log.debug("Stopped parsing. Already parsed until limit = %s" % limit) break cells = poll.findAll("td") if len(cells) != 9: _log.info( "Stopping parsing. Line does not have 9 columns. We need 8 columns to parse stats." ) break cells_t = [ _clean_data(c.string) if c.string is not None else None for c in cells ] a_tag = cells[1].find('a') href = dict(a_tag.attrs)["href"] institute = a_tag.string current_poll_data = { "date": _clean_data( cells_t[0] ), # We actually handle this OK, but clients will probably have problems "source": { "href": href, "name": institute, }, "parties": {} } current_poll_data["parties"].update( (party, cells_t[n]) for party, n in izip(parties_names, range(2, 8))) all_polls.append(current_poll_data) _log.info("Parsed polls for %s" % cells_t[0]) return all_polls
autologin = '******' login = '******' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) login_data = urllib.urlencode({'user_email' : username, 'user_password' : password, 'autologin' : autologin, 'submit' : login }) log = opener.open('http://www.autosport.com/subs/login.php', login_data) gallery = opener.open(url) gal_id = url.split('/')[-1] html_gal = open('debug.html', 'wb') html_gal.write(gallery.read()) html_gal.close() soup = BeautifulSoup(open('debug.html', 'r').read()) items = soup.findAll('ul', id='mycarousel') descriptions = soup.findAll('img', {'class': re.compile(r'\bthumbnail\b')}) images = [] titles = [] dir_name = soup.find('h1') gal_dir = string.replace(dir_name.contents[0], ' ', '_') + '_' + gal_id for item in items: links = item.findAll('a') for link in links: images.append(link['href'])
def _parse_tabela(self, html): soup = BeautifulSoup(html) linhas = soup.findAll('tr', attrs={ 'onclick': re.compile(r"javascript:detalharCep\('\d+','\d+'\);") }) return [self._parse_linha_tabela(linha) for linha in linhas]