def outputData(self, outfile): outSoup = BeautifulStoneSoup("", selfClosingTags=["path"]) outRoot = Tag(outSoup, "svg") outRoot["xmlns"] = "http://www.w3.org/2000/svg" outRoot["width"] = self.width outRoot["height"] = self.height outRoot["version"] = 1.1 outSoup.insert(0, outRoot) for char in reversed(self._soup.findAll("char")): path = Tag(outSoup, "path") path["d"] = char["d"] path["style"] = self.style outRoot.insert(0, path) svg_header = "<?xml version=\"1.0\" standalone=\"no\"?>\n" svg_header += "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"" svg_header += " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n" self.scaleAndGridAlphabet(outSoup) outfile = open(outfile, "w") outfile.write(svg_header + outSoup.prettify()) outfile.close()
def _prepare_soup_put_assign(self): soup, root_tag = self._prepare_soup_root_tag() try: id_tag = Tag(soup, 'ID') id_tag.insert(0, NavigableString(self.id)) root_tag.insert(0, id_tag) except AttributeError: raise ValueError("You must have ID for PUT.") i = 1 old_list = [x.id for x in self.xml_object.assigned] new_list = [x.id for x in self.assigned] added = [x for x in new_list if x not in old_list] removed = [x for x in old_list if x not in new_list] for staff_id in added: add_tag = Tag(soup, 'add', [('id', '%s' % staff_id),]) add_tag.isSelfClosing=True root_tag.insert(i, add_tag) i = i+1 for staff_id in removed: remove_tag = Tag(soup, 'remove', [('id', '%s' % staff_id),]) remove_tag.isSelfClosing=True root_tag.insert(i, remove_tag) i = i+1 return soup
def sanitize_html(value): from BeautifulSoup import BeautifulSoup, Comment, Tag # FIXME: 'None' should never be saved as text if value is None: return "" # allowed tags for a Vodafone Live <CONTAINER type="data" /> # this doubles up as a translation table. CKEditor does new-ish # HTML than Vodafone Live will accept. We have to translate 'em' back # to 'i', and 'strong' back to 'b'. # # NOTE: Order is important since <strong>'s can be inside <p>'s. tags = ( ("em", "i"), # when creating them in the editor they're EMs ("strong", "b"), ("i", "i"), # when loading them as I's the editor leaves them ("b", "b"), # we keep them here to prevent them from being removed ("u", "u"), ("br", "br"), ("p", "p"), ) valid_tags = [tag for tag, replacement_tag in tags] soup = BeautifulSoup(value) # remove all comments from the HTML for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() # hide all tags that aren't in the allowed list, but keep # their contents for tag in soup.findAll(True): # Vodafone Live allows for no tag attributes tag.attrs = [] if tag.name not in valid_tags: tag.hidden = True # replace tags with Vlive equivelants for element, replacement_element in tags: if element is not replacement_element: for tag in soup.findAll(element): replacement_tag = Tag(soup, replacement_element) replacement_tag.insert(0, tag.text) tag.replaceWith(replacement_tag) xml = soup.renderContents().decode("utf8") fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"}) return ( fragment.replace(" ", " ") .replace("’", "'") .replace("‘", "'") .replace(""", '"') .replace("“", '"') .replace("”", '"') .replace("•", "- ") .replace("é", "e") .replace("É", "E") .replace("–", "-") )
def addEpisode(xbmcid, scraperid, snr,enr, title, airdate): f = getDatabase("r") soup = BeautifulSoup(f.read()) f.close() serie = soup.find(scraperid = scraperid) #TODO check inconsistency if serie == None : return False season = serie.find(seasonnr = snr) if season == None: tag = Tag(soup, "season") tag.attrs.append(('seasonnr', snr)) serie.append(tag) season = serie.find(seasonnr = snr) if season == None: util.msg(localize(50000), localize(50004)) return False episode = season.find(episodenr = enr) if episode == None: episodetag = Tag(soup, "episode") episodetag.attrs.append(('episodenr', enr)) titletag = Tag(soup, "title") titletag.insert(0,title) episodetag.append(titletag) airdatetag = Tag(soup, "airdate") airdatetag.insert(0,airdate) episodetag.append(airdatetag) season.append(episodetag) f = getDatabase("w") f.write(soup.prettify()) f.close() #else: #check consistency return True
def do_iperimage(value): """detects iPernity static urls and creates clickable thumbnail for it""" soup = BeautifulSoup(value) iprl = re.compile("^(http://\w+\.ipernity\.com/\d+/\d+/\d+/\d+\.\w+\.)(75|100|240|500|560)(\.jpg)$") iprl_thumb = "500" iprl_zoom = "560" for img in soup.findAll("img", src=iprl): match = iprl.match(img["src"]) try: thumb = Tag(soup, "img") thumb["alt"] = img["title"] thumb["src"] = match.group(1) + iprl_thumb + match.group(3) link = Tag(soup, "a") link["href"] = match.group(1) + iprl_zoom + match.group(3) link["rel"] = "lightbox" link["title"] = img["title"] link.insert(0, thumb) img.replaceWith(link) except: pass return unicode(soup)
def htmlizeTree(tree, base): from BeautifulSoup import Tag, NavigableString import cgi elements = [] for branch in tree: if branch.has_key("href"): el = Tag(base, "A") for attrib in ("href", "add_date", "icon"): el[attrib] = branch[attrib] else: el = Tag(base, "H3") try: el.insert(0, NavigableString(branch["name"])) except: el.insert(0, NavigableString("[can not convert]")) print "can not convert ", branch["name"] dt = Tag(base, "DT") dt.insert(0, el) elements.append(dt) if branch.has_key("tree"): elements.append(htmlizeTree(branch["tree"], base)) dl = Tag(base, "DL") for i, element in enumerate(elements): dl.insert(i, element) dd = Tag(base, "DD") dd.insert(0, dl) return dd
def geo_term_extract(self, desc): data = values ={ 'maxRows':'1', 'fuzzy':'1', 'country':'EE', 'featureClass':'P', 'operator':'OR', 'username':self.geonames_user, 'q':desc.encode('utf-8')} data=urllib.urlencode(values) link = u"http://api.geonames.org/search" xmldata = urllib.urlopen(link, data) soup = BeautifulSoup(xmldata) # print soup.prettify() lng = '0' lat = '0' if len(soup.findAll("lat")) > 0: lng = soup.findAll("lng")[0].text lat = soup.findAll("lat")[0].text lat_f = float(lat) lng_f = float(lng) lat = '%.5f' % ((lat_f * 10000 + random.uniform(1,80))/10000) lng = '%.5f' % ((lng_f * 10000 + random.uniform(1,80))/10000) soup2 = BeautifulSoup() tag1 = Tag(soup2, "Point") tag2 = Tag(soup2, "coordinates") soup2.insert(0, tag1) tag1.insert(0, tag2) text = NavigableString(lng + "," + lat) tag2.insert(0, text) # print soup2 result = (soup2.__str__()).encode("utf-8") return [result, lat, lng]
def _tag_generator(soup, name, attrs=[], contents=None): if attrs != []: new_tag = Tag(soup, name, attrs) else: new_tag = Tag(soup, name) if contents != None: new_tag.insert(0, contents) return new_tag
def code_colorizer(entry): """ Uses BeautifulSoup to find and parse the code in the entry that will be colorized and changes it according to the syntax specs using pygments. The HTML code should include the colorized code wrapped into a div which has language (e.g. python) as id and "code" as class attributes. Best part of using a filter is that we don't have to change the real post containing the code. The worst part is that we have to search for the code layer in each post. """ if settings.COLORIZE_CODE: try: from BeautifulSoup import BeautifulSoup, Tag from pygments import highlight from pygments.lexers import get_lexer_by_name from pygments.formatters import HtmlFormatter except ImportError: return entry try: parser = BeautifulSoup(entry, convertEntities=BeautifulSoup.ALL_ENTITIES) except HTMLParser.HTMLParseError: return entry # searching for code blocks in the blog entry code_blocks = parser.findAll("div", attrs={"class": "code"}) if len(code_blocks) > 0: for block in code_blocks: # if the code block's wrapper div doesn't have an id # attribute don't colorize the code if "id" in block.attrMap: language = block.attrMap["id"] else: continue # finding the exact place of the code layer = block.div if block.div else block # removing any html tags inside the code block [tag.extract() for tag in layer.findAll()] # getting the original code in the block code = "".join(layer.contents) # colorizing the code lexer = get_lexer_by_name(language) formatter = HtmlFormatter(linenos="table", style="tango", cssclass="code") colorized_code = Tag(parser, "div") if block.div else Tag(parser, "div", attrs=(("id", language), ("class", "code"))) colorized_code.insert(0, highlight(code, lexer, formatter)) layer.replaceWith(colorized_code) return parser.renderContents() return entry
def main(bot, args): '''Ответить слушателю. Параметры: <user_id> <message> Если в качестве user_id указать восклицательный знак, сообщение будет выглядеть как объявление.''' syl = { '0' : 'be', '1' : 'sa', '2' : 'ko', '3' : 'pa', '4' : 're', '5' : 'du', '6' : 'ma', '7' : 'ne', '8' : 'wa', '9' : 'si', 'a' : 'to', 'b' : 'za', 'c' : 'mi', 'd' : 'ka', 'e' : 'ga', 'f' : 'no' } salt = bot.settings["ans_salt"] message_limit = 250 userpost = "" if len(args) < 2: return blacklisting = False if args[0] != "!": if args[0] == "?": blacklisting = True del args[0] if len(args[0]) != 12: return _("incorrect name entered, should be 12 symbols.") check = md5() check.update(args[0][:8].encode('utf-8') + salt) if check.hexdigest()[:4] != args[0][8:12]: return _("incorrect name entered (checksum invalid).") if blacklisting: bot.blacklist.append(args[0]) return _("%s was added to blacklist.") % args[0] to = ">>" + args[0] if args[0] in bot.usersposts: userpost = "<span class=\"userpost\">> " + escape(bot.usersposts[args[0]]) + "</span><br/>" else: to = "!" message = " ".join(args[1:]) if len(message) > message_limit: return _("too long answer, should be less than %d symbols, you entered %d symbols.") % (message_limit, len(message)) soup = BeautifulSoup(open(bot.settings["ans_file"], "r")) posts = soup.findAll('p') new_post = Tag(soup, 'p') user_id = Tag(soup, 'span', [('id', 'user_id')]) if to != "!": user_id.insert(0, escape(to)) else: user_id.insert(0, "<b>>>ОБЪЯВЛЕНИЕ<<</b>") new_post.insert(0, '[' + datetime.datetime.strftime(datetime.datetime.now(), "%H:%M:%S") + ']') new_post.insert(1, user_id) new_post.insert(2, userpost + escape(message)) if len(posts) > 0: posts[0].parent.insert(2, new_post) else: soup.find('h1').parent.insert(1, new_post) if len(posts) > 9: posts[len(posts) - 1].extract() f = open(bot.settings["ans_file"], "w") f.write(soup.prettify()) f.close() return _("sent.")
def neighborhood_kml(request,neighborhood): neighborhood = Neighborhood.objects.get(name=neighborhood) soup = BeautifulSoup(neighborhood.geom.kml) tag = Tag(soup, "extrude") soup.polygon.insert(0, tag ) text = "1" tag.insert(0, text) xml = str(soup ) return render_to_response("restaurants/kml_template.html",{'neighborhood': neighborhood,"xml": xml}, context_instance=RequestContext(request))
def replaceJavascript(base_url, soup): for js in soup.findAll('script', {'src': re.compile('.+')}): try: real_js = get_content(resolve_path(base_url, js['src'])) real_js = real_js.replace('</', 'u003c/') js_tag = Tag(soup, 'script') js_tag.insert(0, NavigableString(real_js)) js.replaceWith(js_tag) except Exception,e: print 'failed to load javascript from %s' % js['src'] print e
def build_flat_xml_object(name, fields): from BeautifulSoup import BeautifulStoneSoup, NavigableString, Tag from django.utils.html import escape soup = BeautifulStoneSoup() obj = Tag(soup, name) soup.insert(0, obj) for name, value in fields: tag = Tag(soup, name) tag.insert(0, NavigableString(escape(value))) obj.insert(0, tag) return unicode(soup)
def FixTableHeadings(self): '''Fixes the doxygen table headings. This includes: - Using bare <h2> title row instead of row embedded in <tr><td> in table - Putting the "name" attribute into the "id" attribute of the <tr> tag. - Splitting up tables into multiple separate tables if a table heading appears in the middle of a table. For example, this html: <table> <tr><td colspan="2"><h2><a name="pub-attribs"></a> Data Fields List</h2></td></tr> ... </table> would be converted to this: <h2>Data Fields List</h2> <table> ... </table> ''' table_headers = [] for tag in self.soup.findAll('tr'): if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']: #tag['id'] = tag.td.h2.a['name'] tag.string = tag.td.h2.a.next tag.name = 'h2' table_headers.append(tag) # reverse the list so that earlier tags don't delete later tags table_headers.reverse() # Split up tables that have multiple table header (th) rows for tag in table_headers: print "Header tag: %s is %s" % (tag.name, tag.string.strip()) # Is this a heading in the middle of a table? if tag.findPreviousSibling('tr') and tag.parent.name == 'table': print "Splitting Table named %s" % tag.string.strip() table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) new_table = Tag(self.soup, name='table', attrs=table.attrs) table_parent.insert(table_index + 1, new_table) tag_index = table.contents.index(tag) for index, row in enumerate(table.contents[tag_index:]): new_table.insert(index, row) # Now move the <h2> tag to be in front of the <table> tag assert tag.parent.name == 'table' table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) table_parent.insert(table_index, tag)
def parse_content(self, content, attachments, tags): soup = BeautifulSoup(content) pattern = re.compile(r'<.*?src="\?hash=(\w+?)".*?>') # images for match in soup.findAll('img'): filehashmatch = pattern.search(str(match)) if filehashmatch: filehash = filehashmatch.group(1) filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None) if filename is not None: importedname = self.import_file(filename) match.replaceWith(Tag(soup, 'img', [('src', importedname)])) # pdfs for match in soup.findAll('embed', {"type": "evernote/x-pdf"}): filehashmatch = pattern.search(str(match)) if filehashmatch: filehash = filehashmatch.group(1) filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None) if filename is not None: # convert pdf -> image images = pdf2image(filename) # import each jpg imageTags = Tag(soup, "span") for image in images: importedname = self.import_file(image) # add new image tag imageTags.insert(images.index(image), Tag(soup, 'img', [('src', importedname)])) # replace embed with <img src...> for each image match.replaceWith(imageTags) # TODO: audio # TODO: video #plugins # TODO: qa-format as in Supermemo #for match in soup.find(string=re.compile("A:")): # match['class'] = match.get('class', []) + ['Evernote2Anki-Highlight'] return str(soup).decode('utf-8')
def do_inlines(value, render_content=False): """ Processes inlines for a string of text (such as a blog post). If render_content is True, will return the full blog post HTML, with inlines rendered through their templates. If rendered_content is false, will return a list of inline objects. """ from BeautifulSoup import BeautifulStoneSoup, Tag # Parse the entry content, passing BeautifulStoneSoup our inline tag (plus the regular HTML ones) as self-closing. content = BeautifulStoneSoup(value, selfClosingTags=['br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base','inline']) # Set up the inline_objects list. inline_objects = [] # If rendered_content is true, then we want the entire rendered HTML as a result. if render_content == True: # Look for 'inline' elements, and itterate through them. for inline in content.findAll('inline'): # Get the html from the template for each inline element html_dict = process_inline(inline, html=True) try: inline_object_details = html_dict["inline_object_details"] except: return html_dict # Add the details of this inline to the inline_objects array. inline_object_details = html_dict["inline_object_details"] inline_html = html_dict["inline_html"] inline_objects.append(html_dict["inline_object_details"]) # Add a new div tag to the tree, and then replace our inline tag with it, instead. inline_tag = Tag(content, "div", [("id", inline_object_details['html_id']),("class", inline_object_details['html_class'])]) inline_tag.insert(0, inline_html) inline.replaceWith(inline_tag) # Render out the final HTML for the blog post. final_html = content.renderContents() return final_html # If render_content is false, then we just want a list of the objects themselves. else: # Look for 'inline' elements, and itterate through them. for inline in content.findAll('inline'): # Add the details of this inline to the inline_objects list. processed_inline = process_inline(inline, html=False) inline_objects.append(processed_inline['object']) # Return the final list of inline objects. return inline_objects
def markdown_pygment(txt, linenos="table", stripimg=False): """ Convert Markdown text to Pygmentized HTML """ html = markdown(txt, safe_mode='replace') soup = BeautifulSoup(html) formatter = HtmlFormatter(cssclass='source', linenos=linenos) dirty = False for img in soup.findAll('img'): dirty = True if stripimg: img.replaceWith("[IMAGES NOT ALLOWED IN COMMENTS]") else: # learn BeautifulSoup and clean this up img['class'] = 'postimg' toWrap = img p = img.parent if p.name == 'a': # This is a link, wrap the div around the parent of the link toWrap = p p = p.parent imgDiv = Tag(soup, "div", [("class", "image-wrapper")]) imgDiv.insert(0, toWrap) p.insert(0, imgDiv) # Remove surrounding <p></p> to make HTML valid # This is all rather horrible but works with my blog # posts - need to fix so that the code is correct in all cases # easiest way may be to modify markdown itself para = imgDiv.parent if para.name == 'p': para.replaceWith(imgDiv) for tag in soup.findAll('pre'): if tag.code: txt = tag.code.renderContents() if txt.startswith('pygments:'): lexer_name, txt = txt.split('\n', 1) lexer_name = lexer_name.split(':')[1] txt = _replace_html_entities(txt) if lexer_name in _lexer_names: lexer = get_lexer_by_name(lexer_name, stripnl=True, encoding='UTF-8') tag.replaceWith(highlight(txt, lexer, formatter)) dirty = True if dirty: html = unicode(soup) return html
def userlist(request): x = BeautifulSoup() root = Tag(x,'root') x.insert(0,root) for u in models.Group.objects.get(name='Курсанты').user_set.all(): root.insert(0,'\n') root.insert(0,Tag(x,'user',[ ('uid',str(u.id)), ('username',u.username), ('first_name',u.first_name), ('last_name',u.last_name), ])) return HttpResponse(x)
def anchorArticles(txt): # find all textnodes starting with Article, wrapping this in a named <a> and prepending a hoverable link to this anchor aregex=re.compile('^\s*Article\s+[0-9][0-9.,]*', re.I) nsoup = BeautifulSoup(txt) node=nsoup.find(text=aregex) while node: nodeidx=node.parent.contents.index(node) match=str(re.match(aregex,node).group()) # create named <a> name=match.replace(' ','_') a=Tag(nsoup,'a',[('name',name)]) a.insert(0,match) # create a link that is displayed if the <a> is hovered link=Tag(nsoup,'a', [('class',"anchorLink"), ('href','#'+name)]) link.insert(0,"#") # create a container for the a and the link hover=Tag(nsoup,'span',[('class','hover')]) hover.insert(0,a) hover.insert(0,link) node.parent.insert(nodeidx,hover) # cut the newly wrapped from the original node. newNode=NavigableString(node[len(match):]) node.replaceWith(newNode) node=newNode.findNext(text=aregex) return str(nsoup)
def cache_text_to_html(self): from markdown import markdown from BeautifulSoup import BeautifulSoup, Tag, NavigableString from mnemosyne.utils import purge_html text_html = markdown(self.text) soup = purge_html(BeautifulSoup(text_html)) i_remember_tag = Tag(soup, "span") i_remember_tag.insert(0, NavigableString(self.i_remember() + " ")) i_remember_tag["class"] = "iremember" soup.first().insert(0, i_remember_tag) return soup.decode()
def collage2table(self, data, u=False): """ Takes an html page generated from collage in the oshmail format and converts some divs to a table layout. The collage builds a system of nested divs for rows and columns. What we need is a table with one row, with two columns: 1.1 and 1.2. """ soup = BeautifulSoup(data) # find the real content cells cell_11, cell_12 = soup.findAll(attrs={"class": "collage-column"}, limit=2) # create a table table = Tag(soup, "table", [("id", "collage-table")]) row1 = Tag(soup, "tr") row2 = Tag(soup, "tr") col1 = Tag(soup, "td", [("valign", "top"), ("id", "collage-table-cell1"), ("width", "590")]) col2 = Tag(soup, "td", [("valign", "top"), ("id", "collage-table-cell2"), ("width", "200")]) col1.insert(0, cell_11) col2.insert(0, cell_12) row1.insert(0, col1) row1.insert(1, col2) table.insert(0, row1) if u: return unicode(table) return str(table)
def highlightedNode(self, target_node, yes_phrase, parent_soup): content = str(target_node) text = content.lower() j = text.find(yes_phrase) tag = Tag(parent_soup, "div", [("style", "background-color:#FF8A0D")]) if yes_phrase: tag.append(content[:j]) bold = Tag(parent_soup, "b") bold.insert(0,content[j:(j + len(yes_phrase))]) tag.append(bold) tag.append(content[(j + len(yes_phrase)):]) else: tag.append(content) return tag
def ConvertToTestHtml(quest): types = quest.type titles = quest.text quests_ids = [quest.id] answers = RETestAnswer.objects.filter(question__id__in=quests_ids) newbs = BeautifulSoup() pNode = Tag(newbs, 'p') newbs.insert(0,pNode) if quest.img: print 'Image!!!' print quest.img.url imageNode = Tag(newbs, 'image', [('src', quest.img.url)]) newbs.insert(0,imageNode) TitleNode = Tag(newbs, 'p') TitleNode.string = titles newbs.insert(0,TitleNode) i = 0 if types != 1: for answer in answers: radioname = 'ans' + str(i) nt = Tag(newbs,'input', [('type', 'radio'), ('type', radioname), ('name', 'answerradio'), ('value', str(answer.is_correct))]) nt.string = answer.name pNode.insert(len(pNode.contents), nt) pNode.insert(len(pNode.contents), Tag(newbs, 'br')) else: for answer in answers: radioname = 'ans' + str(i) nt = Tag(newbs,'input', [('type', 'text'), ('name', 'answertext'),('ans', answer.name)]) pNode.insert(len(pNode.contents), nt) pNode.insert(len(pNode.contents), Tag(newbs, 'br')) return newbs.prettify()
def delete(self): soup = BeautifulSoup() client_tag = Tag(soup, 'Client') soup.insert(0, client_tag) try: id_tag = Tag(soup, 'ID') id_tag.insert(0, NavigableString('%d' % self.id)) client_tag.insert(0, id_tag) except AttributeError: raise ValueError("You must have id for delete operation.") response = rest_client.Client("").POST(self.delete_url, str(soup)) soup = BeautifulStoneSoup(response.content) if soup.status and soup.status.contents[0].lower() == 'error': raise ResponseStatusError(soup.errordescription.contents[0])
def sanitize_story(self, story_content): soup = BeautifulSoup(story_content.strip()) fqdn = Site.objects.get_current().domain for iframe in soup("iframe"): url = dict(iframe.attrs).get('src', "") youtube_id = self.extract_youtube_id(url) if youtube_id: a = Tag(soup, 'a', [('href', url)]) img = Tag(soup, 'img', [('style', "display: block; 'background-image': \"url(https://%s/img/reader/youtube_play.png), url(http://img.youtube.com/vi/%s/0.jpg)\"" % (fqdn, youtube_id)), ('src', 'http://img.youtube.com/vi/%s/0.jpg' % youtube_id)]) a.insert(0, img) iframe.replaceWith(a) else: iframe.extract() return unicode(soup)
def AllCategories(request): print 'allcat' x = BeautifulSoup() #root = Tag(x,'ul', [('class', "tree"), ( 'id', "tree")]) #x.insert(0,root) AllCategories = RECategory.objects.filter(parent__isnull=True).order_by('-number') AllAnswered = {} #в logs добавляем только самые поздние по дате RELog for log in RELog.objects.filter(user=request.user).order_by('-date'): if not log.category_id in AllAnswered: AllAnswered[log.category_id] = {} if not log.type_log in AllAnswered[log.category_id]: AllAnswered[log.category_id][log.type_log] = log for category in AllCategories: print category.id nt = Tag(x,'li', [("id", str(category.id))]) log = AllAnswered.get(category.id) rating = '' if log: log = log.get(5) if log : rating = 'Оценка: ' + str(log.rating) div = Tag(x,'div') div.string = rating div["class"] = "rating" #div["style"] = "width: 150px; float: right;" nt.insert(0, div) if category.is_3d: isDDD = "Есть"; else: isDDD = "Нет"; div = Tag(x,'div') div.string = isDDD div["class"] = "is3d" #div["style"] = "margin-right: 0px;width: 110px; float: right;" nt.insert(0, div) div = Tag(x,'div') div["class"] = "demo" #div["style"] = "margin-right: 0px;width: 110px; float: right;" div.string = str(category.type_category) nt.insert(0, div) div = Tag(x,'div') div.string = category.name nt.insert(0, div) x.insert(0,nt) recurseCategories(category, nt, x, AllAnswered) res = x.prettify() #print res print 'endallcat' return res
def add_noindex_to_a(text): doc = BeautifulSoup(text) host_orig = urlparse(settings.SITE_URL)[1] for a in doc.findAll('a'): try: host = urlparse(a['href'])[1] except: pass if a.findParent('noindex')==None: if host!=host_orig: noindex = Tag(doc,"noindex") a.replaceWith(noindex) a['rel']='nofollow' noindex.insert(0,a) return unicode(doc)
def replaceTag(target, targettag,tag=None): if len(targettag) >1: elems = target.findAll(targettag[0],targettag[1]) else: elems = target.findAll(targettag[0]) for element in elems: soup = BeautifulStoneSoup() contents = element.renderContents() contents = charReplacements(contents) if tag is None: element.replaceWith(contents) elif tag is 'newline': element.replaceWith('\n%s'%contents) else: t = Tag(soup, tag) t.insert(0,contents) element.replaceWith( t ) return
def __replaceCss(self, baseUrl, soup): if self.__css != DO_NTH: for css in soup.findAll('link',{'rel':'stylesheet','href':re.compile('.+')}): try: cssHref = css['href'] cssUrl = baseUrl.resolve(cssHref) if self.__css == INLINE: data = self.__contentResolver.getContent(cssUrl, False)[0] cssContent = self.__inlineExternalResourcesInCss(cssUrl, data) else: cssContent = u"<!--" + str(cssUrl) + u"-->" newStyleTag = Tag(soup, "style") newStyleTag.insert(0, MyNavigableString(cssContent)) if css.get('media'): newStyleTag['media'] = css['media'] css.replaceWith(newStyleTag) except BaseException as e: self.__logger.exception(u'failed to load css from %s' % css['href'])
def __replaceJavascript(self, baseUrl,soup): if self.__js != DO_NTH: for js in soup.findAll('script'): src = js.get("src") if js.get('src') else None try: if src and self.__js == INLINE: jsContent = self.__contentResolver.getContent(baseUrl.resolve(src), False) elif self.__js == REMOVE: u = str(baseUrl.resolve(src)) if src else "inlined" jsContent = "<!--" + u + "-->" else: #nothing to change continue newScriptTag = Tag(soup, "script") newScriptTag.insert(0, MyNavigableString(jsContent)) js.replaceWith(newScriptTag) except BaseException as e: self.__logger.error(u'failed to load javascript from %s' % unicode(src))
def sanitize_story(self, story_content): soup = BeautifulSoup(story_content.strip()) fqdn = Site.objects.get_current().domain for iframe in soup("iframe"): url = dict(iframe.attrs).get('src', "") youtube_id = self.extract_youtube_id(url) if youtube_id: a = Tag(soup, 'a', [('href', url)]) img = Tag(soup, 'img', [( 'style', "display: block; 'background-image': \"url(https://%s/img/reader/youtube_play.png), url(http://img.youtube.com/vi/%s/0.jpg)\"" % (fqdn, youtube_id)), ('src', 'http://img.youtube.com/vi/%s/0.jpg' % youtube_id)]) a.insert(0, img) iframe.replaceWith(a) else: iframe.extract() return unicode(soup)
def geo_term_extract(self, desc): data = values = { 'maxRows': '1', 'fuzzy': '1', 'country': 'EE', 'featureClass': 'P', 'operator': 'OR', 'username': self.geonames_user, 'q': desc.encode('utf-8') } data = urllib.urlencode(values) link = u"http://api.geonames.org/search" xmldata = urllib.urlopen(link, data) soup = BeautifulSoup(xmldata) # print soup.prettify() lng = '0' lat = '0' if len(soup.findAll("lat")) > 0: lng = soup.findAll("lng")[0].text lat = soup.findAll("lat")[0].text lat_f = float(lat) lng_f = float(lng) lat = '%.5f' % ((lat_f * 10000 + random.uniform(1, 80)) / 10000) lng = '%.5f' % ((lng_f * 10000 + random.uniform(1, 80)) / 10000) soup2 = BeautifulSoup() tag1 = Tag(soup2, "Point") tag2 = Tag(soup2, "coordinates") soup2.insert(0, tag1) tag1.insert(0, tag2) text = NavigableString(lng + "," + lat) tag2.insert(0, text) # print soup2 result = (soup2.__str__()).encode("utf-8") return [result, lat, lng]
def recurseCategories(parentCat, root, x, AllAnswered): childcats = parentCat.children() if childcats: nt = Tag(x,'ul', [('style', 'display:none')]) root.insert(len(root.contents),nt) root = nt for category in childcats: root.insert(len(root.contents),'\n') nt = Tag(x,"li", [("id", str(category.id))]) log = AllAnswered.get(category.id) rating = '' if log: log = log.get(5) if log : rating = 'Оценка: ' + str(log.rating) div = Tag(x,'div') div.string = rating div["class"] = "rating" #div["style"] = "width: 150px; float: right;" nt.insert(0, div) if category.is_3d: isDDD = "Есть"; else: isDDD = "Нет"; div = Tag(x,'div') div.string = isDDD div["class"] = "is3d" #div["style"] = "margin-right: 0px;width: 110px; float: right;" nt.insert(0, div) div = Tag(x,'div') div["class"] = "demo" #div["style"] = "margin-right: 0px;width: 110px; float: right;" div.string = str(category.type_category) nt.insert(0, div) div = Tag(x,'div') div.string = category.name nt.insert(0, div) root.insert(len(root.contents), nt) recurseCategories(category, nt, x, AllAnswered)
def main(bot, args): '''Ответить слушателю. Параметры: <user_id> <message> Если в качестве user_id указать восклицательный знак, сообщение будет выглядеть как объявление. Если в качестве user_id указать символ @ (или " в русской раскладке), будет использован идентификатор последнего поста. Использовать ОСТОРОЖНО! ? user_id — заблеклистить юзера user_id, его сообщения перестанут поступать в диджейку. ?? — показать блеклист. ?! — очистить блеклист.''' syl = { '0' : 'be', '1' : 'sa', '2' : 'ko', '3' : 'pa', '4' : 're', '5' : 'du', '6' : 'ma', '7' : 'ne', '8' : 'wa', '9' : 'si', 'a' : 'to', 'b' : 'za', 'c' : 'mi', 'd' : 'ka', 'e' : 'ga', 'f' : 'no' } salt = bot.settings["ans_salt"] message_limit = 250 userpost = "" if len(args) == 1 and args[0] != "??" and args[0] != "?!" or not len(args): return blacklisting = False if args[0] != "!": if args[0] == "??": return _("blacklist:\n%s") % "\n".join(bot.blacklist) if args[0] == "?!": bot.blacklist = [] return _("blacklist cleared.") if args[0] == "?": blacklisting = True del args[0] if args[0] == "@" or args[0] == '"': sender = bot.last_user_id elif args[0].isdigit() and int(args[0]) >= 10 and int(args[0]) < 100: sender = bot.num2uid[int(args[0])] else: sender = args[0] if len(sender) != 12: return _("incorrect name entered, should be 12 symbols.") check = md5() check.update(sender[:8].encode('utf-8') + salt) if check.hexdigest()[:4] != sender[8:12]: return _("incorrect name entered (checksum invalid).") if blacklisting: bot.blacklist.append(sender) return _("%s was added to blacklist.") % sender to = ">>" + sender if sender in bot.usersposts: userpost = "<span class=\"userpost\">> " + escape(bot.usersposts[sender]) + "</span><br/>" else: to = "!" message = " ".join(args[1:]) if len(message) > message_limit: return _("too long answer, should be less than %d symbols, you entered %d symbols.") % (message_limit, len(message)) soup = BeautifulSoup(open(bot.settings["ans_file"], "r")) posts = soup.findAll('p') new_post = Tag(soup, 'p') user_id = Tag(soup, 'span', [('id', 'user_id')]) if to != "!": user_id.insert(0, escape(to)) else: user_id.insert(0, "<b>>>ОБЪЯВЛЕНИЕ<<</b>") new_post.insert(0, '<span class="timestamp">[' + datetime.datetime.strftime(datetime.datetime.now(), "%H:%M:%S") + ']</span>') new_post.insert(1, user_id) message = re.sub(r'\[([^]]*)\]', lambda x: '<a href="' + x.group(1).replace("&", "&") + '" target="_blank">' + x.group(1) + '</a>', escape(message)) message = re.sub(r'\{([^}]*)\}', lambda x: '<a href="' + x.group(1).replace("&", "&") + '" target="_blank"><img style="max-width: 200px; max-height: 200px;display: inline;" src="' + x.group(1).replace("&", "&") + '"/></a>', message) new_post.insert(2, userpost + message) if len(posts) > 0: posts[0].parent.insert(2, new_post) else: soup.find('h1').parent.insert(1, new_post) if len(posts) > 9: posts[len(posts) - 1].extract() f = open(bot.settings["ans_file"], "w") f.write(soup.prettify()) f.close() return _("sent.")
def main(): """Create an XML database containing a word from the GNT, its PROIEL ID # and other PROIEL info.""" aligned = codecs.open("aligned-gospels.wds", "rU", "utf-8") xml = codecs.open("proiel-GNT.xml", "rU", "utf-8") print "Parsing the PROIEL XML with BeautifulStoneSoup..." print proiel = BeautifulStoneSoup(xml) tokens = proiel.findAll('token') tok_dict = {} # creating a dictionary keyed by PROIEL IDs to speed up searching for token in tokens: tok_dict[token['id']] = token output = open("gospels-database.xml", "w") print >> output, "<div>" print >> output, "<title>Gospels</title>" count = 100001 soup = BeautifulStoneSoup() word = Tag(soup, "word") print "Iterating through the alignment file..." print for line in aligned: stuff = line.split("\t") word = Tag(soup, "word") form = NavigableString(stuff[0]) word.insert(0, form) # make it so that the IDs count up from 000000, not 100000 word['id'] = str(count).replace("1", "0", 1) word['proiel-id'] = stuff[1] # adding attributes from the PROIEL XML if stuff[1] != "000000" and stuff[1] != "999999" and stuff[1] != "111111": token = tok_dict[stuff[1]] morph = token['morph-features'].split(",") word['lemma'] = morph[0] word['proiel-pos'] = morph[1] word['lang'] = morph[2] word['morph'] = morph[3] word['deprel'] = token['relation'] try: word['proiel-head-id'] = token['head-id'] except KeyError: word['proiel-head-id'] = "root" word['proiel-form'] = stuff[2].rstrip() count += 1 print >> output, word print >> output, "</div>" print "Done!" print
def parse_day(self, soup): for s in soup.findAll( lambda tag: tag.name == 'strong' and tag.contents == []): s.extract() self.url = '' if self.date >= '2011-12-12': body_div = soup.find('div', 'grid_10') or soup.find( 'div', 'grid_7') if not body_div: raise ContextException, 'Could not find div containing main content.' body = body_div.findAll('p') nia_heading_re = re.compile(r'Session: 2011/2012') if not nia_heading_re.match(''.join(body[0](text=True))): raise ContextException, 'Missing NIA heading!' date_head = body[1].find(text=True) body = body[3:] # body[2] is a PDF download link or ISBN else: body = soup('p') nia_heading_re = re.compile( r''' (the)?(\s| |<br>)* (transitional)?(\s| |<br>)* ( northern(\s| |<br>)* ireland(\s| |<br>)* )? assembly ''', re.IGNORECASE | re.VERBOSE) if not nia_heading_re.match(''.join(body[0](text=True))): raise ContextException, 'Missing NIA heading!' date_head = body[1].find(text=True) body = body[2:] timestamp = '' self.speaker = (None, timestamp) self.text = '' for p in body: ptext = re.sub("\s+", " ", ''.join(p(text=True))) phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') #print p, "\n---------------------\n" if p.a and re.match('[^h/]', p.a.get('href', '')): continue if re.match('( )+$', ptext) or ptext == '': continue try: cl = p['class'] except KeyError: raise ContextException, 'Missing class on paragraph: %s' % p cl = re.sub(' style\d', '', cl) if cl == 'OralAnswers': # Main heading, or departmental heading (in bold) if ptext == 'Oral Answers to Questions' or (p.find( 'strong', recursive=False) and len(p.contents) == 1): cl = 'H3SectionHeading' elif re.match('\d+\.( | )+<strong>', phtml): cl = 'B1SpeakersName' elif p.strong: raise ContextException, 'Unhandled <strong> found in %s' % p else: cl = 'H4StageHeading' if cl == 'OralWrittenQuestion' or cl == 'OralAnswers-Question': cl = 'B1SpeakersName' if cl in ('H1DocumentHeading', 'OralWrittenAnswersHeading', 'OralAnswers-H1Heading', 'WrittenStatement-Heading', 'H3SubHeading', 'OralAnswers-H2DepartmentHeading'): cl = 'H3SectionHeading' if cl in ('H4StageHeadingCxSpFirst', 'H4StageHeadingCxSpLast', 'OralAnswers-H3SubjectHeading'): cl = 'H4StageHeading' if cl == 'WrittenStatement-Content' or cl == 'B1BodyText-NumberedList' or cl == 'B2BodyTextBullet1': cl = 'B3BodyText' if cl == 'B3BodyText' and (phtml[0:8] == '<strong>' or re.match( '\d+\.( | )+<strong>', phtml)): cl = 'B1SpeakersName' if cl == 'TimePeriod' and re.search('in the chair(?i)', phtml): cl = 'B3SpeakerinChair' if cl == 'B1BodyTextQuote': cl = 'B3BodyTextItalic' if p.em and len(p.contents) == 1: cl = 'B3BodyTextItalic' if cl == 'H3SectionHeading': self.new_major_heading(ptext, timestamp) elif cl == 'H4StageHeading' or cl == 'H5StageHeading' or cl == 'B3BodyTextClause': self.new_minor_heading(ptext, timestamp) elif cl == 'B1SpeakersName': self.display_speech() m = re.match('.*?:', phtml) if not p.strong and m: newp = Tag(soup, 'p', [('class', 'B1SpeakersName')]) newspeaker = Tag(soup, 'strong') newspeaker.insert(0, m.group()) newp.insert(0, phtml.replace(m.group(), '')) newp.insert(0, newspeaker) p = newp m = re.match('([0-9]+\. )(.*?) asked', phtml) if not p.strong and m: newp = Tag(soup, 'p', [('class', 'B1SpeakersName')]) newspeaker = Tag(soup, 'strong') newspeaker.insert(0, m.group(2)) newp.insert(0, phtml.replace(m.group(), ' asked')) newp.insert(0, newspeaker) newp.insert(0, m.group(1)) p = newp if re.search("<strong>O(’|')Neill\)?</strong>", phtml): newp = Tag(soup, 'p', [('class', 'B1SpeakersName')]) newspeaker = Tag(soup, 'strong') newspeaker.insert(0, re.sub('</?strong>', '', m.group())) newp.insert(0, phtml.replace(m.group(), '')) newp.insert(0, newspeaker) p = newp if not p.strong: raise ContextException, 'No strong in p! %s' % p self.new_person_speak(p, timestamp) elif cl in ('B3BodyTextItalic', 'Q3Motion', 'BillAmend-AmendedText', 'BillAmend-Moved', 'BillAmend-withMinister', 'BillAmend-AmendMade', 'BillAmend-ClauseHeading', 'AyesNoes', 'AyesNoesParties', 'AyesNoesVotes', 'D3PartyMembers', 'B3SpeakerinChair', 'B3BodyTextSpeakerintheChair', 'H2DocumentStartTime', 'AyesNoesDivisionTellers', 'CommunityVoteTable'): match = re.match( 'The Assembly met at ((\d\d?)\.(\d\d) (am|pm)|noon)', phtml) if match: if match.group(1) == 'noon': timestamp = '12:00' else: hour = int(match.group(2)) if hour < 12 and match.group(4) == 'pm': hour += 12 timestamp = "%s:%s" % (hour, match.group(3)) self.speaker = (self.speaker[0], timestamp) self.new_italic_speech(ptext, phtml) elif cl in ('Q3MotionBullet', 'BillAmend-AmendedTextIndent', 'BillAmend-AmendedTextIndent2', 'BillAmend-AmendedTextIndent3', 'BillAmend-QuotewithMinister'): self.text += '<p class="indentitalic">%s</p>\n' % phtml elif cl in ('B3BodyText', 'B3BodyTextnoindent', 'RollofMembersList', 'TableText'): self.text += '<p>%s</p>\n' % phtml elif cl == 'Q1QuoteIndented' or cl == 'Q1Quote': self.text += '<p class="indent">%s</p>\n' % phtml elif cl == 'TimePeriod': timestamp = self.time_period(ptext) elif cl == 'MsoNormal': continue else: raise ContextException, 'Uncaught paragraph! %s %s' % (cl, p) self.display_speech()
def update_testCase_result(src, soup): #print src localtime = time.localtime() updateTime = "%s_%s_%s_%s_%s" % (localtime[0], localtime[1], localtime[2], localtime[3], localtime[4]) head = soup.h1 #update head head.contents[0].replaceWith("BU test report %s" % updateTime) table_map = { "BU sanity test result. URL:": [ "U6_BU_CI", ], } if not re.search("fp_version", src): tc_name = re.search("name=(.*?) ", src).group(1).strip("HZ-").strip() verdict = re.search("verdict=(.*?) ", src).group(1).strip() assc = re.search("assc=(.*?) ", src).group(1).strip() tw = re.search("tw=(.*?) ", src).group(1).strip() mgw = re.search("mgw=(.*?) ", src).group(1).strip() script = re.search("script=(.*?) ", src).group(1).strip() boa = re.search("boa=(.*?) ", src).group(1).strip() nelmon = re.search("nelmon=(.*?) ", src).group(1).strip() link = re.search("link=(.*)", src).group(1).strip() try: tc = soup.find(text=tc_name) #node of text:test case name in soup #print tc tc.previous['href'] = link #update link verdict_list = tc.parent.parent.findNextSiblings( 'td', limit=7) #verdict, tw, nelmon, assc, script, mgw, boa #print verdict_list #update verdict if "PASS" == verdict: tc.parent.parent['bgcolor'] = "green" verdict_list[0]['bgcolor'] = "green" verdict_list[0].contents[0].replaceWith("PASS") elif "FAIL" == verdict: tc.parent.parent['bgcolor'] = "red" verdict_list[0]['bgcolor'] = "red" verdict_list[0].contents[0].replaceWith("FAIL") elif "WARNING" == verdict: tc.parent.parent['bgcolor'] = 'yellow' verdict_list[0]['bgcolor'] = 'yellow' verdict_list[0].contents[0].replaceWith('WARNING') #update TW if "PASS" == tw: verdict_list[1]['bgcolor'] = "green" verdict_list[1].contents[0].replaceWith("PASS") elif "FAIL" == tw: verdict_list[1]['bgcolor'] = "red" verdict_list[1].contents[0].replaceWith("FAIL") #update Nelmon if "PASS" == nelmon: verdict_list[2]['bgcolor'] = "green" verdict_list[2].contents[0].replaceWith("PASS") elif "FAIL" == nelmon: verdict_list[2]['bgcolor'] = "red" verdict_list[2].contents[0].replaceWith("FAIL") #update assc if "PASS" == assc: verdict_list[3]['bgcolor'] = "green" verdict_list[3].contents[0].replaceWith("PASS") elif "FAIL" == assc: verdict_list[3]['bgcolor'] = "red" verdict_list[3].contents[0].replaceWith("FAIL") #update script if "PASS" == script: verdict_list[4]['bgcolor'] = "green" verdict_list[4].contents[0].replaceWith("PASS") elif "FAIL" == script: verdict_list[4]['bgcolor'] = "red" verdict_list[4].contents[0].replaceWith("FAIL") #update mgw if re.search("PASS", mgw): verdict_list[5]['bgcolor'] = "green" verdict_list[5].contents[0].replaceWith("PASS") elif re.search("FAIL", mgw): verdict_list[5]['bgcolor'] = "red" verdict_list[5].contents[0].replaceWith("FAIL") elif re.search("ALERT|CRITICAL", mgw): verdict_list[5]['bgcolor'] = "#800000" verdict_list[5].contents[0].replaceWith("CRITICAL") tc.parent.parent['bgcolor'] = "#800000" #update boa if "PASS" == boa: verdict_list[6]['bgcolor'] = "green" verdict_list[6].contents[0].replaceWith("PASS") elif "FAIL" == boa: verdict_list[6]['bgcolor'] = "red" verdict_list[6].contents[0].replaceWith("FAIL") except: print "%s haven't been included in BU test cases, please contact with BU team" % tc_name else: execution_name = re.search("execution=(.*?) ", src).group(1).strip() mgw_version = re.search("mgw_version=(.*?)il", src).group(1).strip() #il_version = re.search("il_version=(.*?) ", src).group(1).strip() #fp_version = re.search("fp_version=(.*?) ", src).group(1).strip() #prbs_version = re.search("prbs_version=(.*?) ", src).group(1).strip() url = re.search("url=(.*)", src).group(1).strip() #since there is "\n" at the end of every line, so need nextSibling 2 times #if mgw_version and il_version and fp_version and prbs is NA or empty, then update info. Otherwise, skip. #update mgw_version MGW = soup.find(text="release lable:") if MGW.parent.nextSibling.nextSibling.contents[0] == "NA" or \ MGW.parent.nextSibling.nextSibling.contents[0] == "": MGW.parent.nextSibling.nextSibling.contents[0].replaceWith( mgw_version) #update il_version #IL = soup.find(text="IL version:") #if IL.parent.nextSibling.nextSibling.contents[0] == "NA" or \ # IL.parent.nextSibling.nextSibling.contents[0] == "": # IL.parent.nextSibling.nextSibling.contents[0].replaceWith(il_version) #update fp_version #FP = soup.find(text="FP version:") #if FP.parent.nextSibling.nextSibling.contents[0] == "NA" or \ # FP.parent.nextSibling.nextSibling.contents[0] == "": # FP.parent.nextSibling.nextSibling.contents[0].replaceWith(fp_version) #updat prbs_version #PRBS = soup.find(text= "PRBs version:") #if PRBS.parent.nextSibling.nextSibling.a['href'] == "NA": # PRBS.parent.nextSibling.nextSibling.a['href'] = prbs_version # PRBS.parent.nextSibling.nextSibling.contents[0].contents[0].replaceWith(prbs_version) #updat urls for executions for k in table_map.keys(): n = 1 for i in table_map[k]: #if re.search(i, prbs_version): #Use in Open MGW if True: #use in IPA #print k if soup.find( text=re.compile("%s.*" % k) ) == None: #if update sanity test cases result, go to next execution break node = soup.find(text=re.compile("%s.*" % k)).parent temp_soup = BeautifulSoup() tag = Tag(temp_soup, 'a') text = NavigableString("%s" % url) tag.insert(0, text) tag['href'] = "%s" % url node.insert(n, tag) #print node n = n + 1
# No external pages realurl = urldata('realurl')[0].text if not realurl.startswith(SEARCHED_ROOT): continue parturl = realurl[len(SEARCHED_ROOT):] if any(parturl.startswith(prefix) for prefix in IGNORE_PREFIXES): continue q_idx = parturl.find('?') if q_idx >= 0: parturl = parturl[:q_idx] parturls.add(parturl) print >> sys.stderr, 'Building sitemap' for parturl in sorted(parturls, reverse=True): url = Tag(sitemap, 'url') urlset.insert(1, url) loc = Tag(sitemap, 'loc') url.insert(0, loc) text = NavigableString(SITE_ROOT + parturl) loc.insert(0, text) print >> sys.stderr, 'Outputting sitemap' print sitemap.prettify()
def parse_content(self, content, attachments, tags): soup = BeautifulSoup(content) pattern = re.compile(r'<.*?src="\?hash=(\w+?)".*?>') # images for match in soup.findAll('img'): filehashmatch = pattern.search(str(match)) if filehashmatch: filehash = filehashmatch.group(1) filename = next( (l['filename'] for l in attachments if l['hash'] == filehash), None) if filename is not None: importedname = self.import_file(filename) match.replaceWith(Tag(soup, 'img', [('src', importedname)])) # pdfs for match in soup.findAll('embed', {"type": "evernote/x-pdf"}): filehashmatch = pattern.search(str(match)) if filehashmatch: filehash = filehashmatch.group(1) filename = next( (l['filename'] for l in attachments if l['hash'] == filehash), None) if filename is not None: # convert pdf -> image images = pdf2image(filename) # import each jpg imageTags = Tag(soup, "span") for image in images: importedname = self.import_file(image) # add new image tag imageTags.insert( images.index(image), Tag(soup, 'img', [('src', importedname)])) # replace embed with <img src...> for each image match.replaceWith(imageTags) # audio # video #plugins #highlights # TODO: test # <span style="background-color: rgb(255, 204, 102); ">some text...</span> # -> <span class="highlight" style="background-color: rgb(255, 204, 102); ">some text...</span> # # if mw.col.conf.get(SETTING_TAG_HIGHLIGHTS, False) in tags: # matches = soup.find(string=re.compile("<span style=\"background-color: rgb([0-9]+, [0-9]+, [0-9]+); \">.*</span>")) # if matches is not None: # for match in matches: # match['class'] = match.get('class', []) + ['highlight'] # # # TODO: qa #for match in soup.find(string=re.compile("A:")): # match['class'] = match.get('class', []) + ['Evernote2Anki-Highlight'] return str(soup).decode('utf-8')
def rd_parse_post(entry): blogger_id = entry.id created = entry.published.split('.')[:1][0].replace('T', ' ') updated = entry.updated.split('.')[:1][0].replace('T', ' ') link = entry.link #[-1] url = link.replace('http://rugbydump.blogspot.com', '') title = entry.title.encode('ASCII', 'ignore') content = entry.summary content = renode.sub(node, content).encode('ASCII', 'ignore') # Fix up content a bit xcontent = bsoup(content) img = xcontent.img src = img['src'].split('/')[-1] img['src'] = '/media/posts/' + src img['alt'] = title del (img['border']) del (img['style']) del (img['id']) # Put a centererd paragraph around the image np = Tag(xcontent, 'p', [('style', 'text-align: center;')]) np.insert(0, img) try: xcontent.a.replaceWith( np) # Takes away the link around the first image except: xcontent.insert( 0, np ) # Lol that was pretty important (just inserts it and the blank link will remain unfortunately) # Remove the last div xcontent.findAll('div', attrs={'class': 'blogger-post-footer'})[0].extract() try: blurb = xcontent.span.contents[0] except: blurb = '' content = xcontent.prettify() try: numcomments = entry.thr_total except AttributeError: numcomments = 0 try: return { 'src': src, 'created': created, 'updated': updated, 'url': url, 'numcomments': numcomments, 'blogger_id': blogger_id, 'title': title, 'blurb': blurb, 'content': content, } except UnicodeDecodeError: print "Skipping post \"%s\".." % title return
a = span.find("a") example = a["href"] if os.path.isfile(example): example = get_beautiful_file(example) ul = example.body.find("ul") if ul is not None: span.append(ul) count += 1 replace_spans(a, span, example, count) # find all a which are examples and replace them with the standart <span><a></a></span> pattern for a in soup.findAll("a", {"class": "example_icon"}): span = Tag(soup, "span", [("class", "example_icon")]) soup.insert(0, span) example = a["href"] a["class"] = "" a.replaceWith(span) if os.path.isfile(example): example = get_beautiful_file(example) ul = example.body.find("ul") span.insert(0, a) span.insert(1, ul) count += 1 replace_spans(a, span, example, count) new_filename = filename[:-4] new_filename = new_filename + "_new.html" f = open(new_filename, "w") f.write(soup.prettify()) f.close()
def LIST_EPISODES_DB(seriestitle, season, poster, HDonly=False, path=False, NFO=True): import tv as tvDB episodes = tvDB.loadTVEpisodesdb(seriestitle, season, HDonly) #asin,seriestitle,season,episode,episodetitle,url,plot,airdate,runtime,isHD,isprime,watched for asin, seriestitle, season, episode, episodetitle, url, plot, airdate, runtime, isHD, isprime, watched in episodes: episodetitle = episodetitle.replace(':', '').replace('/', ' ').replace( '[HD]', '').strip() if seriestitle in episodetitle: episodetitle = episodetitle.replace( seriestitle, '').strip().strip(',').strip('') if 'Season ' in episodetitle: episodetitle = episodetitle.replace('Season ', 'S') filename = 'S%sE%s - %s' % (season, episode, cleanfilename(episodetitle)) CreateStreamFile(filename, url, path) if NFO: soup = BeautifulStoneSoup() episodedetails = Tag(soup, "episodedetails") soup.insert(0, episodedetails) episodedetails.insert( 0, createElement('title', episodetitle + ' (Amazon)')) if season: episodedetails.insert(1, createElement('season', str(season))) if episode: episodedetails.insert(2, createElement('episode', str(episode))) if plot: episodedetails.insert(3, createElement('plot', plot)) if airdate: episodedetails.insert(4, createElement('aired', airdate)) episodedetails.insert(5, createElement('premiered', airdate)) episodedetails.insert(6, createElement('thumb', poster)) fileinfo = createElement('fileinfo', '') streamdetails = createElement('streamdetails', '') audio = createElement('audio', '') audio.insert(0, createElement('channels', '2')) audio.insert(1, createElement('codec', 'aac')) streamdetails.insert(0, audio) video = createElement('video', '') video.insert(0, createElement('codec', 'h264')) if isHD: video.insert(1, createElement('height', '720')) video.insert(2, createElement('width', '1280')) video.insert(3, createElement('aspect', '1.778')) else: video.insert(1, createElement('height', '480')) video.insert(2, createElement('width', '640')) video.insert(3, createElement('scantype', 'Progressive')) streamdetails.insert(1, video) fileinfo.insert(0, streamdetails) episodedetails.insert(7, fileinfo) episodeNFO = os.path.join(path, filename + '.nfo') file = open(episodeNFO, 'w') file.write(str(soup)) file.close()
def createElement(tagname, contents): soup = BeautifulSoup() element = Tag(soup, tagname) text = NavigableString(contents) element.insert(0, text) return element
from ReadFile import readFile from BeautifulSoup import BeautifulSoup, Tag doc = readFile("xml/book.xml") soup = BeautifulSoup(doc) # replace all authors tags = soup.findAll("author") i = 0 for oldTag in tags: i = i + 1 newTag = Tag(soup, "newTag", [("id", str(i))]) newTag.insert(0, "text #" + str(i)) oldTag.replaceWith(newTag) print soup.prettify() 1
readable_article = Document(html).summary().encode('utf-8') readable_title = Document(html).short_title() manifest += '<item id="article_%s" href="article_%s.html" media-type="application/xhtml+xml"/>\n' % ( i + 1, i + 1) spine += '<itemref idref="article_%s" />\n' % (i + 1) toc += '<navPoint id="navpoint-%s" playOrder="%s"> <navLabel> <text>%s</text> </navLabel> <content src="article_%s.html"/> </navPoint>' % ( i + 2, i + 2, cgi.escape(readable_title), i + 1) soup = BeautifulSoup(readable_article) #Add xml namespace soup.html["xmlns"] = "http://www.w3.org/1999/xhtml" #Insert header body = soup.html.body h1 = Tag(soup, "h1", [("class", "title")]) h1.insert(0, cgi.escape(readable_title)) body.insert(0, h1) #Add stylesheet path head = soup.find('head') if head is None: head = Tag(soup, "head") soup.html.insert(0, head) link = Tag(soup, "link", [("type", "text/css"), ("rel", "stylesheet"), ("href", "stylesheet.css")]) head.insert(0, link) article_title = Tag(soup, "title") article_title.insert(0, cgi.escape(readable_title)) head.insert(1, article_title) #Download images
def makeImagesLocal(soup, params): """ deal with internal and external image references """ for img in soup.findAll('img'): # 'internal' images are marked with class="internal resource" # in order to prevent image fetching later on if 'internal-resource' in (img.get('class') or ''): continue src = img['src'] if params['request'] and src.startswith(params['request'].BASE0) \ and '++resource++' not in src: src = src.replace(params['request'].BASE0 + '/', '') if src.startswith('http'): try: img_data = urllib2.urlopen(str(src)).read() except urllib2.URLError: LOG.warn('No image found: %s - removed from output' % src) img.extract() continue tmpname = tempfile.mktemp(dir=params['destdir']) file(tmpname, 'wb').write(img_data) img['src'] = os.path.basename(tmpname) else: # image with relative URL # first lookup image by direct traversal img_path = urllib.unquote(str(src)) img_obj = params['context'].restrictedTraverse(img_path, None) if img_obj is None: img_path2 = getToolByName( params['context'], 'portal_url').getPortalPath() + img_path img_obj = params['context'].restrictedTraverse(img_path2, None) if img_obj is None and 'resolveuid' in src: mo = uid_reg.search(src) if mo: uid = mo.group(0) img_obj = params['context'].reference_catalog.lookupObject( uid) # For scaled images ('_preview', '_large' etc.) use the original # image always (which is stored as acquisition parent) if img_obj: has_portal_type = hasattr(aq_base(img_obj.aq_inner), 'portal_type') if has_portal_type and img_obj.portal_type == img_obj.aq_parent.portal_type: img_obj = img_obj.aq_parent if img_obj is None: # nothing found, check the next parent node with a 'path' parameter # referring to the origin document parent_container_path = pathFromParent(soup, img) if parent_container_path is not None: img_obj = params['context'].restrictedTraverse( '%s/%s' % (parent_container_path, img_path), None) # still nothing found if img_obj is None: img_split = img_path.split('/') if img_split[-1].startswith( 'image_') or img_split[-1].startswith('image-'): img_path = '/'.join(img_split[:-1]) for image_path in params['images']: if image_path.endswith(img_path): img_obj = params['context'].restrictedTraverse( image_path, None) break # get hold of the image in original size if img_obj: # thumbnails have an Image as aq_parent if img_obj.aq_parent.portal_type == 'Image': img_obj = img_obj.aq_parent if img_obj: img_data = None for attr in ['data', '_data']: try: img_data = str(getattr(img_obj, attr)) continue except AttributeError: pass if img_data == None: LOG.warn('No image found: %s - removed from output' % img_path) img.extract() continue tmpname = tempfile.mktemp(dir=params['destdir']) file(tmpname, 'wb').write(img_data) img['src'] = os.path.basename(tmpname) # image scaling try: scale = img_obj.getField('pdfScale').get(img_obj) except AttributeError: scale = 100 # add content-info debug information # don't add scale as style since the outer image-container # has the style set img['scale'] = str(scale) # now move <img> tag into a dedicated <div> div = Tag(soup, 'div') div['class'] = 'image-container' # div['style'] = 'width: %d%%' % scale div['scale'] = str(scale) div.insert(0, copy.copy(img)) # image caption img_description = img_obj.Description() img_caption = Tag(soup, 'div') img_caption['class'] = 'image-caption' # exclude from image enumeration context = params['context'] exclude_field = img_obj.getField('excludeFromImageEnumeration') if exclude_field and not exclude_field.get(img_obj): span = Tag(soup, 'span') classes = ['image-caption-text'] description = img_obj.Description() if description: classes.append('image-caption-text-with-text') else: classes.append('image-caption-text-without-text') span['class'] = ' '.join(classes) if description: span.insert(0, NavigableString(description)) img_caption.insert(0, span) div.append(img_caption) img.replaceWith(div) else: LOG.warn('No image found: %s - not removed, keeping it' % img_path)
def main(argv=None): if argv == None: argv = sys.argv[1:] parser = make_option_parser() (options, args) = parser.parse_args(argv) log.debug(options) log.setLevel(options.verbose or logging.WARN) global minify if not options.minify: minify = lambda x, path=None: x elif options.google: minify = google_jar_minify elif options.google_rest: minify = google_minify if len(args) != 1: print "Invalid position arguments" parser.print_help() sys.exit(1) INPUT = args[0] OUTPUT = options.output options.initialImport = "" if INPUT.split('.')[-1] not in ('html', 'js', 'pkg'): print "Invalid input file; jsio_compile only operats on .js and .html files" sys.exit(1) compile_kwargs = {} if INPUT.endswith('pkg'): INPUT, options, compile_kwargs = \ load_package_configuration(INPUT, options) output = \ compile_source(INPUT, options, **compile_kwargs) # the root script needs to be able to recognize itself so that it can # figure out where it is. we modify the generated script to store the # expected script name. later on, we can compare that against script # tag src's. # output = \ # output.replace(get_script_src_assignment('jsio.js'), # get_script_src_assignment(os.path.basename(OUTPUT))) # expose = re.compile('window\.jsio\s=\sjsio;'); # expose = re.compile('this\.global\.jsio\s=\sjsio'); # output = expose.sub(options.initialImport + (options.exposeJsio and ';this.global.jsio=jsio;' or ''), output, 1); output += options.initialImport; if options.minify: log.info("Minifying") output = minify(output, path='output') else: log.info("Skipping minify") print "Writing output %s" % OUTPUT # TODO: clean up this hack to write .html files back out if INPUT.endswith('.html'): orig_source = get_source(INPUT) soup = Soup(orig_source) orig_source = "" from BeautifulSoup import Tag tag1 = Tag(soup, "script") tag1.insert(0, output) soup.head.insert(0, tag1) output = soup.prettify() if OUTPUT: f = fileopen(OUTPUT, 'w') f.write(output) f.close() print "RETURNING", len(output) return output
def addServerInfo( soup, serverinfo, uid, snamevalue, urlvalue, unamevalue, passwordvalue ): """ @description: Adds server info to the soup @todo:None @param soup: soup @param serverinfo: @param uid: Unique Id of the server @param snamevalue: String, server name @param urlvalue: String, url of the server @param unamevalue: String, UserName for the server @param passwordvalue: String, password for the server @return: Boolean, True if added successfuly, else False """ snamevalue = unicode(snamevalue) if ifServerNameExists(soup, snamevalue): return False else: server = Tag(soup, "server") serverinfo.insert(0, server) # Creating server info tags servername = Tag(soup, "servername") serverurl = Tag(soup, "serverurl") username = Tag(soup, "username") password = Tag(soup, "password") # Inserting server info fields server.insert(0, servername) server.insert(1, serverurl) server.insert(2, username) server.insert(3, password) # Adding attribute to server tag server["id"] = uid # Adding text values to the server info fields servername.insert(0, snamevalue) serverurl.insert(0, urlvalue) username.insert(0, unamevalue) password.insert(0, passwordvalue) return True
def LIST_TVSHOWS(NFO=False): import tv as tvDB shows = tvDB.loadTVShowdb(favorfilter=True) if (common.addon.getSetting('enablelibraryfolder') == 'true'): SetupAmazonLibrary() elif (common.addon.getSetting('customlibraryfolder') <> ''): CreateDirectory(MOVIE_PATH) CreateDirectory(TV_SHOWS_PATH) for seriestitle, plot, creator, network, genres, actors, year, stars, votes, episodetotal, watched, unwatched, isHD, isprime, favor, TVDBbanner, TVDBposter, TVDBfanart, TVDBseriesid in shows: directorname = os.path.join(TV_SHOWS_PATH, seriestitle.replace(':', '')) CreateDirectory(directorname) if NFO: soup = BeautifulStoneSoup() tvshow = Tag(soup, "tvshow") soup.insert(0, tvshow) tvshow.insert(0, createElement('title', seriestitle)) if year: tvshow.insert(1, createElement('year', str(year))) if plot: tvshow.insert(2, createElement('plot', plot)) if votes: tvshow.insert(3, createElement('votes', str(votes))) if stars: tvshow.insert(4, createElement('rating', str(stars))) if creator: tvshow.insert(5, createElement('credits', creator)) if network: tvshow.insert(6, createElement('studio', network)) if TVDBseriesid: tvshow.insert(7, createElement('id', TVDBseriesid)) episodeguide = createElement('episodeguide', '') url = createElement( 'url', 'http://www.thetvdb.com/api/03B8C17597ECBD64/series/' + TVDBseriesid + '/all/en.zip') url['cache'] = TVDBseriesid + '.xml' episodeguide.insert(0, url) tvshow.insert(8, episodeguide) if TVDBfanart: fanart_tag = createElement('fanart', '') fanart_tag['url'] = 'http://thetvdb.com/banners/' fanart_tag.insert( 0, createElement( 'thumb', TVDBfanart.replace('http://thetvdb.com/banners/', ''))) tvshow.insert(9, fanart_tag) if TVDBposter: tvshow.insert(10, createElement('thumb', TVDBposter)) elif TVDBbanner: tvshow.insert(11, createElement('thumb', TVDBbanner)) index = 11 if genres: for genre in genres.split(','): index += 1 tvshow.insert(index, createElement('genre', genre)) if actors: for actor in actors.split(','): if actor <> None: index += 1 actortag = createElement('actor', '') actorname = createElement('name', actor) actortag.insert(0, actorname) tvshow.insert(index, actortag) seasonTotal, episodeTotal, seasons = LIST_TV_SEASONS(seriestitle, isHD) for season, poster, hasHD in seasons: name = 'Season ' + str(season) if hasHD: name += ' HD' seasonpath = os.path.join(directorname, name) CreateDirectory(seasonpath) if NFO: postertag = createElement('thumb', poster) postertag['type'] = 'season' postertag['season'] = str(season) index += 1 tvshow.insert(index, postertag) LIST_EPISODES_DB(seriestitle, int(season), poster, HDonly=hasHD, path=seasonpath) if NFO: index += 1 tvshow.insert(index, createElement('season', seasonTotal)) index += 1 tvshow.insert(index, createElement('episode', episodeTotal)) tvshowNFO = os.path.join(directorname, 'tvshow.nfo') data = str(soup) if TVDBseriesid: data = 'http://thetvdb.com/index.php?tab=series&id=' + TVDBseriesid file = open(tvshowNFO, 'w') file.write(data) file.close()
def mexhelpextract(mexnames): #print 'processing mex files: ' + mexnames.__repr__() from ConfigParser import RawConfigParser as ConfigParser, Error as error for mexname in mexnames: # ConfigParser for the three elements per subfunctions written to tmpdir # [SubFunction] # usage: 'xyz' # help: 'xyz' # seealso: 'xyz' config = ConfigParser({'usage': [], 'help': [], 'seealso': []}) # assemble command line for matlab matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \ (_tmpdir, \ os.path.splitext(os.path.basename(_mexscript))[0], \ mexname, \ _tmpdir) cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd # and execute matlab w/ the temporary script we wrote earlier try: print 'running MATLAB for %s in %s' % (mexname, _tmpdir) stdin, stderr = os.popen4(cmd) print stderr.read() stdin.close() stderr.close() except: print 'could not dump help for %s into %s' % (mexname, _tmpdir) cfgfile = config.read(os.path.join(_tmpdir, mexname)) if cfgfile == []: print "skipping " + mexname + " (no output)" continue subfunctions = config.sections() print 'processing subfunctions: ' + subfunctions.__repr__() for subfunction in subfunctions: # read in the strings for this subfunction usage = config.get(subfunction, 'usage') help = config.get(subfunction, 'help') seealso = config.get(subfunction, 'seealso') headline = '===[[' + subfunction + ' ' + mexname + '(\'' + subfunction + '\')]]===\n' breadcrumb = "==[[Psychtoolbox]] › [[" \ + mexname + "]].{mex*,dll} subfunction==\n\n" # scrub the text for main text only body = beackern(help) docstring = '' \ + '%%(matlab;Usage)' \ + usage \ + '%%\n' \ + body \ + '\n\n' if seealso: docstring = docstring + '<<=====See also:=====\n' + seealso + '<<' text = '""' + headline \ + breadcrumb \ + docstring + '""' # retrieve old body text, to update or concatenate with synonymous subfunctions # # browse the page title = re.sub("[^\w]|_", "", subfunction) try: resp = mech.open(baseurl + title + "/edit") except HTTPError, e: sys.exit( "retrieving old text during posting of this mex function failed: %d: %s" % (e.code, e.msg)) # get text from the edit form mech.select_form(nr=1) try: oldbody = mech["body"] except: print 'No id="body" form. Figure this out first. cf. page text above.' for form in mech.forms(): print form sys.exit( "retrieving old body text failed while processing page: " + baseurl + title + '/edit') # parse embedded structuring HTML tags in the wiki text soup = BeautifulSoup(oldbody) # check if the subfunction is already present, by CSS 'class' and 'id' subfct = soup.find('div', {'class': "subfct", 'id': mexname}) if subfct: # replace the text of the container DIV subfct.contents[0].replaceWith(text) else: # contruct new DIV to hold the text subfctDIV = Tag(soup, "div") subfctDIV['class'] = 'subfct' subfctDIV['id'] = mexname subfctDIV.insert(0, NavigableString(text)) # insert the new div soup.insert(len(soup), subfctDIV) # Now scoop the good well-formed divs out of the soup divs = soup('div', {'class': "subfct"}) # and drop them into fresh yummy cheese soup cheesesoup = BeautifulSoup() # drop good divs into the soup, one by one for div in divs: # remove the unneeded style attribute, we finally # have this stuff defined in the ptbdocs.css now. del (div['style']) # escape the HTML tags for wiki parser cheesesoup.append(NavigableString('\n""')) cheesesoup.append(div) cheesesoup.append(NavigableString('""\n')) post(subfunction, cheesesoup.renderContents())
def web2epub(urls, outfile=None, cover=None, title=None, author=None, images=None, footer=None, links=None, language=""): if (outfile == None): outfile = time.strftime('%Y-%m-%d-%S.epub') nos = len(urls) cpath = '' ctype = 'image/gif' if cover is not None: cpath = 'images/cover' + os.path.splitext(os.path.abspath(cover))[1] ctype = mimetypes.guess_type(os.path.basename( os.path.abspath(cover)))[0] epub = MyZipFile(outfile, 'w', zipfile.ZIP_DEFLATED) # The first file must be named "mimetype" epub.writestr("mimetype", "application/epub+zip", zipfile.ZIP_STORED) # We need an index file, that lists all other HTML files # This index file itself is referenced in the META_INF/container.xml file epub.writestr( "META-INF/container.xml", '''<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="OEBPS/Content.opf" media-type="application/oebps-package+xml"/> </rootfiles> </container>''') # The index file is another XML file, living per convention # in OEBPS/content.opf index_tpl = '''<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="bookid"> <metadata xmlns:dc="http://purl.org/dc/elements/1.1/"> <dc:title>%(title)s</dc:title> <dc:creator>%(author)s</dc:creator> <dc:date>%(date)s</dc:date> <dc:language>%(lang)s</dc:language> <meta name="cover" content="cover-image" /> </metadata> <manifest> <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/> <item id="cover" href="cover.html" media-type="application/xhtml+xml"/> <item id="cover-image" href="%(front_cover)s" media-type="%(front_cover_type)s"/> <item id="css" href="stylesheet.css" media-type="text/css"/> %(manifest)s </manifest> <spine toc="ncx"> <itemref idref="cover" linear="yes"/> %(spine)s </spine> <guide> <reference href="cover.html" type="cover" title="Cover"/> </guide> </package>''' toc_tpl = '''<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> <head> <meta name="dtb:depth" content="1"/> <meta name="dtb:totalPageCount" content="0"/> <meta name="dtb:maxPageNumber" content="0"/> </head> <docTitle> <text>%(title)s</text> </docTitle> <navMap> <navPoint id="navpoint-1" playOrder="1"> <navLabel> <text>Cover</text> </navLabel> <content src="cover.html"/> </navPoint> %(toc)s </navMap> </ncx>''' cover_tpl = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>Cover</title> <style type="text/css"> img { max-width: 100%%; } .centerpage {text-align:center; vertical-align:middle; margin-right: 100px; margin-left: 100px;} </style> </head> <body> <div class="centerpage"> <img src="%(front_cover)s" alt="Cover image"/> <h2>%(author)s</h2> <h1>%(title)s</h1> <div id="cover-image"> </div> </div> </body> </html>''' stylesheet_tpl = ''' p, body { orphans: 2; widows: 2; } ''' manifest = "" spine = "" toc = "" icon = None for i, url in enumerate(urls): print "Reading URL %s of %s --> %s " % (i + 1, nos, url) ##try: req = urllib.urlopen(url) html = req.read() readable_article = None ##try: document = readability.Document(html) document.TEXT_LENGTH_THRESHOLD = 200 # Gives better results than default readable_article = document.summary() readable_title = document.short_title() ##except: ##continue if (readable_article == None): continue manifest += '<item id="article_%s" href="article_%s.html" media-type="application/xhtml+xml"/>\n' % ( i + 1, i + 1) spine += '<itemref idref="article_%s" />\n' % (i + 1) toc += '<navPoint id="navpoint-%s" playOrder="%s"> <navLabel> <text>%s</text> </navLabel> <content src="article_%s.html"/> </navPoint>' % ( i + 2, i + 2, readable_title.encode('ascii', 'xmlcharrefreplace'), i + 1) try: soup = BeautifulSoup(readable_article) #Add xml namespace soup.html["xmlns"] = "http://www.w3.org/1999/xhtml" except: continue # Insert header if it is not already there body = soup.html.body if not (ascii_chars(readable_title) in ascii_chars(readable_article) ): # TODO: FIXME, this does not work yet, e.g., for ZEIT h1 = Tag(soup, "h1", [("class", "title")]) h1.insert(0, escape(readable_title)) body.insert(0, h1) if (links == None): refs = body.findAll('a') for x in refs: try: tag = Tag(soup, 'span', [("class", "link-removed")]) tag.insert(0, x.text) body.a.replaceWith(tag) except: pass #Add stylesheet path head = soup.find('head') if head is None: head = Tag(soup, "head") soup.html.insert(0, head) link = Tag(soup, "link", [("type", "text/css"), ("rel", "stylesheet"), ("href", "stylesheet.css")]) head.insert(0, link) article_title = Tag(soup, "title") article_title.insert(0, escape(readable_title)) head.insert(1, article_title) # If we do not have an author for the book, then use the URL hostname of the first article if (author == None): author = str(urlparse.urlparse(url).hostname.replace("www.", "")) or '' # If we do not have a title for the book, then use the date if (title == None): if (len(urls) > 1): title = author + " " + str(time.strftime('%d.%m.%Y')) # title = readable_title else: title = readable_title if (images != None): #Download images for j, image in enumerate(soup.findAll("img")): #Convert relative urls to absolute urls imgfullpath = urlparse.urljoin(url, image["src"]) #Remove query strings from url imgpath = urlparse.urlunsplit( urlparse.urlsplit(imgfullpath)[:3] + ( '', '', )) print " Downloading image: %s %s" % (j + 1, imgpath) imgfile = os.path.basename(imgpath) os.system("mogrify -resize 1200x1200 -quality 50 " + imgpath) filename = 'article_%s_image_%s%s' % ( i + 1, j + 1, os.path.splitext(imgfile)[1]) if imgpath.lower().startswith("http"): epub.writestr('OEBPS/images/' + filename, urllib.urlopen(imgpath).read()) image['src'] = 'images/' + filename manifest += '<item id="article_%s_image_%s" href="images/%s" media-type="%s"/>\n' % ( i + 1, j + 1, filename, mimetypes.guess_type(filename)[0]) if (footer != None): p = Tag(soup, "p", [("class", "source-url")]) p.insert(0, url) body.append(p) epub.writestr('OEBPS/article_%s.html' % (i + 1), str(soup)) if (icon == None): icons_xpath = '//link[contains(@rel, "icon")][contains(@href, ".png")]/@href' tree = lxml.html.fromstring( html) # FIXME: This fails on some encodings! # FIXME: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration. results = tree.xpath(icons_xpath) try: icon = urlparse.urljoin(url, sorted(results, reverse=False)[0]) cpath = "images/cover" ctype = "image/png" print(icon) except: pass # This should never happen, but if it does, .encode crashes; hence we catch it if (title == None): title = "Unknown" if (author == None): author = "Unknown" #Metadata about the book info = dict(title=title.encode('ascii', 'xmlcharrefreplace'), author=author.encode('ascii', 'xmlcharrefreplace'), date=time.strftime('%Y-%m-%d'), lang=language, front_cover=cpath, front_cover_type=ctype) epub.writestr('OEBPS/cover.html', cover_tpl % info) if cover is not None: epub.write(os.path.abspath(cover), 'OEBPS/images/cover' + os.path.splitext(cover)[1], zipfile.ZIP_DEFLATED) else: if (icon != None): epub.writestr('OEBPS/images/cover', urllib.urlopen(icon).read(), zipfile.ZIP_DEFLATED) info['manifest'] = manifest info['spine'] = spine info['toc'] = toc # Finally, write the index and toc epub.writestr('OEBPS/stylesheet.css', stylesheet_tpl) epub.writestr('OEBPS/Content.opf', index_tpl % info) epub.writestr('OEBPS/toc.ncx', toc_tpl % info) return outfile
def hack_rst_examples(bs): example_index = 1 for pretag in bs.findAll('pre', {'class':'literal-block'}): table = Tag(bs, "table", [('class', 'example')]) tbody = Tag(bs, "tbody") table.insert(0, tbody) tr = Tag(bs, "tr") tbody.insert(0, tr) td = Tag(bs, "td", [('class', 'example_in')]) tr.insert(0, td) pretag.replaceWith(table) td.insert(0, pretag) gentag = Tag(bs, "td", [('class', 'example_out')]) tcall = Tag(bs, "span", [('id', ('gen_%d' % example_index))]) tcall.insert(0, htmldecode(pretag.contents[0])) gentag.insert(0, tcall) tr.insert(1, gentag) example_index += 1 head = bs.html.head for src in script_includes: head.insert(-1, Tag(bs, 'script', [('type', 'text/javascript'), ('src', src)])) # now insert some script to execute all the examples initscript = Tag(bs, 'script', [('type', 'text/javascript')]) initscript.insert(0, ''' function run_demos() { %s } ''' % ''.join(["mjt.run('gen_%d');\n" % ei for ei in range(1,example_index)])) head.insert(-1, initscript) bs.html.body.attrs.append(('onload', 'run_demos();')) print '%d examples found' % (example_index-1)
def SetupHuluLibrary(self): print "Trying to add Hulu source paths..." source_path = os.path.join(xbmc.translatePath('special://profile/'), 'sources.xml') dialog = xbmcgui.Dialog() self.CreateDirectory(MOVIE_PATH) self.CreateDirectory(TV_SHOWS_PATH) try: file = open(source_path, 'r') contents = file.read() file.close() except: dialog.ok( "Error", "Could not read from sources.xml, does it really exist?") file = open(source_path, 'w') content = "<sources>\n" content += " <programs>" content += " <default pathversion=\"1\"></default>" content += " </programs>" content += " <video>" content += " <default pathversion=\"1\"></default>" content += " </video>" content += " <music>" content += " <default pathversion=\"1\"></default>" content += " </music>" content += " <pictures>" content += " <default pathversion=\"1\"></default>" content += " </pictures>" content += " <files>" content += " <default pathversion=\"1\"></default>" content += " </files>" content += "</sources>" file.close() soup = BeautifulSoup(contents) video = soup.find("video") if len(soup.findAll(text="Hulu Movies")) < 1: movie_source_tag = Tag(soup, "source") movie_name_tag = Tag(soup, "name") movie_name_tag.insert(0, "Hulu Movies") movie_path_tag = Tag(soup, "path") movie_path_tag['pathversion'] = 1 movie_path_tag.insert(0, MOVIE_PATH) movie_source_tag.insert(0, movie_name_tag) movie_source_tag.insert(1, movie_path_tag) video.insert(2, movie_source_tag) if len(soup.findAll(text="Hulu Subscriptions")) < 1: tvshow_source_tag = Tag(soup, "source") tvshow_name_tag = Tag(soup, "name") tvshow_name_tag.insert(0, "Hulu Subscriptions") tvshow_path_tag = Tag(soup, "path") tvshow_path_tag['pathversion'] = 1 tvshow_path_tag.insert(0, TV_SHOWS_PATH) tvshow_source_tag.insert(0, tvshow_name_tag) tvshow_source_tag.insert(1, tvshow_path_tag) video.insert(2, tvshow_source_tag) string = "" for i in soup: string = string + str(i) file = open(source_path, 'w') file.write(str(soup)) file.close() print "Source paths added!"
from BeautifulSoup import BeautifulSoup, Tag, NavigableString soup = BeautifulSoup() tag1 = Tag(soup, "person") tag2 = Tag(soup, "name", [("first","John"),("last","Smith")]) tag3 = Tag(soup, "location", [("country", "uk")]) soup.insert(0, tag1) tag1.insert(0, tag2) tag1.insert(1, tag3) print soup text = NavigableString("John Gary Smith") tag2.insert(0, text) print soup.prettify() 1
# items = soup.findAll("p", {"class": "g"}) for item in items: # print div wspan = item.find("span", {"class": "w"}) # print wspan # Hmm, this should never happen, but it does! if not wspan: continue a = wspan.find('a') if not a: continue if not a['href']: continue cul = Tag(soup, "a") cul['href'] = "/posturl?url=" + urllib.quote(a['href']) img = Tag(soup, "img") img['src'] = "http://static.citeulike.org/favicon.gif" img['style'] = "border:0" cul.insert(0, img) wspan.insert(99, cul) # print wspan.prettify() if testing == 0: print soup else: print soup.prettify()
except: print "Can't read the dictionary file", dictionary_file_name exit() html_index_data = open('../static/index_en.html').read() #shutil.copyfile(, './index_%s.html' % translate_to) index_soup = BeautifulSoup(html_index_data) for link in index_soup.findAll('a'): article_link = unicode(normalize_title(link['href'][6:])) if article_link in trans_dict: #try: link['href'] = '/wiki/%s' % trans_dict[article_link] link['title'] = trans_dict[article_link] link.find(text=link.text).replaceWith(trans_dict[article_link]) #except: # print "translation not found" # pass else: link['class'] = 'TRANS_PENDING' style = Tag(index_soup, "style") index_soup.html.head.insert(1, style) style_text = NavigableString('.TRANS_PENDING {background-color:red;}') style.insert(0, style_text) translated_index = open('./index_%s.html' % translate_to, 'w') translated_index.write(str(index_soup)) translated_index.close()
def sunset_embed(body, request=False): # Moved the import down here to avoid a circular import from sunset.models import image self_closing = [ 'sunset', ] if body and "<sunset" in body: body_raw = BeautifulSoup(body, selfClosingTags=self_closing) imglist = body_raw.findAll('sunset') for imgtag in imglist: err = 'Unknown error parsing Sunset embed tag' new_tag = '' img_pk = imgtag.get('id', False) cur_type = imgtag.get('type', 'icon') if img_pk: img_check = image.objects.filter(pk=int(img_pk)).filter( access_query(request)).select_related('cat') if img_check: cur_img = img_check.first() asset_check = cur_img.assets.filter(type=cur_type) if asset_check: cur_asset = asset_check.first() new_tag = BeautifulSoup(selfClosingTags=self_closing) new_a = Tag(new_tag, 'a') new_img = Tag(new_tag, 'img') new_a['class'] = 'sunset_embed sunset_%s' % cur_type new_a['href'] = cur_img.get_absolute_url() new_a['title'] = cur_img new_img['alt'] = cur_img new_img['title'] = cur_img new_img['src'] = cur_asset.get_url() new_tag.insert(0, new_a) new_a.insert(0, new_img) err = False else: err = 'Sunset image asset type specified in embed tag was not found' else: err = 'Sunset image specified in embed tag was not found' else: err = 'Invalid or missing image ID in Sunset embed tag' if err: imgtag.replaceWith( Comment('%s. Original was: %s' % (err, imgtag))) else: imgtag.replaceWith(new_tag) return unicode(body_raw) else: # Nothing to do. return body
def LIST_MOVIES(): if (common.addon.getSetting('enablelibraryfolder') == 'true'): SetupAmazonLibrary() elif (common.addon.getSetting('customlibraryfolder') <> ''): CreateDirectory(MOVIE_PATH) CreateDirectory(TV_SHOWS_PATH) import movies as moviesDB movies = moviesDB.loadMoviedb(favorfilter=True) for asin, movietitle, url, poster, plot, director, writer, runtime, year, premiered, studio, mpaa, actors, genres, stars, votes, TMDBbanner, TMDBposter, TMDBfanart, isprime, watched, favor, TMDB_ID in movies: CreateStreamFile(movietitle, url, MOVIE_PATH) soup = BeautifulSoup() movie = Tag(soup, "movie") soup.insert(0, movie) movie.insert(0, createElement('title', movietitle + ' (Amazon)')) if year: movie.insert(1, createElement('year', str(year))) if premiered: movie.insert(1, createElement('premiered', premiered)) if plot: movie.insert(2, createElement('plot', plot)) if runtime: movie.insert(2, createElement('runtime', runtime)) if votes: movie.insert(3, createElement('votes', str(votes))) if stars: movie.insert(4, createElement('rating', str(stars))) if director: movie.insert(5, createElement('director', director)) if studio: movie.insert(6, createElement('studio', studio)) if poster: movie.insert(7, createElement('thumb', poster)) if mpaa: movie.insert(8, createElement('mpaa', mpaa)) u = sys.argv[0] u += '?url="' + urllib.quote_plus(url) + '"' u += '&mode="play"' u += '&name="' + urllib.quote_plus(movietitle) + '"' utrailer = u + '&sitemode="PLAYTRAILER"' movie.insert(9, createElement('trailer', utrailer)) fileinfo = createElement('fileinfo', '') streamdetails = createElement('streamdetails', '') audio = createElement('audio', '') audio.insert(0, createElement('channels', '2')) audio.insert(1, createElement('codec', 'aac')) streamdetails.insert(0, audio) video = createElement('video', '') video.insert(0, createElement('codec', 'h264')) video.insert(1, createElement('height', '400')) video.insert(2, createElement('width', '720')) video.insert(4, createElement('scantype', 'Progressive')) streamdetails.insert(1, video) fileinfo.insert(0, streamdetails) movie.insert(10, fileinfo) index = 10 if genres: for genre in genres.split(','): index += 1 movie.insert(index, createElement('genre', genre)) if actors: for actor in actors.split(','): if actor <> None: index += 1 actortag = createElement('actor', '') actorname = createElement('name', actor) actortag.insert(0, actorname) movie.insert(index, actortag) movieNFO = os.path.join(MOVIE_PATH, movietitle + '.nfo') file = open(movieNFO, 'w') file.write(str(soup)) file.close()
chart_url += str(value_killed) chart_url += ',' chart_url += str(value_killer) chart_url += '&chtt=Twitter+Analysis+Chart' #http://chart.apis.google.com/chart?chxl=0:|Policeman+Killed|Killed+by+police&chxs=0,676767,11.5,0,lt,676767&chxt=x&chbh=a,100&chs=300x200&cht=bvg&chco=FF0000&chd=t:30,70&chtt=Twitter+Analysis+Chart # Now, create a HTML page with the information # The paga is simple: head with title, body with a big div holding an image (the chart) and 5 additional divs with text htmldata = BeautifulSoup() htmltag = Tag(htmldata, "html") headtag = Tag(htmldata, "head") titletag = Tag(htmldata, "title") titletag.insert(0, NavigableString('Twitter Stream Analysis Example')) bodytag = Tag(htmldata, "body") imgtag = Tag(htmldata, "img") imgtag['src'] = chart_url divtag_wrap = Tag(htmldata, "div") divtag_t1 = Tag(htmldata, "div") divtag_t1.insert( 0, NavigableString('Total sentences analyzed: ' + str(total_sentences) + ' taken from 400 public tweets')) divtag_t2 = Tag(htmldata, "div") divtag_t2.insert(