def make_links_readable(html): """ Goes through links making them readable If they are too long, they are turned into goo.gl links timing stats: before multiprocess = 0m18.063s """ soup = BeautifulSoup(html) for link in soup.findAll('a'): #links: oldlink = link if link and len(link.get('href', '')) > 90 and options.use_short_links: #make into goo.gl link short_link = shorten_link(soup, link) if short_link != None: link = short_link if validate_link(link) and link.get('href', None): if not link.text: oldlink.replaceWith( link.get('href', "No href link to replace with")) else: div = Tag(soup, 'div') div.setString(link.text) br = Tag(soup, 'br') new_link = Tag(soup, 'a') new_link.setString("(%s)" % (link.get('href'))) div.append(br) div.append(new_link) oldlink.replaceWith(div) print return soup
def anchorArticles(txt): # find all textnodes starting with Article, wrapping this in a named <a> and prepending a hoverable link to this anchor aregex=re.compile('^\s*Article\s+[0-9][0-9.,]*', re.I) nsoup = BeautifulSoup(txt) node=nsoup.find(text=aregex) while node: nodeidx=node.parent.contents.index(node) match=str(re.match(aregex,node).group()) # create named <a> name=match.replace(' ','_') a=Tag(nsoup,'a',[('name',name)]) a.insert(0,match) # create a link that is displayed if the <a> is hovered link=Tag(nsoup,'a', [('class',"anchorLink"), ('href','#'+name)]) link.insert(0,"#") # create a container for the a and the link hover=Tag(nsoup,'span',[('class','hover')]) hover.insert(0,a) hover.insert(0,link) node.parent.insert(nodeidx,hover) # cut the newly wrapped from the original node. newNode=NavigableString(node[len(match):]) node.replaceWith(newNode) node=newNode.findNext(text=aregex) return str(nsoup)
def unTag(self, tag): """ recursively removes unwanted tags according to defined lists @param tag: tag hierarchy to work on """ for child in tag.findChildren(True, recursive=False): self.unTag(child) if (self.remove_classes_regexp != "") and ( tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)): tag.extract() elif tag.name in self.keep_tags: new_tag = Tag(self.input, tag.name) new_tag.contents = tag.contents tag.replaceWith(new_tag) elif tag.name in self.remove_tags_keep_content: children = tag.findChildren(True, recursive=False) if len(children) == 1: tag.replaceWith(children[0]) elif len(children) > 1: new_tag = Tag(self.input, "p") for child in tag.findChildren(True, recursive=False): new_tag.append(child) tag.replaceWith(new_tag) else: tag.replaceWith(tag.renderContents()) else: tag.extract()
def _set_element(self, root, tagname, text=None, attr=None): """Creates if not available an element at the soup root element :return: tag object or None :rtype: Tag """ # Add Topic if not available if attr is None: if root.find(re.compile(tagname + "$", re.I)) is None: new_tag = Tag(self._soup, tagname) root.insert(0, new_tag) else: if root.find(re.compile(tagname + "$", re.I), attr) is None: new_tag = Tag(self._soup, tagname, attr.items()) root.insert(0, new_tag) settings = self._soup.find(self.root) tag = settings.find(re.compile(tagname + "$", re.I)) # Something to insert if tag is not None and text is not None: if tag.text.strip() == "": tag.insert(0, NavigableString(text)) else: tag.contents[0].replaceWith(text) return tag
def setup_source(self): source_path = vfs.join('special://profile/', 'sources.xml') try: soup = vfs.read_file(source_path, soup=True) except: soup = BeautifulSoup() sources_tag = Tag(soup, "sources") soup.insert(0, sources_tag) if soup.find("video") == None: sources = soup.find("sources") if not sources: return video_tag = Tag(soup, "video") sources.insert(0, video_tag) video = soup.find("video") if len(soup.findAll(text="PVR Recordings")) < 1: pvr_source_tag = Tag(soup, "source") pvr_name_tag = Tag(soup, "name") pvr_name_tag.insert(0, "PVR Recordings") PVR_PATH_tag = Tag(soup, "path") PVR_PATH_tag['pathversion'] = 1 PVR_PATH_tag.insert(0, "pvr://recordings/active/Default/") pvr_source_tag.insert(0, pvr_name_tag) pvr_source_tag.insert(1, PVR_PATH_tag) video.insert(2, pvr_source_tag) string = "" for i in soup: string = string + str(i) vfs.write_file(source_path, string)
def do_iperimage(value): '''detects iPernity static urls and creates clickable thumbnail for it''' soup = BeautifulSoup(value) iprl = re.compile( '^(http://\w+\.ipernity\.com/\d+/\d+/\d+/\d+\.\w+\.)(75|100|240|500|560)(\.jpg)$' ) iprl_thumb = '500' iprl_zoom = '560' for img in soup.findAll('img', src=iprl): match = iprl.match(img['src']) try: thumb = Tag(soup, 'img') thumb['alt'] = img['title'] thumb['src'] = match.group(1) + iprl_thumb + match.group(3) link = Tag(soup, 'a') link['href'] = match.group(1) + iprl_zoom + match.group(3) link['rel'] = 'lightbox' link['title'] = img['title'] link.insert(0, thumb) img.replaceWith(link) except: pass return unicode(soup)
def generate_table(summary): soup = BeautifulSoup() new_tag_table = Tag(soup, "table") new_tag_table["border"] = 1 new_tag_table["cellspacing"] = 0 new_tag_table["cellpadding"] = 0 new_tag_table["bordercolordark"] = "#000000" new_tag_table["cellspacing"] = "#ffffff" soup.append(new_tag_table) new_Tag_tr = Tag(soup, "tr") new_Tag_tr["bgcolor"] = "#0072E3" new_tag_table.append(new_Tag_tr) for i in ["TestSuite", "Passed", "Failed", "Total"]: new_Tag_td = Tag(soup, "td") new_Tag_td.string = str(i) new_Tag_tr.append(new_Tag_td) for i in summary: new_Tag_tr = Tag(soup, "tr") new_tag_table.append(new_Tag_tr) for j in i: new_Tag_td = Tag(soup, "td") new_Tag_td.string = str(j) new_Tag_tr.append(new_Tag_td) print str(soup.prettify()) return str(soup.prettify())
def generateContentDivTag(baseDir, h3text): import __main__ contentDivTag = Tag(formatSoup, 'div', attrs={'class' : 'content band-content'}) # 表題埋め込み h3tag = Tag(formatSoup, 'h3') h3tag.append(NavigableString(h3text)) contentDivTag.append(h3tag) # HTML生成 for file in os.listdir(PARENT_DIR + baseDir): if file.endswith(SHTML_EXT): # バンド名ulタグを生成 progreUlTag = generateUlTag('/' + baseDir, file, 'column') albumLiTag = Tag(formatSoup, 'li') progreUlTag.append(albumLiTag) # 作品名ulタグを生成 fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, '/' + baseDir, file]))) albumList = [] for albumClassTag in fileSoup.findAll('a', {'class' : 'album-name'}): albumList.append(albumClassTag['href'].split('/')[-1]) __main__.contentCount += 1 albumDir = '/'.join([baseDir, file.split('.')[0]]) for album in albumList: albumUlTag = generateUlTag('/' + albumDir, album, 'child-column') albumLiTag.append(albumUlTag) contentDivTag.append(progreUlTag) return contentDivTag
def get_first_three(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") for tr in table.findAll("tr"): li = Tag(soup, "li") for td in tr.findAll("td"): if loop != 3: try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "span" if first == 1: first = 0 enclose.append(td) else: if loop != 2: td.append(' - ') li.append(td) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) title = enclose.find("span") enclose.find("span").replaceWith("") enclose.name = "ul" div = Tag(soup, "div") div.append(title) div.append(enclose) return div
def get_last_3(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") ul = Tag(soup, "ul") for tr in table.findAll("tr"): td = tr.findAll("td") li = Tag(soup, "li") for el in td[3:]: if loop != 3: try: text = ''.join(el.findAll(text=True)) text = text.strip() if text != '' and text != ' ': el.name = "span" if loop != 2: el.append(' - ') li.append(el) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) return enclose
def _tag_generator(soup, name, attrs=[], contents=None): if attrs != []: new_tag = Tag(soup, name, attrs) else: new_tag = Tag(soup, name) if contents != None: new_tag.insert(0, contents) return new_tag
def generate_table_of_contents(soup, prefix): header_ids = Counter() headers = soup.findAll(header_re) if not headers: return tocdiv = Tag(soup, "div", [("class", "toc")]) parent = Tag(soup, "ul") parent.level = 0 tocdiv.append(parent) level = 0 previous = 0 for header in headers: contents = u''.join(header.findAll(text=True)) # In the event of an empty header, skip if not contents: continue # Convert html entities to avoid ugly header ids aid = unicode( BeautifulSoup(contents, convertEntities=BeautifulSoup.XML_ENTITIES)) # Prefix with PREFIX_ to avoid ID conflict with the rest of the page aid = u'%s_%s' % (prefix, aid.replace(" ", "_").lower()) # Convert down to ascii replacing special characters with hex aid = str(title_re.sub(lambda c: '.%X' % ord(c.group()), aid)) # Check to see if a tag with the same ID exists id_num = header_ids[aid] + 1 header_ids[aid] += 1 # Only start numbering ids with the second instance of an id if id_num > 1: aid = '%s%d' % (aid, id_num) header['id'] = aid li = Tag(soup, "li", [("class", aid)]) a = Tag(soup, "a", [("href", "#%s" % aid)]) a.string = contents li.append(a) thislevel = int(header.name[-1]) if previous and thislevel > previous: newul = Tag(soup, "ul") newul.level = thislevel parent.append(newul) parent = newul level += 1 elif level and thislevel < previous: while level and parent.level > thislevel: parent = parent.findParent("ul") level -= 1 previous = thislevel parent.append(li) return tocdiv
def AllCategories(request): print 'allcat' x = BeautifulSoup() #root = Tag(x,'ul', [('class', "tree"), ( 'id', "tree")]) #x.insert(0,root) AllCategories = RECategory.objects.filter(parent__isnull=True).order_by('-number') AllAnswered = {} #в logs добавляем только самые поздние по дате RELog for log in RELog.objects.filter(user=request.user).order_by('-date'): if not log.category_id in AllAnswered: AllAnswered[log.category_id] = {} if not log.type_log in AllAnswered[log.category_id]: AllAnswered[log.category_id][log.type_log] = log for category in AllCategories: print category.id nt = Tag(x,'li', [("id", str(category.id))]) log = AllAnswered.get(category.id) rating = '' if log: log = log.get(5) if log : rating = 'Оценка: ' + str(log.rating) div = Tag(x,'div') div.string = rating div["class"] = "rating" #div["style"] = "width: 150px; float: right;" nt.insert(0, div) if category.is_3d: isDDD = "Есть"; else: isDDD = "Нет"; div = Tag(x,'div') div.string = isDDD div["class"] = "is3d" #div["style"] = "margin-right: 0px;width: 110px; float: right;" nt.insert(0, div) div = Tag(x,'div') div["class"] = "demo" #div["style"] = "margin-right: 0px;width: 110px; float: right;" div.string = str(category.type_category) nt.insert(0, div) div = Tag(x,'div') div.string = category.name nt.insert(0, div) x.insert(0,nt) recurseCategories(category, nt, x, AllAnswered) res = x.prettify() #print res print 'endallcat' return res
def SetupAmazonLibrary(self): source_path = xbmc.translatePath( 'special://profile/sources.xml').decode('utf-8') source_added = False source = { self._s.ms_mov: self._s.MOVIE_PATH, self._s.ms_tv: self._s.TV_SHOWS_PATH } if xbmcvfs.exists(source_path): srcfile = xbmcvfs.File(source_path) soup = BeautifulSoup(srcfile) srcfile.close() else: subtags = ['programs', 'video', 'music', 'pictures', 'files'] soup = BeautifulSoup('<sources></sources>') root = soup.sources for cat in subtags: cat_tag = Tag(soup, cat) def_tag = Tag(soup, 'default') def_tag['pathversion'] = 1 cat_tag.append(def_tag) root.append(cat_tag) video = soup.find("video") for name, path in source.items(): path_tag = Tag(soup, "path") path_tag['pathversion'] = 1 path_tag.append(path) source_text = soup.find(text=name) if not source_text: source_tag = Tag(soup, "source") name_tag = Tag(soup, "name") name_tag.append(name) source_tag.append(name_tag) source_tag.append(path_tag) video.append(source_tag) Log(name + ' source path added!') source_added = True else: source_tag = source_text.findParent('source') old_path = source_tag.find('path').contents[0] if path not in old_path: source_tag.find('path').replaceWith(path_tag) Log(name + ' source path changed!') source_added = True if source_added: self.SaveFile(source_path, str(soup)) self._g.dialog.ok(getString(30187), getString(30188), getString(30189), getString(30190)) if self._g.dialog.yesno(getString(30191), getString(30192)): xbmc.executebuiltin('RestartApp')
def parse_content(self, content, attachments, tags): soup = BeautifulSoup(content) pattern = re.compile(r'<.*?src="\?hash=(\w+?)".*?>') # images for match in soup.findAll('img'): filehashmatch = pattern.search(str(match)) if filehashmatch: filehash = filehashmatch.group(1) filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None) if filename is not None: importedname = self.import_file(filename) match.replaceWith(Tag(soup, 'img', [('src', importedname)])) # pdfs for match in soup.findAll('embed', {"type": "evernote/x-pdf"}): filehashmatch = pattern.search(str(match)) if filehashmatch: filehash = filehashmatch.group(1) filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None) if filename is not None: # convert pdf -> image images = pdf2image(filename) # import each jpg imageTags = Tag(soup, "span") for image in images: importedname = self.import_file(image) # add new image tag imageTags.insert(images.index(image), Tag(soup, 'img', [('src', importedname)])) # replace embed with <img src...> for each image match.replaceWith(imageTags) # TODO: audio # TODO: video #plugins # TODO: qa-format as in Supermemo #for match in soup.find(string=re.compile("A:")): # match['class'] = match.get('class', []) + ['Evernote2Anki-Highlight'] return str(soup).decode('utf-8')
def createParentUlTag(targetSoup): parentUlTag = Tag(targetSoup, 'ul', attrs={ 'class': 'xbreadcrumbs', 'id': 'breadcrumbs' }) topListTag = Tag(targetSoup, 'li') topAnchorTag = Tag(targetSoup, 'a', attrs={'href': SITE_DOMAIN}) topAnchorTag.append(NavigableString('TOP')) topListTag.append(topAnchorTag) parentUlTag.append(topListTag) return parentUlTag
def generateUlTag(path, file, ulClass): # バンド名タグを生成 fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, path, file]))) text = fileSoup.find('h1').renderContents() ulTag = Tag(formatSoup, 'ul', attrs={'class' : ulClass}) liTag = Tag(formatSoup, 'li') link = '/'.join([path, file]) aTag = Tag(formatSoup, 'a', attrs={'href' : link}) aTag.append(NavigableString(text)) liTag.append(aTag) ulTag.append(liTag) return ulTag
def SetupAmazonLibrary(): common.Log('Trying to add Amazon source paths...') source_path = os.path.join(common.profilpath, 'sources.xml') source_added = False source = {'Amazon Movies': MOVIE_PATH, 'Amazon TV': TV_SHOWS_PATH} try: file = open(source_path) soup = BeautifulSoup(file) file.close() except: subtags = ['programs', 'video', 'music', 'pictures', 'files'] soup = BeautifulSoup('<sources></sources>') root = soup.sources for cat in subtags: cat_tag = Tag(soup, cat) def_tag = Tag(soup, 'default') def_tag['pathversion'] = 1 cat_tag.append(def_tag) root.append(cat_tag) video = soup.find("video") for name, path in source.items(): path_tag = Tag(soup, "path") path_tag['pathversion'] = 1 path_tag.append(path) source_text = soup.find(text=name) if not source_text: source_tag = Tag(soup, "source") name_tag = Tag(soup, "name") name_tag.append(name) source_tag.append(name_tag) source_tag.append(path_tag) video.append(source_tag) common.Log(name + ' source path added') source_added = True else: source_tag = source_text.findParent('source') old_path = source_tag.find('path').contents[0] if path not in old_path: source_tag.find('path').replaceWith(path_tag) common.Log(name + ' source path changed') source_added = True if source_added: SaveFile(source_path, str(soup)) Dialog.ok(common.getString(30187), common.getString(30188), common.getString(30189), common.getString(30190)) if Dialog.yesno(common.getString(30191), common.getString(30192)): xbmc.executebuiltin('RestartApp')
def userlist(request): x = BeautifulSoup() root = Tag(x,'root') x.insert(0,root) for u in models.Group.objects.get(name='Курсанты').user_set.all(): root.insert(0,'\n') root.insert(0,Tag(x,'user',[ ('uid',str(u.id)), ('username',u.username), ('first_name',u.first_name), ('last_name',u.last_name), ])) return HttpResponse(x)
def CreateSelect(self): '''Создаем select и options''' select = Tag(self.soup, 'select') select['name'] = self.GenerateName() for _ in range(random.randint(3, 12)): option = Tag(self.soup, 'option') option['value'] = self.textShort option.string = self.textShortCap select.append(option) if self._Probability(80): select.option['selected'] = 'selected' self.AppendIds(select, 10, 30) self.ShuffleAttributes(select) return select
def generate_heatmap(intensities): # Load the SVG map svg = open('counties.svg', 'r').read() # Load into Beautiful Soup soup = BeautifulSoup(svg, selfClosingTags=['defs', 'sodipodi:namedview']) # Find counties paths = soup.findAll('path') colors = [ "#DEEBF7", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#08519C", "#08306B" ] min_value = min(intensities.values()) max_value = max(intensities.values()) scalefactor = (len(colors) - 1) / (log(max_value + 1) - log(min_value + 1)) # County style path_style = 'font-size:12px;fill-rule:nonzero;stroke:#FFFFFF;stroke-opacity:1;stroke-width:0.1;stroke-miterlimit:4;stroke-dasharray:none;stroke-linecap:butt;marker-start:none;stroke-linejoin:bevel;fill:' # we will append this hover tooltip after each county path hover_text = '''<text id="popup-%s" x="%s" y="%s" font-size="10" fill="black" visibility="hidden">%s (%s)<set attributeName="visibility" from="hidden" to="visible" begin="%s.mouseover" end="%s.mouseout"/></text>''' for p in paths: if p['id'] not in ["State_Lines", "separator"]: try: count = intensities[p['id']] except: count = 0 x, y = (p['d'].split()[1]).split(',') # insert a new text tag for the county hover tooltip... p.parent.insert(0, Tag(soup, 'text', [("id", 'popup-' + p['id'])])) hover = soup.find("text", {"id": 'popup-' + p['id']}) hover.insert(1, "%s (%s)" % (p['inkscape:label'], str(count))) # add attributes to that text tag... hover['x'] = 250 hover['y'] = 20 hover['font-size'] = "20" hover['fill'] = "black" hover['visibility'] = "hidden" hover.insert(0, Tag(soup, 'set', [("begin", p['id'] + '.mouseover')])) set_tag = soup.find("set", {"begin": p['id'] + '.mouseover'}) set_tag['attributeName'] = "visibility" set_tag['from'] = "hidden" set_tag['to'] = "visible" set_tag['end'] = p['id'] + '.mouseout' color_class = min(int(scalefactor * log(count + 1)), len(colors) - 1) # color_class = int((float(len(colors)-1) * float(count - min_value)) / float(max_value - min_value)) # if count > 0: # print color_class color = colors[color_class] p['style'] = path_style + color print soup.prettify()
def makeHTMLQuestion(fn, htmldata): soup = BeautifulSoup(htmldata) #add JS soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')" soup.find('head').insert(0, SUBMIT_JS) #replace forms forms = soup.findAll('form') if forms: for form in forms: if not form.has_key('method'): form['method'] = 'POST' if not form.has_key('action'): if testmode: form[ 'action'] = 'http://workersandbox.mturk.com/mturk/externalSubmit' else: form[ 'action'] = 'http://www.mturk.com/mturk/externalSubmit' if not form.has_key('onSubmit'): form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');" inputtag = Tag(soup, 'input') inputtag['type'] = 'hidden' inputtag['name'] = 'assignmentId' inputtag['id'] = 'myAssignmentId' inputtag['value'] = '' form.insert(0, inputtag) mainurl = uploadfile(fn, str(soup)) for sub in soup.findAll('img'): # TODO fn = dirname(fn) + '/' + sub['src'] uploadfile(fn) return ExternalQuestion(escape(mainurl), frame_height)
def clean_html_style(data, element, remove_comments=True, remove_empty=True): """removes the style information associated with html element >>> t = '<!-- /* Style Definitions */ table.MsoNormalTable {mso-style-name:"Table Normal"; mso-tstyle-rowband-size:0; mso-tstyle-colband-size:0; mso-style-noshow:yes; mso-style-priority:99; mso-style-qformat:yes; mso-style-parent:""; mso-padding-alt:0in 5.4pt 0in 5.4pt; mso-para-margin-top:0in; mso-para-margin-right:0in; mso-para-margin-bottom:10.0pt; mso-para-margin-left:0in; line-height:115%; mso-pagination:widow-orphan; font-size:11.0pt; font-family:"Calibri","sans-serif"; mso-ascii-font-family:Calibri; mso-ascii-theme-font:minor-latin; mso-hansi-font-family:Calibri; mso-hansi-theme-font:minor-latin;} --><p> </p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">?</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>' >>> clean_html_style(t, 'p') '<p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>' >>> clean_html_style(t, 'p', remove_empty=False) '<p> </p><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>' >>> clean_html_style(t, 'p', remove_comments=False) '<!-- /* Style Definitions */ table.MsoNormalTable\t{mso-style-name:"Table Normal";\tmso-tstyle-rowband-size:0;\tmso-tstyle-colband-size:0;\tmso-style-noshow:yes;\tmso-style-priority:99;\tmso-style-qformat:yes;\tmso-style-parent:"";\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\tmso-para-margin-top:0in;\tmso-para-margin-right:0in;\tmso-para-margin-bottom:10.0pt;\tmso-para-margin-left:0in;\tline-height:115%;\tmso-pagination:widow-orphan;\tfont-size:11.0pt;\tfont-family:"Calibri","sans-serif";\tmso-ascii-font-family:Calibri;\tmso-ascii-theme-font:minor-latin;\tmso-hansi-font-family:Calibri;\tmso-hansi-theme-font:minor-latin;} --><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>' """ try: soup = BeautifulSoup(data) except: soup = BeautifulSoup(data) # remove all comments in this html block if remove_comments: comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] # remove all occurences of tags like sup, script [i.extract() for i in soup.findAll(re.compile('sup|script'))] # find all occurences of the "element" tag for i in soup.findAll(element): text = i.renderContents().strip() if text: new_tag = Tag(soup, element) new_tag.insert(0, text) i.replaceWith(new_tag) elif remove_empty: i.extract() return smart_unicode(soup.renderContents())
def _linkify_headings(self, soup): md_el = soup.find('div', 'md') for heading in md_el.findAll(['h1', 'h2', 'h3'], recursive=False): heading_a = Tag(soup, "a", [('href', '#%s' % heading['id'])]) heading_a.contents = heading.contents heading.contents = [] heading.append(heading_a)
def initServerInfoBase(fileName): """ @description: Intializes soup for the Beautiful soup parser. Reads the exisitng Data from the fileName paramter. @todo:None @param xml: String, Name of file to be loaded in soup. @return: Boolean, True if a successful, else False """ if os.path.exists(fileName): try: f = open(fileName, "r") except: return None, False xml = f.read() f.close() soup = BeautifulStoneSoup(xml) serverinfolist = soup.findAll("serverinfo") else: serverinfolist = [] soup = BeautifulSoup() xml = "null" if len(serverinfolist) == 0: serverinfo = Tag(soup, "serverinfo") soup.insert(0, serverinfo) return soup, True
def content_absolute_links(content, image=None): from django.contrib.sites.models import Site current_site = Site.objects.get(pk=settings.SITE_ID) def abs_url(url): parsed = urlparse.urlparse(url) if parsed.netloc == parsed.scheme == '': url = urlparse.urljoin('http://{0}'.format(current_site.domain), url) return url soup = BeautifulSoup(content) if image: img = Tag(soup, 'img', [('src', image)]) soup.insert(0, img) for link in soup.findAll('a'): link['href'] = abs_url(link['href']) for link in soup.findAll('img'): link['src'] = abs_url(link['src']) return unicode(soup)
def get_mobile_content(self, obj): if obj.mobile_content: content = obj.mobile_content else: content = obj.content if not self.host_det: # apps only content = content.replace("\n<br />\n", "\n") elif self.host_det == "android": content = content.replace("\n<br />\n", "\n") soup = BeautifulSoup(content) # if soup.findAll('iframe'): # gh = soup.findAll('iframe')[0]['src'] # hh = soup.findAll('iframe') for p in soup.findAll("iframe"): if "youtube" in p['src']: newTag = Tag(soup, "a") newTag.attrs.append(("src", p.get('src'))) p.append(newTag) content = unicode(soup) if obj.source is not None and obj.source != '': content = content + "<p>Sources: " + obj.source.replace( "<p>", "").replace("</p>", "") + "</p>" else: content = content content = obj.get_modified_content(content, content_type='mobile') return content
def select_calendar(month=None, year=None): now = datetime.now() day = now.day cal = calendar.HTMLCalendar() cal.setfirstweekday(6) month_table = cal.formatmonth(year, month) soup = BeautifulSoup(month_table) outfile = open("myHTML.html", 'w') for data in soup.findAll('td'): if data['class'] != "noday": days = data.findAll(text=True) for oneday in days: day = NavigableString(oneday) oneday.extract() addatag = Tag(soup, 'input') addatag['type'] = "submit" addatag['name'] = "meetingday" addatag['value'] = day data.insert(0, addatag) outfile.write(soup.prettify()) outfile.close() infile = open("myHTML.html", 'r') calfile = "" for line in infile: calfile = calfile + line infile.close() return calfile
def save(): json_data = request.json status = False data={} with open(ret_url(json_data["doc"],"/papers"), "r+") as inf: txt = inf.read() soup = BeautifulSoup(txt) #Solo se e' una review faccio queste modifiche, altrimenti se e' una decisione lo inserisce direttamente nell'head if json_data["type"] == "review": #Controllo se lo script esiste o meno, se esiste lo elimino for script in soup.findAll("script",{"type":"application/ld+json"}): data = json.loads(script.text.strip()) if data[0]["@type"] == "review": if data[0]["article"]["eval"]["author"] == "mailto:"+json_data["author"]: script.extract() break #Rimuovo il contenuto del Body e lo riscrivo for section in soup.findAll("section"): section.extract() for section in json_data["sections"]: beauty = BeautifulSoup(section) soup.body.insert(len(soup.body.contents), beauty) #Creo lo script e lo inserisco new = Tag(soup, "script") new.attrs.append(("type", "application/ld+json")) new.string = json.dumps(json_data["script"]) soup.head.insert(len(soup.head.contents), new) #Salvo il file html = soup.prettify("utf_8") inf.seek(0) inf.write(html) inf.truncate() inf.close() status=True return jsonify({"result": status})
def _set(self, topic, key, value, topic_attr=None): """Set key and value at topic :return: success status :rtype: bool""" # In case it is an empty document if not unicode(self._soup).strip().startswith("<?xml"): self._soup.insert(0, NavigableString(self.HEADER)) # In case settings root is not defined settings = self._soup.find(self.root) if settings is None: self._soup.insert(1, Tag(self._soup, self.root)) settings = self._soup.find(self.root) # Add Topic topic_tag = self._set_element(settings, topic.lower(), attr=topic_attr) if topic_tag is None: return False # Add key and value key_tag = self._set_element(topic_tag, key.lower(), escape(value)) # Add "" since XML may introduce whitespaces. #key_tag = self._set_element(topic_tag, key, '"{0}"'.format(value)) return key_tag is not None