def make_links_readable(html): """ Goes through links making them readable If they are too long, they are turned into goo.gl links timing stats: before multiprocess = 0m18.063s """ soup = BeautifulSoup(html) for link in soup.findAll('a'): #links: oldlink = link if link and len(link.get('href', '')) > 90 and options.use_short_links: #make into goo.gl link short_link = shorten_link(soup, link) if short_link != None: link = short_link if validate_link(link) and link.get('href', None): if not link.text: oldlink.replaceWith( link.get('href', "No href link to replace with")) else: div = Tag(soup, 'div') div.setString(link.text) br = Tag(soup, 'br') new_link = Tag(soup, 'a') new_link.setString("(%s)" % (link.get('href'))) div.append(br) div.append(new_link) oldlink.replaceWith(div) print return soup
def addEpisode(xbmcid, scraperid, snr,enr, title, airdate): f = getDatabase("r") soup = BeautifulSoup(f.read()) f.close() serie = soup.find(scraperid = scraperid) #TODO check inconsistency if serie == None : return False season = serie.find(seasonnr = snr) if season == None: tag = Tag(soup, "season") tag.attrs.append(('seasonnr', snr)) serie.append(tag) season = serie.find(seasonnr = snr) if season == None: util.msg(localize(50000), localize(50004)) return False episode = season.find(episodenr = enr) if episode == None: episodetag = Tag(soup, "episode") episodetag.attrs.append(('episodenr', enr)) titletag = Tag(soup, "title") titletag.insert(0,title) episodetag.append(titletag) airdatetag = Tag(soup, "airdate") airdatetag.insert(0,airdate) episodetag.append(airdatetag) season.append(episodetag) f = getDatabase("w") f.write(soup.prettify()) f.close() #else: #check consistency return True
def unTag(self, tag): """ recursively removes unwanted tags according to defined lists @param tag: tag hierarchy to work on """ for child in tag.findChildren(True, recursive=False): self.unTag(child) if (self.remove_classes_regexp != "") and (tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)): tag.extract() elif tag.name in self.keep_tags: new_tag = Tag(self.input, tag.name) new_tag.contents = tag.contents tag.replaceWith(new_tag) elif tag.name in self.remove_tags_keep_content: children = tag.findChildren(True, recursive=False) if len(children)==1: tag.replaceWith(children[0]) elif len(children) > 1: new_tag = Tag(self.input, "p") for child in tag.findChildren(True, recursive=False): new_tag.append(child) tag.replaceWith(new_tag) else: tag.replaceWith(tag.renderContents()) else: tag.extract()
def generate_table_of_contents(soup, prefix): header_ids = Counter() headers = soup.findAll(header_re) if not headers: return tocdiv = Tag(soup, "div", [("class", "toc")]) parent = Tag(soup, "ul") parent.level = 0 tocdiv.append(parent) level = 0 previous = 0 for header in headers: contents = u''.join(header.findAll(text=True)) # In the event of an empty header, skip if not contents: continue # Convert html entities to avoid ugly header ids aid = unicode( BeautifulSoup(contents, convertEntities=BeautifulSoup.XML_ENTITIES)) # Prefix with PREFIX_ to avoid ID conflict with the rest of the page aid = u'%s_%s' % (prefix, aid.replace(" ", "_").lower()) # Convert down to ascii replacing special characters with hex aid = str(title_re.sub(lambda c: '.%X' % ord(c.group()), aid)) # Check to see if a tag with the same ID exists id_num = header_ids[aid] + 1 header_ids[aid] += 1 # Only start numbering ids with the second instance of an id if id_num > 1: aid = '%s%d' % (aid, id_num) header['id'] = aid li = Tag(soup, "li", [("class", aid)]) a = Tag(soup, "a", [("href", "#%s" % aid)]) a.string = contents li.append(a) thislevel = int(header.name[-1]) if previous and thislevel > previous: newul = Tag(soup, "ul") newul.level = thislevel newli = Tag(soup, "li", [("class", "toc_child")]) newli.append(newul) parent.append(newli) parent = newul level += 1 elif level and thislevel < previous: while level and parent.level > thislevel: parent = parent.findParent("ul") level -= 1 previous = thislevel parent.append(li) return tocdiv
def make_links_readable(html): """ Goes through links making them readable If they are too long, they are turned into goo.gl links timing stats: before multiprocess = 0m18.063s """ soup = BeautifulSoup(html) for link in soup.findAll('a'):#links: oldlink = link if link and len(link.get('href', '')) > 90 and options.use_short_links: #make into goo.gl link short_link = shorten_link(soup, link) if short_link != None: link = short_link if validate_link(link) and link.get('href', None): if not link.text: oldlink.replaceWith(link.get('href', "No href link to replace with")) else: div = Tag(soup, 'div') div.setString(link.text) br = Tag(soup, 'br') new_link = Tag(soup, 'a') new_link.setString("(%s)" % (link.get('href')) ) div.append(br) div.append(new_link) oldlink.replaceWith(div) print return soup
def unTag(self, tag): """ recursively removes unwanted tags according to defined lists @param tag: tag hierarchy to work on """ for child in tag.findChildren(True, recursive=False): self.unTag(child) if (self.remove_classes_regexp != "") and ( tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)): tag.extract() elif tag.name in self.keep_tags: new_tag = Tag(self.input, tag.name) new_tag.contents = tag.contents tag.replaceWith(new_tag) elif tag.name in self.remove_tags_keep_content: children = tag.findChildren(True, recursive=False) if len(children) == 1: tag.replaceWith(children[0]) elif len(children) > 1: new_tag = Tag(self.input, "p") for child in tag.findChildren(True, recursive=False): new_tag.append(child) tag.replaceWith(new_tag) else: tag.replaceWith(tag.renderContents()) else: tag.extract()
def savePDF(self, pdf_filename, parent_soup, target_node, yes_phrase, url, key, school_name): if target_node: grandparent_node = target_node.parent.parent tag = self.highlightedNode(target_node, yes_phrase, parent_soup) self.replaceNode(target_node, tag) body = Tag(parent_soup,"body") body.append(grandparent_node) else: body = parent_soup try: weasyprint = HTML(string=body.prettify()) tmp_filename = 'pdfs/tmp.pdf' weasyprint.write_pdf(tmp_filename,stylesheets=[CSS(string='body { font-size: 10px; font-family: serif !important }')]) except: print "weasyprint failed on url: "+url if target_node: self.replaceNode(tag, target_node) #return to old state return if target_node: self.replaceNode(tag, target_node) #return to old state sep_filename = "pdfs/sep.pdf" self.makeSepPage(sep_filename, url, key, school_name) merger = PdfFileMerger() if (os.path.exists(pdf_filename)): merger.append(PdfFileReader(file(pdf_filename, 'rb'))) merger.append(PdfFileReader(file(sep_filename, 'rb'))) merger.append(PdfFileReader(file(tmp_filename, 'rb'))) merger.write(pdf_filename)
def get_last_3(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") ul = Tag(soup, "ul") for tr in table.findAll("tr"): td = tr.findAll("td") li = Tag(soup, "li") for el in td[3:]: if loop != 3: try: text = ''.join(el.findAll(text=True)) text = text.strip() if text != '' and text != ' ': el.name = "span" if loop != 2: el.append(' - ') li.append(el) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) return enclose
def get_first_three(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") for tr in table.findAll("tr"): li = Tag(soup, "li") for td in tr.findAll("td"): if loop != 3: try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "span" if first == 1: first = 0 enclose.append(td) else: if loop != 2: td.append(' - ') li.append(td) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) title = enclose.find("span") enclose.find("span").replaceWith("") enclose.name = "ul" div = Tag(soup, "div") div.append(title) div.append(enclose) return div
def rewriteLinksSection(dom, soup, links_table): links = [] for fnt in links_table.findAll('font', {'size': '2', 'face':'verdana'}): if str(fnt).startswith('<font size="2" face="verdana"><a href="'): link = fnt.find('a') caption = link.getText('').strip() if caption.endswith(' Translation') and OMIT_TRANSLATIONS: continue links.append((link['href'], caption)) links_table.decompose() if not INCLUDE_LINKS or len(links) == 0: return b = Tag(soup, 'b') b.string = 'Links' dom.append(b) ul = Tag(soup, 'ul') for url, caption in links: li = Tag(soup, 'li') a = Tag(soup, 'a', {'href': url}) a.string = caption li.append(a) ul.append(li) dom.append(ul)
def fix_heading(heading, tags): ''' Remove paragraphs with no strings. Remove non-special headings that don't start with a paragraph. Remove lists from non-special headings. ''' SPECIAL = ['Books', 'Works', 'Bibliography', 'External links', 'Further reading'] tags = [tag for tag in tags if tag is not None and tag.name!='p' or tag.renderContents(None).strip()] special = False heading_text = tagtext(heading) for word in SPECIAL: if word.lower() in heading_text.lower(): special = True if heading_text == 'External links and references': set_heading_text(heading, 'External links') # Shorten lists (even special ones). # The motivation is that some pages like to list reams of crap, # usually in bibliographies, but in other things too. found_lis = 0 MAX_ITEMS = 10 # per headed section for tag in list(tags): if tag.name in ('ul', 'ol'): for li in tag.findAll('li', recursive=False): found_lis += 1 if found_lis > MAX_ITEMS: li.extract() # Remove any now-empty uls and ols. # Harder than it sounds, due to nested lists. temp = Tag(soup, 'p') for tag in tags: temp.append(tag) for tag in temp.findAll(('ul', 'ol')): if not tag.findAll(('ul', 'ol', 'li')): tag.extract() tags = temp.contents if found_lis > MAX_ITEMS: # Add " (some omitted)" to heading if heading_text: heading_text = heading_text.replace(' (incomplete)', '') if context['srcurl'].startswith('http:'): heading_text += ' (some <a href="%s">omitted</a>)' % context['srcurl'] else: heading_text += ' (some omitted)' # no "relative" links set_heading_text(heading, heading_text) if not special: if heading is not None: # Remove non-special headings which don't start with a paragraph. if not tags or tags[0].name != 'p': return drop_heading(heading) # Remove non-special headings containing lists. for tag in tags: if tag.name in ('ul', 'ol'): return drop_heading(heading) else: # Remove lists from None (before first heading, if any). tags = [tag for tag in tags if tag.name not in ('ul', 'ol')] return (heading, tags)
def _number_sections(self, soup): count = 1 for para in soup.find("div", "md").findAll(["p"], recursive=False): a = Tag(soup, "a", [("class", "p-anchor"), ("id", "p_%d" % count), ("href", "#p_%d" % count)]) a.append(str(count)) para.insert(0, a) para.insert(1, " ") count += 1
def generate_table_of_contents(soup, prefix): header_ids = Counter() headers = soup.findAll(header_re) if not headers: return tocdiv = Tag(soup, "div", [("class", "toc")]) parent = Tag(soup, "ul") parent.level = 0 tocdiv.append(parent) level = 0 previous = 0 for header in headers: contents = u''.join(header.findAll(text=True)) # In the event of an empty header, skip if not contents: continue # Convert html entities to avoid ugly header ids aid = unicode(BeautifulSoup(contents, convertEntities=BeautifulSoup.XML_ENTITIES)) # Prefix with PREFIX_ to avoid ID conflict with the rest of the page aid = u'%s_%s' % (prefix, aid.replace(" ", "_").lower()) # Convert down to ascii replacing special characters with hex aid = str(title_re.sub(lambda c: '.%X' % ord(c.group()), aid)) # Check to see if a tag with the same ID exists id_num = header_ids[aid] + 1 header_ids[aid] += 1 # Only start numbering ids with the second instance of an id if id_num > 1: aid = '%s%d' % (aid, id_num) header['id'] = aid li = Tag(soup, "li", [("class", aid)]) a = Tag(soup, "a", [("href", "#%s" % aid)]) a.string = contents li.append(a) thislevel = int(header.name[-1]) if previous and thislevel > previous: newul = Tag(soup, "ul") newul.level = thislevel newli = Tag(soup, "li", [("class", "toc_child")]) newli.append(newul) parent.append(newli) parent = newul level += 1 elif level and thislevel < previous: while level and parent.level > thislevel: parent = parent.findParent("ul") level -= 1 previous = thislevel parent.append(li) return tocdiv
def linearize_cols_3(soup, table): if table.get('id') == "linearize-cols-3": div = Tag(soup, "ul") div["class"] = "div-container" ul_last = get_last_3(soup, table) ul_first = get_first_3(soup, table) div.append(ul_first) div.append(ul_last) table.replaceWith(div)
def CreateBody(self): '''Создаем body''' body = Tag(self.soup, 'body') totalTagsCount = random.randint(150, 400) '''Создаем структуру шаблона из тегов div''' for _ in range(random.randint(1, 3)): body.append(self.CreateDiv()) divsTotalCount = totalTagsCount * random.randint(15, 25) / 100 while divsTotalCount > 0: divsLowLevelList = [item for item in body.findAll('div') if len(item.findAll(True)) == 0] divToExtend = random.choice(divsLowLevelList) for _ in range(random.randint(2, 4)): divToExtend.append(self.CreateDiv()) divsTotalCount -= 1 '''Получаем список тегов div разных уровней''' divsList = body.findAll('div') divsTopLevelList = [item for item in body.findAll('div', recursive=False)] divsLowLevelList = [item for item in divsList if len(item.findAll(True)) == 0] divsMidLevelList = [item for item in divsList if item not in divsTopLevelList and item not in divsLowLevelList] '''Проставляем им атрибуты''' for item in divsTopLevelList: self.AppendIds(item, 95, 1) for item in divsMidLevelList: self.AppendIds(item, 20, 75) for item in divsLowLevelList: self.AppendIds(item, 30, 65) '''Создаем наполнение главных блоков''' divHeader = divsLowLevelList.pop(random.randint(0, 2)) divHeader.string = '[header]' divMain = divsLowLevelList.pop(random.randint(1, 3)) divMain.string = '[main]' divLinks = divsLowLevelList.pop(random.randint(-3, -1)) divLinks.string = '[links]' divFooter = divsLowLevelList.pop(random.randint(-3, -1)) divFooter.string = '[footer]' '''Создаем меню, сайдбары и формы''' for _ in range(random.randint(1, 2)): menu = divsLowLevelList.pop() menu.append(self.CreateList(0)) for _ in range(random.randint(1, 2)): sidebar = divsLowLevelList.pop() self.CreateSidebar(sidebar) for _ in range(random.randint(0, 2)): form = divsLowLevelList.pop() form.append(self.CreateForm()) '''Создаем прочее наполнение''' random.shuffle(divsLowLevelList) for _ in range(random.randint(2, 5)): div = divsLowLevelList.pop() self.CreateOthers(div) self.soup.html.append(body)
def linearize_cols_2(soup, table): if table.get('id') == "linearize-cols-2": ul = Tag(soup, "ul") ul["class"] = "ul-container" ul_last = get_last_two(soup, table) ul_first = get_first_two(soup, table) ul.append(ul_first) ul.append(ul_last) table.replaceWith(ul)
def _number_sections(self, soup): count = 1 for para in soup.find('div', 'md').findAll(['p'], recursive=False): a = Tag(soup, 'a', [ ('class', 'p-anchor'), ('id', 'p_%d' % count), ('href', '#p_%d' % count), ]) a.append(str(count)) para.insert(0, a) para.insert(1, ' ') count += 1
def replace_courier(soup): """Lacking a better option, I use courier font to mark <code> within tinyMCE. And I want to turn that into real code tags. Most users won't be needing this(?), so this code is not called anywhere but kept for reference """ for t in soup.findAll(lambda s:s.has_key('style') and 'courier' in s['style']): tag = Tag(soup, 'code') while t.contents: tag.append(t.contents[0]) t.replaceWith(tag)
def SetupAmazonLibrary(): common.Log('Trying to add Amazon source paths...') source_path = os.path.join(common.profilpath, 'sources.xml') source_added = False try: file = open(source_path) soup = BeautifulSoup(file) file.close() except: subtags = ['programs', 'video', 'music', 'pictures', 'files'] soup = BeautifulSoup('<sources></sources>') root = soup.sources for cat in subtags: cat_tag = Tag(soup, cat) def_tag = Tag(soup, 'default') def_tag['pathversion'] = 1 cat_tag.append(def_tag) root.append(cat_tag) video = soup.find("video") if len(soup.findAll(text="Amazon Movies")) < 1: movie_source_tag = Tag(soup, "source") movie_name_tag = Tag(soup, "name") movie_name_tag.insert(0, "Amazon Movies") movie_path_tag = Tag(soup, "path") movie_path_tag['pathversion'] = 1 movie_path_tag.insert(0, MOVIE_PATH) movie_source_tag.insert(0, movie_name_tag) movie_source_tag.insert(1, movie_path_tag) video.insert(2, movie_source_tag) source_added = True if len(soup.findAll(text="Amazon TV")) < 1: tvshow_source_tag = Tag(soup, "source") tvshow_name_tag = Tag(soup, "name") tvshow_name_tag.insert(0, "Amazon TV") tvshow_path_tag = Tag(soup, "path") tvshow_path_tag['pathversion'] = 1 tvshow_path_tag.insert(0, TV_SHOWS_PATH) tvshow_source_tag.insert(0, tvshow_name_tag) tvshow_source_tag.insert(1, tvshow_path_tag) video.insert(2, tvshow_source_tag) source_added = True if source_added: common.Log('Source paths added!') SaveFile(source_path, str(soup)) dialog.ok(common.getString(30187), common.getString(30188), common.getString(30189), common.getString(30190)) if dialog.yesno(common.getString(30191), common.getString(30192)): xbmc.executebuiltin('RestartApp')
def replace_courier(soup): """Lacking a better option, I use courier font to mark <code> within tinyMCE. And I want to turn that into real code tags. Most users won't be needing this(?), so this code is not called anywhere but kept for reference """ for t in soup.findAll(lambda s: ('style' in s) and 'courier' in s['style']): tag = Tag(soup, 'code') while t.contents: tag.append(t.contents[0]) t.replaceWith(tag)
def generate_table(summary): soup = BeautifulSoup() new_tag_table = Tag(soup, "table") new_tag_table["border"] = 1 new_tag_table["cellspacing"] = 0 new_tag_table["cellpadding"] = 0 new_tag_table["bordercolordark"] = "#000000" new_tag_table["cellspacing"] = "#ffffff" soup.append(new_tag_table) new_Tag_tr = Tag(soup, "tr") new_Tag_tr["bgcolor"] = "#0072E3" new_tag_table.append(new_Tag_tr) for i in ["TestSuite", "Passed", "Failed", "Total"]: new_Tag_td = Tag(soup, "td") new_Tag_td.string = str(i) new_Tag_tr.append(new_Tag_td) for i in summary: new_Tag_tr = Tag(soup, "tr") new_tag_table.append(new_Tag_tr) for j in i: new_Tag_td = Tag(soup, "td") new_Tag_td.string = str(j) new_Tag_tr.append(new_Tag_td) print str(soup.prettify()) return str(soup.prettify())
def footer_op(soup): head_el = soup.find("head") body_el = soup.find("body") footer_attachable = not (head_el is None or body_el is None) if footer_attachable: footer_wrap_el = Tag(soup, "div") footer_wrap_el['style'] = "position: fixed; width: 100%; height: auto; z-index: 10000; bottom: 0pt; display: block;" footer_el = Tag(soup, "div") footer_el['style'] = "background-color: rgb(15, 25, 35); color: white; height: auto" footer_text = NavigableString(open("footer.html").read()) footer_el.append(footer_text) footer_wrap_el.append(footer_el) body_el.append(footer_wrap_el)
def CreateSelect(self): '''Создаем select и options''' select = Tag(self.soup, 'select') select['name'] = self.GenerateName() for _ in range(random.randint(3, 12)): option = Tag(self.soup, 'option') option['value'] = self.textShort option.string = self.textShortCap select.append(option) if self._Probability(80): select.option['selected'] = 'selected' self.AppendIds(select, 10, 30) self.ShuffleAttributes(select) return select
def generateContentDivTag(baseDir, h3text): import __main__ contentDivTag = Tag(formatSoup, 'div', attrs={'class' : 'content band-content'}) # 表題埋め込み h3tag = Tag(formatSoup, 'h3') h3tag.append(NavigableString(h3text)) contentDivTag.append(h3tag) # HTML生成 for file in os.listdir(PARENT_DIR + baseDir): if file.endswith(SHTML_EXT): # バンド名ulタグを生成 progreUlTag = generateUlTag('/' + baseDir, file, 'column') albumLiTag = Tag(formatSoup, 'li') progreUlTag.append(albumLiTag) # 作品名ulタグを生成 fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, '/' + baseDir, file]))) albumList = [] for albumClassTag in fileSoup.findAll('a', {'class' : 'album-name'}): albumList.append(albumClassTag['href'].split('/')[-1]) __main__.contentCount += 1 albumDir = '/'.join([baseDir, file.split('.')[0]]) for album in albumList: albumUlTag = generateUlTag('/' + albumDir, album, 'child-column') albumLiTag.append(albumUlTag) contentDivTag.append(progreUlTag) return contentDivTag
def linearize_cols_1(soup, table): if table.get('id') == "linearize-cols-1": ul = Tag(soup, "ul") ul["class"] = "linearized" for td in table.findAll("td"): for p in td.findAll("p"): p.name = "span" try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "li" ul.append(td) except: pass table.replaceWith(ul)
def linearize_rows_1(soup, table): if table.get('id') == "linearize-rows-1": div = Tag(soup, "div") div["class"] = "center" for tr in table.findAll("tr"): lista = tr.findAll("td") for td in lista: for p in td.findAll("p"): p.name = "span" td.name = "span" if td == lista[-1]: td = BeautifulSoup(td.prettify()) else: td = BeautifulSoup(td.prettify() + '<span> | </span>') div.append(td) table.replaceWith(div)
def soup_filter_zz_fold_etymology( self, content ): heads = content.findAll( 'h2', {'class':'head'} ) + content.findAll( 'h3', {'class':'head'} ) + content.findAll( 'h4', {'class':'head'} ) etymologys = [] for h in heads: # print "Head, ", h if h.next and h.next.lower().startswith('etymology'): # print "found", h.content[0] etymologys.append( h ) # print 'Etymology found: ', h etymology_index = 1 for e in etymologys: div = Tag( content, 'div' ) div['id'] = u'etymology_'+str(etymology_index) div['style'] = u'display:none' linkSoup = BeautifulSoup( u''' <a href="javascript:f('%s',this)">[show]</a>''' % (div['id']) ) e.append( linkSoup ) paragraphs = [] n = e.nextSibling first = 1 while n and (n.__class__.__name__ == 'NavigableString' or (n.__dict__.has_key('name') and n.name == 'p') ): paragraphs.append( n ) n = n.nextSibling [div.append(p) for p in paragraphs] eIndex = e.parent.contents.index( e ) e.parent.insert( eIndex + 1, div ) etymology_index = etymology_index + 1
def linearize_cols_1(soup, table): if table.get('id') == "linearize-cols-1": ul = Tag(soup,"ul") ul["class"] = "linearized" for td in table.findAll("td"): for p in td.findAll("p"): p.name = "span" try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "li" ul.append(td) except: pass table.replaceWith(ul)
def linearize_states(soup, table): if table.get('id') == "linearize-states": ul = Tag(soup, "ul") ul["class"] = "text-level3" tag = None for tr in table.findAll("tr"): tr.name = "span" tr["class"] = "spaced" for td in tr.findAll("td"): if td["width"] == "40%": td.name = "li" tag = td else: tag.append(td) td.name = "ul" ul.append(tr) table.replaceWith(ul)
def CreateList(self, probNested): '''Создаем список ul, вложенный с заданной вероятностью''' ul = Tag(self.soup, 'ul') self.AppendIds(ul, 50, 30) liClass = self.GenerateClass(0) for _ in range(random.randint(3, 7)): ul.append(self.CreateListItem(liClass)) if self._Probability(probNested): liNestedList = ul.findAll('li') random.shuffle(liNestedList) liNestedList = liNestedList[:random.randint(1, 4)] for liNested in liNestedList: liNested.append(self.CreateList(0)) for li in ul.findAll('li'): if len(li.findAll(True)) == 0: li.append(self.CreateLinkText()) return ul
def linearize_states(soup, table): if table.get('id') == "linearize-states": ul = Tag(soup,"ul") ul["class"] = "text-level3" tag = None for tr in table.findAll("tr"): tr.name = "span" tr["class"] = "spaced" for td in tr.findAll("td"): if td["width"] =="40%": td.name = "li" tag = td else: tag.append(td) td.name = "ul" ul.append(tr) table.replaceWith(ul)
def linearize_cols_2_bold(soup, table): if table.get('id') == "linearize-cols-2-bold": ul = Tag(soup,"ul") ul["class"] = "linearized" for tr in table.findAll("tr"): tr.name = "span" tr["class"] = "spaced" for td in tr.findAll("td"): if td["width"] =="22%": td.name = "li" tag = td else: tag.append(td) td.name = "ul" for p in td.findAll("p", { "class" : "Example"}): p.name = "li" ul.append(tr) table.replaceWith(ul)
def sub_table2(soup, subtable): ul = Tag(soup, "ul") li = Tag(soup, "li") string = '<i>ALWD Guide to Legal Citation: </i>' for tr in subtable.findAll("tr"): for td in tr.findAll("td"): try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': string += ''.join(td.findAll(text=True)) string += ', ' except: pass string = string.strip().rstrip(',') li.append(BeautifulSoup(string)) ul.append(li) return ul
def linearize_cols_2_bold(soup, table): if table.get('id') == "linearize-cols-2-bold": ul = Tag(soup, "ul") ul["class"] = "linearized" for tr in table.findAll("tr"): tr.name = "span" tr["class"] = "spaced" for td in tr.findAll("td"): if td["width"] == "22%": td.name = "li" tag = td else: tag.append(td) td.name = "ul" for p in td.findAll("p", {"class": "Example"}): p.name = "li" ul.append(tr) table.replaceWith(ul)
def SetupAmazonLibrary(self): source_path = xbmc.translatePath( 'special://profile/sources.xml').decode('utf-8') source_added = False source = { self._s.ms_mov: self._s.MOVIE_PATH, self._s.ms_tv: self._s.TV_SHOWS_PATH } if xbmcvfs.exists(source_path): srcfile = xbmcvfs.File(source_path) soup = BeautifulSoup(srcfile) srcfile.close() else: subtags = ['programs', 'video', 'music', 'pictures', 'files'] soup = BeautifulSoup('<sources></sources>') root = soup.sources for cat in subtags: cat_tag = Tag(soup, cat) def_tag = Tag(soup, 'default') def_tag['pathversion'] = 1 cat_tag.append(def_tag) root.append(cat_tag) video = soup.find("video") for name, path in source.items(): path_tag = Tag(soup, "path") path_tag['pathversion'] = 1 path_tag.append(path) source_text = soup.find(text=name) if not source_text: source_tag = Tag(soup, "source") name_tag = Tag(soup, "name") name_tag.append(name) source_tag.append(name_tag) source_tag.append(path_tag) video.append(source_tag) Log(name + ' source path added!') source_added = True else: source_tag = source_text.findParent('source') old_path = source_tag.find('path').contents[0] if path not in old_path: source_tag.find('path').replaceWith(path_tag) Log(name + ' source path changed!') source_added = True if source_added: self.SaveFile(source_path, str(soup)) self._g.dialog.ok(getString(30187), getString(30188), getString(30189), getString(30190)) if self._g.dialog.yesno(getString(30191), getString(30192)): xbmc.executebuiltin('RestartApp')
def linearize_rows_1_cols(soup, table): if table.get('id') == "linearize-rows-1-cols": div = Tag(soup, "div") div["class"] = "center" for tr in table.findAll("tr"): lista = tr.findAll("td") li = Tag(soup, "li") for td in lista: for p in td.findAll("p"): p.name = "span" td.name = "span" if td == lista[0]: td = BeautifulSoup('<b>' + td.prettify() + '</b>') else: td = BeautifulSoup('<span>[</span>' + td.prettify() + '<span>]</span>') li.append(td) div.append(li) div.name = "ul" table.replaceWith(div)
def SetupAmazonLibrary(): common.Log('Trying to add Amazon source paths...') source_path = os.path.join(common.profilpath, 'sources.xml') source_added = False source = {'Amazon Movies': MOVIE_PATH, 'Amazon TV': TV_SHOWS_PATH} try: file = open(source_path) soup = BeautifulSoup(file) file.close() except: subtags = ['programs', 'video', 'music', 'pictures', 'files'] soup = BeautifulSoup('<sources></sources>') root = soup.sources for cat in subtags: cat_tag = Tag(soup, cat) def_tag = Tag(soup, 'default') def_tag['pathversion'] = 1 cat_tag.append(def_tag) root.append(cat_tag) video = soup.find("video") for name, path in source.items(): path_tag = Tag(soup, "path") path_tag['pathversion'] = 1 path_tag.append(path) source_text = soup.find(text=name) if not source_text: source_tag = Tag(soup, "source") name_tag = Tag(soup, "name") name_tag.append(name) source_tag.append(name_tag) source_tag.append(path_tag) video.append(source_tag) common.Log(name + ' source path added') source_added = True else: source_tag = source_text.findParent('source') old_path = source_tag.find('path').contents[0] if path not in old_path: source_tag.find('path').replaceWith(path_tag) common.Log(name + ' source path changed') source_added = True if source_added: SaveFile(source_path, str(soup)) Dialog.ok(common.getString(30187), common.getString(30188), common.getString(30189), common.getString(30190)) if Dialog.yesno(common.getString(30191), common.getString(30192)): xbmc.executebuiltin('RestartApp')
def apply(self, xml): if isinstance(self.root_tag, tuple): self.childs = [(None, self.root_tag[1])] self.root_tag = self.root_tag[0] root_tag = Tag(xml, self.root_tag) if self.childs: for child, opts in self.childs: if not opts["param"] in self.data: if "default" in opts: v = opts.get("default") else: continue else: v = self.data.get(opts["param"], "") if hasattr(self, "clean_%s" % opts["param"]): v = getattr(self, "clean_%s" % opts["param"])() if not child: root_tag.append(NavigableString(str(v))) break tag = Tag(xml, child) tag.append(NavigableString(str(v))) root_tag.append(tag) xml.contents[0].append(root_tag)
def gen_blog_post(outdir, input_base, blog_base, url_base): """Generate the blog post body. """ outdir = path(outdir) input_file = outdir / input_base blog_file = outdir / blog_base canonical_url = "http://www.doughellmann.com/" + url_base if not canonical_url.endswith('/'): canonical_url += '/' if input_base != "index.html": canonical_url += input_base module_name = MODULE title = '%s - ' % module_name # Get the intro paragraph from BeautifulSoup import BeautifulSoup, Tag raw_body = input_file.text().strip() soup = BeautifulSoup(raw_body) intro = soup.find('p') # Strip hyperlinks by replacing those nodes with their contents. for link in intro.findAll('a'): new_span = Tag(soup, 'span') for c in link.contents: new_span.append(c) link.replaceWith(new_span) output_body = '''%(intro)s <p><a href="%(canonical_url)s">Read more...</a></p> ''' % locals() blog_file.write_text(output_body) home_page_reference = '''<p><a class="reference external" href="http://www.doughellmann.com/PyMOTW/">PyMOTW Home</a></p>''' canonical_reference = '''<p>The <a class="reference external" href="%(canonical_url)s">canonical version</a> of this article</p>''' % locals( ) blog_file.write_text(output_body) return
def highlightedNode(self, target_node, yes_phrase, parent_soup): content = str(target_node) text = content.lower() j = text.find(yes_phrase) tag = Tag(parent_soup, "div", [("style", "background-color:#FF8A0D")]) if yes_phrase: tag.append(content[:j]) bold = Tag(parent_soup, "b") bold.insert(0,content[j:(j + len(yes_phrase))]) tag.append(bold) tag.append(content[(j + len(yes_phrase)):]) else: tag.append(content) return tag
def get_first_two(soup, table): loop = 0 enclose = Tag(soup, "div") for tr in table.findAll("tr"): li = Tag(soup, "li") for td in tr.findAll("td"): if loop != 2: try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "span" if loop != 1: td.append(' - ') li.append(td) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) return enclose
def CreateForm(self): '''Создаем form''' form = Tag(self.soup, 'form') form['action'] = random.choice(['', '.']) if self._Probability(70): form['method'] = random.choice(['post', 'get']) if self._Probability(70): form['name'] = self.GenerateName() self.AppendIds(form, 50, 30) div = self.CreateDiv() form.append(div) for _ in range(1, 3): div.append(self.CreateInput('text')) if self._Probability(10): div.append(self.CreateTextarea()) if self._Probability(10): div.append(self.CreateSelect()) for _ in range(0, 2): div.append(self.CreateInput('hidden')) self.ShuffleTags(div) div.append(self.CreateInput('submit')) self.ShuffleAttributes(form) return form
def linearize_alwd(soup, table): if table.get('id') == "linearize-alwd": div = Tag(soup, "div") first = 1 lista = table.findAll("tr") tr1 = lista[0] div.append(tr1) for tr in lista[1:]: ul = Tag(soup, "ul") li = Tag(soup, "li") for td in tr.findAll("td"): for p in td.findAll("p"): p.name = "span" try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': if first == 1: li.append(td) first = 0 else: td.replaceWith("") except: pass for subtable in td.findAll("table"): sub = sub_table2(soup, subtable) try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': li.append(sub) except: pass first = 1 if li.contents: ul.append(li) div.append(ul) table.replaceWith(div)
def createParentUlTag(targetSoup): parentUlTag = Tag(targetSoup, 'ul', attrs={ 'class': 'xbreadcrumbs', 'id': 'breadcrumbs' }) topListTag = Tag(targetSoup, 'li') topAnchorTag = Tag(targetSoup, 'a', attrs={'href': SITE_DOMAIN}) topAnchorTag.append(NavigableString('TOP')) topListTag.append(topAnchorTag) parentUlTag.append(topListTag) return parentUlTag
def generateUlTag(path, file, ulClass): # バンド名タグを生成 fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, path, file]))) text = fileSoup.find('h1').renderContents() ulTag = Tag(formatSoup, 'ul', attrs={'class' : ulClass}) liTag = Tag(formatSoup, 'li') link = '/'.join([path, file]) aTag = Tag(formatSoup, 'a', attrs={'href' : link}) aTag.append(NavigableString(text)) liTag.append(aTag) ulTag.append(liTag) return ulTag
def get_list_for_key(name, children): """ Takes a key and a dictionary containing its children and recursively generates HTML lists items. Each item will contain the name and, if it has children, an unordered list containing those child items. """ li = Tag(SOUP, "li") li.append(NavigableString(name)) if children: ul = Tag(SOUP, "ul") for k, v in children.items(): ul.append(get_list_for_key(k, v)) li.append(ul) return li
def linearize_cols_1_4(soup, table): if table.get('id') == "linearize-cols-1-4": div = Tag(soup, "ul") for i in range(4): for tr in table.findAll("tr"): td = tr.find("td") tr.find("td").replaceWith("") div.append(td) list_a = div.findAll("a") composite_list = [list_a[x:x + 4] for x in range(0, len(list_a), 4)] ul = Tag(soup, "ul") for lista in composite_list: li = Tag(soup, "li") for a in lista: if a == lista[-1]: a = BeautifulSoup(a.prettify()) else: a = BeautifulSoup(a.prettify() + '<span> | </span>') li.append(a) ul.append(li) table.replaceWith(ul)
def get_slides(args): contents = get_file_contents(args.file) soup = BeautifulSoup(markdown(contents)) hsoup = BeautifulSoup() html = Tag(hsoup, 'html') hsoup.append(html) head = Tag(hsoup, 'head') title = Tag(hsoup, 'title') title.setString(args.file) head.append(title) link = Tag(hsoup, 'link') link['rel'] = 'stylesheet' link['type'] = 'text/css' if args.offline: link['href'] = 'default.css' else: link[ 'href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css' head.append(link) script = Tag(hsoup, 'script') if args.offline: script['src'] = 'html5slides.js' else: script[ 'src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js' head.append(script) html.append(head) body = Tag(hsoup, 'body') body['style'] = 'display:none' section = Tag(hsoup, 'section') section['class'] = 'slides layout-regular template-default' body.append(section) elements = [] elements.append(soup.first()) elements.extend(soup.first().findNextSiblings()) article = Tag(hsoup, 'article') section.append(article) for element in elements: if element.name == 'hr': article = Tag(hsoup, 'article') section.append(article) else: article.append(element) html.append(body) return prettify(html)