def unTag(self, tag): """ recursively removes unwanted tags according to defined lists @param tag: tag hierarchy to work on """ for child in tag.findChildren(True, recursive=False): self.unTag(child) if (self.remove_classes_regexp != "") and (tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)): tag.extract() elif tag.name in self.keep_tags: new_tag = Tag(self.input, tag.name) new_tag.contents = tag.contents tag.replaceWith(new_tag) elif tag.name in self.remove_tags_keep_content: children = tag.findChildren(True, recursive=False) if len(children)==1: tag.replaceWith(children[0]) elif len(children) > 1: new_tag = Tag(self.input, "p") for child in tag.findChildren(True, recursive=False): new_tag.append(child) tag.replaceWith(new_tag) else: tag.replaceWith(tag.renderContents()) else: tag.extract()
def _linkify_headings(self, soup): md_el = soup.find('div', 'md') for heading in md_el.findAll(['h1', 'h2', 'h3'], recursive=False): heading_a = Tag(soup, "a", [('href', '#%s' % heading['id'])]) heading_a.contents = heading.contents heading.contents = [] heading.append(heading_a)
def unTag(self, tag): """ recursively removes unwanted tags according to defined lists @param tag: tag hierarchy to work on """ for child in tag.findChildren(True, recursive=False): self.unTag(child) if (self.remove_classes_regexp != "") and ( tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)): tag.extract() elif tag.name in self.keep_tags: new_tag = Tag(self.input, tag.name) new_tag.contents = tag.contents tag.replaceWith(new_tag) elif tag.name in self.remove_tags_keep_content: children = tag.findChildren(True, recursive=False) if len(children) == 1: tag.replaceWith(children[0]) elif len(children) > 1: new_tag = Tag(self.input, "p") for child in tag.findChildren(True, recursive=False): new_tag.append(child) tag.replaceWith(new_tag) else: tag.replaceWith(tag.renderContents()) else: tag.extract()
def _linkify_headings(self, soup): md_el = soup.find("div", "md") for heading in md_el.findAll(["h1", "h2", "h3"], recursive=False): heading_a = Tag(soup, "a", [("href", "#%s" % heading["id"])]) heading_a.contents = heading.contents heading.contents = [] heading.append(heading_a)
def convertStoreFormat(self): """ convert legacy to canonical store format N.B.: While the new canonical store format was introduced in TiddlyWiki v2.2 final, various v2.2 beta releases are still using the legacy store format. @return: None """ try: version = self.getVersion() except (ValueError, AttributeError): version = (0, 0, 0) # assume pre-v2.2 format if version and (version[0] + (version[1] / 10.0) < 2.3): # N.B.: addition works because all pre-v2.3 releases are known -- XXX: actual threshold is v2.2 final for tiddler in self.store.findChildren("div", tiddler = True): # convert tiddler attribute to title attribute tiddler["title"] = tiddler["tiddler"] del(tiddler["tiddler"]) # decode tiddler contents tiddler.contents[0].replaceWith(decodeTiddlerText(tiddler.contents[0])) # XXX: use of contents[0] hacky? # add PRE wrapper pre = Tag(self.dom, "pre") pre.contents = tiddler.contents tiddler.contents = [pre]
def get_malott_menu(today): url = "http://www.scrippscollege.edu/students/dining-services/index.php" resp = requests.get(url) soup = BeautifulSoup(resp.content) head = BeautifulSoup("<thead><tr><td colspan=3>Malott Commons</td></tr></thead>") target = soup.find("div", {"id": "right_column_content"}) target.extract() meals = [] for meal in target.findAll("ul"): meal.extract() meals += [meal] labels = [] for title in target.findAll("p"): title.extract() labels += [title] final_table = BeautifulSoup() table = Tag(final_table, "table") final_table.insert(0, table) table.insert(0, head) table["class"] = "mealtable" for meal in meals: tr = Tag(final_table, "tr") td = Tag(final_table, "td") tr.insert(0, td) td["class"] = "mealtime" td.contents = labels[1].contents table.insert(len(table.contents) - 1, tr) labels = labels[1:] for food in meal.findAll("li"): tr = Tag(final_table, "tr") td = Tag(final_table, "td") tr.insert(0, td) td.contents = food.contents table.insert(len(table.contents) - 1, tr) return final_table.prettify()
def heading_to_bold(txt, add_break=False): """Replaces heading tags (<h1>, <h2>, etc.) with <b> tags. Optionally appends two <br>s.""" soup = BeautifulSoup(txt) headers = soup.findAll([ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']) for header in headers: bold_tag = Tag(soup, "b", []) bold_tag.contents = header.contents header.replaceWith(bold_tag) if add_break: header.append(Tag(soup, "br")) header.append(Tag(soup, "br")) return unicode(soup)
def FixTableHeadings(self): '''Fixes the doxygen table headings to EZT's liking. This includes using <th> instead of <h2> for the heading, and putting the "name" attribute into the "id" attribute of the <tr> tag. For example, this html: <tr><td colspan="2"><h2><a name="pub-attribs"></a> Data Fields List</h2></td></tr> would be converted to this: <tr id="pub-attribs"><th colspan="2">Data Fields List</th></tr> Also, this function splits up tables into multiple separate tables if a table heading appears in the middle of a table. ''' table_headers = [] for tag in self.soup.findAll('tr'): if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']: tag['id'] = tag.td.h2.a['name'] tag.td.string = tag.td.h2.a.next tag.td.name = 'th' table_headers.append(tag) # reverse the list so that earlier tags don't delete later tags table_headers.reverse() # Split up tables that have multiple table header (th) rows for tag in table_headers: # Is this a heading in the middle of a table? if tag.findPreviousSibling('tr') and tag.parent.name == 'table': table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) new_table = Tag(self.soup, name='table', attrs=table.attrs) table_parent.insert(table_index + 1, new_table) tag_index = table.contents.index(tag) new_table.contents = table.contents[tag_index:] del table.contents[tag_index:]
def FixTableHeadings(self): '''Fixes the doxygen table headings to EZT's liking. This includes using <th> instead of <h2> for the heading, and putting the "name" attribute into the "id" attribute of the <tr> tag. For example, this html: <tr><td colspan="2"><h2><a name="pub-attribs"></a> Data Fields List</h2></td></tr> would be converted to this: <tr id="pub-attribs"><th colspan="2">Data Fields List</th></tr> Also, this function splits up tables into multiple separate tables if a table heading appears in the middle of a table. ''' table_headers = [] for tag in self.soup.findAll('tr'): if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']: tag['id'] = tag.td.h2.a['name'] tag.td.string = tag.td.h2.a.next tag.td.name = 'th' table_headers.append(tag) # reverse the list so that earlier tags don't delete later tags table_headers.reverse() # Split up tables that have multiple table header (th) rows for tag in table_headers: # Is this a heading in the middle of a table? if tag.findPreviousSibling('tr') and tag.parent.name == 'table': table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) new_table = Tag(self.soup, name='table', attrs=table.attrs) table_parent.insert(table_index + 1, new_table) tag_index = table.contents.index(tag) new_table.contents = table.contents[tag_index:] del table.contents[tag_index:]
def get_frary_menu(today): data = [] fr_url = "http://www.pomona.edu/administration/dining/menus/frary.aspx" fr_resp = requests.get(fr_url) fr_soup = BeautifulSoup(fr_resp.content) fr_head = BeautifulSoup("<thead><tr><td colspan=3>Frary</td></tr></thead>") # try: target_day = today.strftime("%A") day_div = fr_soup.find("div", text=target_day) if day_div == None: raise Exception("No menu available for today") table = fr_soup.findAll("table", {"class": re.compile("menu")})[today.isoweekday() - 1] # find the right menu table.extract() table["class"] = "mealtable" stations = [] for td in table.findAll(True, {"class": re.compile("station")}): td["class"] = "mealstation" stations += [td] final_table = BeautifulSoup() tabler = Tag(final_table, "table") tabler.insert(0, fr_head) final_table.insert(0, tabler) # if we get an error here, it's probably becase # it's the weekend - no breakfast or lunch, just bruuunch try: # build breakfast breakfast = [] for td in table.findAll(True, {"class": re.compile("breakfast")}): breakfast += [td] breakfast[0]["class"] = "mealtime" breakfast[0]["colspan"] = 3 lunch = [] for td in table.findAll(True, {"class": re.compile("lunch")}): lunch += [td] lunch[0]["class"] = "mealtime" lunch[0]["colspan"] = 3 count = 1 for food in breakfast: tr = Tag(final_table, "tr") if food["class"] == "mealtime": tr.insert(0, food) else: tr.insert(0, food) tr.insert(0, stations[count]) count += 1 tabler.insert(len(tabler.contents) - 1, tr) count = 1 for food in lunch: tr = Tag(final_table, "tr") if food["class"] == "mealtime": tr.insert(0, food) else: tr.insert(0, food) td = Tag(final_table, "td") td.contents = stations[count].contents td["class"] = "mealstation" tr.insert(0, td) count += 1 tabler.insert(len(tabler.contents) - 1, tr) except: pass dinner = [] for td in table.findAll(True, {"class": re.compile("dinner")}): dinner += [td] dinner[0]["class"] = "mealtime" dinner[0]["colspan"] = 3 count = 1 for food in dinner: tr = Tag(final_table, "tr") if food["class"] == "mealtime": tr.insert(0, food) else: tr.insert(0, food) td = Tag(final_table, "td") td.contents = stations[count].contents td["class"] = "mealstation" tr.insert(0, td) count += 1 tabler.insert(len(tabler.contents) - 1, tr) tabler["class"] = "mealtable" return final_table.prettify() # table.prettify()#fr_table