def replace_cell_with_select(cell: Tag, names: list, values: list, attrs={}): """ Replaces the contents of a table cell with a select element with options using the given list of names and values. If the string contents already in the cell, match one of the values given, then that option will be marked as selected. attrs is a dictionary of additional attributes that will be applied to the select element. If the cell is marked as 'contenteditable' or has a class 'editable', those attributes will be removed. """ soup = page_builder.soup_from_text("<select></select>") select = soup.find( 'select' ) # Soup.select is already a function, so we access the select element this way for key in attrs.keys(): select[key] = attrs[key] string = cell.string options = build_option_list(names, value_accessor=(lambda n, i: values[i]), selector=(lambda n, i, v: v == string)) cell.string = '' if 'contenteditable' in cell.attrs: del cell.attrs['contenteditable'] if 'class' in cell.attrs and cell.attrs['class'] == "editable": del cell.attrs['class'] select.append(options) cell.append(select)
def form_service_gen_multiple_values(_id, params, service) -> Union[Tag, str]: script = Tag(name="script", attrs={"defer": True}) values = [] for env in sorted( service, reverse=True, key=lambda x: int(re.search(r"\d+$", x).group()) if x[-1].isdigit() else 0, ): for param, param_value in params.items(): suffix = env.replace(param, "") if env.startswith(param): values.append({ "default": service.get(f"{param}{suffix}", param_value["default"]), "env": param, "help": param_value["help"], "id": param_value["id"], "label": param_value["label"], "selects": param_value.get("selects", []), "type": param_value["type"], }) if len(values) >= len(params): script.append(f"addMultiple('{_id}', '{json.dumps(values)}');") values = [] return script if script.children else ""
def megakills_to_html(self, megakills): if megakills is None: return None columns = megakills.columns soup = BeautifulSoup("", "html.parser") table = Tag(soup, name="table") table["class"] = "blueTable" table["id"] = "divmegakills" soup.append(table) tr = Tag(soup, name="tr") table.append(tr) for col in columns: th = Tag(soup, name="th") tr.append(th) th.append(col) for index, row in megakills.iterrows(): tr = Tag(soup, name="tr") td = Tag(soup, name='td') for col in columns: td = Tag(soup, name='td') td.insert(1, (str(row[col]))) tr.append(td) table.append(tr) return soup
def copy(element): """ beautifulsoup4 객체의 요소를 복제합니다. 오직 해당 요소의 정보만 복사합니다. 자식들에 대한 정보는 원본과 공유됩니다. .. bugs:: work around bug where there is no builder set https://bugs.launchpad.net/beautifulsoup/+bug/1307471. .. 이 함수 작성에 다음 문서를 참조하였음. http://stackoverflow.com/questions/23057631/clone-element-with-beautifulsoup :param element: 복제 할 원소. :type element: bs4.Tag, bs4.NavigableString, bs4.Comment :return: 복제된 원소. """ if isinstance(element, (NavigableString, Comment)): return type(element)(element) clone_element = Tag(None, element.builder, element.name, element.namespace, element.nsprefix) clone_element.attrs = dict(element.attrs) for attr in ('can_be_empty_element', 'hidden'): setattr(clone_element, attr, getattr(element, attr)) for child in element.contents: clone_element.append(child) return clone_element
def wrap_rawtext(cls, element): if isinstance(element, NavigableString): return groups = [] group = [] for c in element.children: if isinstance(c, NavigableString): group.append(c) if isinstance(c, Tag): groups.append(group) group = [] if len(group) > 0: groups.append(group) for g in groups: if len(g) == 0: continue par = Tag(name="p") g[0].wrap(par) for i in range(1, len(g)): par.append(g[i])
def get_markdown_page_index_objects(content: Tag, url: str, page_path: str, title: str, page_type: str, page_views: int) -> List[Dict]: headers = ['h1', 'h2', 'h3'] index_objects = [] children = [ element for element in content.children if isinstance(element, Tag) ] if children[0].name not in headers: return get_page_index_objects(content, url, page_path, title, page_type, page_views) block_title = "" content = [] url_with_href = "" for child in children: if child.name in headers: if block_title != '': for ind, page_part in enumerate(get_valuable_content(content)): page_info = { 'url': url_with_href, 'objectID': url_with_href + str(ind), 'content': page_part, 'headings': block_title, 'pageTitle': title, 'type': page_type, 'pageViews': page_views } index_objects.append(page_info) url_with_href = url + '#' + child.get('id') block_title = child.text content = [] else: content.append(child) return index_objects
def add_mathjax_call(soup): head = soup.find('head') if not head: msg = 'Could not find <head>' raise_desc(ValueError, msg, s=str(soup)) src = 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML' config = r""" MathJax.Hub.Config({ extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true }, "HTML-CSS": { availableFonts: ["TeX"] } }); """ script = Tag(name='script') script['type'] = 'text/x-mathjax-config' script.append(config) head.append(script) script = Tag(name='script') script.attrs['src'] = src head.append(script)
def insert_html(content): soup = BeautifulSoup("", "lxml") wrap = Tag(soup, name="p") wrap["class"] = "text" wrap.append(BeautifulSoup(content, 'html.parser')) soup.append(wrap) return soup
def clone_beautiful_soup_tag(elements): """ :type element: Tag :rtype: Tag """ if elements is None: raise ElementTypeError('elements is None!') if isinstance(elements, (Tag, NavigableString, BeautifulSoup)): element = elements if isinstance(element, NavigableString): return type(element)(element) copy = Tag(None, element.builder, element.name, element.namespace, element.nsprefix) # work around bug where there is no builder set # https://bugs.launchpad.net/beautifulsoup/+bug/1307471 copy.attrs = dict(element.attrs) for attr in ('can_be_empty_element', 'hidden'): setattr(copy, attr, getattr(element, attr)) for child in element.contents: copy.append(clone_beautiful_soup_tag(child)) return copy else: return [clone_beautiful_soup_tag(x) for x in elements]
def construct_xml(self): soup = BeautifulSoup(etree.tostring(etree.Element('OTA_AirLowFareSearchRQ')), 'xml') query = soup.contents[0] query.attrs = { 'xmlns':'http://www.opentravel.org/OTA/2003/05', 'xmlns:xsi':'http://www.w3.org/2001/XMLSchema-instance', 'PrimaryLangId':'en', 'Version':'2.001', 'TimeStamp':str(datetime.datetime.now().isoformat()), 'EchoToken':str(time.mktime(time.gmtime())), 'xsi:schemaLocation':'http://www.opentravel.org/2006A/OTA_AirLowFareSearchRQ.xsd', } t_pos = Tag(name='POS') t_source = Tag(name='Source') t_req = Tag(name='RequestorID') t_req.attrs = { 'ID':'weathersick', 'URL':'http://www.weathersick.com', 'Type':'18', } t_source.append(t_req) t_pos.append(t_source) query.append(t_pos) t_odinf = Tag(name='OriginDestinationInformation') t_odinf.attrs {'RPH':1} t_deptime = Tag(name='DepartureDateTime') t_deptime. OriginDestinationInformation RPH="1" import pdb; pdb.set_trace()
def insert_header(text, size): soup = BeautifulSoup("", "lxml") header1 = Tag(soup, name="h" + str(size)) header1["class"] = "header" header1.append(text) soup.append(header1) return soup
def deepcopy(element): """ beautifulsoup4 객체의 요소를 재귀적으로 복제합니다. 요소가 가진 정보 및 자식의 모든 정보를 재귀적으로 복제합니다. 이 함수를 통하여, beautifulsoup4 4.0.2에서 append 함수 사용시 발생하는 DOM 깨짐현상(기존에 존재하던 태그에 접근 불가능해지거나, 각 메소드들 별로 원래 정상적으로 접근하게될 결과의 일부분만 얻게 되는 현상)을 피할 수 있습니다. .. 이 함수 작성에 다음 문서를 참조하였음. http://stackoverflow.com/questions/23057631/clone-element-with-beautifulsoup :param element: 복제 할 원소. :type element: bs4.Tag, bs4.NavigableString, bs4.Comment :return: 복제된 원소. """ if isinstance(element, (NavigableString, Comment)): return type(element)(element) clone_element = Tag(None, element.builder, element.name, element.namespace, element.nsprefix) # work around bug where there is no builder set # https://bugs.launchpad.net/beautifulsoup/+bug/1307471 clone_element.attrs = dict(element.attrs) for attr in ('can_be_empty_element', 'hidden'): setattr(clone_element, attr, getattr(element, attr)) for child in element.contents: clone_element.append(deepcopy(child)) return clone_element
def renames_to_html(self, table_renames): renamesdf = table_renames[0] columns = renamesdf.columns soup = BeautifulSoup("", "html.parser") table = Tag(soup, name="table") table["class"] = "blueTable" table["id"] = "divrenames" tr = Tag(soup, name="tr") table.append(tr) for col in columns: th = Tag(soup, name="th") tr.append(th) th.append(col) for index, row in renamesdf.iterrows(): tr = Tag(soup, name="tr") for col in columns: td = Tag(soup, name='td') td.insert(1, row[col]) tr.append(td) table.append(tr) soup.append(table) return soup
def medal_html_string(self, color, count): soup = BeautifulSoup("", "html.parser") if color == "g": medal = self.gold_medal_emoji_html elif color == "s": medal = self.silver_medal_emoji_html elif color == "b": medal = self.bronze_medal_emoji_html elif color == "p": medal = self.poop_emoji_html elif color == "q": medal = self.cup_emoji_html elif color == "a": medal = self.amphora_emoji_html elif color == "l": medal = self.leaf_emoji_html elif color == "d": medal = self.diamond_emoji_html elif color == "f": medal = self.silverware_emoji_html elif color == "r": medal = self.springfling_emoji_html elif color == "t": medal = self.trident_emoji_html for i in range(0, count): medal_span = Tag(soup, name='span') medal_span["style"] = "font-size:10px;" medal_html = BeautifulSoup(medal, 'html.parser') medal_span.append(medal_html) soup.append(medal_span) return soup
def feuds_to_html(self, top_feuds): feuds = top_feuds[0] columns = top_feuds[1] soup = BeautifulSoup("", "html.parser") table = Tag(soup, name="table") table["class"] = "blueTable" table["id"] = "divfeuds" soup.append(table) tr = Tag(soup, name="tr") table.append(tr) for col in columns: th = Tag(soup, name="th") tr.append(th) th.append(col) for index, row in feuds.iterrows(): tr = Tag(soup, name="tr") td = Tag(soup, name="td") for col in feuds.columns: td = Tag(soup, name="td") td.insert(1, (str(row[col]))) tr.append(td) table.append(tr) return soup
def merge_row_elements(element: Tag) -> None: """ If an element is an 'mrow' produced by KaTeX, its children are probably needlessly fragmented. For instance, the word 'true' will contain four '<mi>' elements, one for 't', 'r', 'u', and 'e' each. Merge such elements into single elements. """ if element.name != "mrow": return elements = [e for e in element.children if isinstance(e, Tag)] merger = MathMlElementMerger() merged = merger.merge(elements) # If the 'mrow' only contains one element after its children are merged, simplify the # MathML tree replacing this node with its merged child. Preserve the start and end # position of the row element if it is specified, because this often means that a styling # macro was applied to the children, and the start and end positions of the row include # the control sequence and braces for the styling macro. if len(merged) == 1: start = element.attrs.get("s2:start") end = element.attrs.get("s2:end") if start and end: merged[0].attrs["s2:start"] = start merged[0].attrs["s2:end"] = end if "s2:style-start" in element.attrs and "s2:style-end" in element.attrs: merged[0].attrs["s2:style-start"] = element.attrs["s2:style-start"] merged[0].attrs["s2:style-end"] = element.attrs["s2:style-end"] element.replace_with(merged[0]) else: for e in elements: e.extract() for m in merged: element.append(m)
def insert_link(self, url, link_text): # <a href="url">link text</a> soup = BeautifulSoup("", "html.parser") link = Tag(soup, name="a") link["href"] = url link.append(link_text) soup.append(link) return soup
def insert_text(self, content): soup = BeautifulSoup("", "html.parser") text = Tag(soup, name="p") text["class"] = "text" text.append(content) soup.append(text) return soup
def as_tag(self) -> Tag: tag = Tag(name='event', attrs={ 'start': 'T' + str(self.start), 'end': 'T' + str(self.end) }) tag.append(self.value) return tag
def heading2table(soup, table, row): """add heading row to table""" tr = Tag(soup, name="tr") table.append(tr) for attr in row: th = Tag(soup, name="th") tr.append(th) th.append(attr)
def tag(self): tt = Tag(name='table') for r in self.cells: rt = Tag(name='tr') for c in r: rt.append(c.tag()) tt.append(rt) return tt
def row2table(soup, table, row): """ad a row to the table""" tr = Tag(soup, name="tr") table.append(tr) for attr in row: td = Tag(soup, name="td") tr.append(td) td.append(attr)
def tag(self): tt=Tag(name='table') for r in self.cells: rt=Tag(name='tr') for c in r: rt.append(c.tag()) tt.append(rt) return tt
def nest_tags(names): current = Tag(name=names[0]) root = current for i in range(1, len(names)): new_tag = Tag(name=names[i]) current.append(new_tag) current = new_tag return root
def insert_text(content): soup = BeautifulSoup("", "lxml") text = Tag(soup, name="p") text["class"] = "text" text.append(content) soup.append(text) return soup
def initialize_framework(self, head: Tag, tags: List[Tag]): """ Applys the header tags to the head :param head: :param tags: :return: """ for tag in tags: head.append(tag)
def insert_toggle(self, toggle_div): soup = BeautifulSoup("", "html.parser") link = Tag(soup, name="a") link["href"] = "#" link["id"] = toggle_div link["class"] = "text" link.append("+/-") soup.append(link) return soup
def _add_element(soup, element, soup_listing): tag = Tag(parser=soup, name=element.tag_name, namespace=_gumbo.TAG_NAMESPACES[element.tag_namespace], attrs=_convert_attrs(element.attributes)) soup_listing.append(tag) for child in element.children: tag.append(_add_node(soup, child, soup_listing)) tag.offset = element.offset return tag
def replace_dot_code(dot_code: Tag) -> None: svg = BeautifulSoup(dot(dot_code.text), 'xml').svg assert 'style' not in svg.attrs svg.attrs['style'] = ( f'max-width: 100%; ' f'width: {svg.attrs.pop("width")}; ' f'height: {svg.attrs.pop("height")};' ) dot_code.clear() dot_code.append(svg)
def tag(self): t = Tag(name='td') if self.borders: t['class'] = self.borders if self.back is not None: t['style'] = 'background-color: #%06x;' % self.back for x in self.texts: t.append(x.text_tag()) for x in self.texts: t.append(x.div_tag()) return t
def tag(self): t=Tag(name='td') if self.borders: t['class']=self.borders if self.back is not None: t['style']='background-color: #%06x;'%self.back for x in self.texts: t.append(x.text_tag()) for x in self.texts: t.append(x.div_tag()) return t
def merge(roots): if is_atom(roots[0]): atom = Tag(name=roots[0].name) for child in roots[0].children: atom.append(copy.copy(child)) atom['data-ver'] = merge_versions(roots) # if atom.name == 'tr': print('TR', roots) atom['class'] = 'atom-wrapper' return atom # print('name', roots[0].name) tree_children = [flatten_children(root) for root in roots] tree_children.sort(key=len, reverse=True) # flat_children = sum(tree_children, []) groups = groupby(tree_children) for index, group in enumerate(groups): for item in group: item['data-group'] = index item.group = index # print('groups:', groups) # graph = [[] for _ in groups] # for children in tree_children: # prev = None # for child in children: # if prev is not None: # graph[prev.group].append(child.group) # prev = child print( 'C', list( map( lambda children: list(map(lambda child: child.group, children) ), tree_children))) # print('T', topsort(graph)) print('G', groups) # sorted_groups = [groups[i] for i in topsort(graph)] # # if roots[0].get('class') == 'text-wrapper': print('wrapper:', graph, topsort(graph)) # sorted_groups = [merge(group) for group in sorted_groups] sorted_groups = [merge(group) for group in groups] soup = Soup(features='html5lib') root = soup.new_tag(roots[0].name) for x in roots: root.attrs.update(x.attrs) root['data-ver'] = merge_versions(roots) for item in sorted_groups: root.append(copy.copy(item)) return root
def clone(el): if isinstance(el, NavigableString): return type(el)(el) copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix) copy.attrs = dict(el.attrs) for attr in ('can_be_empty_element', 'hidden'): setattr(copy, attr, getattr(el, attr)) for child in el.contents: copy.append(clone(child)) return copy
def pc_to_xml_helper(pc): # returns a list of soup Tags given a pc list if not pc: return [] new_tag = Tag(name=pc[0]) interior, second_part = first_split(pc) if interior and interior[0] not in tag_names: new_tag.string = ' '.join(interior) elif interior and interior[0] in tag_names: for child in pc_to_xml_helper(interior): new_tag.append(child) return [new_tag] + pc_to_xml_helper(second_part)
def clone(el): if isinstance(el, NavigableString): return type(el)(el) copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix) # work around bug where there is no builder set # https://bugs.launchpad.net/beautifulsoup/+bug/1307471 copy.attrs = dict(el.attrs) for attr in ('can_be_empty_element', 'hidden'): setattr(copy, attr, getattr(el, attr)) for child in el.contents: copy.append(clone(child)) return copy
def soup(self): ''' Returns HTML as a BeautifulSoup element. ''' components_soup = Tag(name=self.tagname, builder=BUILDER) components_soup.attrs = self.attributes for c in flatten(self.components): if hasattr(c, 'soup'): components_soup.append(c.soup()) elif type(c) in (str, ): # components_soup.append(BeautifulSoup(str(c))) components_soup.append(str(c)) # else: # Component should not be integrated # pass return components_soup
def clone_bs4_elem(el): """Clone a bs4 tag before modifying it. Code from `http://stackoverflow.com/questions/23057631/clone-element-with -beautifulsoup` """ if isinstance(el, NavigableString): return type(el)(el) copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix) # work around bug where there is no builder set # https://bugs.launchpad.net/beautifulsoup/+bug/1307471 copy.attrs = dict(el.attrs) for attr in ('can_be_empty_element', 'hidden'): setattr(copy, attr, getattr(el, attr)) for child in el.contents: copy.append(clone_bs4_elem(child)) return copy
def get_markdown_page_index_objects(content: Tag, url: str, page_path: str, title: str, page_type: str, page_views: int) -> List[Dict]: headers = ['h1', 'h2', 'h3'] index_objects = [] children = [element for element in content.children if isinstance(element, Tag)] if children[0].name not in headers: return get_page_index_objects(content, url, page_path, title, page_type, page_views) block_title = "" content = [] url_with_href = "" for child in children: if child.name in headers: if block_title != '': for ind, page_part in enumerate(get_valuable_content(content)): page_info = {'url': url_with_href, 'objectID': url_with_href + str(ind), 'content': page_part, 'headings': block_title, 'pageTitle': title, 'type': page_type, 'pageViews': page_views} index_objects.append(page_info) url_with_href = url + '#' + child.get('id') block_title = child.text content = [] else: content.append(child) return index_objects
def printhtml(csvdiffs): """print the html""" soup = BeautifulSoup() html = Tag(soup, name="html") para1 = Tag(soup, name="p") para1.append(csvdiffs[0][0]) para2 = Tag(soup, name="p") para2.append(csvdiffs[1][0]) table = Tag(soup, name="table") table.attrs.update(dict(border="1")) soup.append(html) html.append(para1) html.append(para2) html.append(table) heading2table(soup, table, csvdiffs[3]) for row in csvdiffs[4:]: row = [str(cell) for cell in row] row2table(soup, table, row) # print soup.prettify() print(soup)
if len(divFigures) != 0: for fig in divFigures: figCaption = fig.p # Turn the caption into span for CSS formatting #note the games chapter needs some caption work if figCaption is not None: figCaption.name = "span" # [zach] -- this is to make images that are not full width, have captions below the image div = Tag(soup, None, "div") div['style'] = "clear:both" div.append(clone(fig.img)); fig.img.replace_with(div) # Images have been stored in ./CHAPTER_NAME/images/ relative # to the chapter html, but image references in the html are # to ./images/. Modify the image tags: div.img["src"] = internalImagesPath + "/" + div.img["src"] # Make all hyperlinks in the chapter target a new window/tab hyperlinkTags = soup.find_all("a") for hyperlinkTag in hyperlinkTags: hyperlinkTag["target"]= "_blank" html = str(soup) with open(destChapterPath, "wb") as file: file.write(html)
def _extract_article_body(page): article = page.find(id='artikel').find(class_='content') body = Tag(name='temporary_tag') # +1 internetz for the person who can tell me why I can't write: # for element in article.children: # or # for element in article.contents: for element in list(article.children): # Ignore the comment form if element.name == 'form': continue # Ignore whitespace if element.name is None and re.search('\S', str(element)) is None: continue # Nor div, nor form, nor whitespace: probably article content if element.name != 'div': body.append(element.extract()) continue # TODO uncomment me when the app is ready to support subtitles # Oh, and change the next if with an elif # if 'field-field-ondertitel' in element['class']: # paragraph = _extract_paragraph(element, 'subtitle') # body.append(paragraph) if 'field-field-inleiding' in element['class']: paragraph = _extract_paragraph(element, 'introduction') body.append(paragraph) elif 'field-field-img-regulier' in element['class']: images_div = Tag(name='div', attrs={'class': 'image'}) for image_and_caption in element(id='image-and-caption'): image = image_and_caption.img caption = image_and_caption.find(class_='caption-text') paragraph = Tag(name='p') paragraph.append(image) if caption is not None: paragraph.append(caption.text) images_div.append(paragraph) body.append(images_div) elif 'field-field-website' in element['class']: label = element.find(class_='field-label').text label_p = Tag(name='p') label_s = Tag(name='strong') label_s.append(label) label_p.append(label_s) body.append(label_p) websites = element.find(class_='field-item').contents for website in list(websites): body.append(website) else: # Ignore other divs pass return body
for fig in divFigures: figCaption = fig.p # Turn the caption into span for CSS formatting #note the games chapter needs some caption work if figCaption is not None: figCaption.name = "div" # [zach] -- this is to make images that are not full width, have captions below the image div = Tag(soup, None, "div") div['style'] = "image" #"clear:both" div.append(clone(fig.img)) fig.img.replace_with(div) # Images have been stored in ./CHAPTER_NAME/images/ relative # to the chapter html, but image references in the html are # to ./images/. Modify the image tags: div.img["src"] = internalImagesPath + "/" + div.img["src"] # Turn the figure image into a hyperlink that points to the # full resolution version of the image imgHyperlink = soup.new_tag("a", href=fig.img["src"]) fig.img.wrap(imgHyperlink) fig['class'] = "inner" divWhole = Tag(soup, None, "div") divWhole['class'] = "figure"
def format_links(html): ''' This monster of a function takes in the html from a post and returns a dict containing html, text, summary. Uses opengraph to try to get titles for all untitled links, and tries to hyperlink everything. ''' edit_html = html html = html.replace('"', '"') soup = BeautifulSoup(re.sub(r'&(?!amp;)', r'&', html)) reformat_str = ''.join(random.sample(string.ascii_uppercase, 10)) + '__' reformat_dict = {} videos = [] image = None # Set aside all <img> tags, because we need to treat them special and will add them in later. for tag_index, img_tag in enumerate(soup.find_all('img')): key = reformat_str + 'img' + str(tag_index) img_tag.replace_with(key) # handle the shitty case where a user inputs a non-http link if img_tag.has_attr('src') and not img_tag['src'].startswith('http'): new_src = 'http://' + img_tag['src'] img_tag['src'] = new_src if not image: image = img_tag['src'] reformat_dict[key] = img_tag # Set aside all <a> tags, because we need to treat them special and will add them in later. for tag_index, a_tag in enumerate(soup.find_all('a')): key = reformat_str + 'a' + str(tag_index) a_tag.replace_with(key) # handle the shitty case where a user inputs a non-http link if a_tag.has_attr('href'): new_href = a_tag['href'].strip() if not new_href.startswith('http'): new_href = 'http://' + a_tag['href'] a_tag['href'] = new_href embed_link = get_embed_link(new_href) if embed_link: videos.append(embed_link) a_tag['target'] = '_blank' try: if a_tag.string and a_tag['href'] and a_tag.string in a_tag['href']: og_title = get_opengraph(a_tag['href'], params=['title']).get('title') a_tag.string = og_title.strip() except: pass reformat_dict[key] = a_tag mentions = [] # Find all mentions and format them mention_regex = re.compile(r'(@\S+(?:\s\S+)?)') for mention_index, mention_str in enumerate(soup(text=mention_regex)): key = reformat_str + 'm' + str(mention_index) mention_split_list = mention_regex.split(mention_str) parent_tag = Tag(name='span') for piece in mention_split_list: if type(piece) in [unicode, str]: s = mention_regex.search(piece) if s: first_letter = re.search(r"@\S+", piece).group()[1] names = [u.name for u in User.objects(name__istartswith=first_letter)] for i in range(len(piece) - 1): query_name = re.compile(piece[1:len(piece) - i], flags=re.IGNORECASE) matches = len([name for name in names if query_name.match(name)]) if matches == 1: a_tag = Tag(name='a') target_user = User.objects(name=query_name).get() a_tag['href'] = '/profile/%s' % str(target_user.id) a_tag['target'] = '_blank' a_tag['mention'] = 'Yes' a_tag.string = '@' + query_name.pattern parent_tag.append(a_tag) parent_tag.append(NavigableString(piece[len(piece) - i:])) mentions.append(str(target_user.id)) break else: # for/else structure # catch an @ that didn't match any users parent_tag.append(NavigableString(piece)) else: parent_tag.append(NavigableString(piece)) reformat_dict[key] = parent_tag mention_str.replace_with(key) opengraph_index = 0 opengraph_objects = [] # Find all plaintext links and format them. for p in soup.find_all('p'): p_text = unicode(p.text) if link_regex.search(p_text): new_p = Tag(name='p') opengraph_only = False p_opengraph_objects = [] link_split_list = link_regex.split(p_text) for piece in link_split_list: if type(piece) in [unicode, str]: s = link_regex.search(piece) if s: link_text = s.group().strip() if not link_text.startswith('http'): link_text = 'http://' + link_text opengraph = get_opengraph(link_text) a_tag = Tag(name='a') a_tag.string = opengraph.get('title', link_text) or link_text a_tag['href'] = link_text a_tag['target'] = '_blank' if not image and opengraph['image']: image = opengraph['image'] embed_link = get_embed_link(link_text) if embed_link: videos.append(embed_link) else: num_items = 0 for item in link_split_list: if item and not re.match(r'^<.+>$', item): num_items += 1 if num_items == 1: opengraph_objects.append(opengraph) p_opengraph_objects.append(opengraph) opengraph_only = True new_p.append(a_tag) else: new_p.append(NavigableString(piece)) if opengraph_only: new_p = Tag(name='p') for obj in p_opengraph_objects: div = Tag(name='div db-opengraph') div['site'] = 'comment.opengraph[%d]' % opengraph_index opengraph_index += 1 new_p.append(div) p.replace_with(new_p) # Bring back all set-aside <a> and <img> tags for key in reformat_dict: soup(text=key)[0].replace_with(reformat_dict[key]) # Extract html from soup html = unicode(soup) html = clean_html(html) # Anonymized html for mention in soup.find_all('a', attrs={'mention': 'Yes'}): mention.replace_with(NavigableString('@User')) anonymized_html = unicode(soup) anonymized_html = clean_html(anonymized_html) # Generate text text = MLStripper.strip_html(html) anonymized_text = MLStripper.strip_html(anonymized_html) # Generate summary first_paragraph = re.compile('<p>.+?(<br/>|</p>)').search(html) if first_paragraph: summary = MLStripper.strip_html(first_paragraph.group()) if not summary and opengraph_objects: summary = opengraph_objects[0]['title'] if not summary and text: summary = text if not summary: summary = "" # Generate anonymized summary first_paragraph = re.compile('<p>.+?(<br/>|</p>)').search(anonymized_html) if first_paragraph: anonymized_summary = MLStripper.strip_html(first_paragraph.group()) if not anonymized_summary and opengraph_objects: anonymized_summary = opengraph_objects[0]['title'] if not anonymized_summary and text: anonymized_summary = text if not anonymized_summary: anonymized_summary = "" # In summary, we should all the pesky double-spaces and truncate if necessary summary = summary.replace(' ', ' ') if len(summary) > 100: summary = summary[:97] + '...' anonymized_summary = anonymized_summary.replace(' ', ' ') if len(anonymized_summary) > 100: anonymized_summary = anonymized_summary[:97] + '...' return {'html': html, 'edit_html': edit_html, 'summary': summary, 'text': text, 'anonymized_html': anonymized_html, 'anonymized_summary': anonymized_summary, 'mentions': mentions, 'videos': videos, 'opengraph': opengraph_objects, 'image': image}
# for attr in mem_attr: # th = Tag(soup, None, "th") # tr.append(th) # th.append(attr) # print soup.prettify() for c in chapterTags: ul = Tag(soup, None, "ul") li = Tag(soup, None, "li") a = Tag(soup, None, "a"); a['href'] = "chapters/" + c['path'] + ".html" a.string = c['title'] li.append(a) ul.append(li) #print c['title'] #print c['path'] if (len(['innerTags'])): ulInner = Tag(soup, None, "ul") li.append(ulInner); for tag in c['innerTags']: liInner = Tag(soup, None, "li") ulInner.append(liInner) a = Tag(soup, None, "a") tagNoSpaces = tag.replace(" ", "") a['href'] = "chapters/" + c['path'] + ".html#" + tagNoSpaces a['target'] = "_top" a.string = tag
if len(divFigures) != 0: for fig in divFigures: figCaption = fig.p # Turn the caption into span for CSS formatting #note the games chapter needs some caption work if figCaption is not None: figCaption.name = "span" # [zach] -- this is to make images that are not full width, have captions below the image div = Tag(soup, None, "div") div['style'] = "clear:both" div.append(clone(fig.img)); fig.img.replace_with(div) # Images have been stored in ./CHAPTER_NAME/images/ relative # to the chapter html, but image references in the html are # to ./images/. Modify the image tags: div.img["src"] = internalImagesPath + "/" + div.img["src"] # Turn the figure image into a hyperlink that points to the # full resolution version of the image imgHyperlink = soup.new_tag("a", href=fig.img["src"]) fig.img.wrap(imgHyperlink) # Make all hyperlinks in the chapter target a new window/tab hyperlinkTags = soup.find_all("a")
def build_rss(url, list_selector, item_selector, ignored_qp, output, pretty=False): try: soup = BeautifulSoup('<rss version="2.0" />', "xml") rss = soup.rss has_lxml = True except FeatureNotFound: rss = BeautifulSoup('<rss version="2.0" />').rss has_lxml = False r = requests.get(url) list_html = (BeautifulSoup(r.text, "lxml") if has_lxml else BeautifulSoup(r.text)).html channel = Tag(name="channel") rss.append(channel) channel.append(new_tag("title", list_html.head.title.string)) channel.append(new_tag("link", url)) channel.append(new_tag("description", "--")) channel.append(new_tag("lastBuildDate", time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()))) channel.append(new_tag("generator", "RSS Builder")) item_urls = list_html.select(list_selector) for item_url in map(lambda i: i["href"], item_urls): item_url = urlparse.urljoin(url, item_url) parsed = urlparse.urlparse(item_url) query_params = urlparse.parse_qsl(parsed.query) item_url = urlparse.urlunparse( ( parsed.scheme, parsed.netloc, parsed.path, parsed.params, "&".join([k + "=" + v for k, v in query_params if k not in ignored_qp]), parsed.fragment, ) ) r = requests.get(item_url) item_html = (BeautifulSoup(r.text, "lxml") if has_lxml else BeautifulSoup(r.text)).html item = Tag(name="item") item.append(new_tag("title", item_html.head.title.string)) item.append(new_tag("link", item_url)) item.append(new_tag("description", str(item_html.select(item_selector)[0]))) channel.append(item) out_func = lambda x: (x.prettify() if pretty else unicode(x)).encode("utf-8") if output == "-": out_file = sys.stdout close_file = lambda: None else: out_file = open(output, "w") close_file = out_file.close if has_lxml: out_file.write(out_func(soup)) else: out_file.write('<?xml version="1.0" encoding="UTF-8" ?>\n') out_file.write(out_func(rss)) out_file.write("\n") close_file()
def rebuild_rss(url, output, selectors, replace = None, pretty = False, raw = False): source = feedparser.parse(url) try: soup = BeautifulSoup('<rss version="2.0" />', 'xml') rss = soup.rss has_lxml = True except FeatureNotFound: rss = BeautifulSoup('<rss version="2.0" />').rss has_lxml = False channel = Tag(name = 'channel') rss.append(channel) putback_elems(source.feed, channel_required, channel) putback_elems(source.feed, channel_optional, channel) build_date = Tag(name = 'lastBuildDate') build_date.string = time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.gmtime()) channel.append(build_date) generator = Tag(name = 'generator') generator.string = source.feed.generator + ' & RSS Rebuilder' if hasattr(source.feed, 'generator') else 'RSS Rebuilder' channel.append(generator) if replace: regexp = re.compile(replace[0]) for entry in source.entries: item = Tag(name = 'item') channel.append(item) putback_elems(entry, item_required, item) putback_elems(entry, item_optional, item) r = requests.get(entry.link) html = r.content if raw else r.text linked_html = BeautifulSoup(html, 'lxml') if has_lxml else BeautifulSoup(html) content = '' for selector in selectors: tags = linked_html.select(selector) if replace: tags = replace_urls(tags, regexp, replace[1]) content = reduce(lambda s, tag: s + unicode(tag), tags, content) desc = Tag(name = 'description') desc.string = content item.append(desc) out_func = lambda x: (x.prettify() if pretty else unicode(x)).encode('utf-8') if output == '-': out_file = sys.stdout close_file = lambda: None else: out_file = open(output, 'w') close_file = out_file.close if has_lxml: out_file.write(out_func(soup)) else: out_file.write('<?xml version="1.0" encoding="UTF-8" ?>\n') out_file.write(out_func(rss)) out_file.write('\n') close_file()