def main(): args = argument_parser().parse_args() with pathlib.Path(args.file).open('r') as fp: soup = bs4.BeautifulSoup(fp, 'html.parser') soup.ul.contents = [bs4.NavigableString('\n') ] + sort_mods(soup) + [bs4.NavigableString('\n')] with pathlib.Path(args.output or args.file).open('w') as file: file.write(str(soup))
def parse_content(parent: Union[bs4.NavigableString, bs4.Tag, bs4.Comment]) -> bs4.NavigableString: res = '' if isinstance(parent, bs4.Comment): pass elif isinstance(parent, bs4.NavigableString): return parent else: children = parent.contents if len(children) == 0: html_tag = str(parent) return bs4.NavigableString('\n') if 'br' in html_tag else bs4.NavigableString('') else: for child in children: res += parse_content(child) return bs4.NavigableString(res)
def new_tag(self, name, parent=None, string=None, class_=None, index=None, before=None, after=None, **kwargs): tag = self.__doc.new_tag(name, **kwargs) if (string is not None): if (tag.string is not None): tag.string.replace_with(string) else: tag.string = soup.NavigableString(string) if (class_ is not None): tag['class'] = class_ if (before is not None): before.insert_before(tag) elif (after is not None): after.insert_after(tag) elif (parent is not None): if (index is None or index < 0): parent.append(tag) else: parent.insert(index, tag) return tag
def setter(self, value): tag = self.doc for part in parts: if part == '': continue elif part == 'text()': if tag.string: tag.contents[0] = bs4.NavigableString(value) else: tag.append(value) tag.string = tag.contents[0] return else: child = tag.find(part) if not child: child = bs4.Tag(self.doc, part) tag.append(child) tag = child tag.append(value)
def setter(self, text): if self.tag.string: self.tag.contents[0] = bs4.NavigableString(text) else: self.tag.append(text) self.tag.string = self.tag.contents[0]
def run_script(name,file_name="projects.html"): with open(file_name) as inf: txt = inf.read() soup = bs4.BeautifulSoup(txt) with open(f"{file_name.split('.')[0]}_copy.html", "w+") as fc: fc.write(str(soup.prettify())) root = soup.new_tag("div",**{'class':"project-tile"}) obj = soup.new_tag("div",**{'class':"project-object"}) cont = soup.new_tag("div",**{'class':"project-container" ,'onclick':"toggle_project_info(this)"}) img = soup.new_tag("img", **{'alt':'_'.join(name.split()), 'class':"project-object-img"},src=f"resources/img/projects/{'_'.join(name.split())}.png") ovly = soup.new_tag("div",**{'class':'project-object-img-overlay'}) p =soup.new_tag("p",**{'class':'project-object-name'}) p.insert(0,bs4.NavigableString(name.capitalize())) cont.append(img) cont.append(ovly) cont.append(p) obj.append(cont) info = soup.new_tag("div",**{'class':"project-info"}) txt = soup.new_tag("div",**{'class':"text"}) em = soup.new_tag("embed",**{'class':"readme"},src=f"resources/readmes/{'_'.join(name.split())}.html") txt.append(em) info.append(txt) root.append(obj) root.append(info) soup.html.body.section.div.append(root) soup = soup.prettify() with open(file_name, "w") as outf: outf.write(str(soup))
def normalize_rotated_range(self): """ Normalize 'rotated' ranges, indicated by the class 'vertical-title' """ for container in self.soup.select('.vertical-title'): # Extract the labels from the header. for header_labels in container.select('.rotate'): labels = [] header_labels.wrap(self.soup.new_tag('table')) for div in header_labels.select('div'): labels.append(div.text) # Fill in the checked value as text, remove all ranges. for sibling in container.find_next_siblings('div'): squares = sibling.select('.range_square') if squares: # Get the position of the selected element for i, square in enumerate( squares[0].parent.select('div')): if 'range_true' in square.get('class', []): # Print the text-label squares[0].parent.parent.insert( 0, bs4.NavigableString(labels[i])) # Remove the squares. squares[0].parent.decompose() # Remove the header row. container.decompose() # Remove the additional lines with 'hr' tags. for inline_comment in self.soup.select('.inline-comment'): for hr in inline_comment.select('hr'): hr.parent.decompose()
def _add_word_with_annotation(line_num, word_num, word, stemmer, soup, freq_dictionary): try: soup.find_all("p")[line_num].append(soup.new_tag("w")) soup.find_all("w")[word_num].append(bs4.NavigableString(word)) soup.find_all("w")[word_num]['lex'] = stemmer.analyze( word)[0]['analysis'][0]['lex'] soup.find_all("w")[word_num]['gr'] = stemmer.analyze( word)[0]['analysis'][0]['gr'] except IndexError: return None return None
def get_content_from_soup(soup): for b in soup.findAll('br'): b.replace_with("❡") for a in soup.findAll('a'): a.string = _LINK_FORMAT.format(text=a.text, href=a.attrs.get('href', '#').replace(".", ".")) for paragraph_holder in ['h1', 'h2', 'h3', 'h4', 'h5']: for e in soup.findAll(paragraph_holder): e.insert(0, bs4.NavigableString("❡ § ")) for paragraph_holder in ['p', 'div']: for e in soup.findAll(paragraph_holder): e.insert(0, bs4.NavigableString("❡")) for deletable_tag in ['script', 'style', 'header', 'link', 'footer']: for e in soup.findAll(deletable_tag): e.decompose() for deletable in ['header', 'footer', 'wm-ipp-base']: for e in soup.findAll(class_=deletable): e.decompose() for e in soup.findAll(id=deletable): e.decompose() for e in soup.findAll("li"): e.insert(0, bs4.NavigableString("❡ • ")) paragraphs = [[ s.strip().replace(".", ".") for s in p.split('.') if s.strip() ] for p in soup.text.replace("\n", "").split('❡')] to_ret = [] bullet_carry = False for p in paragraphs: if (p == ['•']): bullet_carry = True continue if (p): if bullet_carry: p[0] = ' • ' + p[0] to_ret.append(p) bullet_carry = False return to_ret
def compile_latex(article: Article) -> Article: """Looks through the article content for embedded LaTeX and compiles it into PDFs, and adds the proper tags so they show up on import. """ text_tag: bs4.NavigableString #matches LaTeX inside one or two dollar signs inline_regex = r'\$?\$([^\$]+)\$\$?' for text_tag in article.content.find_all(text=True): p = re.compile(inline_regex) for match in p.finditer(text_tag): latex = match.group(1) if not is_latex(latex): continue #just use the hash of the latex for a unique filename, this should probably never collide filename = article.get_pdf_location(str(hash(latex))) compile_latex_str(latex, filename) #if we can't find the parent, assume it's just the document parent: Tag if text_tag.parent == None or text_tag.parent.name == '[document]': parent = article.content else: parent = text_tag.parent tag_idx = parent.contents.index(text_tag) #replace the matched latex with a link tag begin, end = text_tag.split(match.group(0)) #convert these strings to tags begin = bs4.NavigableString(begin) end = bs4.NavigableString(end) text_tag.replace_with(begin) #the latex compiler will automatically add a .pdf so we have to add one too link_tag = Tag(name='link', attrs={'href': 'file://' + filename + '.pdf'}) parent.insert(tag_idx + 1, link_tag) parent.insert(tag_idx + 2, end) #set the current tag to the new end tag text_tag = end return article
def replace_text_with_tag(sub_text: str, repl_tag: Tag, text_tag: bs4.NavigableString, article: Article) -> bs4.NavigableString: #if we can't find the parent, assume it's just the document parent: Tag if text_tag.parent == None or text_tag.parent.name == '[document]': parent = article.content else: parent = text_tag.parent tag_idx = parent.contents.index(text_tag) #replace the matched text with a tag begin, *rest = text_tag.split(sub_text, maxsplit=1) end: str if len(rest): end = rest[0] else: end = "" #convert these strings to tags begin = bs4.NavigableString(begin) end = bs4.NavigableString(end) text_tag.replace_with(begin) parent.insert(tag_idx + 1, repl_tag) parent.insert(tag_idx + 2, end) return end
def parse_content( parent: Union[bs4.NavigableString, bs4.Tag, bs4.Comment] ) -> bs4.NavigableString: """parse_content convert a tag to a string with interpretting `<br>` and ignoring other tags. .. seealso:: https://github.com/kmyk/online-judge-tools/issues/553 """ res = '' if isinstance(parent, bs4.Comment): pass elif isinstance(parent, bs4.NavigableString): return parent else: children = parent.contents if len(children) == 0: html_tag = str(parent) return bs4.NavigableString( '\n') if 'br' in html_tag else bs4.NavigableString('') else: for child in children: res += parse_content(child) return bs4.NavigableString(res)
def range_to_table(self): """ Cast the 'ranges' to a more basic format: wrap the parent container with a table, and cast the divs to tds. """ for range_min in self.soup.select('.range_min'): range_container = range_min.parent.parent range_table = self.soup.new_tag('table') range_container.insert(0, range_table) for i, div in enumerate(range_container.select('div')): div.name = 'td' extracted = div.extract() range_table.insert(i, extracted) for selected in self.soup.select('.range_true'): selected.insert(0, bs4.NavigableString('x'))
def _parse_sample_tag(self, tag: bs4.Tag) -> Optional[Tuple[str, str]]: assert isinstance(tag, bs4.Tag) assert tag.name == 'pre' prv = utils.previous_sibling_tag(tag) pprv = tag.parent and utils.previous_sibling_tag(tag.parent) if prv.name == 'h6' and tag.parent.name == 'div' and tag.parent[ 'class'] == ['paragraph'] and pprv.name == 'h5': log.debug('h6: %s', str(prv)) log.debug('name.encode(): %s', prv.string.encode()) # tag.string for the tag below returns None # - "<pre></pre>" # - "<pre>6<br />1 1<br />7 4<br />0 5<br />1 3<br />-8 9<br />5 1</pre>" # for more details, see https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string if tag.string is not None: s = tag.string else: s = bs4.NavigableString(''.join(string + '\n' for string in tag.strings)) return utils.textfile(s.lstrip()), pprv.string + ' ' + prv.string return None
def __init__(self): self.domain_re = re.compile(rf"https?://{self.domain}(?P<path>/.+)") self.qid_slug_answer_re = re.compile( r"^(q|questions)/(?P<post_id>[0-9]+)/[^/]+/(?P<answer_id>[0-9]+)") self.qid_re = re.compile(r"^(q|questions)/(?P<post_id>[0-9]+)/?") self.aid_re = re.compile(r"^a/(?P<answer_id>[0-9]+)/?") self.uid_re = re.compile(r"^users/(?P<user_id>[0-9]+)/?") self.tid_re = re.compile(r"^questions/tagged/(?P<tag_id>[0-9]+)/?$") # supported internal paths (what we provide) # used to rule-out in-SE internal links we don't support self.supported_res = ( re.compile(r"questions/tagged/.+"), re.compile(r"users/[0-9]+/.+"), re.compile(r"questions/[0-9]+/.+"), re.compile(r"a/[0-9]+/?$"), re.compile(r"users/profiles/[0-9]+.webp$"), re.compile(r"questions/?$"), re.compile(r"questions_page=[0-9]+$"), re.compile(r"users/?$"), re.compile(r"users_page=[0-9]+$"), re.compile(r"tags$"), re.compile(r"tags_page=[0-9]+$"), re.compile(r"api/tags.json$"), re.compile(r"about$"), re.compile(r"images/[0-9]+.webp$"), ) self.redacted_string = bs4.NavigableString(self.redacted_text) # self.markdown = mistune.create_markdown( # escape=False, # plugins=[plugin_strikethrough, plugin_table, plugin_footnotes], # ) if self.conf.censor_words_list: with open(self.conf.build_dir.joinpath("words.list"), "r") as fh: # this will actually replace occurences of ~strings matching # words in the list but those can be part of actual words or whole. self.words_re = re.compile(r"\b\b|\b\b".join( map(re.escape, [line.strip() for line in fh.readlines()])))
def generate_report(results, output_path=None, force=False): ''' .. versionadded:: 1.28 .. versionchanged:: 1.29.1 Only try to format results for tests that have data in the :data:`results` dictionary. Prior to version ``1.29.1``, this function would fail unless the :data:`results` dictionary contained data existed for **all tests** in :data:`ALL_TESTS` . .. versionchanged:: 1.54 If output extension is ``.html``, output self-contained HTML report with ``<script id="results">...</script>`` tag containing JSON report results. Generate summary report of :func:`self_test` results either as Markdown or a Word document. Parameters ---------- results : dict Results from :func:`self_test`. output_path : str, optional Report output path. If not specified, a text-only Markdown report is generated. If extension of output path is ``docx``, write output as Word document. If extension of output path is ``html``, write output as self-contained HTML report with ``<script id="results">...</script>`` tag containing JSON report results. Otherwise, output path is interpreted as a directory path and a Markdown file is written to the output directory, along with ``.png`` images for test-related plots (where applicable). Output directory will be created if it does not exist. force : bool, optional Overwrite output path if it exists. Returns ------- str or None If :data:`output_path` is not specified, a text-only Markdown report is returned. Raises ------ IOError If :data:`output_path` exists and :data:`force` is not ``True``. ''' if output_path is not None: output_path = ph.path(output_path).realpath() if output_path.exists() and not force: if output_path.isdir() and output_path.listdir(): # Output path is a directory with existing contents. raise IOError('Output directory already exists and is ' 'non-empty. Use `force` to overwrite.') elif output_path.ext.lower() == '.docx': raise IOError('Output path exists. Use `force` to overwrite.') elif output_path.ext.lower() == '.html': raise IOError('Output path exists. Use `force` to overwrite.') elif output_path.isfile(): raise IOError('Output path exists and is a file. Output path ' 'must either be a directory or a filepath with ' 'the `.docx` extension.') tests_with_figure = set([ 'test_channels', 'test_voltage', 'test_on_board_feedback_calibration' ]) # Find starting time of earliest test (or current date and time if no # timestamp is available). min_timestamp = min([ result_i['utc_timestamp'] for result_i in six.itervalues(results) if 'utc_timestamp' in result_i ] + [dt.datetime.utcnow().isoformat()]) header = ['# DropBot self test (*{}*)'.format(min_timestamp.split('.')[0])] if output_path is None: # Execute `format_<test name>_results` for each test to generate each # respective Markdown report. md_results_cmds = [ 'format_{test_name}_results(results["{test_name}"])'.format( test_name=name_i) for name_i in ALL_TESTS if name_i in results ] md_results = list(map(eval, md_results_cmds)) # Join Markdown reports, separated by horizontal bars. md_report = (2 * '\n' + (72 * '-') + 2 * '\n').join(header + md_results) # No output path was specified. Return text-only Markdown report. return md_report if output_path.ext.lower() in ('.docx', '.html'): output_path.parent.makedirs_p() parent_dir = ph.path(tempfile.mkdtemp(prefix='dropbot-self-test')) else: parent_dir = output_path output_path.makedirs_p() markdown_path = parent_dir.joinpath('results-summary.markdown') try: # Execute `format_<test name>_results` for each test to generate each # respective Markdown report. md_results = [ eval('format_{test_name}_results'.format(test_name=name_i))( results[name_i], **({ 'figure_path': parent_dir.joinpath(name_i + '.png') } if name_i in tests_with_figure else {})) for name_i in ALL_TESTS if name_i in results ] # Join Markdown reports, separated by horizontal bars. md_report = (2 * '\n' + (72 * '-') + 2 * '\n').join(header + md_results) with markdown_path.open('w') as output: output.write(md_report) if output_path.ext.lower() == '.docx': sp.check_call(['pandoc', markdown_path, '-o', output_path], shell=True) elif output_path.ext.lower() == '.html': # Write template to file for use with `pandoc`. template = pkgutil.get_data( 'dropbot', 'static/templates/' 'SelfTestTemplate.html5') template_path = parent_dir.joinpath('SelfTestTemplate.html5') template_path.write_text(template) # Use `pandoc` to create self-contained `.html` report. sp.check_call([ 'pandoc', markdown_path, '-o', output_path, '--standalone', '--self-contained', '--template', template_path ], shell=True, stderr=sp.PIPE) with output_path.open('r') as input_: data = input_.read() # Inject JSON result data into HTML report. soup = bs4.BeautifulSoup(data, 'lxml') results_script = soup.select_one('script#results') # Format JSON with indents. Works around [`json_tricks` # issue][i51]. # # [i51]: https://github.com/mverleg/pyjson_tricks/issues/51 json_data = json_tricks.dumps(results, indent=4) results_script.string = bs4.NavigableString(json_data) with output_path.open('w') as output: output.write(unicode(soup).encode('utf8')) finally: if output_path.ext.lower() in ('.docx', '.html'): parent_dir.rmtree()
def createTextNode(self, data): return Text(self, BeautifulSoup.NavigableString(data))
def createTextNode(self, data): from .Text import Text return Text(self, bs4.NavigableString(data))
def createTextNode(self, data): from .Text import Text return Text(self, BeautifulSoup.NavigableString(data))
def createTextNode(self, data): return Text(self, bs4.NavigableString(data))
def sort_mods(mods: bs4.Tag): return intersperse( sorted(mods.ul.find_all('li'), key=lambda x: x.text.lower()), bs4.NavigableString('\n'))
def postprocess_soup(self, soup): """ For each instance of a glossary word, replace it with a glossary tag """ # Only run this if there is indeed a glossary if len(self.terms.keys()) == 0: return term_list = "|".join(self.terms) # TODO: there is at least one rendering issue with two glossary # elements inside an <em> block that is rendering wrong. # Only look in the main body, skip scripts, TOC, etc main = soup.find("div", {"id": "MAIN"}) term_search = re.compile( r"^(|.*?[\s(\"'])(%s)(|[\s.?!\"',)].*)$" % term_list, re.MULTILINE) if main == None: return for string in list(main.strings): if string.parent.name == "code": continue # skip code fragments if string.parent.name == "a": continue # skip hyperlinks # TODO: can I do this smarter by explicitly tagging td elements for the dictionary? The code seems to work, # though I want to make sure that this isn't hurting build perf. skipit = False for parent in string.parents: if parent == None: continue if parent.attrs == None: continue if parent.attrs.has_key( "id") and parent.attrs["id"] == "table-glossary": skipit = True break if parent.attrs.has_key("class") and ( "definition_popup" in parent.attrs["class"]): # Skip hyperlinks for the popup windows skipit = True break if skipit: continue found_match = False text = unicode(string) while (True): m = term_search.search(text) if not m: break found_match = True string.insert_before(bs4.NavigableString(m.group(1))) # Replace the term search with the rest of the string so we can iterate on terms text = m.group(3) if self.visible: ahref = soup.new_tag("a", href='#glossary_%s' % m.group(2)) ahref.string = m.group(2) ahref.attrs["class"] = "glossary_link" ahref.attrs["data-show"] = "define_%s" % m.group(2) string.insert_before(ahref) else: span = soup.new_tag("span") span.string = m.group(2) span.attrs["class"] = "glossary_link" span.attrs["data-show"] = "define_%s" % m.group(2) string.insert_before(span) if found_match: # Put whatever remains at the end and extract the original string string.insert_before(bs4.NavigableString(text)) string.extract()
def _createHtml(self, soup): child = self.getChild().createHtml(soup) assert assertType(child, list) return ([bs4.NavigableString(f"{{{{#{self.field}}}}}")] + child + [bs4.NavigableString(f"{{{{/{self.field}}}}}")])
def parse(args, target, soup, workdir, out_dir): source_info = {"type": "script", "file": __file__, "children": []} tile_data = [] for filename_raw in args[1:]: parts = filename_raw.split("#") filename_actual = parts[0] if len(parts) > 1: use_ids_raw = parts[1].split(",") else: use_ids_raw = None data_items = {} filename = os.path.join(workdir, filename_actual) with open(filename, "r") as f: data = json.load(f) for item in data: if use_ids_raw is not None: data_items[item["id"]] = item else: tile_data.append(item) if use_ids_raw is not None: for n in use_ids_raw: if n.count("-") == 1: bits = n.split("-") min_ = int(bits[0]) max_ = int(bits[1]) if max_ >= min_: for i in range(min_, max_ + 1): tile_data.append(data_items[i]) else: for i in range(min_, max_ - 1, -1): tile_data.append(data_items[i]) else: tile_data.append(data_items[int(n)]) source_info["children"].append({ "type": "source", "file": filename, "children": [] }) nav = soup.new_tag("div") nav["class"] = "pagelist-nav" target.append(nav) for item in tile_data: entry = soup.new_tag("div") entry["class"] = "pagelist-entry" # Clickable title title = soup.new_tag("div") title["class"] = "pagelist-entry-title" title_a = soup.new_tag("a", href=item["click"]) title_a.string = item["title"] title.append(title_a) entry.append(title) # Tag and link bubbles if "tags" in item: tag_container = soup.new_tag("div") tag_container["class"] = "pagelist-tag-container" if "tags" in item: for name, text in item["tags"]: tag_el = soup.new_tag("span") tag_el["class"] = "tag" tag_el["data-name"] = name if name in tag_colors: tag_el["style"] = "background-color: " + tag_colors[ name][0] + "; color: " + tag_colors[name][1] + ";" tag_el.string = text tag_container.append(tag_el) tag_container.append(bs4.NavigableString(" ")) entry.append(tag_container) # Body if "body" in item: body = soup.new_tag("div") body["class"] = "pagelist-entry-desc" body_content = bs4.BeautifulSoup(item["body"], "lxml") body.extend(body_content.body.contents) entry.append(body) target.append(entry) return source_info
def fetch(url, proxies=None, timeout=20): fetch_start_time = time.time() if proxies is None: proxies = {} resp = requests.get(url, proxies=proxies, timeout=timeout) # print(resp.text) # print(resp.status_code) soup = BeautifulSoup(resp.content, 'html5lib') # print(soup.title) # print(soup.h1.text.strip()) # print(soup.h2.text.strip()) h1 = soup.h1 if h1 is None: return False, 'failed to find titie' h1_hidden_part = h1.find(class_='u-hiddenVisually') if h1_hidden_part: h1_hidden_part.decompose() name = h1.text.strip() at_raw = soup.h2.text.strip() at = at_raw[1:] # if not at_raw.startswith('@'): # return False, 'failed to extract at: {}'.format(at_raw) item_container = soup.find(id='stream-items-id') # if item_container is None: # return False, 'failed to get item container' items = item_container.find_all('li', recursive=False) index = None items_parsed = [] for index, item in enumerate(items): fullname = item.find(class_='fullname').text.strip() username = item.find(class_='username').text.strip().replace('@', '') is_retweet = item.find(class_='js-retweet-text') is not None # link a_link = item.find('a', class_=['twitter-timeline-link', 'u-hidden']) link = a_link.get('href') a_link.decompose() # time time_node = item.find(lambda tag: 'data-time-ms' in tag.attrs) timestamp_ms = int(time_node.get('data-time-ms')) # hashtag for hashtag in item.find_all('a', class_='twitter-hashtag'): hash_text = hashtag.text.strip() hash_relative_href = hashtag.get('href') hash_link = urllib.parse.urljoin( 'http://twitter.com/', hash_relative_href) new_tag = soup.new_tag('a', href=hash_link, target='_blank', rel='noopener') new_tag.string = hash_text hashtag.replace_with(new_tag) # emoji for emoji_img in item.find_all('img', class_=('Emoji', 'Emoji--forText')): alt = emoji_img.get('alt') if alt: text_tag = bs4.NavigableString(alt) emoji_img.replace_with(text_tag) # atreply for atreply in item.find_all('a', class_='twitter-atreply'): href = atreply.get('href') reply_link = urllib.parse.urljoin('http://twitter.com/', href) reply_text = atreply.text.strip() new_tag = soup.new_tag('a', href=reply_link, target='_blank', rel='noopener') new_tag.string = reply_text atreply.replace_with(new_tag) text_container = item.find(class_='js-tweet-text-container') content = str(text_container) content_md = html2text.html2text(content, bodywidth=0).rstrip() items_parsed.append({ 'id': link.split('/')[-1], 'name': fullname, 'at': username, 'timestamp_ms': timestamp_ms, 'content_md': content_md, 'content': content, 'link': link, 'retweet': is_retweet, }) fetch_end_time = time.time() fetch_duration = fetch_end_time - fetch_start_time return { 'url': url, 'name': name, 'at': at, 'items': items_parsed, '_fetch_start_time': fetch_start_time, '_fetch_end_time': fetch_end_time, '_fetch_duration': fetch_duration, }
def GenerateHTML(self, controller, minify=False, prettify=False): soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Doctype): x.extract() # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Declaration): x.extract() # Remove all imports. imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links. scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all in-line scripts. scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all in-line styles. inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) if html: ns = soup.new_tag('style') ns.append(bs4.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed. stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text: isinstance(text, bs4.Comment)) for comment in comments: comment.extract() if prettify: return soup.prettify('utf-8').strip() # We are done. return unicode(soup).strip()
def redact_link(self, link): for attr in ("href", "title"): if attr in link.attrs: del link.attrs[attr] link.contents = [bs4.NavigableString("[redacted]")]
def parse(args, target, soup, workdir, out_dir): source_info = {"type": "script", "file": __file__, "children": []} tile_data = [] for filename_raw in args[1:]: parts = filename_raw.split("#") filename_actual = parts[0] if len(parts) > 1: use_ids_raw = parts[1].split(",") else: use_ids_raw = None data_items = {} filename = os.path.join(workdir, filename_actual) with open(filename, "r") as f: data = json.load(f) for item in data: if use_ids_raw is not None: data_items[item["id"]] = item else: tile_data.append(item) if use_ids_raw is not None: for n in use_ids_raw: if n.count("-") == 1: bits = n.split("-") min_ = int(bits[0]) max_ = int(bits[1]) if max_ >= min_: for i in range(min_, max_ + 1): tile_data.append(data_items[i]) else: for i in range(min_, max_ - 1, -1): tile_data.append(data_items[i]) else: tile_data.append(data_items[int(n)]) source_info["children"].append({ "type": "source", "file": filename, "children": [] }) for item in tile_data: entry = soup.new_tag("div", id=("post." + str(item["id"]))) entry["class"] = "tile" # Clickable image img = soup.new_tag("img") img["class"] = "tile-image" if "image" in item: real_image_loc = os.path.join(workdir, item["image"]) img["alt"] = "Depiction of '" + item["title"] + "'" else: real_image_loc = "images/projects/placeholder.png" img["alt"] = "Placeholder image" img["src"] = os.path.relpath(real_image_loc, start=out_dir) entry_image = Image.open(os.path.normpath(real_image_loc)) img["width"], img["height"] = entry_image.size if "click" in item: img_a = soup.new_tag("a", href=item["click"]) img_a.append(img) entry.append(img_a) else: entry.append(img) # Date if "time" in item: when = datetime.utcfromtimestamp(item["time"]) when_str = when.strftime("%b %e, %Y") when_div = soup.new_tag("div") when_div["class"] = "tile-date" when_div.string = when_str entry.append(when_div) # Title title = soup.new_tag("div") title["class"] = "tile-title" title.string = item["title"] entry.append(title) # Tag and link bubbles if ("tags" in item) or ("links" in item): tag_container = soup.new_tag("div") tag_container["class"] = "tile-tag-container" if "tags" in item: for name, text in item["tags"]: tag_el = soup.new_tag("span") tag_el["class"] = "tag" if name in tag_colors: tag_el["style"] = "background-color: " + tag_colors[ name][0] + "; color: " + tag_colors[name][1] + ";" tag_el.string = text tag_container.append(tag_el) tag_container.append(bs4.NavigableString(" ")) if "links" in item: for props in item["links"]: text = props["text"] dest = props["dest"] link_el = soup.new_tag("a") link_el["class"] = "tag" link_el["href"] = dest icon = soup.new_tag("img") icon[ "src"] = "../icons/link.png" #material icon (https://material.io/tools/icons/) (under https://www.apache.org/licenses/LICENSE-2.0.html) icon["alt"] = "link: " icon["class"] = "tag-link-icon" link_el.append(icon) text_el = soup.new_tag("span") text_el.string = text link_el.append(text_el) tag_container.append(link_el) tag_container.append(bs4.NavigableString(" ")) entry.append(tag_container) # Body body = soup.new_tag("div") body["class"] = "tile-body" body_content = bs4.BeautifulSoup(item["body"], "lxml") body.extend(body_content.body.contents) entry.append(body) target.append(entry) # Add some dummy spacers to make everything display as desired for i in range(10): spacer = soup.new_tag("div") spacer["class"] = "tile-spacer" target.append(spacer) return source_info