def proc_hr(fig, img): if MATERIAL_HR: hr = bs4.Tag(name="hr") fig.replace_with(hr) else: url = "https:" + img["data-src"] name = "dividers/{}.png".format(url.split("/")[-1]) if not os.path.exists(path(name)): print("下载分割线", url) download(url, name) hr = bs4.Tag(name="img", attrs={"src": url, "hr": None}) fig.replace_with(hr)
def initEmptyMetalink(self): self.xml = bs4.BeautifulSoup('<?xml version="1.0" encoding="utf-8"?>', "xml") self.metalink = bs4.Tag(name="metalink") self.metalink["xmlns"] = "urn:ietf:params:xml:ns:metalink" generator = bs4.Tag(name="generator") generator.append("downloaders python library") self.metalink.append("\n") self.metalink.append(generator) self.metalink.append("\n") self.xml.append(self.metalink) return self.xml
def parse_content(content): soup = bs4.BeautifulSoup(content._content, "html.parser") for def_list in soup.find_all("dl"): defns = [] for def_title in def_list.find_all("dt"): if def_title.text not in Definitions.exclude: anchor_name = make_anchor(def_title) anchor_tag = bs4.Tag(name="a", attrs={"name": anchor_name}) index = def_list.parent.index(def_list) - 1 def_list.parent.insert(index, anchor_tag) defns.append( { "title": make_title(def_title), "definition": make_def(def_title), "anchor": anchor_name, "source": content, } ) for defn in defns: defn["see_also"] = [d for d in defns if d is not defn] Definitions.definitions += defns content._content = str(soup)
def insertRow(self, index=None): # `index' specifies the position of the row to insert (starts at 0). The value of # -1 can also be used; which result in that the new row will be inserted at the # last position. This parameter is required in Firefox and Opera, but optional in # Internet Explorer, Chrome and Safari. If this parameter is omitted, insertRow() # inserts a new row at the last position in IE and at the first position in Chrome # and Safari. if index is None: if log.ThugOpts.Personality.isIE(): index = -1 if log.ThugOpts.Personality.isChrome( ) or log.ThugOpts.Personality.isSafari(): index = 0 row = HTMLTableRowElement(self.doc, BeautifulSoup.Tag(self.doc, name='tr')) if index in ( -1, len(self._rows), ): self.rows.nodes.append(row) else: self.rows.nodes.insert(index, row) return row
def __collectTextElements(self): """Return all elements containing parts of chapter text (which may be <p>aragraphs, <div>isions or plain text nodes) under a single root.""" starter = self._document.find('div', {'itemprop': 'articleBody'}) if starter is None: # FIXME: This will occur if the method is called more than once. # The reason is elements appended to `root' are removed from the document. # BS 4.4 implements cloning via `copy.copy()', but supporting it for BS 4.3 # would be error-prone (due to relying on BS internals) and is not needed. if self._textElement: _logger.debug( u"You may not call this function more than once!") raise ParsingError(u'Failed to locate text.') collection = [starter] for element in starter.childGenerator(): if element is None: break collection.append(element) root = bs4.Tag(name='td') for element in collection: root.append(element) if self._configuration['excludeEditorSignature']: root = self._excludeEditorSignature(root) return root
def createCaption(self): if self._caption: return self._caption self._caption = HTMLTableCaptionElement( self.doc, BeautifulSoup.Tag(self.doc, name='caption')) return self._caption
def createTHead(self): if self._tHead: return self._tHead self._tHead = HTMLTableSectionElement(self.doc, BeautifulSoup.Tag(self.doc, name = 'thead')) self.rows.nodes.insert(0, self._tHead) return self._tHead
def parse_content(content): soup = bs4.BeautifulSoup(content._content, 'html.parser') for def_list in soup.find_all('dl'): defns = [] for def_title in def_list.find_all('dt'): if def_title.text not in Definitions.exclude: anchor_name = make_anchor(def_title) anchor_tag = bs4.Tag(name="a", attrs={'name': anchor_name}) index = def_list.parent.index(def_list) - 1 def_list.parent.insert(index, anchor_tag) defns.append({ 'title': make_title(def_title), 'definition': make_def(def_title), 'anchor': anchor_name, 'source': content }) for defn in defns: defn['see_also'] = [d for d in defns if d is not defn] Definitions.definitions += defns content._content = str(soup)
def createTFoot(self): if self._tFoot: return self._tFoot self._tFoot = HTMLTableSectionElement(self.doc, BeautifulSoup.Tag(self.doc, name = 'tfoot')) self.rows.nodes.append(self._tFoot) return self._tFoot
def insertRow(self, index = None): # Insert a new empty row in the table. The new row is inserted immediately before # and in the same section as the current indexth row in the table. If index is -1 # or equal to the number of rows, the new row is appended. In addition, when the # table is empty the row is inserted into a TBODY which is created and inserted # into the table. # `index' specifies the position of the row to insert (starts at 0). The value of # -1 can also be used; which result in that the new row will be inserted at the # last position. This parameter is required in Firefox and Opera, but optional in # Internet Explorer, Chrome and Safari. If this parameter is omitted, insertRow() # inserts a new row at the last position in IE and at the first position in Chrome # and Safari. if index is None: if log.ThugOpts.Personality.isIE(): index = -1 if log.ThugOpts.Personality.isChrome() or log.ThugOpts.Personality.isSafari(): index = 0 # PLEASE REVIEW ME! if not len(self.tBodies): tBody = HTMLTableSectionElement(self.doc, BeautifulSoup.Tag(self.doc, name = 'tbody')) self.tBodies.nodes.append(tBody) if self.tFoot is None: self.rows.nodes.append(tBody) else: self.rows.nodes.insert(-2, tBody) else: tBody = self.tBodies[-1] row = tBody.insertRow(index) return row
def setter(self, value): tag = self.doc for part in parts: if part == '': continue elif part == 'text()': if tag.string: tag.contents[0] = bs4.NavigableString(value) else: tag.append(value) tag.string = tag.contents[0] return else: child = tag.find(part) if not child: child = bs4.Tag(self.doc, part) tag.append(child) tag = child tag.append(value)
async def get_information(cls, href): """ Gets information for the given search result. """ url = base_cppr + href conn = await cls.acquire_http() response = await conn.get(url) # Make soup. bs = bs4.BeautifulSoup(await response.text()) header = bs.find(name="tr", attrs={"class": "t-dsc-header"}) if header: header = header.text else: header = "" taster_tbl: bs4.Tag = bs.find(name="table", attrs={"class": "t-dcl-begin"}) if taster_tbl: tasters = taster_tbl.find_all( name="span", attrs={"class": lambda c: c is not None and "mw-geshi" in c}, ) if tasters: # Fixes some formatting for i, taster in enumerate(tasters): taster = taster.text.split("\n") taster = "\n".join(t.rstrip() for t in taster) taster = taster.replace("\n\n", "\n") tasters[i] = taster # Remove tasters from DOM taster_tbl.replace_with(bs4.Tag(name="empty")) else: tasters = [] h1 = bs.find(name="h1").text # Get the description desc = bs.find(name="div", attrs={"id": "mw-content-text"}) if desc: # first_par_node = desc.find(name='p') # description = first_par_node.text + '\n' # sibs = first_par_node.find_next_siblings() # for sib in sibs: # description += sib.text + '\n' description = "\n".join( p.text for p in desc.find_all(name="p") if not p.text.strip().endswith(":") and not p.text.strip().startswith("(") and not p.text.strip().endswith(")") ) else: description = "" return url, h1, tasters, header, description
def createElement(self, tagname): # bs4.Tag(builder = None) element = DOMImplementation.createHTMLElement(self, bs4.Tag(self.doc, None, tagname)) if self.onCreateElement: self.onCreateElement(element) return element
def createElement(self, tagname): element = DOMImplementation.createHTMLElement( self, bs4.Tag(parent=None, name=tagname)) if self.onCreateElement: self.onCreateElement(element) return element
def __init__(self, doc, parent, attr): self.doc = doc self.parent = parent self.attr = attr self.tag = BeautifulSoup.Tag(parser=self.doc, name='attr') Node.__init__(self, doc) self._value = self.getValue()
def generateMetalinkFileNodeFromTarget(self, target): fileName = target.fsPath.name file = self.metalink.select_one("file", name=fileName) if file is None: file = bs4.Tag(name="file") file["name"] = target.fsPath.name self.metalink.append("\n") self.metalink.append(file) self.metalink.append("\n") for uri in target.uris: url = bs4.Tag(name="url") url.append(uri) file.append("\n") file.append(url) file.append("\n") return file
def __init__(self, doc, parent, attr): self.doc = doc self.parent = parent self.attr = attr self.tag = bs4.Tag(parser = self.doc, name = 'attr') Node.__init__(self, doc) self._specified = False self._value = self.getValue()
def fixMetalink(meta4Text: str) -> "bs4.BeautifulSoup": """This function is licensed under Unlicense license""" meta4XML = bs4.BeautifulSoup(meta4Text, "xml") fEl = meta4XML.select_one("file") urisEls = list(fEl.select("url")) for u in urisEls: u.string = fixHTTPS(u.string) if not fEl.select("metaurl[mediatype=torrent]"): t = bs4.Tag(name="metaurl") t.attrs["mediatype"] = "torrent" t.string = uris["torrent"] urisEls[0].insert_before(t) magnetUri = ourGet(uris["magnet"]).text.strip() t = bs4.Tag(name="url") t.attrs["priority"] = "0" t.string = magnetUri urisEls[0].insert_before(t) return meta4XML
def createElement(self, tagname, tagvalue = None): from .DOMImplementation import DOMImplementation if log.ThugOpts.features_logging: log.ThugLogging.Features.increase_createelement_count() # Internet Explorer 8 and below also support the syntax # document.createElement('<P>') if log.ThugOpts.Personality.isIE() and log.ThugOpts.Personality.browserMajorVersion < 9: if tagname.startswith('<') and '>' in tagname: tagname = tagname[1:].split('>')[0] return DOMImplementation.createHTMLElement(self, BeautifulSoup.Tag(parser = self.doc, name = tagname))
def createElement(self, tagname, tagvalue = None): from DOMImplementation import DOMImplementation # Internet Explorer 8 and below also support the syntax # document.createElement('<P>') if log.ThugOpts.Personality.isIE() and log.ThugOpts.Personality.browserVersion < '9.0': if tagname.startswith('<') and '>' in tagname: tagname = tagname[1:].split('>')[0] element = DOMImplementation.createHTMLElement(self, BeautifulSoup.Tag(parser = self.doc, name = tagname)) if self.onCreateElement: self.onCreateElement(element) return element
def embedImages(): if debug: shutil.copy('output.html', 'outputOLD.html') file1 = open('output.html', 'rb') con = file1.read() file1.close() try: bs = BeautifulSoup.BeautifulSoup(con.decode('utf8'), features="lxml") except UnicodeDecodeError: bs = BeautifulSoup.BeautifulSoup(con, fromEncoding='windows-1252', features="lxml") if bs.find('meta', {'http-equiv': 'Content-Type'}) is None: bs.find('head').insert( 1, BeautifulSoup.Tag(parser=bs, name='meta', attrs={ 'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8' })) tagsTemp = bs.findAll('img') tags = [] for x in tagsTemp: h = x.get('src') if h != 'None': if len(h) > 3: if h[0:4] == 'cid:': tags.append(x) for x in tags: src = x['src'] with open(src[4:], 'rb') as emb: stream = emb.read() data = 'data:image;base64,' + base64.b64encode(stream).decode('utf8') x['src'] = data if not debug: os.remove(src[4:]) con = bs.prettify() file1 = open('output.html', 'wb') file1.write(con.encode('utf8')) file1.close()
def append_to(parent, tag, **kwargs): """ Append an element to the supplied parent. :param parent: Parent to append to. :param tag: Tag to create. :param kwargs: Tag kwargs. :return: New element. """ if hasattr(parent, "soup"): soup = parent.soup else: soup = parent.find_parent("html") # Create Tag explicitly instead of using new_tag, otherwise attribute "name" leads to clash with tag-name in bs4 new_tag = bs4.Tag(builder=soup.builder, name=tag, attrs=kwargs) new_tag.soup = soup parent.append(new_tag) return new_tag
def createElement(self, tagname, tagvalue=None): # zomo import re match = re.search('iframe', str(tagname).lower()) if match: log.ThugLogging.add_behavior_warn( "[iframe injection: createElement] %s" % str(tagname)) from DOMImplementation import DOMImplementation # Internet Explorer 8 and below also support the syntax # document.createElement('<P>') if log.ThugOpts.Personality.isIE( ) and log.ThugOpts.Personality.browserVersion < '9.0': if tagname.startswith('<') and '>' in tagname: tagname = tagname[1:].split('>')[0] element = DOMImplementation.createHTMLElement( self, BeautifulSoup.Tag(parser=self.doc, name=tagname)) if self.onCreateElement: self.onCreateElement(element) return element
def insertRow(self, index=None): # Insert a new empty row in the table. The new row is inserted immediately before # and in the same section as the current indexth row in the table. If index is -1 # or equal to the number of rows, the new row is appended. In addition, when the # table is empty the row is inserted into a TBODY which is created and inserted # into the table. # `index' specifies the position of the row to insert (starts at 0). The value of # -1 can also be used; which result in that the new row will be inserted at the # last position. This parameter is required in Firefox and Opera, but optional in # Internet Explorer, Chrome and Safari. If this parameter is omitted, insertRow() # inserts a new row at the last position in IE and at the first position in Chrome # and Safari. if index is None: if log.ThugOpts.Personality.isIE(): index = -1 if log.ThugOpts.Personality.isChrome( ) or log.ThugOpts.Personality.isSafari(): index = 0 row = HTMLTableRowElement(self.doc, bs4.Tag(self.doc, name='tr')) self.rows.nodes.insert(index, row) return row
def __init__(self, doc): self.tag = BeautifulSoup.Tag(parser = doc, name = 'documentfragment') Node.__init__(self, doc) self.__init_personality()
def format_clean_content(title, body, BeautifulSoup): # heavily integrated with output of dsl reader plugin! # and with xdxf also. """ :param title: str | None """ # class="sec" => d:priority="2" # style="color:steelblue" => class="ex" # class="p" style="color:green" => class="p" # style="color:green" => class="c" # style="margin-left:{}em" => class="m{}" # <s> => <del> # xhtml is strict if BeautifulSoup: soup = BeautifulSoup.BeautifulSoup(body, "lxml", from_encoding='utf-8') # difference between 'lxml' and 'html.parser' if soup.body: soup = soup.body for tag in soup(class_='sec'): tag['class'].remove('sec') if not tag['class']: del tag['class'] tag['d:priority'] = "2" for tag in soup(lambda x: 'color:steelblue' in x.get('style', '')): remove_style(tag, 'color:steelblue') if 'ex' not in tag.get('class', []): tag['class'] = tag.get('class', []) + ['ex'] for tag in soup(is_green): remove_style(tag, 'color:green') if 'p' not in tag.get('class', ''): tag['class'] = tag.get('class', []) + ['c'] for tag in soup(True): if 'style' in tag.attrs: m = margin_re.search(tag['style']) if m: remove_style(tag, m.group(0)) tag['class'] = tag.get('class', []) + ['m' + m.group(1)] for tag in soup.select('[href]'): href = tag['href'] if not (href.startswith('http:') or href.startswith('https:')): tag['href'] = 'x-dictionary:d:%s' % href for tag in soup('u'): tag.name = 'span' tag['class'] = tag.get('class', []) + ['u'] for tag in soup('s'): tag.name = 'del' if title: h1 = BeautifulSoup.Tag(name='h1') h1.string = title soup.insert(0, h1) # hence the name BeautifulSoup content = toStr(soup.encode_contents()) else: # somewhat analogue to what BeautifulSoup suppose to do body = em0_9_re.sub(em0_9_sub, body) body = em0_9_ex_re.sub(em0_9_ex_sub, body) body = href_re.sub(href_sub, body) body = body \ .replace('<i style="color:green">', '<i class="c">') \ .replace('<i class="p" style="color:green">', '<i class="p">') \ .replace('<span class="ex" style="color:steelblue">', '<span class="ex">') \ .replace('<span class="sec ex" style="color:steelblue">', '<span class="sec ex">') \ .replace('<u>', '<span class="u">').replace('</u>', '</span>') \ .replace('<s>', '<del>').replace('</s>', '</del>') # nice header to display content = '<h1>%s</h1>%s' % (title, body) if title else body content = close_tag.sub('<\g<1> />', content) content = img_tag.sub('<img \g<1>/>', content) content = content.replace(' ', ' ') content = nonprintable.sub('', content) return content
def createElement(self, tagname, tagvalue=None): return DOMImplementation.createHTMLElement( self, bs4.Tag(parser=self.doc, name=tagname))
def to_html(self, *, assets): if self._html is not None: return self._html asset_by_id = {_['id']: _ for _ in assets} def _assetName(id_): return asset_by_id[id_]['name'] html = bs4.BeautifulSoup('', 'lxml') d = {} def _add(e0, e1): parent1 = html _e = e0 while _e is not None: if id(_e) in d: parent1 = d[id(_e)] break _e = _e.parent if (parent1 is html) and (not _is_tag(e0)): return if _is_tag(e0): d[id(e0)] = e1 parent1.append(e1) tr = Traversal(self._root) for e0 in tr: if isinstance(e0, bs4.NavigableString): _li = str(e0).split('$$') hasMath = False for _ in _li: if not hasMath: _add(e0, _) else: _span = bs4.Tag(name='span') _span['hasMath'] = 'true' _span.append(_) _add(e0, _span) hasMath = not hasMath continue if not _is_tag(e0): continue if e0.name == 'asset': assert _has_no_child(e0) e1 = bs4.Tag(name='p') e1['class'] = 'asset' e1.append(_assetName(e0['id'])) elif e0.name == 'img': assert _has_no_child(e0) e1 = bs4.Tag(name='img') e1['src'] = e1['alt'] = _assetName(e0['assetId']) e1['src'] = quote(e1['src']) elif e0.name == 'heading': e1 = bs4.Tag(name='h%d' % int(e0['level'])) elif e0.name == 'text': e1 = bs4.Tag(name='p') elif e0.name == 'list': bulletType = e0['bulletType'] if bulletType == 'numbers': e1 = bs4.Tag(name='ol') e1['type'] = '1' elif bulletType == 'bullets': e1 = bs4.Tag(name='ul') else: e1 = bs4.Tag(name='ul') logging.warning('[CML] unknown bulletType=%s' % bulletType) elif e0.name == 'a': e1 = bs4.Tag(name='a') e1['href'] = e0['href'] if e0.get('refid'): e1['refid'] = e0['refid'] elif e0.name == 'code': e1 = bs4.Tag(name='pre') e1.append(copy.copy(e0)) tr.skip_children() elif e0.name in [ 'li', 'strong', 'em', 'u', 'table', 'tr', 'td', 'th', 'sup', 'sub' ]: e1 = bs4.Tag(name=e0.name) elif e0.name in ['co-content']: continue else: logging.warning('[CML] unknown e0.name=%s\n%s' % (e0.name, e0)) continue _add(e0, e1) self._html = str(html) return self._html
async def get_information(self, ctx, href): """ Gets information for the given search result. """ url = base_cppr + href async with self.acquire_http_session() as conn: with algorithms.TimeIt() as timer: async with conn.get(url) as resp: self.logger.info("GET %s", url) resp.raise_for_status() # Make soup. bs = bs4.BeautifulSoup(await resp.text(), features="html.parser") await ctx.send(f"Response from server took {timer.time_taken * 1_000:,.2f}ms", delete_after=3) header = bs.find(name="tr", attrs={"class": "t-dsc-header"}) if header: header = header.text else: header = "" taster_tbl: bs4.Tag = bs.find(name="table", attrs={"class": "t-dcl-begin"}) if taster_tbl: tasters = taster_tbl.find_all(name="span", attrs={"class": lambda c: c is not None and "mw-geshi" in c}) if tasters: # Fixes some formatting for i, taster in enumerate(tasters): taster = taster.text.split("\n") taster = "\n".join(t.rstrip() for t in taster) taster = taster.replace("\n\n", "\n") tasters[i] = taster # Remove tasters from DOM taster_tbl.replace_with(bs4.Tag(name="empty")) else: tasters = [] h1 = bs.find(name="h1").text # Get the description desc = bs.find(name="div", attrs={"id": "mw-content-text"}) if desc: # first_par_node = desc.find(name='p') # description = first_par_node.text + '\n' # sibs = first_par_node.find_next_siblings() # for sib in sibs: # description += sib.text + '\n' description = "\n".join( p.text for p in desc.find_all(name="p") if not p.text.strip().endswith(":") and not p.text.strip().startswith("(") and not p.text.strip().endswith(")") ) else: description = "" return url, h1, tasters, header, description
def format_clean_content(title, body, BeautifulSoup): # heavily integrated with output of dsl reader plugin! # and with xdxf also. """ :param title: str | None """ # class="sec" => d:priority="2" # style="color:steelblue" => class="ex" # class="p" style="color:green" => class="p" # style="color:green" => class="c" # style="margin-left:{}em" => class="m{}" # <s> => <del> # xhtml is strict if BeautifulSoup: soup = BeautifulSoup.BeautifulSoup(body, "lxml", from_encoding="utf-8") # difference between "lxml" and "html.parser" if soup.body: soup = soup.body for tag in soup(class_="sec"): tag["class"].remove("sec") if not tag["class"]: del tag["class"] tag["d:priority"] = "2" for tag in soup(lambda x: "color:steelblue" in x.get("style", "")): remove_style(tag, "color:steelblue") if "ex" not in tag.get("class", []): tag["class"] = tag.get("class", []) + ["ex"] for tag in soup(is_green): remove_style(tag, "color:green") if "p" not in tag.get("class", ""): tag["class"] = tag.get("class", []) + ["c"] for tag in soup(True): if "style" in tag.attrs: m = margin_re.search(tag["style"]) if m: remove_style(tag, m.group(0)) tag["class"] = tag.get("class", []) + ["m" + m.group(1)] for tag in soup.select("[href]"): href = tag["href"] if href.startswith("bword://"): href = href[len("bword://"):] if not (href.startswith("http:") or href.startswith("https:")): tag["href"] = "x-dictionary:d:%s" % href for tag in soup("u"): tag.name = "span" tag["class"] = tag.get("class", []) + ["u"] for tag in soup("s"): tag.name = "del" if title: h1 = BeautifulSoup.Tag(name="h1") h1.string = title soup.insert(0, h1) # hence the name BeautifulSoup content = toStr(soup.encode_contents()) else: # somewhat analogue to what BeautifulSoup suppose to do body = em0_9_re.sub(em0_9_sub, body) body = em0_9_ex_re.sub(em0_9_ex_sub, body) body = href_re.sub(href_sub, body) body = body \ .replace('<i style="color:green">', '<i class="c">') \ .replace('<i class="p" style="color:green">', '<i class="p">') \ .replace('<span class="ex" style="color:steelblue">', '<span class="ex">') \ .replace('<span class="sec ex" style="color:steelblue">', '<span class="sec ex">') \ .replace('<u>', '<span class="u">').replace('</u>', '</span>') \ .replace('<s>', '<del>').replace('</s>', '</del>') # nice header to display content = "<h1>%s</h1>%s" % (title, body) if title else body content = close_tag.sub("<\g<1> />", content) content = img_tag.sub("<img \g<1>/>", content) content = content.replace(" ", " ") content = nonprintable.sub("", content) return content