def remove_empty_lines(html): key = '%s:remove_empty_lines' % hash(html) out = cache.get(key, namespace="filters") if out: return out if '</' in html: html = html.strip().replace('\n', '') soup = BeautifulSoup(html) lines = [] for element in soup.contents: if isinstance(element, Tag): if element.text: lines.append(str(element).strip()) elif 'br' in str(element): lines.append('\n') elif isinstance(element, NavigableString): lines.append(str(element).strip()) out = ''.join(lines).strip() while '\n\n' in out: out = out.replace('\n\n', '\n') else: out = '\n'.join([line for line in html.split('\n') if line.strip()]) cache.set(key, out, namespace="filters") return out
def get_text(msg): text = "" if msg.is_multipart(): html = None for part in msg.get_payload(): if part.get_content_charset() is None: charset = chardet.detect(str(part))['encoding'] else: charset = part.get_content_charset() if part.get_content_type() == 'text/plain': text = decode_str( ' '.join(part.get_payload().split('\n')) + "\n") if part.get_content_type() == 'text/html': html = str(part.get_payload(decode=True), str(charset), "ignore") if html is None: return text.strip() else: msg_data = lxml.html.document_fromstring(html.strip()) return str("\n".join(etree.XPath("//text()")(msg_data))) elif part.get_content_type() == 'text/plain': text = decode_str(' '.join(part.get_payload().split('\n')) + "\n") ret = "\n".join( [ll.rstrip() for ll in text.splitlines() if ll.strip()]) # text = str(msg.get_payload(decode=True),msg.get_content_charset(),'ignore') return ret.strip()
def remove_empty_lines(html): key = '%s:remove_empty_lines' % hash(html) out = cache.get(key, namespace="filters") if out: return out if '</' in html: html = html.strip().replace('\n', '') soup = BeautifulSoup(html) lines = [] for element in soup.contents: if isinstance(element, Tag): if element.text: lines.append(str(element).strip()) elif 'br' in str(element): lines.append('\n') elif isinstance(element, NavigableString): lines.append(str(element).strip()) out = ''.join(lines).strip() while '\n\n' in out: out = out.replace('\n\n', '\n') else: out = '\n'.join([line for line in html.split('\n') if line.strip()]) cache.set(key, out, namespace="filters") return out
def strip_wrapping(html): """ Removes the wrapping that might have resulted when using get_html_tree(). """ if html.startswith('<div>') and html.endswith('</div>'): html = html[5:-6] return html.strip()
def markdown(url): """ http://mikelev.in/2014/01/stripping-html-text-markdown-readability/ """ html = url if checkurl(url): html = gethtml(url) if not html: return None html = barebones(html) html = html.replace('<p>', "\n") html = html.replace('</p>', "") html = html.replace('<hr>', "\n---\n") html = html.replace('<blockquote>', "\n> ") html = html.replace('</blockquote>', "") html = html.replace('<h1>', "\n# ") html = html.replace('<h2>', "\n## ") html = html.replace('<h3>', "\n### ") html = html.replace('<h4>', "\n#### ") html = html.replace('<h5>', "\n##### ") html = html.replace('<h6>', "\n###### ") html = html.replace('</h1>', "") html = html.replace('</h2>', "") html = html.replace('</h3>', "") html = html.replace('</h4>', "") html = html.replace('</h5>', "") html = html.replace('</h6>', "") html = html.replace('<li>', "") html = html.replace('</li>', "") html = html.replace('<ul>', "") html = html.replace('<ol>', "") html = html.replace('</ul>', "") html = html.replace('</ol>', "") html = lesslines(html) html = html.strip() return html
def barebones(url): html = url if checkurl(url): html = gethtml(url) if not html: return None # This chops out the following tags AND all the presumably extraneous content in-between. for nuketagblock in ['title', 'head']: html = deletenode(html, nuketagblock) html = bodycopy(html) html = stripcomments(html) # Same as above, but a second-pass on the usual code-bloating suspects in between body tags. for nuketagblock in ['header', 'footer', 'nav', 'script', 'style', 'noscript', 'form', 'object', 'embed', 'select']: html = deletenode(html, nuketagblock) html = stripparams(html) html = lowercasetags(html) # html = striplists(html) html = stripemptyhtml(html) html = stripbr(html) # This strips out the following tags, but leaves the in-between content in place. for nuketag in ['label', 'section', 'article', 'div', 'span', 'img', 'a', 'b', 'i', 'param', 'table', 'td', 'tr', 'font', 'title', 'head', 'meta', 'strong', 'em', 'iframe']: html = deletetag(html, nuketag) html = stripwhitespace(html) html = stripcrlf(html) html = onetagoneline(html) html = convert_html_entities(html) html = lesslines(html) html = html.replace('\n', ' ') html = html.replace(' ', ' ') html = html.strip() return html
def strip_wrapping(html): """ Removes the wrapping that might have resulted when using get_html_tree(). """ if html.startswith('<div>') and html.endswith('</div>'): html = html[5:-6] return html.strip()
def extractHtml(html, selector, type='css', dump=False): items = [] if html.strip() != '': try: try: soup = lxml.html.fromstring(html) except ValueError: soup = lxml.html.fromstring(html.encode('utf-8')) if type == 'css': for item in soup.cssselect(selector): item = lxml.etree.tostring(item).decode('utf-8').strip() items.append(item) elif type == 'xpath': result = soup.xpath(selector) result = result if isinstance(result, list) else [result] for item in result: if isinstance(item, lxml.etree._Element): item = lxml.etree.tostring(item).decode('utf-8') items.append(str(item).strip()) except Exception as e: items.append('ERROR: ' + str(e)) return items
def get_decoded_email_body(message_body): """ Decode email body. Detect character set if the header is not set. We try to get text/plain, but if there is not one then fallback to text/html. :param message_body: Raw 7-bit message body input e.g. from imaplib. Double encoded in quoted-printable and latin-1 :return: Message body as unicode string """ msg = message_from_string(message_body) text = "" if msg.is_multipart(): html = None for part in msg.get_payload(): #logger.debug( "%s, %s" % (part.get_content_type(), part.get_content_charset())) if part.get_content_charset() is None: # We cannot know the character set, so return decoded # "something" text = part.get_payload(decode=True) continue charset = part.get_content_charset() if part.get_content_type() == 'text/plain': text = str(part.get_payload(decode=True), str(charset), "ignore").encode('utf8', 'replace') if part.get_content_type() == 'text/html': html = str(part.get_payload(decode=True), str(charset), "ignore").encode('utf8', 'replace') if text is not None: return text.strip() else: return html.strip() else: text = str(msg.get_payload(decode=True),msg.get_content_charset(), 'ignore').encode('utf8', 'replace') return text.strip()
def FindRSSFeeds(): # blacklist = ( 'Pictures', 'Coffee Break', 'Live mag', 'You mag' ) blacklist = () feeds = [] # page to read the list of rss feeds from rss_feed_page = "http://www.dailymail.co.uk/home/rssMenu.html" html = ukmedia.FetchURL( rss_feed_page ) assert html.strip() != '' soup = BeautifulSoup( html ) # look for rss icons, step back to find the links. for btn in soup.findAll( 'span', {'class':"rss-btn rss"} ): a = btn.find('a') if a: feed_url = urlparse.urljoin( rss_feed_page, a['href'] ) # could get a more human-readable name, but relative url is good enough feed_name = a['href'] feeds.append( (feed_name,feed_url) ) assert len(feeds) > 120 # 168 feeds at time of writing return feeds
def markdown(url): """ http://mikelev.in/2014/01/stripping-html-text-markdown-readability/ """ html = url if checkurl(url): html = gethtml(url) if not html: return None html = barebones(html) html = html.replace('<p>', "\n") html = html.replace('</p>', "") html = html.replace('<hr>', "\n---\n") html = html.replace('<blockquote>', "\n> ") html = html.replace('</blockquote>', "") html = html.replace('<h1>', "\n# ") html = html.replace('<h2>', "\n## ") html = html.replace('<h3>', "\n### ") html = html.replace('<h4>', "\n#### ") html = html.replace('<h5>', "\n##### ") html = html.replace('<h6>', "\n###### ") html = html.replace('</h1>', "") html = html.replace('</h2>', "") html = html.replace('</h3>', "") html = html.replace('</h4>', "") html = html.replace('</h5>', "") html = html.replace('</h6>', "") html = html.replace('<li>', "") html = html.replace('</li>', "") html = html.replace('<ul>', "") html = html.replace('<ol>', "") html = html.replace('</ul>', "") html = html.replace('</ol>', "") html = lesslines(html) html = html.strip() return html
def format_linebreaks(html=''): paragraphs = [ '<p>%s</p>' % p if not tags_re.match(p) else p for p in linebreaks_re.split(html.strip()) if not whitespace_re.match(p) ] return ''.join(paragraphs)
def mentions_in_html(self, html: str) -> List[Tuple[str, str]]: if not html.strip(): return [] return [(a_tag.text, href) for a_tag, _, href, _ in lxml.html.iterlinks(html) if a_tag.text and self.link_is_matrix_to_regex.match(unquote(href.strip()))]
def extract(url): doc = readability.Document(urllib2.urlopen(url).read()) title = doc.title() html = doc.summary() html = lstrip(html, '<html>') html = lstrip(html, '<body/>') html = rstrip(html, '</html>') return '<h1>%s</h1>%s' % (title, html.strip())
def shrink_style(cls, style_str, filtered_css_properties, changed_css_properties): if not style_str: return None properties = {} for p in style_str.split(";"): if p.strip(): token = p.split(":") if len(token) > 1: properties[token[0].strip()] = token[1].strip() return Utils._shrink_properties(properties, filtered_css_properties, changed_css_properties)
def clean_html(html): """Clean an HTML snippet into a readable string""" # Newline vs <br /> html = html.replace('\n', ' ') html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities html = html_decode(html) return html.strip()
def shift_headings(html, header_shift=1): """ """ if not html.strip(): return "" dom = lxml.html.fromstring(html) for header in dom.cssselect("h1, h2, h3, h4, h5, h6"): header.tag = "h{}".format(int(header.tag[1]) + header_shift) output = lxml.html.tostring(dom).decode("utf-8") if output.startswith("<div>"): output = output[5:-6] return output
def strip_tags_parser(self, html): """ 去除文本中的HTML标签.用到了HTMLParser 使用示例: str_text=strip_tags("<font color=red>hello</font>") :return: String """ from HTMLParser import HTMLParser html = html.strip('\n') html = html.strip('\t') html = html.strip(' ') html = html.strip() result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return '$'.join(result)
def clean_html(html): """Clean an HTML snippet into a readable string""" # Newline vs <br /> html = html.replace('\n', ' ') html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities html = html_decode(html) return html.strip()
def __call__(self, html, input_encoding=None, output_encoding=unicode, return_body=False): if not isinstance(html, unicode): if not input_encoding: raise TypeError('Input data must be unicode') html = unicode(html, input_encoding) html = html.strip() if not html: return u'' root = lxml.html.fromstring(html) for name in self.transformation_names: method = TRANSFORMATIONS.get(name) params = dict( context=self.context, request=getattr(self.context, 'REQUEST', None), destdir=self.destdir, ) if method is None: raise ValueError('No transformation "%s" registered' % name) ts = time.time() argspec = inspect.getargspec(method) if isinstance(argspec, tuple): args = argspec[0] # Python 2.4 else: args = argspec.args if 'params' in args: method(root, params) else: method(root) LOG.info('Transformation %-30s: %3.6f seconds' % (name, time.time() - ts)) if return_body: body = root.xpath('//body')[0] html_new = body.text + u''.join([ lxml.html.tostring(b, encoding=output_encoding) for b in body ]) else: html_new = lxml.html.tostring(root, encoding=output_encoding) if html_new.startswith('<div>') and html_new.endswith('</div>'): html_new = html_new[5:-6].strip() return html_new.strip()
def termSearch(userinput): ## URL Modification for user term search url = "http://tlwi.w3-969.ibm.com/standards/terminology/cgi-bin/lookup?ug=corporate&term=" + userinput + "&submit=Search&source=en&target_switch=none&template=simple&db_name=LOGOS&11=main+form&11=acronym~abbreviation&11=prohibited" ##Set the URL content = urllib.request.urlopen(url).read() soup = BeautifulSoup(content, "lxml") ## Get Results soup = soup.find_all('ol') ## Delete Reference Links and Convert clean = bleach.clean(soup, tags=['ol', 'br', 'li', 'p'], strip=True, strip_comments=True) test = str(clean) html = html2text.html2text(test) html = html.replace("_", "\n") html = html.strip("[") html = html.strip("]\n") ## Return String if html == "": html = "No results have been found. Please try a different query." return html
def write_links_page(self): output_file = self._cache_dir / 'links.html' with output_file.open('w') as fp: fp.write('<meta charset="utf-8"><ul>\n') for url, filename in self._files.items(): path = str(self._cache_dir / filename) tree = html5lib.parse(open(path, 'rb'), namespaceHTMLElements=False) title = tree.find('.//title').text html = """<li> <a href="%s">%s</a> - %s - %s </li>""" % (filename, url, title, filename) fp.write(html.strip() + '\n') fp.write('</ul>')
def get_innerhtml(self, xpath): htmls = [] for element in self.html.xpath(xpath): html = '' if element.text: html += element.text for child in element.getchildren(): if hasattr(self, 'encoding'): html += lxml.html.tostring( child, encoding=self.encoding).decode() else: html += lxml.html.tostring(child).decode() htmls.append(html.strip()) return htmls
async def read(self) -> str: """Read and convert to HTML the document located at :attr:`path`. :raise OSError: if the reader's :attr:`~program` cannot convert the document. :raise UnicodeDecodeError: when the conversion's result is invalid. """ assert self.path is not None, "Open a file before trying to read it" cmdline = self.arguments.format(path=self.path) # Can raise OSError or UnicodeDecodeError. html = await self.run(cmdline) return html.strip()
def clean_html(html): """ Remove HTML markup from the given string. Borrowed from nltk. """ # First we remove inline JavaScript/CSS: cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) return cleaned.strip()
def clean_html(html): """ Remove HTML markup from the given string. Borrowed from nltk. """ # First we remove inline JavaScript/CSS: cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) return cleaned.strip()
def content_strip_tags(self, html): """ Очищает контент публикации от HTML-разметки и возвращает простой текст. """ # Заменяем удвоенные пробелы одинарным html = re.sub(r' {2,}', ' ', html) # Удаляем символы перевода строки и табуляцию html = re.sub(r'[\r\n\t]+', '', html) # Удаляем пробелы между тегами html = re.sub(r'(</\w+>)\s+(<\w+[^>]*>)', r'\1\2', html) # Производим замену ссылок на: текст [ссылка] html = re.sub(r'''<a[^>]*href=['"]([^'"]+)['"][^>]*>([^<]+)</a>''', r'\2 [\1]', html) html = re.sub(r'''<a[^>]*>([^<]*)</a>''', r'\1', html) # К заголовкам и параграфам добавляем перевод строки html = re.sub(r'''<(h1|h2|h3|h4|h5|h6|p)[^>]*>([^<]*)</\1>''', r'\2' + os.linesep * 2, html) html = HTMLParser().unescape(html) return html.strip()
def get_static_urls(url): html = urlopen(url).read() parser = etree.HTMLParser() tree = etree.fromstring(html.strip(), parser).getroottree() page = tree.getroot() for link in CSSSelector('link')(page): yield wrap(url, link.attrib['href']) for link in CSSSelector('script')(page): try: src = wrap(url, link.attrib['src']) if not src.count('googleapis'): yield wrap(url, link.attrib['src']) except KeyError: # block pass
def get_static_urls(url): html = urlopen(url).read() parser = etree.HTMLParser() tree = etree.fromstring(html.strip(), parser).getroottree() page = tree.getroot() for link in CSSSelector('link')(page): yield wrap(url, link.attrib['href']) for link in CSSSelector('script')(page): try: src = wrap(url, link.attrib['src']) if not src.count('googleapis'): yield wrap(url, link.attrib['src']) except KeyError: # block pass
def filter( self, html: str, inline: bool = False, outgoing: bool = False, display_name_mentions: Optional[Dict[str, str]] = None, ) -> str: """Filter and return HTML.""" mentions = display_name_mentions sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions)) html = sanit.sanitize(html).rstrip("\n") if not html.strip(): return html tree = etree.fromstring( html, parser=etree.HTMLParser(encoding="utf-8"), ) for a_tag in tree.iterdescendants("a"): self._mentions_to_matrix_to_links(a_tag, mentions, outgoing) if not outgoing: self._matrix_to_links_add_classes(a_tag) html = etree.tostring(tree, encoding="utf-8", method="html").decode() html = sanit.sanitize(html).rstrip("\n") if outgoing: return html # Client-side modifications html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html) if not inline: return html return self.inline_quote_regex.sub( r'\1<span class="quote">\2</span>', html, )
def __call__(self, html, input_encoding=None, output_encoding=unicode, return_body=False): if not isinstance(html, unicode): if not input_encoding: raise TypeError('Input data must be unicode') html = unicode(html, input_encoding) html = html.strip() if not html: return u'' root = lxml.html.fromstring(html) for name in self.transformation_names: method = TRANSFORMATIONS.get(name) params = dict(context=self.context, request=getattr(self.context, 'REQUEST', None), destdir=self.destdir, ) if method is None: raise ValueError('No transformation "%s" registered' % name) ts = time.time() argspec = inspect.getargspec(method) if isinstance(argspec, tuple): args = argspec[0] # Python 2.4 else: args = argspec.args if 'params' in args: method(root, params) else: method(root) LOG.info('Transformation %-30s: %3.6f seconds' % (name, time.time()-ts)) if return_body: body = root.xpath('//body')[0] html_new = body.text + u''.join([lxml.html.tostring(b, encoding=output_encoding) for b in body]) else: html_new = lxml.html.tostring(root, encoding=output_encoding) if html_new.startswith('<div>') and html_new.endswith('</div>'): html_new = html_new[5:-6].strip() return html_new.strip()
def __del_html_tag(self, html): ''' @summary: --------- @param html: @param save_useful_tag:保留有用的标签,如img和p标签 --------- @result: ''' html = self.__replace_str(html, u'(?i)<script(.|\n)*?</script>') html = self.__replace_str(html, u'(?i)<style(.|\n)*?</style>') html = self.__replace_str(html, u'(?i)<\/?(span|section|font|em)[^<>]*?>') html = self.__replace_str(html, u'(?i)<div[^<>]+?(display:.?none|comment|measure).*?>([\s\S]*?)<\/div>') html = self.__replace_str(html, u'<!--(.|\n)*?-->') html = self.__replace_str(html, u'(?!&[a-z]+=)&[a-z]+;?', ' ') html = self.__replace_str(html, '[\f\r\t\v]') # 将空格和换行符外的其他空白符去掉 html = html.strip() return html
def get_linked_data(x): path = x["path"] if path in CACHE: return CACHE[path] try: html = just.read(path) except EOFError: CACHE[path] = None return None if not html.strip(): CACHE[path] = None return None tree = lxml.html.fromstring(html) res = tree.xpath("//input[@name='q' and @type='text']") if not res: linked_data = None else: linked_data = {"title": res[0].value} CACHE[path] = linked_data return linked_data
def clean_html(html): """ Copied from NLTK package. Remove HTML markup from the given string. :param html: the HTML string to be cleaned :type html: str :rtype: str """ # see http://stackoverflow.com/questions/26002076/python-nltk-clean-html-not-implemented # First we remove inline JavaScript/CSS: cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) return cleaned.strip()
def clean_html(html): """ Copied from NLTK package. Remove HTML markup from the given string. :param html: the HTML string to be cleaned :type html: str :rtype: str """ # see http://stackoverflow.com/questions/26002076/python-nltk-clean-html-not-implemented # First we remove inline JavaScript/CSS: cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) return cleaned.strip()
def get_text(self, msg): cleaner = Cleaner() cleaner.javascript = True # Remove script tags cleaner.style = True # Remove css body = "" for part in msg.walk(): # Check mime type for text if part.get_content_maintype() == "text": body += part.get_payload() # Get the transfer encoding used transfer_encoding = msg.get("Content-transfer-encoding") if transfer_encoding: # Decode transfer encoding header transfer_encoding = self.decode_header(transfer_encoding) if transfer_encoding.lower() == "base64": # Base64 transfer decoding body = base64.b64decode(body) # Quoted-printable transfer decoding html = quopri.decodestring(body).decode("utf-8", "ignore") # Check if body is empty if html.strip() != "": # Create document from html document = lxml.html.document_fromstring(html) # Clean html and get text text = "\n".join( etree.XPath("//text()")(cleaner.clean_html(document))) # Remove blank lines from text text = "\n".join( filter(lambda x: not re.match(r'^\s*$', x), text.splitlines())) return text return ""
def barebones(url): html = url if checkurl(url): html = gethtml(url) if not html: return None # This chops out the following tags AND all the presumably extraneous content in-between. for nuketagblock in ['title', 'head']: html = deletenode(html, nuketagblock) html = bodycopy(html) html = stripcomments(html) # Same as above, but a second-pass on the usual code-bloating suspects in between body tags. for nuketagblock in [ 'header', 'footer', 'nav', 'script', 'style', 'noscript', 'form', 'object', 'embed', 'select' ]: html = deletenode(html, nuketagblock) html = stripparams(html) html = lowercasetags(html) # html = striplists(html) html = stripemptyhtml(html) html = stripbr(html) # This strips out the following tags, but leaves the in-between content in place. for nuketag in [ 'label', 'section', 'article', 'div', 'span', 'img', 'a', 'b', 'i', 'param', 'table', 'td', 'tr', 'font', 'title', 'head', 'meta', 'strong', 'em', 'iframe' ]: html = deletetag(html, nuketag) html = stripwhitespace(html) html = stripcrlf(html) html = onetagoneline(html) html = convert_html_entities(html) html = lesslines(html) html = html.replace('\n', ' ') html = html.replace(' ', ' ') html = html.strip() return html
html = html.replace("_", "\n") html = html.strip("[") html = html.strip("]\n") ## Return String if html == "": html = "No results have been found. Please try a different query." return html if __name__ == "__main__": print("Please enter the user input") userinput = input() url = "http://tlwi.w3-969.ibm.com/standards/terminology/cgi-bin/lookup?ug=corporate&term=" + userinput + "&submit=Search&source=en&target_switch=none&template=simple&db_name=LOGOS&11=main+form&11=acronym~abbreviation&11=prohibited" ##Set the URL content = urllib.request.urlopen(url).read() soup = BeautifulSoup(content, "lxml") soup = soup.find_all('ol') #Delete Reference Links clean = bleach.clean(soup, tags=['ol', 'br', 'li', 'p'], strip=True, strip_comments=True) test = str(clean) html = html2text.html2text(test) html = html.replace("_", "\n") html = html.strip("[") html = html.strip("]\n") print(html) if html == "": print("EMPTY")
def format_linebreaks(html=''): paragraphs = ['<p>%s</p>' % p if not tags_re.match(p) else p for p in linebreaks_re.split(html.strip()) if not whitespace_re.match(p)] return ''.join(paragraphs)
def clean_broken_html(self, html): return html.strip().replace(" ", "")