def handle_redirect(red, lead): title = red.split("REDIRECT")[-1].strip() doc = html(fetch.get_html(title, lead)) try: return doc.decode('utf-8') except: return doc
def test2() : h = html(Bootstrap) h.div_(Class="a") h.div_(Class="b") print( h.string() )
def problem(code): r = url_open("http://codeforces.com/problemset/problem/" + code.strip().replace("-", "/")) tree = lxml.html.fromstring(r.read()) def html(e): " inner html " return etree.tostring(e).split(">", 1)[1].rsplit("</", 1)[0] inputs = tree.xpath("//div[@class='input']/pre") outputs = tree.xpath("//div[@class='output']/pre") content = html(tree.xpath("//div[@class='problem-statement']/div")[1]) input_text = html(tree.xpath("//div[@class='input-specification']")[0]) input_text = input_text.replace('<div class="section-title">Input</div>', "<h2>Оролт</h2>") content += input_text output_t = html(tree.xpath("//div[@class='output-specification']")[0]) output_t = output_t.replace('<div class="section-title">Output</div>', "<h2>Гаралт</h2>") content += output_t note = "" if tree.xpath("//div[@class='note']"): note = html(tree.xpath("//div[@class='note']")[0]) note = note.replace('<div class="section-title">Note</div>', "") return { # meta fields "time": tree.xpath("//div[@class='time-limit']/text()")[0], "memory": tree.xpath("//div[@class='memory-limit']/text()")[0], "input": tree.xpath("//div[@class='input-file']/text()")[0], "output": tree.xpath("//div[@class='output-file']/text()")[0], "tests": zip(map(lambda e: "\n".join(e.xpath("./text()")), inputs), map(lambda e: "\n".join(e.xpath("./text()")), outputs)), # statement fields "content": html2text(content), "note": html2text(note), }
def html_text(data, lead=False, compact=False): doc = html(data, lead) doc = html_keep_tags(doc) doc = utils.strip_refs(doc) doc = utils.single_space(doc) doc = plain_text_cleanup(doc) if compact: doc = utils.collapse(doc) try: return doc.encode('utf-8') except: return doc
def wget_post(url): ''' Download post Parse post's tree Get Metadata Save content to Markdown file inside docs/file.md ''' wget = 'wget --quiet --no-clobber {}'.format(url) subprocess.call(wget, shell=True) # download as index.html input_file = open('index.html', "r") # open and parse parsed = lxml.html.parse(input_file) article = parsed.xpath('//article')[0] date, author, title = post2markdown(parsed) author = author.encode('utf-8') title = title.encode('utf-8') #convert to markdown # md_filename = "docs/{date}-{file}.md".format(date=date, file=title.replace(" ", "_")) # pandoc(date, author, title, md_filename) html_filename = "docs/{date}-{file}.html".format(date=date, file=title.replace(" ", "_")) html(html_filename)
def embed_html(path="mapa.html", width=750, height=500): from IPython.display import HTML """ Avoid in-lining the source HTMl into the notebook by adding just a link. CAVEAT: All links must be relative! Examples -------- >>> html = embed_html(path="./mapa.html") >>> isinstance(html, HTML) """ html = ('<iframe src="files/{path}" ' 'style="width: {width}px; height: {height}px;' 'border: none"></iframe>').format return HTML(html(path=path, width=width, height=height))
def _make_element(self, tag): """Creates brand new HTML element, which inherits behavior of the current one (including ``base_url``, etc.). """ return html('<{0} />'.format(tag), base_url=self.base_url)
def flavored_markdown(text): key = '%s:flavored_markdown' % hash(text) html = cache.get(key, namespace="filters") if html: return html text = ' ' + text + ' ' text = unescape(text) # extract Reference-style links reference_urls = REFERENCE_URL_REGEX.findall(text) reference_urls = [i[0] for i in reference_urls] for i in reference_urls: text = text.replace(i, md5(i).hexdigest()) # extract urls urls = URL_REGEX.findall(text) urls = [i[0] for i in urls if i] urls.sort(key=len, reverse=True) for url in urls: for pattern in ['%s)', ' %s', '\n%s', '\r\n%s', '%s\n', '%s\r\n']: if pattern % url in text: text = text.replace(pattern % url, pattern % md5(url).hexdigest()) break # extract emoticons and symbols symbols = EMOTICONS.keys() symbols.extend(SYMBOLS.keys()) symbols.sort(key=len, reverse=True) for symbol in symbols: for pattern in [ ' %s', ' %s. ', ' %s.\n', ' %s.\r\n', '\n%s', '\r\n%s', '%s\n', '%s\r\n' ]: if pattern % symbol in text: text = text.replace(pattern % symbol, pattern % md5(symbol).hexdigest()) break # extract mentions mentions = re.findall('(@\[.*?\))', text) if mentions: for mention in mentions: text = text.replace(mention, md5(mention).hexdigest()) # extract hashtags hashtags = re.findall('(#\[.*?\))', text) if hashtags: for hashtag in hashtags: text = text.replace(hashtag, md5(hashtag).hexdigest()) # extract underscores words - prevent foo_bar_baz from ending up with an italic word in the middle words_with_underscores = [w for w in \ re.findall('((?! {4}|\t)\w+_\w+_\w[\w_]*)', text) \ if not w.startswith('_')] for word in words_with_underscores: text = text.replace(word, md5(word).hexdigest()) # treats newlines in paragraph-like content as real line breaks text = text.strip().replace('<br>', '8b0f0ea73162b7552dda3c149b6c045d') text = text.strip().replace('\r\n', '<br>').replace( '\n', '<br>') # normalize \r\n and \n to <br> text = text.strip().replace('<br>', ' \n') # treats newlines text = text.strip().replace('|| \n', '||\n') # undo if wiki-tables text = text.strip().replace('8b0f0ea73162b7552dda3c149b6c045d', '<br>') # restore reference_urls for i in reference_urls: text = text.replace(md5(i).hexdigest(), i) # convert text to html html = markdown(text, extras=[ "wiki-tables", "cuddled-lists", "fenced-code-blocks", "header-ids", "code-friendly", "pyshell", "footnotes" ]) # print html # extract code-blocks html = html.replace( '\n', '<br/>') # convert multi-lines to single-lines for regex matching code_blocks = re.findall('(<code>.*?</code>)', html) for block in code_blocks: html = html.replace(block, md5(block).hexdigest()) # Show emoticons and symbols for symbol in symbols: if SYMBOLS.has_key(symbol): html = html.replace(md5(symbol).hexdigest(), SYMBOLS[symbol]) else: html = html.replace( md5(symbol).hexdigest(), EMOTICONS[symbol].replace("<img src", "<img class='emoticon' src")) # Autolinks urls, mentions, hashtags, turn youtube links to embed code for url in urls: title = api.get_url_info(url).title hash_string = md5(url).hexdigest() if len(url) > 40: html = html.replace( hash_string, '<a href="%s" target="_blank" title="%s">%s</a>' % (url, title, url[:40] + '...')) else: html = html.replace( hash_string, '<a href="%s" target="_blank" title="%s">%s</a>' % (url, title, url)) for mention in mentions: hash_string = md5(mention).hexdigest() user = re.compile('@\[(?P<name>.+)\]\((?P<id>.*)\)').match( mention).groupdict() user['id'] = user['id'].split(':', 1)[-1] html = html.replace( hash_string, '<a href="#!/user/%s" class="overlay"><span class="tag">%s</span></a>' % (user.get('id'), user.get('name'))) for hashtag in hashtags: hash_string = md5(hashtag).hexdigest() tag = re.compile('#\[(?P<name>.+)\]\((?P<id>.*)\)').match( hashtag).groupdict() tag['id'] = tag['id'].split(':', 1)[-1] html = html.replace( hash_string, '<a href="?hashtag=%s" class="overlay"><span class="tag">%s</span></a>' % (tag.get('id'), tag.get('name'))) # Restore code blocks for block in code_blocks: html = html.replace(md5(block).hexdigest(), block) # restore urls, mentions, emoticons and hashtag in code blocks for url in urls: html = html.replace(md5(url).hexdigest(), url) for mention in mentions: html = html.replace(md5(mention).hexdigest(), mention) for hashtag in hashtags: html = html.replace(md5(hashtag).hexdigest(), hashtag) for symbol in symbols: html = html.replace(md5(symbol).hexdigest(), symbol) # restore words with underscores for word in words_with_underscores: html = html.replace(md5(word).hexdigest(), word) # restore \n html = html.replace('<br/>', '\n') # xss protection html = sanitize_html(html) if not html or html.isspace(): return '' # add target="_blank" to all a tags html = PyQuery(html) html('a:not(.overlay)').attr('target', '_blank') html = str(html) html = html.replace('<br/>', '<br>') cache.set(key, html, namespace="filters") return html
#!/usr/bin/env python ''' Test insultgenerator.org ''' from urllib2 import urlopen import lxml res = urlopen('http://www.insultgenerator.org') tree = lxml.html(res) page = res.read().strip()
def flavored_markdown(text): key = '%s:flavored_markdown' % hash(text) html = cache.get(key, namespace="filters") if html: return html text = ' ' + text + ' ' text = unescape(text) # extract Reference-style links reference_urls = REFERENCE_URL_REGEX.findall(text) reference_urls = [i[0] for i in reference_urls] for i in reference_urls: text = text.replace(i, md5(i).hexdigest()) # extract urls urls = URL_REGEX.findall(text) urls = [i[0] for i in urls if i] urls.sort(key=len, reverse=True) for url in urls: for pattern in ['%s)', ' %s', '\n%s', '\r\n%s', '%s\n', '%s\r\n']: if pattern % url in text: text = text.replace(pattern % url, pattern % md5(url).hexdigest()) break # extract emoticons and symbols symbols = EMOTICONS.keys() symbols.extend(SYMBOLS.keys()) symbols.sort(key=len, reverse=True) for symbol in symbols: for pattern in [' %s', ' %s. ', ' %s.\n', ' %s.\r\n', '\n%s', '\r\n%s', '%s\n', '%s\r\n']: if pattern % symbol in text: text = text.replace(pattern % symbol, pattern % md5(symbol).hexdigest()) break # extract mentions mentions = re.findall('(@\[.*?\))', text) if mentions: for mention in mentions: text = text.replace(mention, md5(mention).hexdigest()) # extract hashtags hashtags = re.findall('(#\[.*?\))', text) if hashtags: for hashtag in hashtags: text = text.replace(hashtag, md5(hashtag).hexdigest()) # extract underscores words - prevent foo_bar_baz from ending up with an italic word in the middle words_with_underscores = [w for w in \ re.findall('((?! {4}|\t)\w+_\w+_\w[\w_]*)', text) \ if not w.startswith('_')] for word in words_with_underscores: text = text.replace(word, md5(word).hexdigest()) # treats newlines in paragraph-like content as real line breaks text = text.strip().replace('<br>', '8b0f0ea73162b7552dda3c149b6c045d') text = text.strip().replace('\r\n', '<br>').replace('\n', '<br>') # normalize \r\n and \n to <br> text = text.strip().replace('<br>', ' \n') # treats newlines text = text.strip().replace('|| \n', '||\n') # undo if wiki-tables text = text.strip().replace('8b0f0ea73162b7552dda3c149b6c045d', '<br>') # restore reference_urls for i in reference_urls: text = text.replace(md5(i).hexdigest(), i) # convert text to html html = markdown(text, extras=["wiki-tables", "cuddled-lists", "fenced-code-blocks", "header-ids", "code-friendly", "pyshell", "footnotes"]) # print html # extract code-blocks html = html.replace('\n', '<br/>') # convert multi-lines to single-lines for regex matching code_blocks = re.findall('(<code>.*?</code>)', html) for block in code_blocks: html = html.replace(block, md5(block).hexdigest()) # Show emoticons and symbols for symbol in symbols: if SYMBOLS.has_key(symbol): html = html.replace(md5(symbol).hexdigest(), SYMBOLS[symbol]) else: html = html.replace(md5(symbol).hexdigest(), EMOTICONS[symbol].replace("<img src", "<img class='emoticon' src")) # Autolinks urls, mentions, hashtags, turn youtube links to embed code for url in urls: title = api.get_url_info(url).title hash_string = md5(url).hexdigest() if len(url) > 40: html = html.replace(hash_string, '<a href="%s" target="_blank" title="%s">%s</a>' % (url, title, url[:40] + '...')) else: html = html.replace(hash_string, '<a href="%s" target="_blank" title="%s">%s</a>' % (url, title, url)) for mention in mentions: hash_string = md5(mention).hexdigest() user = re.compile('@\[(?P<name>.+)\]\((?P<id>.*)\)').match(mention).groupdict() user['id'] = user['id'].split(':', 1)[-1] html = html.replace(hash_string, '<a href="#!/user/%s" class="overlay"><span class="tag">%s</span></a>' % (user.get('id'), user.get('name'))) for hashtag in hashtags: hash_string = md5(hashtag).hexdigest() tag = re.compile('#\[(?P<name>.+)\]\((?P<id>.*)\)').match(hashtag).groupdict() tag['id'] = tag['id'].split(':', 1)[-1] html = html.replace(hash_string, '<a href="?hashtag=%s" class="overlay"><span class="tag">%s</span></a>' % (tag.get('id'), tag.get('name'))) # Restore code blocks for block in code_blocks: html = html.replace(md5(block).hexdigest(), block) # restore urls, mentions, emoticons and hashtag in code blocks for url in urls: html = html.replace(md5(url).hexdigest(), url) for mention in mentions: html = html.replace(md5(mention).hexdigest(), mention) for hashtag in hashtags: html = html.replace(md5(hashtag).hexdigest(), hashtag) for symbol in symbols: html = html.replace(md5(symbol).hexdigest(), symbol) # restore words with underscores for word in words_with_underscores: html = html.replace(md5(word).hexdigest(), word) # restore \n html = html.replace('<br/>', '\n') # xss protection html = sanitize_html(html) if not html or html.isspace(): return '' # add target="_blank" to all a tags html = PyQuery(html) html('a:not(.overlay)').attr('target', '_blank') html = str(html) html = html.replace('<br/>', '<br>') cache.set(key, html, namespace="filters") return html
def cleanSoup(self, html): for script in html(["script", "style"]): script.extract()