Python htmlの例、lxml.html Pythonの例

コード例 #1

0

ファイルを表示

ファイル: extract.py プロジェクト: 0x9900/wptools

def handle_redirect(red, lead):
    title = red.split("REDIRECT")[-1].strip()
    doc = html(fetch.get_html(title, lead))
    try:
        return doc.decode('utf-8')
    except:
        return doc

コード例 #2

0

ファイルを表示

ファイル: exml.py プロジェクト: dainan13/happy

 def test2() :
     h = html(Bootstrap)
     
     h.div_(Class="a")
     h.div_(Class="b")
     
     print( h.string() )

コード例 #3

0

ファイルを表示

ファイル: parse.py プロジェクト: scriptmn/codeforces-mn

def problem(code):
    r = url_open("http://codeforces.com/problemset/problem/" +
                 code.strip().replace("-", "/"))

    tree = lxml.html.fromstring(r.read())

    def html(e):
        " inner html "
        return etree.tostring(e).split(">", 1)[1].rsplit("</", 1)[0]

    inputs = tree.xpath("//div[@class='input']/pre")
    outputs = tree.xpath("//div[@class='output']/pre")

    content = html(tree.xpath("//div[@class='problem-statement']/div")[1])
    input_text = html(tree.xpath("//div[@class='input-specification']")[0])
    input_text = input_text.replace('<div class="section-title">Input</div>',
                                    "<h2>Оролт</h2>")
    content += input_text

    output_t = html(tree.xpath("//div[@class='output-specification']")[0])
    output_t = output_t.replace('<div class="section-title">Output</div>',
                                "<h2>Гаралт</h2>")
    content += output_t

    note = ""
    if tree.xpath("//div[@class='note']"):
        note = html(tree.xpath("//div[@class='note']")[0])
        note = note.replace('<div class="section-title">Note</div>', "")

    return {
        # meta fields
        "time": tree.xpath("//div[@class='time-limit']/text()")[0],
        "memory": tree.xpath("//div[@class='memory-limit']/text()")[0],
        "input": tree.xpath("//div[@class='input-file']/text()")[0],
        "output": tree.xpath("//div[@class='output-file']/text()")[0],
        "tests": zip(map(lambda e: "\n".join(e.xpath("./text()")), inputs),
                     map(lambda e: "\n".join(e.xpath("./text()")), outputs)),
        # statement fields
        "content": html2text(content),
        "note": html2text(note),
    }

コード例 #4

0

ファイルを表示

ファイル: extract.py プロジェクト: 0x9900/wptools

def html_text(data, lead=False, compact=False):
    doc = html(data, lead)
    doc = html_keep_tags(doc)
    doc = utils.strip_refs(doc)
    doc = utils.single_space(doc)
    doc = plain_text_cleanup(doc)
    if compact:
        doc = utils.collapse(doc)
    try:
        return doc.encode('utf-8')
    except:
        return doc

コード例 #5

0

ファイルを表示

def wget_post(url):
    '''
    Download post
    Parse post's tree
    Get Metadata
    Save content to Markdown file inside docs/file.md 
    '''
    wget = 'wget --quiet --no-clobber {}'.format(url)        
    subprocess.call(wget, shell=True) # download as index.html
    input_file = open('index.html', "r") # open and parse
    parsed = lxml.html.parse(input_file)
    article = parsed.xpath('//article')[0]    
    date, author, title = post2markdown(parsed)
    author = author.encode('utf-8')
    title = title.encode('utf-8')    


    #convert to markdown
#    md_filename = "docs/{date}-{file}.md".format(date=date, file=title.replace(" ", "_")) 
#    pandoc(date, author, title, md_filename)

    html_filename = "docs/{date}-{file}.html".format(date=date, file=title.replace(" ", "_")) 
    html(html_filename)

コード例 #6

0

ファイルを表示

def embed_html(path="mapa.html", width=750, height=500):
    from IPython.display import HTML
    """
    Avoid in-lining the source HTMl into the notebook by adding just a link.
    CAVEAT: All links must be relative!

    Examples
    --------
    >>> html = embed_html(path="./mapa.html")
    >>> isinstance(html, HTML)

    """
    html = ('<iframe src="files/{path}" '
            'style="width: {width}px; height: {height}px;'
            'border: none"></iframe>').format
    return HTML(html(path=path, width=width, height=height))

コード例 #7

0

ファイルを表示

ファイル: pytools.py プロジェクト: jbosch-noaa/utilities

def embed_html(path="mapa.html", width=750, height=500):
    from IPython.display import HTML
    """
    Avoid in-lining the source HTMl into the notebook by adding just a link.
    CAVEAT: All links must be relative!

    Examples
    --------
    >>> html = embed_html(path="./mapa.html")
    >>> isinstance(html, HTML)

    """
    html = ('<iframe src="files/{path}" '
            'style="width: {width}px; height: {height}px;'
            'border: none"></iframe>').format
    return HTML(html(path=path, width=width, height=height))

コード例 #8

0

ファイルを表示

 def _make_element(self, tag):
     """Creates brand new HTML element, which inherits behavior of
     the current one (including ``base_url``, etc.).
     """
     return html('<{0} />'.format(tag), base_url=self.base_url)

コード例 #9

0

ファイルを表示

def flavored_markdown(text):
    key = '%s:flavored_markdown' % hash(text)
    html = cache.get(key, namespace="filters")
    if html:
        return html

    text = ' ' + text + ' '
    text = unescape(text)

    # extract Reference-style links
    reference_urls = REFERENCE_URL_REGEX.findall(text)
    reference_urls = [i[0] for i in reference_urls]
    for i in reference_urls:
        text = text.replace(i, md5(i).hexdigest())

    # extract urls
    urls = URL_REGEX.findall(text)
    urls = [i[0] for i in urls if i]
    urls.sort(key=len, reverse=True)
    for url in urls:
        for pattern in ['%s)', ' %s', '\n%s', '\r\n%s', '%s\n', '%s\r\n']:
            if pattern % url in text:
                text = text.replace(pattern % url,
                                    pattern % md5(url).hexdigest())
                break

    # extract emoticons and symbols
    symbols = EMOTICONS.keys()
    symbols.extend(SYMBOLS.keys())
    symbols.sort(key=len, reverse=True)
    for symbol in symbols:
        for pattern in [
                ' %s', ' %s. ', ' %s.\n', ' %s.\r\n', '\n%s', '\r\n%s', '%s\n',
                '%s\r\n'
        ]:
            if pattern % symbol in text:
                text = text.replace(pattern % symbol,
                                    pattern % md5(symbol).hexdigest())
                break

    # extract mentions
    mentions = re.findall('(@\[.*?\))', text)
    if mentions:
        for mention in mentions:
            text = text.replace(mention, md5(mention).hexdigest())

    # extract hashtags
    hashtags = re.findall('(#\[.*?\))', text)
    if hashtags:
        for hashtag in hashtags:
            text = text.replace(hashtag, md5(hashtag).hexdigest())

    # extract underscores words - prevent foo_bar_baz from ending up with an italic word in the middle
    words_with_underscores = [w for w in \
                              re.findall('((?! {4}|\t)\w+_\w+_\w[\w_]*)', text) \
                              if not w.startswith('_')]

    for word in words_with_underscores:
        text = text.replace(word, md5(word).hexdigest())

    # treats newlines in paragraph-like content as real line breaks
    text = text.strip().replace('<br>', '8b0f0ea73162b7552dda3c149b6c045d')
    text = text.strip().replace('\r\n', '<br>').replace(
        '\n', '<br>')  # normalize \r\n and \n to <br>
    text = text.strip().replace('<br>', '  \n')  # treats newlines
    text = text.strip().replace('||  \n', '||\n')  # undo if wiki-tables
    text = text.strip().replace('8b0f0ea73162b7552dda3c149b6c045d', '<br>')

    # restore reference_urls
    for i in reference_urls:
        text = text.replace(md5(i).hexdigest(), i)

    # convert text to html
    html = markdown(text,
                    extras=[
                        "wiki-tables", "cuddled-lists", "fenced-code-blocks",
                        "header-ids", "code-friendly", "pyshell", "footnotes"
                    ])

    #  print html

    # extract code-blocks
    html = html.replace(
        '\n',
        '<br/>')  # convert multi-lines to single-lines for regex matching
    code_blocks = re.findall('(<code>.*?</code>)', html)
    for block in code_blocks:
        html = html.replace(block, md5(block).hexdigest())

    # Show emoticons and symbols
    for symbol in symbols:
        if SYMBOLS.has_key(symbol):
            html = html.replace(md5(symbol).hexdigest(), SYMBOLS[symbol])
        else:
            html = html.replace(
                md5(symbol).hexdigest(),
                EMOTICONS[symbol].replace("<img src",
                                          "<img class='emoticon' src"))

    # Autolinks urls, mentions, hashtags, turn youtube links to embed code
    for url in urls:
        title = api.get_url_info(url).title
        hash_string = md5(url).hexdigest()
        if len(url) > 40:
            html = html.replace(
                hash_string, '<a href="%s" target="_blank" title="%s">%s</a>' %
                (url, title, url[:40] + '...'))
        else:
            html = html.replace(
                hash_string, '<a href="%s" target="_blank" title="%s">%s</a>' %
                (url, title, url))

    for mention in mentions:
        hash_string = md5(mention).hexdigest()
        user = re.compile('@\[(?P<name>.+)\]\((?P<id>.*)\)').match(
            mention).groupdict()
        user['id'] = user['id'].split(':', 1)[-1]
        html = html.replace(
            hash_string,
            '<a href="#!/user/%s" class="overlay"><span class="tag">%s</span></a>'
            % (user.get('id'), user.get('name')))

    for hashtag in hashtags:
        hash_string = md5(hashtag).hexdigest()
        tag = re.compile('#\[(?P<name>.+)\]\((?P<id>.*)\)').match(
            hashtag).groupdict()
        tag['id'] = tag['id'].split(':', 1)[-1]
        html = html.replace(
            hash_string,
            '<a href="?hashtag=%s" class="overlay"><span class="tag">%s</span></a>'
            % (tag.get('id'), tag.get('name')))

    # Restore code blocks
    for block in code_blocks:
        html = html.replace(md5(block).hexdigest(), block)

    # restore urls, mentions, emoticons and hashtag in code blocks
    for url in urls:
        html = html.replace(md5(url).hexdigest(), url)
    for mention in mentions:
        html = html.replace(md5(mention).hexdigest(), mention)
    for hashtag in hashtags:
        html = html.replace(md5(hashtag).hexdigest(), hashtag)
    for symbol in symbols:
        html = html.replace(md5(symbol).hexdigest(), symbol)

    # restore words with underscores
    for word in words_with_underscores:
        html = html.replace(md5(word).hexdigest(), word)

    # restore \n
    html = html.replace('<br/>', '\n')

    # xss protection
    html = sanitize_html(html)

    if not html or html.isspace():
        return ''

    # add target="_blank" to all a tags
    html = PyQuery(html)
    html('a:not(.overlay)').attr('target', '_blank')
    html = str(html)
    html = html.replace('<br/>', '<br>')

    cache.set(key, html, namespace="filters")
    return html

コード例 #10

0

ファイルを表示

ファイル: html.py プロジェクト: apophys/zitkino.cz

 def _make_element(self, tag):
     """Creates brand new HTML element, which inherits behavior of
     the current one (including ``base_url``, etc.).
     """
     return html('<{0} />'.format(tag), base_url=self.base_url)

コード例 #11

0

ファイルを表示

ファイル: insult.py プロジェクト: frumiousbandersnatch/sobrieti-plugins

#!/usr/bin/env python 
'''
Test insultgenerator.org
'''

from urllib2 import urlopen
import lxml
res = urlopen('http://www.insultgenerator.org')
tree = lxml.html(res)

page = res.read().strip()

コード例 #12

0

ファイルを表示

ファイル: filters.py プロジェクト: AloneRoad/jupo

def flavored_markdown(text): 
  key = '%s:flavored_markdown' % hash(text)
  html = cache.get(key, namespace="filters")
  if html:
    return html
   
  text = ' ' + text + ' '
  text = unescape(text)
  
  # extract Reference-style links
  reference_urls = REFERENCE_URL_REGEX.findall(text)
  reference_urls = [i[0] for i in reference_urls]
  for i in reference_urls:
    text = text.replace(i, md5(i).hexdigest())  
  
  # extract urls
  urls = URL_REGEX.findall(text)
  urls = [i[0] for i in urls if i]
  urls.sort(key=len, reverse=True)
  for url in urls:
    for pattern in ['%s)', ' %s', '\n%s', '\r\n%s', '%s\n', '%s\r\n']:
      if pattern % url in text:
        text = text.replace(pattern % url, pattern % md5(url).hexdigest())
        break
      
  # extract emoticons and symbols
  symbols = EMOTICONS.keys()
  symbols.extend(SYMBOLS.keys())
  symbols.sort(key=len, reverse=True)
  for symbol in symbols:
    for pattern in [' %s', ' %s. ', ' %s.\n', ' %s.\r\n', '\n%s', '\r\n%s', '%s\n', '%s\r\n']:
      if pattern % symbol in text:
        text = text.replace(pattern % symbol, pattern % md5(symbol).hexdigest())
        break
  
  # extract mentions
  mentions = re.findall('(@\[.*?\))', text)
  if mentions:
    for mention in mentions:
      text = text.replace(mention, md5(mention).hexdigest())
  
  # extract hashtags
  hashtags = re.findall('(#\[.*?\))', text)
  if hashtags:
    for hashtag in hashtags:
      text = text.replace(hashtag, md5(hashtag).hexdigest())
            
  # extract underscores words - prevent foo_bar_baz from ending up with an italic word in the middle
  words_with_underscores = [w for w in \
                            re.findall('((?! {4}|\t)\w+_\w+_\w[\w_]*)', text) \
                            if not w.startswith('_')]
  
  for word in words_with_underscores:
    text = text.replace(word, md5(word).hexdigest())
  
  # treats newlines in paragraph-like content as real line breaks
  text = text.strip().replace('<br>', '8b0f0ea73162b7552dda3c149b6c045d')
  text = text.strip().replace('\r\n', '<br>').replace('\n', '<br>') # normalize \r\n and \n to <br>
  text = text.strip().replace('<br>', '  \n') # treats newlines
  text = text.strip().replace('||  \n', '||\n') # undo if wiki-tables
  text = text.strip().replace('8b0f0ea73162b7552dda3c149b6c045d', '<br>')
  
  # restore reference_urls
  for i in reference_urls:
    text = text.replace(md5(i).hexdigest(), i) 
  
  # convert text to html
  html = markdown(text, extras=["wiki-tables",
                                "cuddled-lists",
                                "fenced-code-blocks",
                                "header-ids",
                                "code-friendly",
                                "pyshell",
                                "footnotes"])
  
#  print html
  
  # extract code-blocks
  html = html.replace('\n', '<br/>') # convert multi-lines to single-lines for regex matching
  code_blocks = re.findall('(<code>.*?</code>)', html)
  for block in code_blocks:
    html = html.replace(block, md5(block).hexdigest())
    
    
  # Show emoticons and symbols
  for symbol in symbols:
    if SYMBOLS.has_key(symbol):
      html = html.replace(md5(symbol).hexdigest(),
                          SYMBOLS[symbol])
    else:
      html = html.replace(md5(symbol).hexdigest(),
                          EMOTICONS[symbol].replace("<img src", 
                                                    "<img class='emoticon' src"))
  
  # Autolinks urls, mentions, hashtags, turn youtube links to embed code
  for url in urls: 
    title = api.get_url_info(url).title
    hash_string = md5(url).hexdigest()
    if len(url) > 40:
      html = html.replace(hash_string, 
                          '<a href="%s" target="_blank" title="%s">%s</a>' % (url, title, url[:40] + '...'))
    else:
      html = html.replace(hash_string, 
                          '<a href="%s" target="_blank" title="%s">%s</a>' % (url, title, url))
  
  for mention in mentions:
    hash_string = md5(mention).hexdigest()
    user = re.compile('@\[(?P<name>.+)\]\((?P<id>.*)\)').match(mention).groupdict()
    user['id'] = user['id'].split(':', 1)[-1]
    html = html.replace(hash_string, 
                        '<a href="#!/user/%s" class="overlay"><span class="tag">%s</span></a>' % (user.get('id'), user.get('name')))
  
  for hashtag in hashtags:
    hash_string = md5(hashtag).hexdigest()
    tag = re.compile('#\[(?P<name>.+)\]\((?P<id>.*)\)').match(hashtag).groupdict()
    tag['id'] = tag['id'].split(':', 1)[-1]
    html = html.replace(hash_string, 
                        '<a href="?hashtag=%s" class="overlay"><span class="tag">%s</span></a>' % (tag.get('id'), tag.get('name')))  
    
  # Restore code blocks
  for block in code_blocks:
    html = html.replace(md5(block).hexdigest(), block)
  
  # restore urls, mentions, emoticons and hashtag in code blocks
  for url in urls:
    html = html.replace(md5(url).hexdigest(), url)
  for mention in mentions:
    html = html.replace(md5(mention).hexdigest(), mention)
  for hashtag in hashtags:
    html = html.replace(md5(hashtag).hexdigest(), hashtag)  
  for symbol in symbols:
    html = html.replace(md5(symbol).hexdigest(), symbol)  
  
  # restore words with underscores
  for word in words_with_underscores:
    html = html.replace(md5(word).hexdigest(), word)
  
  # restore \n
  html = html.replace('<br/>', '\n') 

  # xss protection
  html = sanitize_html(html)

  if not html or html.isspace():
    return ''
  
  
  # add target="_blank" to all a tags
  html = PyQuery(html)
  html('a:not(.overlay)').attr('target', '_blank')
  html = str(html)
  html = html.replace('<br/>', '<br>')
  
  cache.set(key, html, namespace="filters")
  return html

コード例 #13

0

ファイルを表示

ファイル: tokenizer.py プロジェクト: sky102/WebSearchEngine

 def cleanSoup(self, html):
     for script in html(["script", "style"]):
         script.extract()