Python to_doc Beispiele, jabbapylib.web.scraper.lx.to_doc Python Beispiele

Beispiel #1

0

Datei anzeigen

def test_to_doc():
    #doc = lx.to_doc(ex.HTML_1)    # could be this too
    doc = lx.to_doc(ex.HTML_1, scraper.LXML_HTML)  # this parser is the default
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.HTML5PARSER)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.BEAUTIFULSOUP)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, parser=None)
    assert doc is None
    #
    # now let's see with HTML fragments
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.LXML_HTML, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.HTML5PARSER, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.BEAUTIFULSOUP, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, parser=None, whole_doc=False)
    assert doc is None

Beispiel #2

0

Datei anzeigen

Datei: test_lx.py Projekt: ThePenguin1140/jabbapylib

def test_to_doc():
    #doc = lx.to_doc(ex.HTML_1)    # could be this too
    doc = lx.to_doc(ex.HTML_1, scraper.LXML_HTML)    # this parser is the default
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.HTML5PARSER)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.BEAUTIFULSOUP)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, parser=None)
    assert doc is None
    #
    # now let's see with HTML fragments
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.LXML_HTML, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.HTML5PARSER, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.BEAUTIFULSOUP, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, parser=None, whole_doc=False)
    assert doc is None

Beispiel #3

0

Datei anzeigen

Datei: lx_wallbase.py Projekt: ThePenguin1140/jabbapylib

def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)
    
    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)
    
    return images

Beispiel #4

0

Datei anzeigen

Datei: hyphen.py Projekt: the7day/jabbapylib

def process(word):
    """Process the given word.

    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)

    return (word, get_hyphen(doc), get_mp3(doc))

Beispiel #5

0

Datei anzeigen

def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)

    return [x for x in li if x]  # remove None elems

Beispiel #6

0

Datei anzeigen

Datei: lx_wallbase.py Projekt: ThePenguin1140/jabbapylib

def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)
        
    return [x for x in li if x]     # remove None elems

Beispiel #7

0

Datei anzeigen

Datei: hyphen.py Projekt: ThePenguin1140/jabbapylib

def process(word):
    """Process the given word.
    
    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)
    
    return (word, get_hyphen(doc), get_mp3(doc))

Beispiel #8

0

Datei anzeigen

def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)

    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)

    return images

Beispiel #9

0

Datei anzeigen

Datei: lx_simple.py Projekt: jeffreywinn/jabbapylib

def demo6():
    text = """<ul>
<li>abc</li>
<li>def
<li>ghi</li>
</ul>"""
    doc = lx.to_doc(text)
    for li in doc.cssselect("ul li"):
        print li.text.strip()

Beispiel #10

0

Datei anzeigen

Datei: weather.py Projekt: ThePenguin1140/jabbapylib

def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content() 
    print celsius

Beispiel #11

0

Datei anzeigen

Datei: weather.py Projekt: the7day/jabbapylib

def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content()
    print celsius

Beispiel #12

0

Datei anzeigen

Datei: lx_simple.py Projekt: jeffreywinn/jabbapylib

def demo8():
    url = "http://python.org/"
    text = get_page(url)
    # doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    # doc = lx.to_doc(text)
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    # print type(doc)
    # print etree.tostring(doc)
    title = doc.cssselect("html head title")[0]
    print title.text

Beispiel #13

0

Datei anzeigen

Datei: lx_simple.py Projekt: jeffreywinn/jabbapylib

def demo5():
    text = """
<html>
    <table>
        <tr><td>http://google.ca</td></tr>
        <tr><td>http://reddit.com</td></tr>
    </table>
</html>
"""
    doc = lx.to_doc(text)
    lx.show_paths(doc)

Beispiel #14

0

Datei anzeigen

Datei: lx_simple.py Projekt: jeffreywinn/jabbapylib

def demo4():
    text = """
<html>
    <table>
        <tr><td>http://google.ca</td></tr>
        <tr><td>http://reddit.com</td></tr>
    </table>
</html>
"""
    doc = lx.to_doc(text)
    doc = lx.autolink(doc)
    print lx.prettify(doc)

Beispiel #15

0

Datei anzeigen

Datei: lx_simple.py Projekt: jeffreywinn/jabbapylib

def demo1():
    text = """
<html>
    <table>
        <tr><td>Header</td></tr>
        <tr><td>Want This</td></tr>
    </table>
    <a href="http://google.ca">Google.ca</a>
</html>
"""
    doc = lx.to_doc(text)
    row1 = doc.cssselect("table")[0]
    print row1.cssselect("tr td")[0].text
    print doc.cssselect("a[href]")[0].get("href")

Beispiel #16

0

Datei anzeigen

def test_prettify():
    doc = lx.to_doc(ex.UGLY, parser=scraper.LXML_HTML)
    #
    nice = lx.prettify(doc, method=scraper.LXML_HTML)
    assert '</h1>' in nice and '</html>' in nice
    #
    #    nice = lx.prettify(doc, method=scraper.HTML5PARSER)    # missing
    #
    nice = lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=scraper.TIDY)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=None)
    assert nice is None

Beispiel #17

0

Datei anzeigen

Datei: test_lx.py Projekt: ThePenguin1140/jabbapylib

def test_prettify():
    doc = lx.to_doc(ex.UGLY, parser=scraper.LXML_HTML)
    #
    nice = lx.prettify(doc, method=scraper.LXML_HTML)
    assert '</h1>' in nice and '</html>' in nice
    #
#    nice = lx.prettify(doc, method=scraper.HTML5PARSER)    # missing
    #
    nice = lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=scraper.TIDY)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=None)
    assert nice is None

Beispiel #18

0

Datei anzeigen

Datei: lx_simple.py Projekt: jeffreywinn/jabbapylib

def demo3():
    html = """<html>
  <head>
    <script type="text/javascript" src="stuff.js"></script>
    <link rel="alternate" type="text/rss" src="some-rss">
    <style>
        body {background-image: url(javascript:do_something)};
        div {color: expression(something)};
    </style>
  </head>
  <body onload="some_function()">
     Hello World!
  </body>
 </html>"""
    doc = lx.to_doc(html)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

Beispiel #19

0

Datei anzeigen

Datei: test_lx.py Projekt: ThePenguin1140/jabbapylib

def test_show_paths():
    doc = lx.to_doc(ex.HTML_1)
    
    old_stdout = sys.stdout
    buf = StringIO()
    sys.stdout = buf 
    #
    lx.show_paths(doc, find=None)
    assert "'Want This' => /html/body/table/tr[2]/td" in buf.getvalue()
    #
    buf = StringIO()
    sys.stdout = buf
    lx.show_paths(doc, find='Google.ca')
    assert "'Google.ca' => /html/body/a" in buf.getvalue()
    #
    buf.close()
    sys.stdout = old_stdout

Beispiel #20

0

Datei anzeigen

def test_show_paths():
    doc = lx.to_doc(ex.HTML_1)

    old_stdout = sys.stdout
    buf = StringIO()
    sys.stdout = buf
    #
    lx.show_paths(doc, find=None)
    assert "'Want This' => /html/body/table/tr[2]/td" in buf.getvalue()
    #
    buf = StringIO()
    sys.stdout = buf
    lx.show_paths(doc, find='Google.ca')
    assert "'Google.ca' => /html/body/a" in buf.getvalue()
    #
    buf.close()
    sys.stdout = old_stdout

Beispiel #21

0

Datei anzeigen

Datei: lx_simple.py Projekt: jeffreywinn/jabbapylib

def demo7():
    text = """<html>
 <body
  <div></div>
  <div id="content">
   <ul>
    <li>First item</li>
    <li>Second item</li>
   </ul>
  </div>
 </body>
</html>"""
    doc = lx.to_doc(text)
    lx.show_paths(doc)
    for tag in doc.cssselect("div#content ul li"):
        print tag.text
    print lx.css_to_xpath("div#content ul li")
    lx.open_in_browser(doc)

Beispiel #22

0

Datei anzeigen

Datei: lx_simple.py Projekt: jeffreywinn/jabbapylib

def demo2():
    url = "http://projecteuler.net/"
    text = get_page(url)
    doc = lx.to_doc(text)
    lx.make_links_absolute(doc, base_url=url)
    print lx.tostring(doc)

Beispiel #23

0

Datei anzeigen

def test_flatten():
    doc = lx.to_doc(ex.HTML_1)
    assert lx.flatten(doc) == 'HeaderWant ThisGoogle.ca\n'

Beispiel #24

0

Datei anzeigen

def demo3():
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

Beispiel #25

0

Datei anzeigen

def demo2():
    doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

Beispiel #26

0

Datei anzeigen

def demo1():
    doc = lx.to_doc(text)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

Beispiel #27

0

Datei anzeigen

Demo for lx.py.
Download population of countries.
"""

import re

from jabbapylib.web.scraper import lx
from jabbapylib.web.web import get_page


def process(doc):
    data = {}
    
    for row in doc.cssselect('tr'):
        cols = row.cssselect('td')
        if cols:
            rank = cols[0].text
            if rank and re.search('^\d+$', rank):
                country = cols[1].cssselect('a[title]')[0].text
                population = int(cols[2].text.replace(',', ''))  
                data[country] = population
                
    print data

#############################################################################

if __name__ == "__main__":
    url = 'https://secure.wikimedia.org/wikipedia/en/wiki/List_of_countries_by_population'
    text = get_page(url)
    doc = lx.to_doc(text)
    process(doc)

Beispiel #28

0

Datei anzeigen

def test_autolink():
    doc = lx.to_doc(ex.TEXT)
    doc = lx.autolink(doc)
    html = lx.tostring(doc)
    assert '<a href="http://retrogames.com/games/commando">http://retrogames.com/games/commando</a>' in html

Beispiel #29

0

Datei anzeigen

Datei: lx_parsers.py Projekt: ThePenguin1140/jabbapylib

def demo2():
    doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

Beispiel #30

0

Datei anzeigen

Datei: lx_parsers.py Projekt: ThePenguin1140/jabbapylib

def demo1():
    doc = lx.to_doc(text)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

Beispiel #31

0

Datei anzeigen

def test_tostring():
    doc = lx.to_doc(ex.HTML_1)
    html = lx.tostring(doc)
    assert type(html) is str and len(html) > 0

Beispiel #32

0

Datei anzeigen

Datei: test_lx.py Projekt: ThePenguin1140/jabbapylib

def test_autolink():
    doc = lx.to_doc(ex.TEXT)
    doc = lx.autolink(doc)
    html = lx.tostring(doc)
    assert '<a href="http://retrogames.com/games/commando">http://retrogames.com/games/commando</a>' in html

Beispiel #33

0

Datei anzeigen

def test_make_links_absolute():
    doc = lx.to_doc(ex.LINKS)
    doc = lx.make_links_absolute(doc, base_url='http://retrogames.com')
    html = lx.tostring(doc)
    assert "http://retrogames.com/games/elite" in html
    assert "http://retrogames.com/games/commando" in html

Beispiel #34

0

Datei anzeigen

Datei: test_bs.py Projekt: the7day/jabbapylib

def test_doc_to_soup():
    doc = lx.to_doc(ex.HTML_1)
    soup = bs.doc_to_soup(doc)
    assert isinstance(soup, BeautifulSoup)

Beispiel #35

0

Datei anzeigen

Datei: lx_parsers.py Projekt: ThePenguin1140/jabbapylib

def demo3():
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

Beispiel #36

0

Datei anzeigen

Datei: test_lx.py Projekt: ThePenguin1140/jabbapylib

def test_flatten():
    doc = lx.to_doc(ex.HTML_1)
    assert lx.flatten(doc) == 'HeaderWant ThisGoogle.ca\n'

Beispiel #37

0

Datei anzeigen

Datei: test_lx.py Projekt: ThePenguin1140/jabbapylib

def test_make_links_absolute():
    doc = lx.to_doc(ex.LINKS)
    doc = lx.make_links_absolute(doc, base_url='http://retrogames.com')
    html = lx.tostring(doc)
    assert "http://retrogames.com/games/elite" in html
    assert "http://retrogames.com/games/commando" in html

Beispiel #38

0

Datei anzeigen

Datei: test_bs.py Projekt: jeffreywinn/jabbapylib

def test_doc_to_soup():
    doc = lx.to_doc(ex.HTML_1)
    soup = bs.doc_to_soup(doc)
    assert isinstance(soup, BeautifulSoup)

Beispiel #39

0

Datei anzeigen

Datei: test_lx.py Projekt: ThePenguin1140/jabbapylib

def test_tostring():
    doc = lx.to_doc(ex.HTML_1)
    html = lx.tostring(doc)
    assert type(html) is str and len(html) > 0