Beispiel #1
0
def test_to_doc():
    #doc = lx.to_doc(ex.HTML_1)    # could be this too
    doc = lx.to_doc(ex.HTML_1, scraper.LXML_HTML)  # this parser is the default
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.HTML5PARSER)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.BEAUTIFULSOUP)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, parser=None)
    assert doc is None
    #
    # now let's see with HTML fragments
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.LXML_HTML, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.HTML5PARSER, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.BEAUTIFULSOUP, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, parser=None, whole_doc=False)
    assert doc is None
Beispiel #2
0
def test_to_doc():
    #doc = lx.to_doc(ex.HTML_1)    # could be this too
    doc = lx.to_doc(ex.HTML_1, scraper.LXML_HTML)    # this parser is the default
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.HTML5PARSER)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.BEAUTIFULSOUP)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, parser=None)
    assert doc is None
    #
    # now let's see with HTML fragments
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.LXML_HTML, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.HTML5PARSER, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.BEAUTIFULSOUP, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, parser=None, whole_doc=False)
    assert doc is None
def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)
    
    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)
    
    return images
Beispiel #4
0
def process(word):
    """Process the given word.

    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)

    return (word, get_hyphen(doc), get_mp3(doc))
Beispiel #5
0
def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)

    return [x for x in li if x]  # remove None elems
def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)
        
    return [x for x in li if x]     # remove None elems
Beispiel #7
0
def process(word):
    """Process the given word.
    
    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)
    
    return (word, get_hyphen(doc), get_mp3(doc))
Beispiel #8
0
def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)

    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)

    return images
Beispiel #9
0
def demo6():
    text = """<ul>
<li>abc</li>
<li>def
<li>ghi</li>
</ul>"""
    doc = lx.to_doc(text)
    for li in doc.cssselect("ul li"):
        print li.text.strip()
Beispiel #10
0
def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content() 
    print celsius
Beispiel #11
0
def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content()
    print celsius
Beispiel #12
0
def demo8():
    url = "http://python.org/"
    text = get_page(url)
    # doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    # doc = lx.to_doc(text)
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    # print type(doc)
    # print etree.tostring(doc)
    title = doc.cssselect("html head title")[0]
    print title.text
Beispiel #13
0
def demo5():
    text = """
<html>
    <table>
        <tr><td>http://google.ca</td></tr>
        <tr><td>http://reddit.com</td></tr>
    </table>
</html>
"""
    doc = lx.to_doc(text)
    lx.show_paths(doc)
Beispiel #14
0
def demo4():
    text = """
<html>
    <table>
        <tr><td>http://google.ca</td></tr>
        <tr><td>http://reddit.com</td></tr>
    </table>
</html>
"""
    doc = lx.to_doc(text)
    doc = lx.autolink(doc)
    print lx.prettify(doc)
Beispiel #15
0
def demo1():
    text = """
<html>
    <table>
        <tr><td>Header</td></tr>
        <tr><td>Want This</td></tr>
    </table>
    <a href="http://google.ca">Google.ca</a>
</html>
"""
    doc = lx.to_doc(text)
    row1 = doc.cssselect("table")[0]
    print row1.cssselect("tr td")[0].text
    print doc.cssselect("a[href]")[0].get("href")
Beispiel #16
0
def test_prettify():
    doc = lx.to_doc(ex.UGLY, parser=scraper.LXML_HTML)
    #
    nice = lx.prettify(doc, method=scraper.LXML_HTML)
    assert '</h1>' in nice and '</html>' in nice
    #
    #    nice = lx.prettify(doc, method=scraper.HTML5PARSER)    # missing
    #
    nice = lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=scraper.TIDY)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=None)
    assert nice is None
Beispiel #17
0
def test_prettify():
    doc = lx.to_doc(ex.UGLY, parser=scraper.LXML_HTML)
    #
    nice = lx.prettify(doc, method=scraper.LXML_HTML)
    assert '</h1>' in nice and '</html>' in nice
    #
#    nice = lx.prettify(doc, method=scraper.HTML5PARSER)    # missing
    #
    nice = lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=scraper.TIDY)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=None)
    assert nice is None
Beispiel #18
0
def demo3():
    html = """<html>
  <head>
    <script type="text/javascript" src="stuff.js"></script>
    <link rel="alternate" type="text/rss" src="some-rss">
    <style>
        body {background-image: url(javascript:do_something)};
        div {color: expression(something)};
    </style>
  </head>
  <body onload="some_function()">
     Hello World!
  </body>
 </html>"""
    doc = lx.to_doc(html)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
Beispiel #19
0
def test_show_paths():
    doc = lx.to_doc(ex.HTML_1)
    
    old_stdout = sys.stdout
    buf = StringIO()
    sys.stdout = buf 
    #
    lx.show_paths(doc, find=None)
    assert "'Want This' => /html/body/table/tr[2]/td" in buf.getvalue()
    #
    buf = StringIO()
    sys.stdout = buf
    lx.show_paths(doc, find='Google.ca')
    assert "'Google.ca' => /html/body/a" in buf.getvalue()
    #
    buf.close()
    sys.stdout = old_stdout
Beispiel #20
0
def test_show_paths():
    doc = lx.to_doc(ex.HTML_1)

    old_stdout = sys.stdout
    buf = StringIO()
    sys.stdout = buf
    #
    lx.show_paths(doc, find=None)
    assert "'Want This' => /html/body/table/tr[2]/td" in buf.getvalue()
    #
    buf = StringIO()
    sys.stdout = buf
    lx.show_paths(doc, find='Google.ca')
    assert "'Google.ca' => /html/body/a" in buf.getvalue()
    #
    buf.close()
    sys.stdout = old_stdout
Beispiel #21
0
def demo7():
    text = """<html>
 <body
  <div></div>
  <div id="content">
   <ul>
    <li>First item</li>
    <li>Second item</li>
   </ul>
  </div>
 </body>
</html>"""
    doc = lx.to_doc(text)
    lx.show_paths(doc)
    for tag in doc.cssselect("div#content ul li"):
        print tag.text
    print lx.css_to_xpath("div#content ul li")
    lx.open_in_browser(doc)
Beispiel #22
0
def demo2():
    url = "http://projecteuler.net/"
    text = get_page(url)
    doc = lx.to_doc(text)
    lx.make_links_absolute(doc, base_url=url)
    print lx.tostring(doc)
Beispiel #23
0
def test_flatten():
    doc = lx.to_doc(ex.HTML_1)
    assert lx.flatten(doc) == 'HeaderWant ThisGoogle.ca\n'
Beispiel #24
0
def demo3():
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
Beispiel #25
0
def demo2():
    doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
Beispiel #26
0
def demo1():
    doc = lx.to_doc(text)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
Beispiel #27
0
Demo for lx.py.
Download population of countries.
"""

import re

from jabbapylib.web.scraper import lx
from jabbapylib.web.web import get_page


def process(doc):
    data = {}
    
    for row in doc.cssselect('tr'):
        cols = row.cssselect('td')
        if cols:
            rank = cols[0].text
            if rank and re.search('^\d+$', rank):
                country = cols[1].cssselect('a[title]')[0].text
                population = int(cols[2].text.replace(',', ''))  
                data[country] = population
                
    print data

#############################################################################

if __name__ == "__main__":
    url = 'https://secure.wikimedia.org/wikipedia/en/wiki/List_of_countries_by_population'
    text = get_page(url)
    doc = lx.to_doc(text)
    process(doc)
Beispiel #28
0
def test_autolink():
    doc = lx.to_doc(ex.TEXT)
    doc = lx.autolink(doc)
    html = lx.tostring(doc)
    assert '<a href="http://retrogames.com/games/commando">http://retrogames.com/games/commando</a>' in html
Beispiel #29
0
def demo2():
    doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
Beispiel #30
0
def demo1():
    doc = lx.to_doc(text)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
Beispiel #31
0
def test_tostring():
    doc = lx.to_doc(ex.HTML_1)
    html = lx.tostring(doc)
    assert type(html) is str and len(html) > 0
Beispiel #32
0
def test_autolink():
    doc = lx.to_doc(ex.TEXT)
    doc = lx.autolink(doc)
    html = lx.tostring(doc)
    assert '<a href="http://retrogames.com/games/commando">http://retrogames.com/games/commando</a>' in html
Beispiel #33
0
def test_make_links_absolute():
    doc = lx.to_doc(ex.LINKS)
    doc = lx.make_links_absolute(doc, base_url='http://retrogames.com')
    html = lx.tostring(doc)
    assert "http://retrogames.com/games/elite" in html
    assert "http://retrogames.com/games/commando" in html
Beispiel #34
0
def test_doc_to_soup():
    doc = lx.to_doc(ex.HTML_1)
    soup = bs.doc_to_soup(doc)
    assert isinstance(soup, BeautifulSoup)
Beispiel #35
0
def demo3():
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
Beispiel #36
0
def test_flatten():
    doc = lx.to_doc(ex.HTML_1)
    assert lx.flatten(doc) == 'HeaderWant ThisGoogle.ca\n'
Beispiel #37
0
def test_make_links_absolute():
    doc = lx.to_doc(ex.LINKS)
    doc = lx.make_links_absolute(doc, base_url='http://retrogames.com')
    html = lx.tostring(doc)
    assert "http://retrogames.com/games/elite" in html
    assert "http://retrogames.com/games/commando" in html
Beispiel #38
0
def test_doc_to_soup():
    doc = lx.to_doc(ex.HTML_1)
    soup = bs.doc_to_soup(doc)
    assert isinstance(soup, BeautifulSoup)
Beispiel #39
0
def test_tostring():
    doc = lx.to_doc(ex.HTML_1)
    html = lx.tostring(doc)
    assert type(html) is str and len(html) > 0