Python get_page Examples, jabbapylib.web.web.get_page Python Examples

Example #1

0

Show file

File: say.py Project: the7day/jabbapylib

def say_with_google(word, autoremove=True, background=False, debug=False):
    """
    Say a word with Google.

    https://ubuntuincident.wordpress.com/2012/03/27/audio-pronunciation-of-words-from-google/
    The return value is a tuple: (found, mp3_file), where
    found is True if the word was retrieved successfully (False otherwise), and
    mp3_file is the path of the locally saved mp3 (or None if it was not saved).
    Set autoremove to False if you want to work with the mp3 later, when this
    function returned.
    The function stores the mp3 files in /tmp.
    """
    found = False  # Was the mp3 successfully found?
    mp3_file = None  # Is the locally saved mp3 file kept?
    url = template.format(word=word)
    content = web.get_page(url, user_agent=True)
    if content:
        found = True
        fname = '/tmp/{word}.mp3'.format(word=word)
        fs.store_content_in_file(content, fname, overwrite=True)
        mp3_file = fname
        if not debug:
            play(fname, background=background)
        if autoremove:
            os.unlink(fname)
            mp3_file = None
    else:
        found = False
        mp3_file = None

    return (found, mp3_file)

Example #2

0

Show file

File: network.py Project: ThePenguin1140/jabbapylib

def get_my_external_ip():
    """
    Get my external IP.
    
    Local IP: http://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
    """
    return get_page('http://ifconfig.me/ip')

Example #3

0

Show file

File: say.py Project: jeffreywinn/jabbapylib

def say_with_google(word, autoremove=True, background=False, debug=False):
    """
    Say a word with Google.

    https://ubuntuincident.wordpress.com/2012/03/27/audio-pronunciation-of-words-from-google/
    The return value is a tuple: (found, mp3_file), where
    found is True if the word was retrieved successfully (False otherwise), and
    mp3_file is the path of the locally saved mp3 (or None if it was not saved).
    Set autoremove to False if you want to work with the mp3 later, when this
    function returned.
    The function stores the mp3 files in /tmp.
    """
    found = False  # Was the mp3 successfully found?
    mp3_file = None  # Is the locally saved mp3 file kept?
    url = template.format(word=word)
    content = web.get_page(url, user_agent=True)
    if content:
        found = True
        fname = "/tmp/{word}.mp3".format(word=word)
        fs.store_content_in_file(content, fname, overwrite=True)
        mp3_file = fname
        if not debug:
            play(fname, background=background)
        if autoremove:
            os.unlink(fname)
            mp3_file = None
    else:
        found = False
        mp3_file = None

    return (found, mp3_file)

Example #4

0

Show file

def get_my_external_ip():
    """
    Get my external IP.

    Local IP: http://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
    """
    return get_page('http://ifconfig.me/ip')

Example #5

0

Show file

File: get_image_urls_in_gallery.py Project: jabbalaci/JabbaPyLib-in-Action

def extract_image_urls(url):
    origin = None
    li = []

    text = get_page(url, user_agent=True, referer=True)
    soup = bs.to_soup(text)

    #    this version worked for a day:
    #    for pic in soup.findCssSelect('div.pic'):
    #        a = pic.find('a', href=True)
    #        if a:
    #            li.append(a['href'])

    #   here is a new version, updated to the changes
    for div in soup.findCssSelect("div.pic"):
        img = div.find("img")
        if img and img.has_key("src"):
            li.append(img["src"].replace("/small/", "/large/"))

    for div in soup.findCssSelect("html body form#aspnetForm div#main div"):
        result = re.search(r"URL: (http://.*)View full images", div.text)
        if result:
            origin = result.group(1)

    return origin, li

Example #6

0

Show file

File: wordpress.py Project: jabbalaci/JabbaPyLib-in-Action

def visit(blog, dic):
    url = 'http://{name}.wordpress.com'.format(name=blog)
    text = get_page(url)
    soup = bs.to_soup(text)
    hits = soup.findCssSelect('div#blog-stats ul li')[0].text
    hits = int(hits.replace('hits','').replace(',','').strip())
    #
    dic[url] = hits

Example #7

0

Show file

File: test_web.py Project: the7day/jabbapylib

def setup_module(module):
    """runs just once per module"""
    global GOOGLE_HTML
    GOOGLE_HTML = web.get_page(GOOGLE)
    try:
        os.unlink(cfg.TEST_TMP_FILE)
    except:
        pass    # maybe it didn't exist

Example #8

0

Show file

File: lx_wallbase.py Project: ThePenguin1140/jabbapylib

def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)
    
    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)
    
    return images

Example #9

0

Show file

def main(url):
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html, method=web.HTML2TEXT)

    #txt = ascii.unicode_to_ascii(txt)
    #txt = txt.replace(u'\xb7', '-')
    #txt = ascii.remove_non_ascii(txt).encode('ascii')

    print_result(txt)

Example #10

0

Show file

def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)

    return [x for x in li if x]  # remove None elems

Example #11

0

Show file

def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)

    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)

    return images

Example #12

0

Show file

File: hyphen.py Project: ThePenguin1140/jabbapylib

def process(word):
    """Process the given word.
    
    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)
    
    return (word, get_hyphen(doc), get_mp3(doc))

Example #13

0

Show file

File: bing.py Project: sharad/rc

def extract():
    text = web.get_page(URL)
    text = text.split('g_img={url:')[1]
    text = text.split(',')[0].replace("'", "")
    img_url = urljoin(URL, text)
    fname = img_url.split('/')[-1]
    img_ext = os.path.splitext(fname)[1]
    save_name = get_date_from_year_to_day() + img_ext
    return (img_url, save_name)

Example #14

0

Show file

File: lx_wallbase.py Project: ThePenguin1140/jabbapylib

def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)
        
    return [x for x in li if x]     # remove None elems

Example #15

0

Show file

File: hyphen.py Project: the7day/jabbapylib

def process(word):
    """Process the given word.

    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)

    return (word, get_hyphen(doc), get_mp3(doc))

Example #16

0

Show file

File: weather.py Project: the7day/jabbapylib

def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content()
    print celsius

Example #17

0

Show file

File: dictionary_2.py Project: ThePenguin1140/jabbapylib

def main(url):
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html, method=web.HTML2TEXT)
    
    #txt = ascii.unicode_to_ascii(txt)
    #txt = txt.replace(u'\xb7', '-')
    #txt = ascii.remove_non_ascii(txt).encode('ascii')

    
    print_result(txt)

Example #18

0

Show file

File: lx_simple.py Project: jeffreywinn/jabbapylib

def demo8():
    url = "http://python.org/"
    text = get_page(url)
    # doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    # doc = lx.to_doc(text)
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    # print type(doc)
    # print etree.tostring(doc)
    title = doc.cssselect("html head title")[0]
    print title.text

Example #19

0

Show file

File: weather.py Project: ThePenguin1140/jabbapylib

def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content() 
    print celsius

Example #20

0

Show file

File: bing.py Project: ayoub-benali/jabbapylib

def extract(test=False):
    text = web.get_page(URL)
    text = text.split('g_img={url:')[1]
    text = text.split(',')[0].replace("'", "")
    img_url = urljoin(URL, text)
    fname = img_url.split('/')[-1]
    fname = unquote(fname).split('/')[-1]
    if not test:
        print '# fname:', fname
    save_name = '{date}-{fname}'.format(date=get_date_from_year_to_day(), fname=fname) 
    return (img_url, save_name)

Example #21

0

Show file

File: bing.py Project: the7day/jabbapylib

def extract(test=False):
    text = web.get_page(URL)
    text = text.split('g_img={url:')[1]
    text = text.split(',')[0].replace("'", "")
    img_url = urljoin(URL, text)
    fname = img_url.split('/')[-1]
    fname = unquote(fname).split('/')[-1]
    if not test:
        print '# fname:', fname
    save_name = '{date}-{fname}'.format(date=get_date_from_year_to_day(), fname=fname)
    return (img_url, save_name)

Example #22

0

Show file

File: dictionary.py Project: the7day/jabbapylib

def process(word):
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html).decode('utf-8')

    #txt = ascii.unicode_to_ascii(txt)
    txt = txt.replace(u'\xb7', '-')
    txt = ascii.remove_non_ascii(txt).encode('ascii')
    txt = re.sub('\[.*?.gif\]', '', txt)

    print_result(txt)

Example #23

0

Show file

File: dictionary.py Project: jeffreywinn/jabbapylib

def process(word):
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html).decode("utf-8")

    # txt = ascii.unicode_to_ascii(txt)
    txt = txt.replace(u"\xb7", "-")
    txt = ascii.remove_non_ascii(txt).encode("ascii")
    txt = re.sub("\[.*?.gif\]", "", txt)

    print_result(txt)

Example #24

0

Show file

File: slogan.py Project: the7day/jabbapylib

def get_slogan(word, times=1):
    assert 1 <= times <= 10  # be nice with the server
    #
    li = []
    url = BASE + urllib.urlencode({'user': word})
    for _ in xrange(times):
        text = get_page(url, user_agent=True)
        soup = bs.to_soup(text)
        slogan = soup.findCssSelect('html body div p')[0].text
        if string.count(slogan, '.') == 1 and not slogan[0].isupper():
            slogan = slogan.replace('.', '')
        if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!':
            slogan = slogan[:-1]
        li.append(slogan)

    return li

Example #25

0

Show file

File: slogan.py Project: ThePenguin1140/jabbapylib

def get_slogan(word, times=1):
    assert 1 <= times <= 10     # be nice with the server
    #
    li = []
    url = BASE + urllib.urlencode({'user' : word})
    for _ in xrange(times):
        text = get_page(url, user_agent=True)
        soup = bs.to_soup(text)
        slogan = soup.findCssSelect('html body div p')[0].text
        if string.count(slogan, '.') == 1 and not slogan[0].isupper():
            slogan = slogan.replace('.', '')
        if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!':
            slogan = slogan[:-1]
        li.append(slogan)
        
    return li

Example #26

0

Show file

File: wordnik.py Project: ayoub-benali/jabbapylib

def definitions(word):
    """
    Fetch the definition of the word.
    """
    template = 'http://api.wordnik.com//v4/word.json/{word}/definitions?includeRelated=false&includeTags=false&useCanonical=false'
    url = prepare_url(template, word)
    try:
        decoded = json.loads(get_page(url))
        #print json.dumps(decoded)
        #
        partOfSpeech = decoded[0]['partOfSpeech']
        text = decoded[0]['text']
        d = {}
        d['partOfSpeech'] = partOfSpeech
        d['text'] = text
        return d
    except:
        return None

Example #27

0

Show file

File: wordnik.py Project: the7day/jabbapylib

def definitions(word):
    """
    Fetch the definition of the word.
    """
    template = 'http://api.wordnik.com//v4/word.json/{word}/definitions?includeRelated=false&includeTags=false&useCanonical=false'
    url = prepare_url(template, word)
    try:
        decoded = json.loads(get_page(url))
        #print json.dumps(decoded)
        #
        partOfSpeech = decoded[0]['partOfSpeech']
        text = decoded[0]['text']
        d = {}
        d['partOfSpeech'] = partOfSpeech
        d['text'] = text
        return d
    except:
        return None

Example #28

0

Show file

File: image.py Project: ThePenguin1140/jabbapylib

 def download(self, warning=True):
     """Download yourself."""
     if os.path.exists(self.get_skip_path()):
         return False
     
     # else
     if not self.exists():
         if self.make_dirs():
             obj = web.get_page(self.file_url, user_agent=True, referer=True)
             fs.store_content_in_file(obj, self.get_local_path())
     
     ok = self.exists()
     if not ok and warning:
         print >>sys.stderr, "# warning: couldn't download {url}.".format(url=self.file_url)
         
     if self.readme:
         self.save_readme()
         
     return ok

Example #29

0

Show file

File: bs4_example.py Project: acutesoftware/PrimCom

def check():
    """Just an example how to use the BS4 library."""
    text = str(get_page(URL))
    soup = bs.to_soup(text, PARSER)
    book = soup.find('div', {'class' : 'module bookSmall'})
    link = book.find('a', href=True)
    print link['href']
    #
    book = soup.find('div', {'class' : 'module fullBook'})
    try:
        title = book.find('span', {'property': 'dc:title'}).text.lower()
    except:
        title = ""
    print title
    tabs = soup.find_all('div', {'class' : 'tabModules'})[-1]
    try:
        desc = tabs.find('p', {'class' : 'paragraph'}).text.lower()
    except:
        desc = ""
    print desc

Example #30

0

Show file

File: wordnik.py Project: the7day/jabbapylib

def examples(word, limit=None):
    """
    Fetch examples.
    """
    template = 'http://api.wordnik.com//v4/word.json/{word}/examples'
    url = prepare_url(template, word)
    #print url
    try:
        decoded = json.loads(get_page(url))
        #print json.dumps(decoded)
        li = []
        array = decoded['examples']  # no limit, everything
        if limit:  # if limit specified
            array = array[:limit]
        for e in array:
            li.append(e['text'])
        #
        return li
    except:
        return None

Example #31

0

Show file

def extract_list():
    """
    Extract proxy list from base url.
    """
    sys.stdout.write('# extracting list')
    proxies = []
    text = get_page(BASE, user_agent=True)
    soup = bs.to_soup(text)
    proxylist = soup.findCssSelect('table.proxylist')[0]
    for tr in proxylist.findAll('tr', {'class': True}):
        if tr['class'] in ('odd', 'even'):
            cols = tr.findAll('td')
            ip = cols[0].text
            type = cols[1].text
            country = cols[2].text
            proxies.append(Proxy(ip, type, country))
            sys.stdout.write('.')
    #
    print 'done.'
    return proxies

Example #32

0

Show file

File: wordnik.py Project: ayoub-benali/jabbapylib

def examples(word, limit=None):
    """
    Fetch examples.
    """
    template = 'http://api.wordnik.com//v4/word.json/{word}/examples'
    url = prepare_url(template, word)
    #print url
    try:
        decoded = json.loads(get_page(url))
        #print json.dumps(decoded)
        li = []
        array = decoded['examples']     # no limit, everything
        if limit:                       # if limit specified
            array = array[:limit] 
        for e in array:
            li.append(e['text'])
        #
        return li
    except:
        return None

Example #33

0

Show file

    def download(self, warning=True):
        """Download yourself."""
        if os.path.exists(self.get_skip_path()):
            return False

        # else
        if not self.exists():
            if self.make_dirs():
                obj = web.get_page(self.file_url,
                                   user_agent=True,
                                   referer=True)
                fs.store_content_in_file(obj, self.get_local_path())

        ok = self.exists()
        if not ok and warning:
            print >> sys.stderr, "# warning: couldn't download {url}.".format(
                url=self.file_url)

        if self.readme:
            self.save_readme()

        return ok

Example #34

0

Show file

def is_internet_on(method=1):
    """Check if the Internet connection is on."""

    if method == 1:
        # At my current place we have a wifi that redirects to a login page,
        # so we always have a connection. That's why I check the content of
        # the fetched webpage.
        text = web.get_page(URL, timeout=3)
        if text:
            if '<title>Google</title>' in text:
                return True
        # else:
        return False
    elif method == 2:
        # http://stackoverflow.com/questions/3764291/checking-network-connection
        try:
            urllib2.urlopen('http://www.google.com', timeout=1)
            return True
        except urllib2.URLError:
            return False
    else:
        print '# warning: unknown method in is_internet_on()'

Example #35

0

Show file

File: network.py Project: ThePenguin1140/jabbapylib

def is_internet_on(method=1):
    """Check if the Internet connection is on."""
    
    if method == 1:
        # At my current place we have a wifi that redirects to a login page,
        # so we always have a connection. That's why I check the content of
        # the fetched webpage.
        text = web.get_page(URL, timeout=3)
        if text:
            if '<title>Google</title>' in text:
                return True
        # else:
        return False
    elif method == 2:
        # http://stackoverflow.com/questions/3764291/checking-network-connection
        try:
            urllib2.urlopen('http://www.google.com', timeout=1)
            return True
        except urllib2.URLError: 
            return False
    else:
        print '# warning: unknown method in is_internet_on()'

Example #36

0

Show file

File: extract_from_wikipedia.py Project: jabbalaci/JabbaPyLib-in-Action

def main():
    text = web.get_page(URL)
    soup = BeautifulSoup(text)
    
    countries = Countries()
    
    for row in soup.findAll('tr'):
        cols = row.findAll('td')
        if cols:
            rank = cols[0].text
            if rank and re.search('^\d+$', rank):
                country = cols[1].find('a', title=True).text
                population = int(cols[2].text.replace(',', ''))  
                #print country,':',population
                countries.add(country, population)
                
    #countries.sort()
    
    d = {}
    for country in countries.countries:
        d[country.name] = country.population
        
    print json.dumps(d)

Example #37

0

Show file

File: imdb.py Project: the7day/jabbapylib

 def get_info(self):
     text = get_page(self.url)
     return json.loads(text)

Example #38

0

Show file

    assert base_url is not None
    #
    for tag in soup.findAll('a', href=True):
        tag['href'] = urlparse.urljoin(base_url, tag['href'])

    return soup


# The patch is applied automatically when this module is imported.
css_patch()

#############################################################################

if __name__ == "__main__":
    url = "http://index.hu"
    text = web.get_page(url)
    soup = to_soup(text)
    print prettify(soup)
    #

    LINKS = """
<html>
<head>
<title>retrogames.com</title>
</head>
<a href="http://retrogames.com">Retro Games HQ</a>
<a href="/games/elite">Elite</a>
<a href="/games/commando">Commando</a>
</html>
"""

Example #39

0

Show file

Demo for lx.py.
Download population of countries.
"""

import re

from jabbapylib.web.scraper import lx
from jabbapylib.web.web import get_page


def process(doc):
    data = {}
    
    for row in doc.cssselect('tr'):
        cols = row.cssselect('td')
        if cols:
            rank = cols[0].text
            if rank and re.search('^\d+$', rank):
                country = cols[1].cssselect('a[title]')[0].text
                population = int(cols[2].text.replace(',', ''))  
                data[country] = population
                
    print data

#############################################################################

if __name__ == "__main__":
    url = 'https://secure.wikimedia.org/wikipedia/en/wiki/List_of_countries_by_population'
    text = get_page(url)
    doc = lx.to_doc(text)
    process(doc)

Example #40

0

Show file

File: geoinfo.py Project: jeffreywinn/jabbapylib

 def get_json(self, ip):
     text = get_page(gp_template.format(ip=ip))
     text = re.sub("^geoPlugin\(", "", text)
     text = re.sub("\)$", "", text)
     return json.loads(text)

Example #41

0

Show file

File: imdb.py Project: ThePenguin1140/jabbapylib

 def get_info(self):
     text = get_page(self.url)
     return json.loads(text)

Example #42

0

Show file

File: geoinfo.py Project: the7day/jabbapylib

 def get_json(self, ip):
     text = get_page(gp_template.format(ip=ip))
     text = re.sub('^geoPlugin\(', '', text)
     text = re.sub('\)$', '', text)
     return json.loads(text)

Example #43

0

Show file

File: lx_simple.py Project: jeffreywinn/jabbapylib

def demo2():
    url = "http://projecteuler.net/"
    text = get_page(url)
    doc = lx.to_doc(text)
    lx.make_links_absolute(doc, base_url=url)
    print lx.tostring(doc)

Example #44

0

Show file

File: lx_simple.py Project: jeffreywinn/jabbapylib

def demo9():
    url = "http://python.org/"
    text = get_page(url)
    soup = bs.to_soup(text)
    title = soup.findCssSelect("html head title")[0]
    print title.text

Example #45

0

Show file

File: bs.py Project: ThePenguin1140/jabbapylib

    assert base_url is not None
    #
    for tag in soup.findAll('a', href=True):
        tag['href'] = urlparse.urljoin(base_url, tag['href'])
    
    return soup


# The patch is applied automatically when this module is imported.
css_patch()

#############################################################################

if __name__ == "__main__":
    url = "http://index.hu"
    text = web.get_page(url)
    soup = to_soup(text)
    print prettify(soup)
    #
    
    LINKS = """
<html>
<head>
<title>retrogames.com</title>
</head>
<a href="http://retrogames.com">Retro Games HQ</a>
<a href="/games/elite">Elite</a>
<a href="/games/commando">Commando</a>
</html>
"""

Example #46

0

Show file

File: test_fs.py Project: ThePenguin1140/jabbapylib

 def test_store_content_in_file(self):
     content = web.get_page(GOOGLE)
     assert not os.path.exists(cfg.TEST_TMP_FILE)
     fs.store_content_in_file(content, cfg.TEST_TMP_FILE)
     assert os.path.getsize(cfg.TEST_TMP_FILE) > 0
     os.unlink(cfg.TEST_TMP_FILE)

Example #47

0

Show file

File: test_fs.py Project: the7day/jabbapylib

 def test_store_content_in_file(self):
     content = web.get_page(GOOGLE)
     assert not os.path.exists(cfg.TEST_TMP_FILE)
     fs.store_content_in_file(content, cfg.TEST_TMP_FILE)
     assert os.path.getsize(cfg.TEST_TMP_FILE) > 0
     os.unlink(cfg.TEST_TMP_FILE)

Example #48

0

Show file

File: geoinfo.py Project: ThePenguin1140/jabbapylib

 def get_json(self, ip):
     text = get_page(gp_template.format(ip=ip))
     text = re.sub('^geoPlugin\(', '', text)
     text = re.sub('\)$', '', text)
     return json.loads(text)