Example #1
0
def say_with_google(word, autoremove=True, background=False, debug=False):
    """
    Say a word with Google.

    https://ubuntuincident.wordpress.com/2012/03/27/audio-pronunciation-of-words-from-google/
    The return value is a tuple: (found, mp3_file), where
    found is True if the word was retrieved successfully (False otherwise), and
    mp3_file is the path of the locally saved mp3 (or None if it was not saved).
    Set autoremove to False if you want to work with the mp3 later, when this
    function returned.
    The function stores the mp3 files in /tmp.
    """
    found = False  # Was the mp3 successfully found?
    mp3_file = None  # Is the locally saved mp3 file kept?
    url = template.format(word=word)
    content = web.get_page(url, user_agent=True)
    if content:
        found = True
        fname = '/tmp/{word}.mp3'.format(word=word)
        fs.store_content_in_file(content, fname, overwrite=True)
        mp3_file = fname
        if not debug:
            play(fname, background=background)
        if autoremove:
            os.unlink(fname)
            mp3_file = None
    else:
        found = False
        mp3_file = None

    return (found, mp3_file)
Example #2
0
def get_my_external_ip():
    """
    Get my external IP.
    
    Local IP: http://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
    """
    return get_page('http://ifconfig.me/ip')
Example #3
0
def say_with_google(word, autoremove=True, background=False, debug=False):
    """
    Say a word with Google.

    https://ubuntuincident.wordpress.com/2012/03/27/audio-pronunciation-of-words-from-google/
    The return value is a tuple: (found, mp3_file), where
    found is True if the word was retrieved successfully (False otherwise), and
    mp3_file is the path of the locally saved mp3 (or None if it was not saved).
    Set autoremove to False if you want to work with the mp3 later, when this
    function returned.
    The function stores the mp3 files in /tmp.
    """
    found = False  # Was the mp3 successfully found?
    mp3_file = None  # Is the locally saved mp3 file kept?
    url = template.format(word=word)
    content = web.get_page(url, user_agent=True)
    if content:
        found = True
        fname = "/tmp/{word}.mp3".format(word=word)
        fs.store_content_in_file(content, fname, overwrite=True)
        mp3_file = fname
        if not debug:
            play(fname, background=background)
        if autoremove:
            os.unlink(fname)
            mp3_file = None
    else:
        found = False
        mp3_file = None

    return (found, mp3_file)
Example #4
0
def get_my_external_ip():
    """
    Get my external IP.

    Local IP: http://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
    """
    return get_page('http://ifconfig.me/ip')
def extract_image_urls(url):
    origin = None
    li = []

    text = get_page(url, user_agent=True, referer=True)
    soup = bs.to_soup(text)

    #    this version worked for a day:
    #    for pic in soup.findCssSelect('div.pic'):
    #        a = pic.find('a', href=True)
    #        if a:
    #            li.append(a['href'])

    #   here is a new version, updated to the changes
    for div in soup.findCssSelect("div.pic"):
        img = div.find("img")
        if img and img.has_key("src"):
            li.append(img["src"].replace("/small/", "/large/"))

    for div in soup.findCssSelect("html body form#aspnetForm div#main div"):
        result = re.search(r"URL: (http://.*)View full images", div.text)
        if result:
            origin = result.group(1)

    return origin, li
def visit(blog, dic):
    url = 'http://{name}.wordpress.com'.format(name=blog)
    text = get_page(url)
    soup = bs.to_soup(text)
    hits = soup.findCssSelect('div#blog-stats ul li')[0].text
    hits = int(hits.replace('hits','').replace(',','').strip())
    #
    dic[url] = hits
Example #7
0
def setup_module(module):
    """runs just once per module"""
    global GOOGLE_HTML
    GOOGLE_HTML = web.get_page(GOOGLE)
    try:
        os.unlink(cfg.TEST_TMP_FILE)
    except:
        pass    # maybe it didn't exist
Example #8
0
def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)
    
    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)
    
    return images
Example #9
0
def main(url):
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html, method=web.HTML2TEXT)

    #txt = ascii.unicode_to_ascii(txt)
    #txt = txt.replace(u'\xb7', '-')
    #txt = ascii.remove_non_ascii(txt).encode('ascii')

    print_result(txt)
Example #10
0
def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)

    return [x for x in li if x]  # remove None elems
Example #11
0
def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)

    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)

    return images
Example #12
0
def process(word):
    """Process the given word.
    
    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)
    
    return (word, get_hyphen(doc), get_mp3(doc))
Example #13
0
File: bing.py Project: sharad/rc
def extract():
    text = web.get_page(URL)
    text = text.split('g_img={url:')[1]
    text = text.split(',')[0].replace("'", "")
    img_url = urljoin(URL, text)
    fname = img_url.split('/')[-1]
    img_ext = os.path.splitext(fname)[1]
    save_name = get_date_from_year_to_day() + img_ext
    return (img_url, save_name)
Example #14
0
def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)
        
    return [x for x in li if x]     # remove None elems
Example #15
0
def process(word):
    """Process the given word.

    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)

    return (word, get_hyphen(doc), get_mp3(doc))
Example #16
0
def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content()
    print celsius
Example #17
0
def main(url):
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html, method=web.HTML2TEXT)
    
    #txt = ascii.unicode_to_ascii(txt)
    #txt = txt.replace(u'\xb7', '-')
    #txt = ascii.remove_non_ascii(txt).encode('ascii')

    
    print_result(txt)
Example #18
0
def demo8():
    url = "http://python.org/"
    text = get_page(url)
    # doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    # doc = lx.to_doc(text)
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    # print type(doc)
    # print etree.tostring(doc)
    title = doc.cssselect("html head title")[0]
    print title.text
Example #19
0
def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content() 
    print celsius
Example #20
0
def extract(test=False):
    text = web.get_page(URL)
    text = text.split('g_img={url:')[1]
    text = text.split(',')[0].replace("'", "")
    img_url = urljoin(URL, text)
    fname = img_url.split('/')[-1]
    fname = unquote(fname).split('/')[-1]
    if not test:
        print '# fname:', fname
    save_name = '{date}-{fname}'.format(date=get_date_from_year_to_day(), fname=fname) 
    return (img_url, save_name)
Example #21
0
def extract(test=False):
    text = web.get_page(URL)
    text = text.split('g_img={url:')[1]
    text = text.split(',')[0].replace("'", "")
    img_url = urljoin(URL, text)
    fname = img_url.split('/')[-1]
    fname = unquote(fname).split('/')[-1]
    if not test:
        print '# fname:', fname
    save_name = '{date}-{fname}'.format(date=get_date_from_year_to_day(), fname=fname)
    return (img_url, save_name)
Example #22
0
def process(word):
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html).decode('utf-8')

    #txt = ascii.unicode_to_ascii(txt)
    txt = txt.replace(u'\xb7', '-')
    txt = ascii.remove_non_ascii(txt).encode('ascii')
    txt = re.sub('\[.*?.gif\]', '', txt)

    print_result(txt)
Example #23
0
def process(word):
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html).decode("utf-8")

    # txt = ascii.unicode_to_ascii(txt)
    txt = txt.replace(u"\xb7", "-")
    txt = ascii.remove_non_ascii(txt).encode("ascii")
    txt = re.sub("\[.*?.gif\]", "", txt)

    print_result(txt)
Example #24
0
def get_slogan(word, times=1):
    assert 1 <= times <= 10  # be nice with the server
    #
    li = []
    url = BASE + urllib.urlencode({'user': word})
    for _ in xrange(times):
        text = get_page(url, user_agent=True)
        soup = bs.to_soup(text)
        slogan = soup.findCssSelect('html body div p')[0].text
        if string.count(slogan, '.') == 1 and not slogan[0].isupper():
            slogan = slogan.replace('.', '')
        if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!':
            slogan = slogan[:-1]
        li.append(slogan)

    return li
Example #25
0
def get_slogan(word, times=1):
    assert 1 <= times <= 10     # be nice with the server
    #
    li = []
    url = BASE + urllib.urlencode({'user' : word})
    for _ in xrange(times):
        text = get_page(url, user_agent=True)
        soup = bs.to_soup(text)
        slogan = soup.findCssSelect('html body div p')[0].text
        if string.count(slogan, '.') == 1 and not slogan[0].isupper():
            slogan = slogan.replace('.', '')
        if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!':
            slogan = slogan[:-1]
        li.append(slogan)
        
    return li
Example #26
0
def definitions(word):
    """
    Fetch the definition of the word.
    """
    template = 'http://api.wordnik.com//v4/word.json/{word}/definitions?includeRelated=false&includeTags=false&useCanonical=false'
    url = prepare_url(template, word)
    try:
        decoded = json.loads(get_page(url))
        #print json.dumps(decoded)
        #
        partOfSpeech = decoded[0]['partOfSpeech']
        text = decoded[0]['text']
        d = {}
        d['partOfSpeech'] = partOfSpeech
        d['text'] = text
        return d
    except:
        return None
Example #27
0
def definitions(word):
    """
    Fetch the definition of the word.
    """
    template = 'http://api.wordnik.com//v4/word.json/{word}/definitions?includeRelated=false&includeTags=false&useCanonical=false'
    url = prepare_url(template, word)
    try:
        decoded = json.loads(get_page(url))
        #print json.dumps(decoded)
        #
        partOfSpeech = decoded[0]['partOfSpeech']
        text = decoded[0]['text']
        d = {}
        d['partOfSpeech'] = partOfSpeech
        d['text'] = text
        return d
    except:
        return None
Example #28
0
 def download(self, warning=True):
     """Download yourself."""
     if os.path.exists(self.get_skip_path()):
         return False
     
     # else
     if not self.exists():
         if self.make_dirs():
             obj = web.get_page(self.file_url, user_agent=True, referer=True)
             fs.store_content_in_file(obj, self.get_local_path())
     
     ok = self.exists()
     if not ok and warning:
         print >>sys.stderr, "# warning: couldn't download {url}.".format(url=self.file_url)
         
     if self.readme:
         self.save_readme()
         
     return ok
Example #29
0
def check():
    """Just an example how to use the BS4 library."""
    text = str(get_page(URL))
    soup = bs.to_soup(text, PARSER)
    book = soup.find('div', {'class' : 'module bookSmall'})
    link = book.find('a', href=True)
    print link['href']
    #
    book = soup.find('div', {'class' : 'module fullBook'})
    try:
        title = book.find('span', {'property': 'dc:title'}).text.lower()
    except:
        title = ""
    print title
    tabs = soup.find_all('div', {'class' : 'tabModules'})[-1]
    try:
        desc = tabs.find('p', {'class' : 'paragraph'}).text.lower()
    except:
        desc = ""
    print desc
Example #30
0
def examples(word, limit=None):
    """
    Fetch examples.
    """
    template = 'http://api.wordnik.com//v4/word.json/{word}/examples'
    url = prepare_url(template, word)
    #print url
    try:
        decoded = json.loads(get_page(url))
        #print json.dumps(decoded)
        li = []
        array = decoded['examples']  # no limit, everything
        if limit:  # if limit specified
            array = array[:limit]
        for e in array:
            li.append(e['text'])
        #
        return li
    except:
        return None
Example #31
0
def extract_list():
    """
    Extract proxy list from base url.
    """
    sys.stdout.write('# extracting list')
    proxies = []
    text = get_page(BASE, user_agent=True)
    soup = bs.to_soup(text)
    proxylist = soup.findCssSelect('table.proxylist')[0]
    for tr in proxylist.findAll('tr', {'class': True}):
        if tr['class'] in ('odd', 'even'):
            cols = tr.findAll('td')
            ip = cols[0].text
            type = cols[1].text
            country = cols[2].text
            proxies.append(Proxy(ip, type, country))
            sys.stdout.write('.')
    #
    print 'done.'
    return proxies
Example #32
0
def examples(word, limit=None):
    """
    Fetch examples.
    """
    template = 'http://api.wordnik.com//v4/word.json/{word}/examples'
    url = prepare_url(template, word)
    #print url
    try:
        decoded = json.loads(get_page(url))
        #print json.dumps(decoded)
        li = []
        array = decoded['examples']     # no limit, everything
        if limit:                       # if limit specified
            array = array[:limit] 
        for e in array:
            li.append(e['text'])
        #
        return li
    except:
        return None
Example #33
0
    def download(self, warning=True):
        """Download yourself."""
        if os.path.exists(self.get_skip_path()):
            return False

        # else
        if not self.exists():
            if self.make_dirs():
                obj = web.get_page(self.file_url,
                                   user_agent=True,
                                   referer=True)
                fs.store_content_in_file(obj, self.get_local_path())

        ok = self.exists()
        if not ok and warning:
            print >> sys.stderr, "# warning: couldn't download {url}.".format(
                url=self.file_url)

        if self.readme:
            self.save_readme()

        return ok
Example #34
0
def is_internet_on(method=1):
    """Check if the Internet connection is on."""

    if method == 1:
        # At my current place we have a wifi that redirects to a login page,
        # so we always have a connection. That's why I check the content of
        # the fetched webpage.
        text = web.get_page(URL, timeout=3)
        if text:
            if '<title>Google</title>' in text:
                return True
        # else:
        return False
    elif method == 2:
        # http://stackoverflow.com/questions/3764291/checking-network-connection
        try:
            urllib2.urlopen('http://www.google.com', timeout=1)
            return True
        except urllib2.URLError:
            return False
    else:
        print '# warning: unknown method in is_internet_on()'
Example #35
0
def is_internet_on(method=1):
    """Check if the Internet connection is on."""
    
    if method == 1:
        # At my current place we have a wifi that redirects to a login page,
        # so we always have a connection. That's why I check the content of
        # the fetched webpage.
        text = web.get_page(URL, timeout=3)
        if text:
            if '<title>Google</title>' in text:
                return True
        # else:
        return False
    elif method == 2:
        # http://stackoverflow.com/questions/3764291/checking-network-connection
        try:
            urllib2.urlopen('http://www.google.com', timeout=1)
            return True
        except urllib2.URLError: 
            return False
    else:
        print '# warning: unknown method in is_internet_on()'
def main():
    text = web.get_page(URL)
    soup = BeautifulSoup(text)
    
    countries = Countries()
    
    for row in soup.findAll('tr'):
        cols = row.findAll('td')
        if cols:
            rank = cols[0].text
            if rank and re.search('^\d+$', rank):
                country = cols[1].find('a', title=True).text
                population = int(cols[2].text.replace(',', ''))  
                #print country,':',population
                countries.add(country, population)
                
    #countries.sort()
    
    d = {}
    for country in countries.countries:
        d[country.name] = country.population
        
    print json.dumps(d)
Example #37
0
 def get_info(self):
     text = get_page(self.url)
     return json.loads(text)
Example #38
0
    assert base_url is not None
    #
    for tag in soup.findAll('a', href=True):
        tag['href'] = urlparse.urljoin(base_url, tag['href'])

    return soup


# The patch is applied automatically when this module is imported.
css_patch()

#############################################################################

if __name__ == "__main__":
    url = "http://index.hu"
    text = web.get_page(url)
    soup = to_soup(text)
    print prettify(soup)
    #

    LINKS = """
<html>
<head>
<title>retrogames.com</title>
</head>
<a href="http://retrogames.com">Retro Games HQ</a>
<a href="/games/elite">Elite</a>
<a href="/games/commando">Commando</a>
</html>
"""
Example #39
0
Demo for lx.py.
Download population of countries.
"""

import re

from jabbapylib.web.scraper import lx
from jabbapylib.web.web import get_page


def process(doc):
    data = {}
    
    for row in doc.cssselect('tr'):
        cols = row.cssselect('td')
        if cols:
            rank = cols[0].text
            if rank and re.search('^\d+$', rank):
                country = cols[1].cssselect('a[title]')[0].text
                population = int(cols[2].text.replace(',', ''))  
                data[country] = population
                
    print data

#############################################################################

if __name__ == "__main__":
    url = 'https://secure.wikimedia.org/wikipedia/en/wiki/List_of_countries_by_population'
    text = get_page(url)
    doc = lx.to_doc(text)
    process(doc)
Example #40
0
 def get_json(self, ip):
     text = get_page(gp_template.format(ip=ip))
     text = re.sub("^geoPlugin\(", "", text)
     text = re.sub("\)$", "", text)
     return json.loads(text)
Example #41
0
 def get_info(self):
     text = get_page(self.url)
     return json.loads(text)
Example #42
0
 def get_json(self, ip):
     text = get_page(gp_template.format(ip=ip))
     text = re.sub('^geoPlugin\(', '', text)
     text = re.sub('\)$', '', text)
     return json.loads(text)
Example #43
0
def demo2():
    url = "http://projecteuler.net/"
    text = get_page(url)
    doc = lx.to_doc(text)
    lx.make_links_absolute(doc, base_url=url)
    print lx.tostring(doc)
Example #44
0
def demo9():
    url = "http://python.org/"
    text = get_page(url)
    soup = bs.to_soup(text)
    title = soup.findCssSelect("html head title")[0]
    print title.text
Example #45
0
    assert base_url is not None
    #
    for tag in soup.findAll('a', href=True):
        tag['href'] = urlparse.urljoin(base_url, tag['href'])
    
    return soup


# The patch is applied automatically when this module is imported.
css_patch()

#############################################################################

if __name__ == "__main__":
    url = "http://index.hu"
    text = web.get_page(url)
    soup = to_soup(text)
    print prettify(soup)
    #
    
    LINKS = """
<html>
<head>
<title>retrogames.com</title>
</head>
<a href="http://retrogames.com">Retro Games HQ</a>
<a href="/games/elite">Elite</a>
<a href="/games/commando">Commando</a>
</html>
"""
Example #46
0
 def test_store_content_in_file(self):
     content = web.get_page(GOOGLE)
     assert not os.path.exists(cfg.TEST_TMP_FILE)
     fs.store_content_in_file(content, cfg.TEST_TMP_FILE)
     assert os.path.getsize(cfg.TEST_TMP_FILE) > 0
     os.unlink(cfg.TEST_TMP_FILE)
Example #47
0
 def test_store_content_in_file(self):
     content = web.get_page(GOOGLE)
     assert not os.path.exists(cfg.TEST_TMP_FILE)
     fs.store_content_in_file(content, cfg.TEST_TMP_FILE)
     assert os.path.getsize(cfg.TEST_TMP_FILE) > 0
     os.unlink(cfg.TEST_TMP_FILE)
Example #48
0
 def get_json(self, ip):
     text = get_page(gp_template.format(ip=ip))
     text = re.sub('^geoPlugin\(', '', text)
     text = re.sub('\)$', '', text)
     return json.loads(text)