Python parse_html Examples, lxml.html.soupparser.parse_html Python Examples

Example #1

0

Show file

File: simple_scraper.py Project: kevinwu06/scraping_stuff

def espn_schedule():
    for year in [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]:
        time.sleep(30)
        for month in [3, 4, 5, 6, 7, 8, 9, 10]:
            for day in [1, 8, 15, 22, 29]:
                if month == 3:
                    if day != 29:
                        continue
                time.sleep(2)
                if day < 10:
                    day = '0%s' % (day)
                if month < 10:
                    month = '0%s' % (month)
                date = int('%s%s%s' % (year, month, day))
                browser = create_browser()
                page = browser.open('http://espn.go.com/mlb/schedule?date=%s' % date)
                html = parse_html(page.read())
                for tr in html.cssselect('.mod-content tr'):
                    for a in tr.cssselect('td a'):
                        link = a.get('href')
                        print link
                        try:
                            espn_id = int(link.split('?id=')[1])
                            print espn_id
                            get_or_create_espnid(espn_id=espn_id, date=date)
                        except:
                            continue
                        break

Example #2

0

Show file

def main():
    global news_scraper

    opener = urllib2.build_opener()
    opener.addheaders = [
        ('User-agent', 'Mozilla/5.0'),
        ('Referer', news_scraper.base_url)
    ]
    urllib2.install_opener(opener)

    error_count = 0
    while error_count < 3:
        try:
            html = scraperwiki.scrape(news_scraper.base_url)
        except urllib2.URLError, e:
            print 'Cannot reach the server:',
            if hasattr(e, 'reason'): print e.reason
            elif hasattr(e, 'code'): print e.code
            error_count += 1

        try:
            html = html.decode(news_scraper.page_encoding)
        except UnicodeDecodeError:
            encoded = ''
            for word in html.split(' '):
                try:
                    encoded += word.decode(news_scraper.page_encoding) + ' '
                except UnicodeDecodeError:
                    pass
            html = encoded.rstrip()

        num_of_article = scrape(parse_html(html))

        print 'Headline ,', num_of_article, 'article(s)'
        break

Example #3

0

Show file

File: korean_health_news_scraper_for_yonhap_news.py Project: flyeven/scraperwiki-scraper-vault

def main():
    global base_url, page_num_of_article, page_start, page_step, page_encoding, page_sleep, count

    count = 0

    last_page      = scraperwiki.sqlite.get_var('last_page', -1)
    latest_article = None
    start_over     = False
    if last_page == -1:
        last_page      = page_start
        latest_article = scraperwiki.sqlite.get_var('latest_article', None)
        start_over     = True

    opener = urllib2.build_opener()
    opener.addheaders = [
        ('User-agent', 'Mozilla/5.0'),
        ('Referer', base_url)
    ]
    urllib2.install_opener(opener)

    error_count    = 0
    num_of_article = page_num_of_article
    while num_of_article == page_num_of_article:
        page_url = build_url(last_page)

        try:
            html = scraperwiki.scrape(page_url)
        except urllib2.URLError, e:
            print 'Cannot reach the server:',
            if hasattr(e, 'reason'): print e.reason
            elif hasattr(e, 'code'): print e.code
            error_count += 1
            if error_count < 3: continue
            else: break

        try:
            html = html.decode(page_encoding)
        except UnicodeDecodeError:
            encoded = ''
            for word in html.split(' '):
                try:
                    encoded += word.decode(page_encoding) + ' '
                except UnicodeDecodeError:
                    pass
            html = encoded.rstrip()

        num_of_article = scrape(parse_html(html), latest_article, start_over)

        page = last_page / page_step
        if (page_start == 0): page += 1

        scraperwiki.sqlite.save_var('last_page', last_page)
        print 'Page', page, ',', num_of_article, 'article(s)'

        last_page += page_step
        if not page_exists(html, last_page): break
        time.sleep(page_sleep)

Example #4

0

Show file

def main():
    global base_url, page_num_of_article, page_start, page_step, page_encoding, page_sleep, count

    count = 0

    last_page = scraperwiki.sqlite.get_var('last_page', -1)
    latest_article = None
    start_over = False
    if last_page == -1:
        last_page = page_start
        latest_article = scraperwiki.sqlite.get_var('latest_article', None)
        start_over = True

    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0'), ('Referer', base_url)]
    urllib2.install_opener(opener)

    error_count = 0
    num_of_article = page_num_of_article
    while num_of_article == page_num_of_article:
        page_url = build_url(last_page)

        try:
            html = scraperwiki.scrape(page_url)
        except urllib2.URLError, e:
            print 'Cannot reach the server:',
            if hasattr(e, 'reason'): print e.reason
            elif hasattr(e, 'code'): print e.code
            error_count += 1
            if error_count < 3: continue
            else: break

        try:
            html = html.decode(page_encoding)
        except UnicodeDecodeError:
            encoded = ''
            for word in html.split(' '):
                try:
                    encoded += word.decode(page_encoding) + ' '
                except UnicodeDecodeError:
                    pass
            html = encoded.rstrip()

        num_of_article = scrape(parse_html(html), latest_article, start_over)

        page = last_page / page_step
        if (page_start == 0): page += 1

        scraperwiki.sqlite.save_var('last_page', last_page)
        print 'Page', page, ',', num_of_article, 'article(s)'

        last_page += page_step
        if not page_exists(html, last_page): break
        time.sleep(page_sleep)