from lxml.html import fromstring from advanced_link_crawler import download url = 'http://example.webscraping.com/places/default/view/Afghanistan-1' html = download(url) tree = fromstring(html) img = tree.xpath('//tr[@id="places_national_flag__row"]/td[@class="w2p_fw"]//@src')[0] print('http://example.webscraping.com' + img)
import time import re from all_scrapers import re_scraper, bs_scraper, \ lxml_scraper, lxml_xpath_scraper from advanced_link_crawler import download NUM_ITERATIONS = 1000 # number of times to test each scraper html = download('http://example.webscraping.com/places/default/view/Singapore-203') scrapers = [ ('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper), ('Xpath', lxml_xpath_scraper)] for name, scraper in scrapers: # record start time of scrape start = time.time() for i in range(NUM_ITERATIONS): if scraper == re_scraper: re.purge() result = scraper(html) # check scraped result is as expected assert result['area'] == '692 square kilometres' # record end time of scrape and output the total end = time.time() print('%s: %.2f seconds' % (name, end - start))
import time import re from all_scrapers import re_scraper, bs_scraper, \ lxml_scraper, lxml_xpath_scraper from advanced_link_crawler import download NUM_ITERATIONS = 1000 # number of times to test each scraper html = download( 'http://example.webscraping.com/places/default/view/Afghanistan-1') scrapers = [('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper), ('Xpath', lxml_xpath_scraper)] for name, scraper in scrapers: # record start time of scrape start = time.time() for i in range(NUM_ITERATIONS): if scraper == re_scraper: re.purge() result = scraper(html) # check scraped result is as expected assert result['area'] == '647,500 square kilometres' # record end time of scrape and output the total end = time.time() print('%s: %.2f seconds' % (name, end - start))
def fetch_youtube_url(watch_id): if watch_id == "": watch_id = '0uUoqD8a0V4' url = "https://www.youtube.com/watch?v=" + watch_id return download(url)