def test_detectencoding2(self): url = 'g1.globo.com' response = crawler.download(url) data = response.read() charset = crawler.detectcharset(response.info(), data) self.assertEqual(charset, 'utf-8') data.decode(charset)
def main(): """ Main program. Parse arguments, run crawler, print report """ args = ARGS.parse_args() if not args.b and not args.f: print "Use --help for command line help" return if args.b: bugs = {fix_url(bug) for bug in args.b} else: bugs = [fix_url(bug) for bug in read(args.f[0])] try: output = [] start_time = time.time() for bug in bugs: result = crawler.download(bug) output.append(result) total_time = round(time.time() - start_time, 2) print "It took %s seconds to download %s bug reports!" % (total_time, len(bugs)) report(output) except KeyboardInterrupt: print "Interupted!" except crawler.BugNotFound, e: print "An error occurred while crawling bug: " + bug print e.message
from bs4 import BeautifulSoup import csv, crawler url = 'http://www.imdb.com/chart/boxoffice' html = crawler.download(url) soup = BeautifulSoup(html, 'html.parser') tabela = soup.find(attrs={'class': 'chart full-width'}) titulos = tabela.findAll(attrs={'class': 'titleColumn'}) valores_acumulado = tabela.findAll(attrs={'class': 'secondaryInfo'}) semanas = tabela.findAll(attrs={'class': 'weeksColumn'}) try: f = open('tabela.csv', 'w') writer = csv.writer(f, delimiter=';', quoting=csv.QUOTE_MINIMAL) writer.writerow(('nome', 'valor', 'semanas')) for x in range(len(valores_acumulado)): titulo = titulos[x].text.strip() valor_acumulado = valores_acumulado[x].text.strip() semana = semanas[x].text.strip() writer.writerow((titulo, valor_acumulado, semana)) finally: f.close()
from bs4 import BeautifulSoup from urllib.request import urlopen from crawler import download url = 'https://www.rottentomatoes.com/browse/tv-list-1' html = download(url) soup = BeautifulSoup(html, 'html5lib') tr = soup.find_all('tr', class_=('tv_show_tr tvTopListTitle')) for i in tr: print(i.get_text().strip()) print('*----------------*')
import re from crawler import download url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239' page = download(url) area = re.findall(r'<td class="w2p_fw">(.*?)</td>', page)[1] print(area)
def test_download(self): response = crawler.download(self.url) self.assertTrue(self.url in response.geturl())
def test_detectdomain(self): url = 'folha.com.br' response = crawler.download(url) self.assertEqual('www.folha.uol.com.br', crawler.detectdomain(response))
def test_detectencoding(self): response = crawler.download(self.url) data = response.read() charset = crawler.detectcharset(response.info(), data) self.assertEqual(charset, 'iso-8859-1') data.decode(charset)