Beispiel #1
0
	def test_detectencoding2(self):
		url = 'g1.globo.com'
		response = crawler.download(url)
		data = response.read()
		charset = crawler.detectcharset(response.info(), data)
		self.assertEqual(charset, 'utf-8')
		data.decode(charset)
Beispiel #2
0
def main():
  """ Main program.

  Parse arguments, run crawler, print report
  """

  args = ARGS.parse_args()
  if not args.b and not args.f:
    print "Use --help for command line help"
    return

  if args.b:
    bugs = {fix_url(bug) for bug in args.b}
  else:
    bugs = [fix_url(bug) for bug in read(args.f[0])]

  try:
    output = []
    start_time = time.time()

    for bug in bugs:
      result = crawler.download(bug)
      output.append(result)

    total_time = round(time.time() - start_time, 2)
    print "It took %s seconds to download %s bug reports!" % (total_time, len(bugs))

    report(output)
  except KeyboardInterrupt:
    print "Interupted!"
  except crawler.BugNotFound, e:
    print "An error occurred while crawling bug: " + bug
    print e.message
Beispiel #3
0
from bs4 import BeautifulSoup
import csv, crawler

url = 'http://www.imdb.com/chart/boxoffice'
html = crawler.download(url)
soup = BeautifulSoup(html, 'html.parser')

tabela = soup.find(attrs={'class': 'chart full-width'})
titulos = tabela.findAll(attrs={'class': 'titleColumn'})
valores_acumulado = tabela.findAll(attrs={'class': 'secondaryInfo'})
semanas = tabela.findAll(attrs={'class': 'weeksColumn'})

try:
    f = open('tabela.csv', 'w')
    writer = csv.writer(f, delimiter=';', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(('nome', 'valor', 'semanas'))

    for x in range(len(valores_acumulado)):
        titulo = titulos[x].text.strip()
        valor_acumulado = valores_acumulado[x].text.strip()
        semana = semanas[x].text.strip()
        writer.writerow((titulo, valor_acumulado, semana))
finally:
    f.close()
Beispiel #4
0
from bs4 import BeautifulSoup
from urllib.request import urlopen
from crawler import download

url = 'https://www.rottentomatoes.com/browse/tv-list-1'
html = download(url)
soup = BeautifulSoup(html, 'html5lib')
tr = soup.find_all('tr', class_=('tv_show_tr tvTopListTitle'))

for i in tr:
    print(i.get_text().strip())
    print('*----------------*')
Beispiel #5
0
import re
from crawler import download

url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
page = download(url)
area = re.findall(r'<td class="w2p_fw">(.*?)</td>', page)[1]
print(area)
Beispiel #6
0
	def test_download(self):
		response = crawler.download(self.url)
		self.assertTrue(self.url in response.geturl())
Beispiel #7
0
	def test_detectdomain(self):
		url = 'folha.com.br'
		response = crawler.download(url)
		self.assertEqual('www.folha.uol.com.br', crawler.detectdomain(response))
Beispiel #8
0
	def test_detectencoding(self):
		response = crawler.download(self.url)
		data = response.read()
		charset = crawler.detectcharset(response.info(), data)
		self.assertEqual(charset, 'iso-8859-1')
		data.decode(charset)