# -*- coding: latin1 -*- """ A simple script to get a list of links in a site, in this case 'http://www.cms.ba.gov.br/vereadores.aspx', execute each and get some data in its respective pages Um script simples para obter uma lista de links em um site, neste caso 'http://www.cms.ba.gov.br/vereadores.aspx', execute cada um e obter alguns dados em suas respectivas páginas """ from webcapture import WebCapture wb = WebCapture() wb.accessUrl("http://www.cms.ba.gov.br/vereadores.aspx") wb.creatList("Vereadores") for tag in wb.getTagList("#meio_conteudo .foto_vereador_int", "a"): wb.accessUrl("http://www.cms.ba.gov.br/" + tag.attrib["href"]) wb.creatObject() wb.capture("nome", ".nome_vereador") wb.capture("cargo", ".cargo_ver") wb.capture("partido", ".partido_vereador") wb.capture("telefone", ".telefones_vereador") wb.capture("email", ".email_vereador") wb.closeObject() wb.save('json') wb.save('xml')
# -*- coding: latin1 -*- """ A script that run a form setting him parameters for get a different page response, then catch some date in the page response """ from webcapture import WebCapture wb = WebCapture() wb.accessUrl("http://www2.camara.leg.br/deputados/pesquisa") wb.creatList("Deputados") form = wb.getform("#formDepAtual") for option in form.getValues("deputado"): form.setParam("deputado", option.value) wb.accessForm(form) wb.creatObject() wb.capture("nome", "#content ul li:nth-child(1)", format = r".*: (.*)", filter = 1) wb.capture("aniversario", "#content ul li:nth-child(2)", format = r".*: (\d*) / (\d*) .*", filter = "\\1/\\2") wb.capture("profissao", "#content ul li:nth-child(2)", format = r".*: .*: (.*)", filter = "\\1")