def parse(self, response): html = response.body encoding = "iso-8859-15" # TODO: use encoding from header parties = rows.import_from_xpath( io.BytesIO(html), encoding="iso-8859-15", # TODO: use encoding from header rows_xpath='//select[@id="partido"]/option', fields_xpath=OrderedDict([("code", "./@value"), ("name", "./text()")]), ) states = rows.import_from_xpath( io.BytesIO(html), encoding=encoding, rows_xpath='//select[@id="uf"]/option', fields_xpath=OrderedDict([("code", "./@value"), ("name", "./text()")]), ) link = "http://agencia.tse.jus.br/estatistica/sead/eleitorado/filiados/uf/filiados_{party_code}_{state_code}.zip" for party in parties: for state in states: party_code = party.code if party_code == "SOLIDARIEDADE": # Fix TSE link party_code = "sd" url = link.format(party_code=party_code, state_code=state.code) download_filename = make_filepath(party_code, state.code) yield scrapy.Request( url=url, meta={ "filename": download_filename, "party": party.name, "state": state.name, "url": url, }, callback=self.save_zip, )
def test_xpath_must_be_text_type(self): with self.assertRaises(TypeError): rows.import_from_xpath(self.filename, encoding=self.encoding, rows_xpath=b'//div', fields_xpath={'f1': './/span'}) with self.assertRaises(TypeError): rows.import_from_xpath(self.filename, encoding=self.encoding, rows_xpath='//div', fields_xpath={'f1': b'.//span'})
def test_xpath_must_be_text_type(self): with self.assertRaises(TypeError): rows.import_from_xpath( self.filename, encoding=self.encoding, rows_xpath=b"//div", fields_xpath={"f1": ".//span"}, ) with self.assertRaises(TypeError): rows.import_from_xpath( self.filename, encoding=self.encoding, rows_xpath="//div", fields_xpath={"f1": b".//span"}, )
def get_links(date): # Download HTML and get all the links (text + url) url = 'http://www.cnj.jus.br/transparencia/remuneracao-dos-magistrados' response = requests.get(url) rows_xpath = '//a' fields_xpath = OrderedDict([('name', './/text()'), ('url', './@href')]) table = rows.import_from_xpath(io.BytesIO(response.content), encoding=response.encoding, rows_xpath=rows_xpath, fields_xpath=fields_xpath) # Filter out links which don't point to spreadsheets result = [] for row in table: if row.name is None or row.name == 'documento padrão' or \ '.xls' not in row.url: continue data = { 'name': row.name.replace('\xa0', ' '), 'url': urljoin(url, row.url), 'date_scraped': date, } result.append(data) return rows.import_from_dicts(result)
def get_links(year, month, date_scraped): # Download HTML and get all the links (text + url) month = MONTHS[month] url = f'http://www.cnj.jus.br/transparencia/remuneracao-dos-magistrados/remuneracao-{month}-{year}' response = requests.get(url) if not response.ok: raise RuntimeError('Data not found') rows_xpath = '//a' fields_xpath = OrderedDict([('name', './/text()'), ('url', './@href')]) table = rows.import_from_xpath(io.BytesIO(response.content), encoding=response.encoding, rows_xpath=rows_xpath, fields_xpath=fields_xpath) # Filter out links which don't point to spreadsheets result = [] for row in table: if not (row.name or '').strip() or row.name == 'documento padrão' or \ '.xls' not in row.url: continue data = { 'date_scraped': date_scraped, 'month': MONTHS2[month], 'name': row.name.replace('\xa0', ' '), 'url': unquote(urljoin(url, row.url)).strip(), 'year': year, } result.append(data) return rows.import_from_dicts(result)
def parse_month(self, response): meta = response.request.meta month_meta = {"ano": meta["year"], "mes": meta["month"]} rows_xpath = ( "//a[contains(@href, 'xls') and not(contains(text(), 'documento'))]" ) fields_xpath = OrderedDict([("tribunal", ".//text()"), ("url", ".//@href")]) table = rows.import_from_xpath( io.BytesIO(response.body), rows_xpath=rows_xpath, encoding=response.encoding, fields_xpath=fields_xpath, ) for row in table: url = urljoin(self.start_urls[0], unquote(row.url)).strip() # Fix URLs url = url.replace("http:/w", "http://w") if " " in url: # Case: "http://www.cnj.jus.br/TREMS%20http:/www.cnj.jus.br/files/conteudo/arquivo/2019/04/b91f9672dfe8abfb9cd3fbc6e8a5510e.xls" for part in url.split(): if ".xls" in part: url = part break # Some links have errors (more than one URL inside), so a list of # URLs for the same court is generated so we can check later the # correct one. if url.count("http://") > 1: urls = [] for part in url.split("http:"): if not part: continue urls.append("http:" + part) else: urls = [url] for url in urls: filename = settings.DOWNLOAD_PATH / Path(urlparse(url).path).name court_meta = month_meta.copy() court_meta.update( { "baixado_em": datetime.datetime.now(), "arquivo": filename.relative_to(settings.BASE_PATH), "tribunal": fix_tribunal( (row.tribunal or "").replace("\xa0", " ") ), "url": url, } ) yield scrapy.Request( url=court_meta["url"], meta={"row": court_meta}, callback=self.save_file, ) # Yield the row so we can check later when links are incorrect # (repeated, 404 etc.) yield court_meta
def parse_list(self, response): url = "http://servicos.tce.pr.gov.br/TCEPR/Tribunal/Relacon/DadosConsulta/Pesquisa" table = rows.import_from_xpath( io.BytesIO(response.body), encoding=response.encoding, rows_xpath='//select[@id="cdMunicipio"]/option', fields_xpath=OrderedDict([("municipio", "./text()"), ("codigo_ibge", "./@value")]), force_types={"codigo_ibge": rows.fields.TextField}, ) cities = { row.municipio.strip(): row.codigo_ibge for row in table if row.codigo_ibge != 0 } table = rows.import_from_xpath( io.BytesIO(response.body), encoding=response.encoding, rows_xpath='//select[@id="nrAno"]/option', fields_xpath=OrderedDict([("year", "./@value")]), ) years = [row.year for row in table if row.year != 0] for year in years: for city, ibge_code in cities.items(): post_data = { "cdMunicipio": ibge_code, "municipio": city, "nrAno": str(year), } yield scrapy.FormRequest( url, method="POST", formdata=post_data, meta={ "city": city, "ibge_code": ibge_code, "year": year }, callback=self.parse_result, )
def test_import_from_xpath_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {'encoding': 'iso-8859-15', 'some_key': 123, 'other': 456, } self.kwargs.update(kwargs) result = rows.import_from_xpath(self.filename, **self.kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs['meta'] = {'imported_from': 'xpath', 'filename': self.filename,} self.assertEqual(call[1], kwargs)
def test_import_from_xpath_filename(self): table = rows.import_from_xpath(self.filename, encoding=self.encoding, **self.kwargs) expected_meta = {'imported_from': 'xpath', 'filename': self.filename,} self.assertEqual(table.meta, expected_meta) temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) fobj = temp.file rows.export_to_csv(table, fobj) fobj.seek(0) table = rows.import_from_csv(fobj) self.assert_table_equal(table, self.expected_table)
def test_import_from_xpath_unescape_and_extract_text(self): html = ''' <ul> <li><a href="/wiki/Abadia_de_Goi%C3%A1s" title="Abadia de Goiás">Abadia de Goiás</a> (GO)</li> <li><a href="/wiki/Abadi%C3%A2nia" title="Abadiânia">Abadiânia</a> (GO)</li> </ul> '''.encode('utf-8') rows_xpath = '//ul/li' fields_xpath = OrderedDict([('name', './/text()'), ('link', './/a/@href')]) table = rows.import_from_xpath(BytesIO(html), rows_xpath=rows_xpath, fields_xpath=fields_xpath, encoding='utf-8') self.assertEqual(table[0].name, 'Abadia de Goiás (GO)') self.assertEqual(table[1].name, 'Abadiânia (GO)')
def parse_result(self, response): meta = response.request.meta links = rows.import_from_xpath( io.BytesIO(response.body), encoding=response.encoding, rows_xpath="//a", fields_xpath=OrderedDict([("document_type", ".//text()"), ("url", ".//@href")]), ) # TODO: what if 'links' is empty? Probably wrong ibge_code passed for link in links: link = link._asdict() link.update({ "city": meta["city"], "ibge_code": meta["ibge_code"], "year": meta["year"], }) yield link
def test_import_from_xpath_unescape_and_extract_text(self): html = """ <ul> <li><a href="/wiki/Abadia_de_Goi%C3%A1s" title="Abadia de Goiás">Abadia de Goiás</a> (GO)</li> <li><a href="/wiki/Abadi%C3%A2nia" title="Abadiânia">Abadiânia</a> (GO)</li> </ul> """.encode("utf-8") rows_xpath = "//ul/li" fields_xpath = OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]) table = rows.import_from_xpath( BytesIO(html), rows_xpath=rows_xpath, fields_xpath=fields_xpath, encoding="utf-8", ) self.assertEqual(table[0].name, "Abadia de Goiás (GO)") self.assertEqual(table[1].name, "Abadiânia (GO)")
def test_import_from_xpath_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 encoding = "iso-8859-15" kwargs = {"some_key": 123, "other": 456} self.kwargs.update(kwargs) result = rows.import_from_xpath(self.filename, encoding=encoding, **self.kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs["meta"] = { "imported_from": "xpath", "filename": self.filename, "encoding": encoding, } self.assertEqual(call[1], kwargs)
def update_legislators(self): url = self.base_url + '/deputado/' html = self.retrieve_uri(url, post_process=False, force_encoding='utf-8') rows_xpath = u'//tbody/tr' fields_xpath = { u'nome': u'./td[position()=1]/a/text()', u'url': u'./td[position()=1]/a/@href', u'party': u'./td[position()=2]/text()', u'telefone': u'./td[position()=3]/text()', u'fax': u'./td[position()=4]/text()', u'email': u'./td[position()=5]/a[position()=1]/img/@title', } table = rows.import_from_xpath(BytesIO(html.encode('utf-8')), rows_xpath, fields_xpath) url_regex = re.compile(r'.*id/(\d+)') email_regex = re.compile(r'Email: (.*)') for row in table: _id = url_regex.match(row.url).group(1) email = None if row.email: email = email_regex.match(row.email).group(1).strip() party_siglum = self._normalize_party_siglum(row.party) party, party_created = PoliticalParty.objects.get_or_create( siglum=party_siglum ) self.debug(u'New party: {0}'.format(party)) legislator, created = Legislator.objects.get_or_create(name=row.nome) legislator.site = self.base_url + row.url legislator.email = email legislator.save() if created: self.debug(u'New legislator: {0}'.format(legislator)) else: self.debug(u'Found existing legislator: {0}'.format(legislator)) self.mandate_for_legislator(legislator, party, original_id=_id)
def test_import_from_xpath_unescape_and_extract_text(self): html = """ <ul> <li><a href="/wiki/Abadia_de_Goi%C3%A1s" title="Abadia de Goiás">Abadia de Goiás</a> (GO)</li> <li><a href="/wiki/Abadi%C3%A2nia" title="Abadiânia">Abadiânia</a> (GO)</li> </ul> """.encode( "utf-8" ) rows_xpath = "//ul/li" fields_xpath = OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]) table = rows.import_from_xpath( BytesIO(html), rows_xpath=rows_xpath, fields_xpath=fields_xpath, encoding="utf-8", ) self.assertEqual(table[0].name, "Abadia de Goiás (GO)") self.assertEqual(table[1].name, "Abadiânia (GO)")
def test_import_from_xpath_fobj(self): # TODO: may test with codecs.open passing an encoding with open(self.filename, mode='rb') as fobj: table = rows.import_from_xpath(fobj, encoding=self.encoding, **self.kwargs) expected_meta = {'imported_from': 'xpath', 'filename': self.filename, 'encoding': self.encoding, } self.assertEqual(table.meta, expected_meta) temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) fobj = temp.file rows.export_to_csv(table, fobj) fobj.seek(0) table = rows.import_from_csv(fobj) self.assert_table_equal(table, self.expected_table)
def parse_detale_verba(self, elem, budget_title, budget_subtitle): rows_xpath = u'//tbody/tr' fields_xpath = { u'nome': u'./td[position()=1]/text()', u'cpf_cnpj': u'./td[position()=2]/text()', u'date': u'./td[position()=3]/text()', u'number': u'./td[position()=4]/text()', u'value_presented': u'./td[position()=5]/text()', u'value_expensed': u'./td[position()=6]/text()', } table = rows.import_from_xpath( BytesIO(str(elem)), rows_xpath, fields_xpath) for row in table: data = dict(row.__dict__) data.update({ 'budget_title': budget_title, 'budget_subtitle': budget_subtitle, 'cpf_cnpj': self.normalize_cnpj_or_cpf(row.cpf_cnpj), 'value_presented': self.parse_money(row.value_presented), 'value_expensed': self.parse_money(row.value_expensed), }) self.debug(u'Generated JSON: {0}'.format(data)) yield data
def parse(self, response): meta = response.request.meta table = rows.import_from_xpath( io.BytesIO(response.body), rows_xpath='//article[@class="featured-attractions__item"]', fields_xpath=OrderedDict([ ("name", ".//h3//a/text()"), ("category", ".//h4//a/text()"), ("url", ".//h4//a/@href"), ("thumbnail_url", ".//img/@src"), ]), ) for row in table: data = row._asdict() del data["category"] # Not needed here, will get on next request yield scrapy.Request( url=data["url"], meta={"data": data}, callback=self.parse_event, ) if response.xpath("//a[text() = 'Próxima >>']"): yield self.make_list_request(date=meta["date"], page_number=meta["page_number"] + 1)
import rows try: from urlparse import urljoin # Python 2 except ImportError: from urllib.parse import urljoin # Python 3 # Get data from Portuguese Wikipedia city_list_url = "https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil" response = requests.get(city_list_url) html = response.content # Extract desired data using XPath cities = rows.import_from_xpath( BytesIO(html), rows_xpath="//table/tr/td/ul/li", fields_xpath=OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]), ) regexp_city_state = re.compile(r"(.*) \(([A-Z]{2})\)") def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data["link"] = urljoin("https://pt.wikipedia.org", data["link"]) data["name"], data["state"] = regexp_city_state.findall(data["name"])[0] return data
from io import BytesIO from urllib2 import urlparse import requests import rows # Get data from Portuguese Wikipedia city_list_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil' response = requests.get(city_list_url) html = response.content # Extract desired data using XPath cities = rows.import_from_xpath( BytesIO(html), rows_xpath='//table/tr/td/ul/li', fields_xpath=OrderedDict([('name', './/text()'), ('link', './/a/@href')])) regexp_city_state = re.compile(r'(.*) \(([A-Z]{2})\)') def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link']) data['name'], data['state'] = regexp_city_state.findall(data['name'])[0] return data new_fields = OrderedDict() new_fields['name'] = cities.fields['name']
# coding: utf-8 from __future__ import unicode_literals import os from collections import OrderedDict import rows # taken from: # http://www.supercom.gob.ec/es/informate-y-participa/directorio-de-medios/21-radiodifusoras filename = os.path.join( os.path.dirname(__file__), "../../tests/data/ecuador-medios-radiodifusoras.html" ) rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]' fields_xpath = OrderedDict( [ ("url", ".//h2/a/@href"), ("name", ".//h2/a/text()"), ("address", './/div[@class="spField field_direccion"]/text()'), ("phone", './/div[@class="spField field_telefono"]/text()'), ("website", './/div[@class="spField field_sitio_web"]/text()'), ("email", './/div[@class="spField field_email"]/text()'), ] ) table = rows.import_from_xpath(filename, rows_xpath, fields_xpath) rows.export_to_csv(table, "ecuador-radiodifusoras.csv")
try: from urlparse import urljoin # Python 2 except ImportError: from urllib.parse import urljoin # Python 3 # Get data from Portuguese Wikipedia city_list_url = "https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil" response = requests.get(city_list_url) html = response.content # Extract desired data using XPath cities = rows.import_from_xpath( BytesIO(html), rows_xpath="//table/tr/td/ul/li", fields_xpath=OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]), ) regexp_city_state = re.compile(r"(.*) \(([A-Z]{2})\)") def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data["link"] = urljoin("https://pt.wikipedia.org", data["link"]) data["name"], data["state"] = regexp_city_state.findall(data["name"])[0] return data
# coding: utf-8 from __future__ import unicode_literals import os from collections import OrderedDict import rows # taken from: # http://www.supercom.gob.ec/es/informate-y-participa/directorio-de-medios/21-radiodifusoras filename = os.path.join(os.path.dirname(__file__), '../../tests/data/ecuador-medios-radiodifusoras.html') rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]' fields_xpath = OrderedDict([ ('url', './/h2/a/@href'), ('name', './/h2/a/text()'), ('address', './/div[@class="spField field_direccion"]/text()'), ('phone', './/div[@class="spField field_telefono"]/text()'), ('website', './/div[@class="spField field_sitio_web"]/text()'), ('email', './/div[@class="spField field_email"]/text()'), ]) table = rows.import_from_xpath(filename, rows_xpath, fields_xpath) rows.export_to_csv(table, 'ecuador-radiodifusoras.csv')
from collections import OrderedDict from io import BytesIO from urllib2 import urlparse import requests import rows # Get data from Portuguese Wikipedia city_list_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil' response = requests.get(city_list_url) html = response.content # Extract desired data using XPath cities = rows.import_from_xpath(BytesIO(html), rows_xpath='//table/tr/td/ul/li', fields_xpath=OrderedDict([ ('name', './/text()'), ('link', './/a/@href') ])) regexp_city_state = re.compile(r'(.*) \(([A-Z]{2})\)') def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link']) data['name'], data['state'] = regexp_city_state.findall(data['name'])[0] return data