Exemple #1
0
 def parse(self, response):
     html = response.body
     encoding = "iso-8859-15"  # TODO: use encoding from header
     parties = rows.import_from_xpath(
         io.BytesIO(html),
         encoding="iso-8859-15",  # TODO: use encoding from header
         rows_xpath='//select[@id="partido"]/option',
         fields_xpath=OrderedDict([("code", "./@value"), ("name", "./text()")]),
     )
     states = rows.import_from_xpath(
         io.BytesIO(html),
         encoding=encoding,
         rows_xpath='//select[@id="uf"]/option',
         fields_xpath=OrderedDict([("code", "./@value"), ("name", "./text()")]),
     )
     link = "http://agencia.tse.jus.br/estatistica/sead/eleitorado/filiados/uf/filiados_{party_code}_{state_code}.zip"
     for party in parties:
         for state in states:
             party_code = party.code
             if party_code == "SOLIDARIEDADE":  # Fix TSE link
                 party_code = "sd"
             url = link.format(party_code=party_code, state_code=state.code)
             download_filename = make_filepath(party_code, state.code)
             yield scrapy.Request(
                 url=url,
                 meta={
                     "filename": download_filename,
                     "party": party.name,
                     "state": state.name,
                     "url": url,
                 },
                 callback=self.save_zip,
             )
Exemple #2
0
    def test_xpath_must_be_text_type(self):
        with self.assertRaises(TypeError):
            rows.import_from_xpath(self.filename,
                                   encoding=self.encoding,
                                   rows_xpath=b'//div',
                                   fields_xpath={'f1': './/span'})

        with self.assertRaises(TypeError):
            rows.import_from_xpath(self.filename,
                                   encoding=self.encoding,
                                   rows_xpath='//div',
                                   fields_xpath={'f1': b'.//span'})
Exemple #3
0
    def test_xpath_must_be_text_type(self):
        with self.assertRaises(TypeError):
            rows.import_from_xpath(
                self.filename,
                encoding=self.encoding,
                rows_xpath=b"//div",
                fields_xpath={"f1": ".//span"},
            )

        with self.assertRaises(TypeError):
            rows.import_from_xpath(
                self.filename,
                encoding=self.encoding,
                rows_xpath="//div",
                fields_xpath={"f1": b".//span"},
            )
def get_links(date):

    # Download HTML and get all the links (text + url)
    url = 'http://www.cnj.jus.br/transparencia/remuneracao-dos-magistrados'
    response = requests.get(url)
    rows_xpath = '//a'
    fields_xpath = OrderedDict([('name', './/text()'), ('url', './@href')])
    table = rows.import_from_xpath(io.BytesIO(response.content),
                                   encoding=response.encoding,
                                   rows_xpath=rows_xpath,
                                   fields_xpath=fields_xpath)

    # Filter out links which don't point to spreadsheets
    result = []
    for row in table:
        if row.name is None or row.name == 'documento padrão' or \
                '.xls' not in row.url:
            continue

        data = {
            'name': row.name.replace('\xa0', ' '),
            'url': urljoin(url, row.url),
            'date_scraped': date,
        }
        result.append(data)
    return rows.import_from_dicts(result)
Exemple #5
0
def get_links(year, month, date_scraped):

    # Download HTML and get all the links (text + url)
    month = MONTHS[month]
    url = f'http://www.cnj.jus.br/transparencia/remuneracao-dos-magistrados/remuneracao-{month}-{year}'
    response = requests.get(url)
    if not response.ok:
        raise RuntimeError('Data not found')

    rows_xpath = '//a'
    fields_xpath = OrderedDict([('name', './/text()'), ('url', './@href')])
    table = rows.import_from_xpath(io.BytesIO(response.content),
                                   encoding=response.encoding,
                                   rows_xpath=rows_xpath,
                                   fields_xpath=fields_xpath)

    # Filter out links which don't point to spreadsheets
    result = []
    for row in table:
        if not (row.name or '').strip() or row.name == 'documento padrão' or \
                '.xls' not in row.url:
            continue

        data = {
            'date_scraped': date_scraped,
            'month': MONTHS2[month],
            'name': row.name.replace('\xa0', ' '),
            'url': unquote(urljoin(url, row.url)).strip(),
            'year': year,
        }
        result.append(data)
    return rows.import_from_dicts(result)
Exemple #6
0
    def test_xpath_must_be_text_type(self):
        with self.assertRaises(TypeError):
            rows.import_from_xpath(
                self.filename,
                encoding=self.encoding,
                rows_xpath=b"//div",
                fields_xpath={"f1": ".//span"},
            )

        with self.assertRaises(TypeError):
            rows.import_from_xpath(
                self.filename,
                encoding=self.encoding,
                rows_xpath="//div",
                fields_xpath={"f1": b".//span"},
            )
    def parse_month(self, response):
        meta = response.request.meta
        month_meta = {"ano": meta["year"], "mes": meta["month"]}

        rows_xpath = (
            "//a[contains(@href, 'xls') and not(contains(text(), 'documento'))]"
        )
        fields_xpath = OrderedDict([("tribunal", ".//text()"), ("url", ".//@href")])
        table = rows.import_from_xpath(
            io.BytesIO(response.body),
            rows_xpath=rows_xpath,
            encoding=response.encoding,
            fields_xpath=fields_xpath,
        )
        for row in table:
            url = urljoin(self.start_urls[0], unquote(row.url)).strip()

            # Fix URLs
            url = url.replace("http:/w", "http://w")
            if " " in url:
                # Case: "http://www.cnj.jus.br/TREMS%20http:/www.cnj.jus.br/files/conteudo/arquivo/2019/04/b91f9672dfe8abfb9cd3fbc6e8a5510e.xls"
                for part in url.split():
                    if ".xls" in part:
                        url = part
                        break

            # Some links have errors (more than one URL inside), so a list of
            # URLs for the same court is generated so we can check later the
            # correct one.
            if url.count("http://") > 1:
                urls = []
                for part in url.split("http:"):
                    if not part:
                        continue
                    urls.append("http:" + part)
            else:
                urls = [url]

            for url in urls:
                filename = settings.DOWNLOAD_PATH / Path(urlparse(url).path).name
                court_meta = month_meta.copy()
                court_meta.update(
                    {
                        "baixado_em": datetime.datetime.now(),
                        "arquivo": filename.relative_to(settings.BASE_PATH),
                        "tribunal": fix_tribunal(
                            (row.tribunal or "").replace("\xa0", " ")
                        ),
                        "url": url,
                    }
                )
                yield scrapy.Request(
                    url=court_meta["url"],
                    meta={"row": court_meta},
                    callback=self.save_file,
                )
                # Yield the row so we can check later when links are incorrect
                # (repeated, 404 etc.)
                yield court_meta
    def parse_list(self, response):
        url = "http://servicos.tce.pr.gov.br/TCEPR/Tribunal/Relacon/DadosConsulta/Pesquisa"
        table = rows.import_from_xpath(
            io.BytesIO(response.body),
            encoding=response.encoding,
            rows_xpath='//select[@id="cdMunicipio"]/option',
            fields_xpath=OrderedDict([("municipio", "./text()"),
                                      ("codigo_ibge", "./@value")]),
            force_types={"codigo_ibge": rows.fields.TextField},
        )
        cities = {
            row.municipio.strip(): row.codigo_ibge
            for row in table if row.codigo_ibge != 0
        }

        table = rows.import_from_xpath(
            io.BytesIO(response.body),
            encoding=response.encoding,
            rows_xpath='//select[@id="nrAno"]/option',
            fields_xpath=OrderedDict([("year", "./@value")]),
        )
        years = [row.year for row in table if row.year != 0]

        for year in years:
            for city, ibge_code in cities.items():
                post_data = {
                    "cdMunicipio": ibge_code,
                    "municipio": city,
                    "nrAno": str(year),
                }
                yield scrapy.FormRequest(
                    url,
                    method="POST",
                    formdata=post_data,
                    meta={
                        "city": city,
                        "ibge_code": ibge_code,
                        "year": year
                    },
                    callback=self.parse_result,
                )
Exemple #9
0
    def test_import_from_xpath_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        kwargs = {'encoding': 'iso-8859-15', 'some_key': 123, 'other': 456, }
        self.kwargs.update(kwargs)

        result = rows.import_from_xpath(self.filename, **self.kwargs)
        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs['meta'] = {'imported_from': 'xpath', 'filename': self.filename,}
        self.assertEqual(call[1], kwargs)
Exemple #10
0
    def test_import_from_xpath_filename(self):
        table = rows.import_from_xpath(self.filename,
                                       encoding=self.encoding,
                                       **self.kwargs)

        expected_meta = {'imported_from': 'xpath', 'filename': self.filename,}
        self.assertEqual(table.meta, expected_meta)

        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        fobj = temp.file
        rows.export_to_csv(table, fobj)
        fobj.seek(0)
        table = rows.import_from_csv(fobj)

        self.assert_table_equal(table, self.expected_table)
Exemple #11
0
 def test_import_from_xpath_unescape_and_extract_text(self):
     html = '''
       <ul>
         <li><a href="/wiki/Abadia_de_Goi%C3%A1s" title="Abadia de Goiás">Abadia de Goi&aacute;s</a> (GO)</li>
         <li><a href="/wiki/Abadi%C3%A2nia" title="Abadiânia">Abadi&acirc;nia</a> (GO)</li>
       </ul>
     '''.encode('utf-8')
     rows_xpath = '//ul/li'
     fields_xpath = OrderedDict([('name', './/text()'),
                                 ('link', './/a/@href')])
     table = rows.import_from_xpath(BytesIO(html),
                                    rows_xpath=rows_xpath,
                                    fields_xpath=fields_xpath,
                                    encoding='utf-8')
     self.assertEqual(table[0].name, 'Abadia de Goiás (GO)')
     self.assertEqual(table[1].name, 'Abadiânia (GO)')
 def parse_result(self, response):
     meta = response.request.meta
     links = rows.import_from_xpath(
         io.BytesIO(response.body),
         encoding=response.encoding,
         rows_xpath="//a",
         fields_xpath=OrderedDict([("document_type", ".//text()"),
                                   ("url", ".//@href")]),
     )
     # TODO: what if 'links' is empty? Probably wrong ibge_code passed
     for link in links:
         link = link._asdict()
         link.update({
             "city": meta["city"],
             "ibge_code": meta["ibge_code"],
             "year": meta["year"],
         })
         yield link
Exemple #13
0
 def test_import_from_xpath_unescape_and_extract_text(self):
     html = """
       <ul>
         <li><a href="/wiki/Abadia_de_Goi%C3%A1s" title="Abadia de Goiás">Abadia de Goi&aacute;s</a> (GO)</li>
         <li><a href="/wiki/Abadi%C3%A2nia" title="Abadiânia">Abadi&acirc;nia</a> (GO)</li>
       </ul>
     """.encode("utf-8")
     rows_xpath = "//ul/li"
     fields_xpath = OrderedDict([("name", ".//text()"),
                                 ("link", ".//a/@href")])
     table = rows.import_from_xpath(
         BytesIO(html),
         rows_xpath=rows_xpath,
         fields_xpath=fields_xpath,
         encoding="utf-8",
     )
     self.assertEqual(table[0].name, "Abadia de Goiás (GO)")
     self.assertEqual(table[1].name, "Abadiânia (GO)")
Exemple #14
0
    def test_import_from_xpath_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        encoding = "iso-8859-15"
        kwargs = {"some_key": 123, "other": 456}
        self.kwargs.update(kwargs)

        result = rows.import_from_xpath(self.filename, encoding=encoding, **self.kwargs)
        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs["meta"] = {
            "imported_from": "xpath",
            "filename": self.filename,
            "encoding": encoding,
        }
        self.assertEqual(call[1], kwargs)
Exemple #15
0
    def update_legislators(self):
        url = self.base_url + '/deputado/'
        html = self.retrieve_uri(url, post_process=False, force_encoding='utf-8')

        rows_xpath = u'//tbody/tr'
        fields_xpath = {
            u'nome': u'./td[position()=1]/a/text()',
            u'url': u'./td[position()=1]/a/@href',
            u'party': u'./td[position()=2]/text()',
            u'telefone': u'./td[position()=3]/text()',
            u'fax': u'./td[position()=4]/text()',
            u'email': u'./td[position()=5]/a[position()=1]/img/@title',
        }
        table = rows.import_from_xpath(BytesIO(html.encode('utf-8')), rows_xpath, fields_xpath)

        url_regex = re.compile(r'.*id/(\d+)')
        email_regex = re.compile(r'Email: (.*)')

        for row in table:
            _id = url_regex.match(row.url).group(1)
            email = None

            if row.email:
                email = email_regex.match(row.email).group(1).strip()

            party_siglum = self._normalize_party_siglum(row.party)
            party, party_created = PoliticalParty.objects.get_or_create(
                siglum=party_siglum
            )

            self.debug(u'New party: {0}'.format(party))

            legislator, created = Legislator.objects.get_or_create(name=row.nome)

            legislator.site = self.base_url + row.url
            legislator.email = email
            legislator.save()

            if created:
                self.debug(u'New legislator: {0}'.format(legislator))
            else:
                self.debug(u'Found existing legislator: {0}'.format(legislator))

            self.mandate_for_legislator(legislator, party, original_id=_id)
Exemple #16
0
 def test_import_from_xpath_unescape_and_extract_text(self):
     html = """
       <ul>
         <li><a href="/wiki/Abadia_de_Goi%C3%A1s" title="Abadia de Goiás">Abadia de Goi&aacute;s</a> (GO)</li>
         <li><a href="/wiki/Abadi%C3%A2nia" title="Abadiânia">Abadi&acirc;nia</a> (GO)</li>
       </ul>
     """.encode(
         "utf-8"
     )
     rows_xpath = "//ul/li"
     fields_xpath = OrderedDict([("name", ".//text()"), ("link", ".//a/@href")])
     table = rows.import_from_xpath(
         BytesIO(html),
         rows_xpath=rows_xpath,
         fields_xpath=fields_xpath,
         encoding="utf-8",
     )
     self.assertEqual(table[0].name, "Abadia de Goiás (GO)")
     self.assertEqual(table[1].name, "Abadiânia (GO)")
Exemple #17
0
    def test_import_from_xpath_fobj(self):
        # TODO: may test with codecs.open passing an encoding
        with open(self.filename, mode='rb') as fobj:
            table = rows.import_from_xpath(fobj,
                                           encoding=self.encoding,
                                           **self.kwargs)

        expected_meta = {'imported_from': 'xpath',
                         'filename': self.filename,
                         'encoding': self.encoding, }
        self.assertEqual(table.meta, expected_meta)

        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        fobj = temp.file
        rows.export_to_csv(table, fobj)
        fobj.seek(0)
        table = rows.import_from_csv(fobj)

        self.assert_table_equal(table, self.expected_table)
Exemple #18
0
    def test_import_from_xpath_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        encoding = "iso-8859-15"
        kwargs = {"some_key": 123, "other": 456}
        self.kwargs.update(kwargs)

        result = rows.import_from_xpath(self.filename,
                                        encoding=encoding,
                                        **self.kwargs)
        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs["meta"] = {
            "imported_from": "xpath",
            "filename": self.filename,
            "encoding": encoding,
        }
        self.assertEqual(call[1], kwargs)
Exemple #19
0
    def parse_detale_verba(self, elem, budget_title, budget_subtitle):
        rows_xpath = u'//tbody/tr'
        fields_xpath = {
            u'nome': u'./td[position()=1]/text()',
            u'cpf_cnpj': u'./td[position()=2]/text()',
            u'date': u'./td[position()=3]/text()',
            u'number': u'./td[position()=4]/text()',
            u'value_presented': u'./td[position()=5]/text()',
            u'value_expensed': u'./td[position()=6]/text()',
        }
        table = rows.import_from_xpath(
            BytesIO(str(elem)), rows_xpath, fields_xpath)
        for row in table:
            data = dict(row.__dict__)
            data.update({
                'budget_title': budget_title,
                'budget_subtitle': budget_subtitle,
                'cpf_cnpj': self.normalize_cnpj_or_cpf(row.cpf_cnpj),
                'value_presented': self.parse_money(row.value_presented),
                'value_expensed': self.parse_money(row.value_expensed),
            })
            self.debug(u'Generated JSON: {0}'.format(data))

            yield data
    def parse(self, response):
        meta = response.request.meta

        table = rows.import_from_xpath(
            io.BytesIO(response.body),
            rows_xpath='//article[@class="featured-attractions__item"]',
            fields_xpath=OrderedDict([
                ("name", ".//h3//a/text()"),
                ("category", ".//h4//a/text()"),
                ("url", ".//h4//a/@href"),
                ("thumbnail_url", ".//img/@src"),
            ]),
        )
        for row in table:
            data = row._asdict()
            del data["category"]  # Not needed here, will get on next request
            yield scrapy.Request(
                url=data["url"],
                meta={"data": data},
                callback=self.parse_event,
            )

        if response.xpath("//a[text() = 'Próxima >>']"):
            yield self.make_list_request(date=meta["date"], page_number=meta["page_number"] + 1)
Exemple #21
0
import rows

try:
    from urlparse import urljoin  # Python 2
except ImportError:
    from urllib.parse import urljoin  # Python 3

# Get data from Portuguese Wikipedia
city_list_url = "https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil"
response = requests.get(city_list_url)
html = response.content

# Extract desired data using XPath
cities = rows.import_from_xpath(
    BytesIO(html),
    rows_xpath="//table/tr/td/ul/li",
    fields_xpath=OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]),
)

regexp_city_state = re.compile(r"(.*) \(([A-Z]{2})\)")


def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data["link"] = urljoin("https://pt.wikipedia.org", data["link"])
    data["name"], data["state"] = regexp_city_state.findall(data["name"])[0]
    return data

from io import BytesIO
from urllib2 import urlparse

import requests
import rows


# Get data from Portuguese Wikipedia
city_list_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil'
response = requests.get(city_list_url)
html = response.content

# Extract desired data using XPath
cities = rows.import_from_xpath(
        BytesIO(html),
        rows_xpath='//table/tr/td/ul/li',
        fields_xpath=OrderedDict([('name', './/text()'),
                                  ('link', './/a/@href')]))

regexp_city_state = re.compile(r'(.*) \(([A-Z]{2})\)')

def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link'])
    data['name'], data['state'] = regexp_city_state.findall(data['name'])[0]
    return data

new_fields = OrderedDict()
new_fields['name'] = cities.fields['name']
# coding: utf-8

from __future__ import unicode_literals

import os
from collections import OrderedDict

import rows

# taken from:
# http://www.supercom.gob.ec/es/informate-y-participa/directorio-de-medios/21-radiodifusoras
filename = os.path.join(
    os.path.dirname(__file__), "../../tests/data/ecuador-medios-radiodifusoras.html"
)
rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]'
fields_xpath = OrderedDict(
    [
        ("url", ".//h2/a/@href"),
        ("name", ".//h2/a/text()"),
        ("address", './/div[@class="spField field_direccion"]/text()'),
        ("phone", './/div[@class="spField field_telefono"]/text()'),
        ("website", './/div[@class="spField field_sitio_web"]/text()'),
        ("email", './/div[@class="spField field_email"]/text()'),
    ]
)

table = rows.import_from_xpath(filename, rows_xpath, fields_xpath)
rows.export_to_csv(table, "ecuador-radiodifusoras.csv")
try:
    from urlparse import urljoin  # Python 2
except ImportError:
    from urllib.parse import urljoin  # Python 3


# Get data from Portuguese Wikipedia
city_list_url = "https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil"
response = requests.get(city_list_url)
html = response.content

# Extract desired data using XPath
cities = rows.import_from_xpath(
    BytesIO(html),
    rows_xpath="//table/tr/td/ul/li",
    fields_xpath=OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]),
)

regexp_city_state = re.compile(r"(.*) \(([A-Z]{2})\)")


def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data["link"] = urljoin("https://pt.wikipedia.org", data["link"])
    data["name"], data["state"] = regexp_city_state.findall(data["name"])[0]
    return data

Exemple #25
0
# coding: utf-8

from __future__ import unicode_literals

import os
from collections import OrderedDict

import rows

# taken from:
# http://www.supercom.gob.ec/es/informate-y-participa/directorio-de-medios/21-radiodifusoras
filename = os.path.join(os.path.dirname(__file__),
                        '../../tests/data/ecuador-medios-radiodifusoras.html')
rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]'
fields_xpath = OrderedDict([
        ('url', './/h2/a/@href'),
        ('name', './/h2/a/text()'),
        ('address', './/div[@class="spField field_direccion"]/text()'),
        ('phone', './/div[@class="spField field_telefono"]/text()'),
        ('website', './/div[@class="spField field_sitio_web"]/text()'),
        ('email', './/div[@class="spField field_email"]/text()'), ])

table = rows.import_from_xpath(filename, rows_xpath, fields_xpath)
rows.export_to_csv(table, 'ecuador-radiodifusoras.csv')
from collections import OrderedDict
from io import BytesIO
from urllib2 import urlparse

import requests
import rows

# Get data from Portuguese Wikipedia
city_list_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil'
response = requests.get(city_list_url)
html = response.content

# Extract desired data using XPath
cities = rows.import_from_xpath(BytesIO(html),
                                rows_xpath='//table/tr/td/ul/li',
                                fields_xpath=OrderedDict([
                                    ('name', './/text()'),
                                    ('link', './/a/@href')
                                ]))

regexp_city_state = re.compile(r'(.*) \(([A-Z]{2})\)')


def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link'])
    data['name'], data['state'] = regexp_city_state.findall(data['name'])[0]
    return data